Major refactoring: UUID-based storage, streaming architecture, and CLI improvements

This commit represents a significant architectural overhaul of vaultik: Database Schema Changes: - Switch files table to use UUID primary keys instead of path-based keys - Add UUID primary keys to blobs table for immediate chunk association - Update all foreign key relationships to use UUIDs - Add comprehensive schema documentation in DATAMODEL.md - Add SQLite busy timeout handling for concurrent operations Streaming and Performance Improvements: - Implement true streaming blob packing without intermediate storage - Add streaming chunk processing to reduce memory usage - Improve progress reporting with real-time metrics - Add upload metrics tracking in new uploads table CLI Refactoring: - Restructure CLI to use subcommands: snapshot create/list/purge/verify - Add store info command for S3 configuration display - Add custom duration parser supporting days/weeks/months/years - Remove old backup.go in favor of enhanced snapshot.go - Add --cron flag for silent operation Configuration Changes: - Remove unused index_prefix configuration option - Add support for snapshot pruning retention policies - Improve configuration validation and error messages Testing Improvements: - Add comprehensive repository tests with edge cases - Add cascade delete debugging tests - Fix concurrent operation tests to use SQLite busy timeout - Remove tolerance for SQLITE_BUSY errors in tests Documentation: - Add MIT LICENSE file - Update README with new command structure - Add comprehensive DATAMODEL.md explaining database schema - Update DESIGN.md with UUID-based architecture Other Changes: - Add test-config.yml for testing - Update Makefile with better test output formatting - Fix various race conditions in concurrent operations - Improve error handling throughout
2025-07-22 14:54:37 +02:00
parent 86b533d6ee
commit 78af626759
54 changed files with 5525 additions and 1109 deletions
--- a/internal/database/repository_edge_cases_test.go
+++ b/internal/database/repository_edge_cases_test.go
@@ -0,0 +1,543 @@
+package database
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+)
+
+// TestFileRepositoryEdgeCases tests edge cases for file repository
+func TestFileRepositoryEdgeCases(t *testing.T) {
+	db, cleanup := setupTestDB(t)
+	defer cleanup()
+
+	ctx := context.Background()
+	repo := NewFileRepository(db)
+
+	tests := []struct {
+		name    string
+		file    *File
+		wantErr bool
+		errMsg  string
+	}{
+		{
+			name: "empty path",
+			file: &File{
+				Path:  "",
+				MTime: time.Now(),
+				CTime: time.Now(),
+				Size:  1024,
+				Mode:  0644,
+				UID:   1000,
+				GID:   1000,
+			},
+			wantErr: false, // Empty strings are allowed, only NULL is not allowed
+		},
+		{
+			name: "very long path",
+			file: &File{
+				Path:  "/" + strings.Repeat("a", 4096),
+				MTime: time.Now(),
+				CTime: time.Now(),
+				Size:  1024,
+				Mode:  0644,
+				UID:   1000,
+				GID:   1000,
+			},
+			wantErr: false,
+		},
+		{
+			name: "path with special characters",
+			file: &File{
+				Path:  "/test/file with spaces and 特殊文字.txt",
+				MTime: time.Now(),
+				CTime: time.Now(),
+				Size:  1024,
+				Mode:  0644,
+				UID:   1000,
+				GID:   1000,
+			},
+			wantErr: false,
+		},
+		{
+			name: "zero size file",
+			file: &File{
+				Path:  "/empty.txt",
+				MTime: time.Now(),
+				CTime: time.Now(),
+				Size:  0,
+				Mode:  0644,
+				UID:   1000,
+				GID:   1000,
+			},
+			wantErr: false,
+		},
+		{
+			name: "symlink with target",
+			file: &File{
+				Path:       "/link",
+				MTime:      time.Now(),
+				CTime:      time.Now(),
+				Size:       0,
+				Mode:       0777 | 0120000, // symlink mode
+				UID:        1000,
+				GID:        1000,
+				LinkTarget: "/target",
+			},
+			wantErr: false,
+		},
+	}
+
+	for i, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Add a unique suffix to paths to avoid UNIQUE constraint violations
+			if tt.file.Path != "" {
+				tt.file.Path = fmt.Sprintf("%s_%d_%d", tt.file.Path, i, time.Now().UnixNano())
+			}
+
+			err := repo.Create(ctx, nil, tt.file)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Create() error = %v, wantErr %v", err, tt.wantErr)
+			}
+			if err != nil && tt.errMsg != "" && !strings.Contains(err.Error(), tt.errMsg) {
+				t.Errorf("Create() error = %v, want error containing %q", err, tt.errMsg)
+			}
+		})
+	}
+}
+
+// TestDuplicateHandling tests handling of duplicate entries
+func TestDuplicateHandling(t *testing.T) {
+	db, cleanup := setupTestDB(t)
+	defer cleanup()
+
+	ctx := context.Background()
+	repos := NewRepositories(db)
+
+	// Test duplicate file paths - Create uses UPSERT logic
+	t.Run("duplicate file paths", func(t *testing.T) {
+		file1 := &File{
+			Path:  "/duplicate.txt",
+			MTime: time.Now(),
+			CTime: time.Now(),
+			Size:  1024,
+			Mode:  0644,
+			UID:   1000,
+			GID:   1000,
+		}
+		file2 := &File{
+			Path:  "/duplicate.txt", // Same path
+			MTime: time.Now().Add(time.Hour),
+			CTime: time.Now().Add(time.Hour),
+			Size:  2048,
+			Mode:  0644,
+			UID:   1000,
+			GID:   1000,
+		}
+
+		err := repos.Files.Create(ctx, nil, file1)
+		if err != nil {
+			t.Fatalf("failed to create file1: %v", err)
+		}
+		originalID := file1.ID
+
+		// Create with same path should update the existing record (UPSERT behavior)
+		err = repos.Files.Create(ctx, nil, file2)
+		if err != nil {
+			t.Fatalf("failed to create file2: %v", err)
+		}
+
+		// Verify the file was updated, not duplicated
+		retrievedFile, err := repos.Files.GetByPath(ctx, "/duplicate.txt")
+		if err != nil {
+			t.Fatalf("failed to retrieve file: %v", err)
+		}
+
+		// The file should have been updated with file2's data
+		if retrievedFile.Size != 2048 {
+			t.Errorf("expected size 2048, got %d", retrievedFile.Size)
+		}
+
+		// ID might be different due to the UPSERT
+		if retrievedFile.ID != file2.ID {
+			t.Logf("File ID changed from %s to %s during upsert", originalID, retrievedFile.ID)
+		}
+	})
+
+	// Test duplicate chunk hashes
+	t.Run("duplicate chunk hashes", func(t *testing.T) {
+		chunk := &Chunk{
+			ChunkHash: "duplicate-chunk",
+			SHA256:    "duplicate-sha",
+			Size:      1024,
+		}
+
+		err := repos.Chunks.Create(ctx, nil, chunk)
+		if err != nil {
+			t.Fatalf("failed to create chunk: %v", err)
+		}
+
+		// Creating the same chunk again should be idempotent (ON CONFLICT DO NOTHING)
+		err = repos.Chunks.Create(ctx, nil, chunk)
+		if err != nil {
+			t.Errorf("duplicate chunk creation should be idempotent, got error: %v", err)
+		}
+	})
+
+	// Test duplicate file-chunk mappings
+	t.Run("duplicate file-chunk mappings", func(t *testing.T) {
+		file := &File{
+			Path:  "/test-dup-fc.txt",
+			MTime: time.Now(),
+			CTime: time.Now(),
+			Size:  1024,
+			Mode:  0644,
+			UID:   1000,
+			GID:   1000,
+		}
+		err := repos.Files.Create(ctx, nil, file)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		chunk := &Chunk{
+			ChunkHash: "test-chunk-dup",
+			SHA256:    "test-sha-dup",
+			Size:      1024,
+		}
+		err = repos.Chunks.Create(ctx, nil, chunk)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		fc := &FileChunk{
+			FileID:    file.ID,
+			Idx:       0,
+			ChunkHash: chunk.ChunkHash,
+		}
+
+		err = repos.FileChunks.Create(ctx, nil, fc)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Creating the same mapping again should be idempotent
+		err = repos.FileChunks.Create(ctx, nil, fc)
+		if err != nil {
+			t.Error("file-chunk creation should be idempotent")
+		}
+	})
+}
+
+// TestNullHandling tests handling of NULL values
+func TestNullHandling(t *testing.T) {
+	db, cleanup := setupTestDB(t)
+	defer cleanup()
+
+	ctx := context.Background()
+	repos := NewRepositories(db)
+
+	// Test file with no link target
+	t.Run("file without link target", func(t *testing.T) {
+		file := &File{
+			Path:       "/regular.txt",
+			MTime:      time.Now(),
+			CTime:      time.Now(),
+			Size:       1024,
+			Mode:       0644,
+			UID:        1000,
+			GID:        1000,
+			LinkTarget: "", // Should be stored as NULL
+		}
+
+		err := repos.Files.Create(ctx, nil, file)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		retrieved, err := repos.Files.GetByID(ctx, file.ID)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if retrieved.LinkTarget != "" {
+			t.Errorf("expected empty link target, got %q", retrieved.LinkTarget)
+		}
+	})
+
+	// Test snapshot with NULL completed_at
+	t.Run("incomplete snapshot", func(t *testing.T) {
+		snapshot := &Snapshot{
+			ID:          "incomplete-test",
+			Hostname:    "test-host",
+			StartedAt:   time.Now(),
+			CompletedAt: nil, // Should remain NULL until completed
+		}
+
+		err := repos.Snapshots.Create(ctx, nil, snapshot)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		retrieved, err := repos.Snapshots.GetByID(ctx, snapshot.ID)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if retrieved.CompletedAt != nil {
+			t.Error("expected nil CompletedAt for incomplete snapshot")
+		}
+	})
+
+	// Test blob with NULL uploaded_ts
+	t.Run("blob not uploaded", func(t *testing.T) {
+		blob := &Blob{
+			ID:         "not-uploaded",
+			Hash:       "test-hash",
+			CreatedTS:  time.Now(),
+			UploadedTS: nil, // Not uploaded yet
+		}
+
+		err := repos.Blobs.Create(ctx, nil, blob)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		retrieved, err := repos.Blobs.GetByID(ctx, blob.ID)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if retrieved.UploadedTS != nil {
+			t.Error("expected nil UploadedTS for non-uploaded blob")
+		}
+	})
+}
+
+// TestLargeDatasets tests operations with large amounts of data
+func TestLargeDatasets(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping large dataset test in short mode")
+	}
+
+	db, cleanup := setupTestDB(t)
+	defer cleanup()
+
+	ctx := context.Background()
+	repos := NewRepositories(db)
+
+	// Create a snapshot
+	snapshot := &Snapshot{
+		ID:        "large-dataset-test",
+		Hostname:  "test-host",
+		StartedAt: time.Now(),
+	}
+	err := repos.Snapshots.Create(ctx, nil, snapshot)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Create many files
+	const fileCount = 1000
+	fileIDs := make([]string, fileCount)
+
+	t.Run("create many files", func(t *testing.T) {
+		start := time.Now()
+		for i := 0; i < fileCount; i++ {
+			file := &File{
+				Path:  fmt.Sprintf("/large/file%05d.txt", i),
+				MTime: time.Now(),
+				CTime: time.Now(),
+				Size:  int64(i * 1024),
+				Mode:  0644,
+				UID:   uint32(1000 + (i % 10)),
+				GID:   uint32(1000 + (i % 10)),
+			}
+			err := repos.Files.Create(ctx, nil, file)
+			if err != nil {
+				t.Fatalf("failed to create file %d: %v", i, err)
+			}
+			fileIDs[i] = file.ID
+
+			// Add half to snapshot
+			if i%2 == 0 {
+				err = repos.Snapshots.AddFileByID(ctx, nil, snapshot.ID, file.ID)
+				if err != nil {
+					t.Fatal(err)
+				}
+			}
+		}
+		t.Logf("Created %d files in %v", fileCount, time.Since(start))
+	})
+
+	// Test ListByPrefix performance
+	t.Run("list by prefix performance", func(t *testing.T) {
+		start := time.Now()
+		files, err := repos.Files.ListByPrefix(ctx, "/large/")
+		if err != nil {
+			t.Fatal(err)
+		}
+		if len(files) != fileCount {
+			t.Errorf("expected %d files, got %d", fileCount, len(files))
+		}
+		t.Logf("Listed %d files in %v", len(files), time.Since(start))
+	})
+
+	// Test orphaned cleanup performance
+	t.Run("orphaned cleanup performance", func(t *testing.T) {
+		start := time.Now()
+		err := repos.Files.DeleteOrphaned(ctx)
+		if err != nil {
+			t.Fatal(err)
+		}
+		t.Logf("Cleaned up orphaned files in %v", time.Since(start))
+
+		// Verify correct number remain
+		files, err := repos.Files.ListByPrefix(ctx, "/large/")
+		if err != nil {
+			t.Fatal(err)
+		}
+		if len(files) != fileCount/2 {
+			t.Errorf("expected %d files after cleanup, got %d", fileCount/2, len(files))
+		}
+	})
+}
+
+// TestErrorPropagation tests that errors are properly propagated
+func TestErrorPropagation(t *testing.T) {
+	db, cleanup := setupTestDB(t)
+	defer cleanup()
+
+	ctx := context.Background()
+	repos := NewRepositories(db)
+
+	// Test GetByID with non-existent ID
+	t.Run("GetByID non-existent", func(t *testing.T) {
+		file, err := repos.Files.GetByID(ctx, "non-existent-uuid")
+		if err != nil {
+			t.Errorf("GetByID should not return error for non-existent ID, got: %v", err)
+		}
+		if file != nil {
+			t.Error("expected nil file for non-existent ID")
+		}
+	})
+
+	// Test GetByPath with non-existent path
+	t.Run("GetByPath non-existent", func(t *testing.T) {
+		file, err := repos.Files.GetByPath(ctx, "/non/existent/path.txt")
+		if err != nil {
+			t.Errorf("GetByPath should not return error for non-existent path, got: %v", err)
+		}
+		if file != nil {
+			t.Error("expected nil file for non-existent path")
+		}
+	})
+
+	// Test invalid foreign key reference
+	t.Run("invalid foreign key", func(t *testing.T) {
+		fc := &FileChunk{
+			FileID:    "non-existent-file-id",
+			Idx:       0,
+			ChunkHash: "some-chunk",
+		}
+		err := repos.FileChunks.Create(ctx, nil, fc)
+		if err == nil {
+			t.Error("expected error for invalid foreign key")
+		}
+		if !strings.Contains(err.Error(), "FOREIGN KEY") {
+			t.Errorf("expected foreign key error, got: %v", err)
+		}
+	})
+}
+
+// TestQueryInjection tests that the system is safe from SQL injection
+func TestQueryInjection(t *testing.T) {
+	db, cleanup := setupTestDB(t)
+	defer cleanup()
+
+	ctx := context.Background()
+	repos := NewRepositories(db)
+
+	// Test various injection attempts
+	injectionTests := []string{
+		"'; DROP TABLE files; --",
+		"' OR '1'='1",
+		"'; DELETE FROM files WHERE '1'='1'; --",
+		`test'); DROP TABLE files; --`,
+	}
+
+	for _, injection := range injectionTests {
+		t.Run("injection attempt", func(t *testing.T) {
+			// Try injection in file path
+			file := &File{
+				Path:  injection,
+				MTime: time.Now(),
+				CTime: time.Now(),
+				Size:  1024,
+				Mode:  0644,
+				UID:   1000,
+				GID:   1000,
+			}
+			_ = repos.Files.Create(ctx, nil, file)
+			// Should either succeed (treating as normal string) or fail with constraint
+			// but should NOT execute the injected SQL
+
+			// Verify tables still exist
+			var count int
+			err := db.conn.QueryRow("SELECT COUNT(*) FROM files").Scan(&count)
+			if err != nil {
+				t.Fatal("files table was damaged by injection")
+			}
+		})
+	}
+}
+
+// TestTimezoneHandling tests that times are properly handled in UTC
+func TestTimezoneHandling(t *testing.T) {
+	db, cleanup := setupTestDB(t)
+	defer cleanup()
+
+	ctx := context.Background()
+	repos := NewRepositories(db)
+
+	// Create file with specific timezone
+	loc, err := time.LoadLocation("America/New_York")
+	if err != nil {
+		t.Skip("timezone not available")
+	}
+
+	// Use Truncate to remove sub-second precision since we store as Unix timestamps
+	nyTime := time.Now().In(loc).Truncate(time.Second)
+	file := &File{
+		Path:  "/timezone-test.txt",
+		MTime: nyTime,
+		CTime: nyTime,
+		Size:  1024,
+		Mode:  0644,
+		UID:   1000,
+		GID:   1000,
+	}
+
+	err = repos.Files.Create(ctx, nil, file)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Retrieve and verify times are in UTC
+	retrieved, err := repos.Files.GetByID(ctx, file.ID)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Check that times are equivalent (same instant)
+	if !retrieved.MTime.Equal(nyTime) {
+		t.Error("time was not preserved correctly")
+	}
+
+	// Check that retrieved time is in UTC
+	if retrieved.MTime.Location() != time.UTC {
+		t.Error("retrieved time is not in UTC")
+	}
+}