Fix FK constraint errors in batched file insertion

Generate file UUIDs upfront in checkFileInMemory() rather than deferring to Files.Create(). This ensures file_chunks and chunk_files records have valid FileID values when constructed during file processing, before the batch insert transaction. Root cause: For new files, file.ID was empty when building the fileChunks and chunkFiles slices. The ID was only generated later in Files.Create(), but by then the slices already had empty FileID values, causing FK constraint failures. Also adds PROCESS.md documenting the snapshot creation lifecycle, database transactions, and FK dependency ordering.
2025-12-19 19:48:48 +07:00
parent 899448e1da
commit 43a69c2cfb
3 changed files with 692 additions and 61 deletions
--- a/internal/snapshot/scanner.go
+++ b/internal/snapshot/scanner.go
@@ -15,6 +15,7 @@ import (
 	"git.eeqj.de/sneak/vaultik/internal/log"
 	"git.eeqj.de/sneak/vaultik/internal/storage"
 	"github.com/dustin/go-humanize"
+	"github.com/google/uuid"
 	"github.com/spf13/afero"
 )

@@ -25,6 +26,13 @@ type FileToProcess struct {
 	File     *database.File
 }

+// pendingFileData holds all data needed to commit a file to the database
+type pendingFileData struct {
+	file       *database.File
+	fileChunks []database.FileChunk
+	chunkFiles []database.ChunkFile
+}
+
 // Scanner scans directories and populates the database with file and chunk information
 type Scanner struct {
 	fs               afero.Fs
@@ -42,6 +50,10 @@ type Scanner struct {
 	knownChunks   map[string]struct{}
 	knownChunksMu sync.RWMutex

+	// Pending file data buffer for batch insertion
+	pendingFiles   []pendingFileData
+	pendingFilesMu sync.Mutex
+
 	// Mutex for coordinating blob creation
 	packerMu sync.Mutex // Blocks chunk production during blob creation

@@ -49,6 +61,11 @@ type Scanner struct {
 	scanCtx context.Context
 }

+const (
+	// Batch size for file database operations
+	fileBatchSize = 100
+)
+
 // ScannerConfig contains configuration for the scanner
 type ScannerConfig struct {
 	FS               afero.Fs
@@ -286,6 +303,73 @@ func (s *Scanner) addKnownChunk(hash string) {
 	s.knownChunksMu.Unlock()
 }

+// addPendingFile adds a file to the pending buffer and flushes if needed
+func (s *Scanner) addPendingFile(ctx context.Context, data pendingFileData) error {
+	s.pendingFilesMu.Lock()
+	s.pendingFiles = append(s.pendingFiles, data)
+	needsFlush := len(s.pendingFiles) >= fileBatchSize
+	s.pendingFilesMu.Unlock()
+
+	if needsFlush {
+		return s.flushPendingFiles(ctx)
+	}
+	return nil
+}
+
+// flushPendingFiles writes all pending files to the database in a single transaction
+func (s *Scanner) flushPendingFiles(ctx context.Context) error {
+	s.pendingFilesMu.Lock()
+	files := s.pendingFiles
+	s.pendingFiles = nil
+	s.pendingFilesMu.Unlock()
+
+	if len(files) == 0 {
+		return nil
+	}
+
+	return s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
+		for _, data := range files {
+			// Create or update the file record
+			if err := s.repos.Files.Create(txCtx, tx, data.file); err != nil {
+				return fmt.Errorf("creating file record: %w", err)
+			}
+
+			// Delete any existing file_chunks and chunk_files for this file
+			if err := s.repos.FileChunks.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
+				return fmt.Errorf("deleting old file chunks: %w", err)
+			}
+			if err := s.repos.ChunkFiles.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
+				return fmt.Errorf("deleting old chunk files: %w", err)
+			}
+
+			// Create file-chunk mappings
+			for i := range data.fileChunks {
+				if err := s.repos.FileChunks.Create(txCtx, tx, &data.fileChunks[i]); err != nil {
+					return fmt.Errorf("creating file chunk: %w", err)
+				}
+			}
+
+			// Create chunk-file mappings
+			for i := range data.chunkFiles {
+				if err := s.repos.ChunkFiles.Create(txCtx, tx, &data.chunkFiles[i]); err != nil {
+					return fmt.Errorf("creating chunk file: %w", err)
+				}
+			}
+
+			// Add file to snapshot
+			if err := s.repos.Snapshots.AddFileByID(txCtx, tx, s.snapshotID, data.file.ID); err != nil {
+				return fmt.Errorf("adding file to snapshot: %w", err)
+			}
+		}
+		return nil
+	})
+}
+
+// flushAllPending flushes all pending files to the database
+func (s *Scanner) flushAllPending(ctx context.Context) error {
+	return s.flushPendingFiles(ctx)
+}
+
 // ScanPhaseResult contains the results of the scan phase
 type ScanPhaseResult struct {
 	FilesToProcess   []*FileToProcess
@@ -429,8 +513,21 @@ func (s *Scanner) checkFileInMemory(path string, info os.FileInfo, knownFiles ma
 		gid = stat.Gid()
 	}

-	// Create file record
+	// Check against in-memory map first to get existing ID if available
+	existingFile, exists := knownFiles[path]
+
+	// Create file record with ID set upfront
+	// For new files, generate UUID immediately so it's available for chunk associations
+	// For existing files, reuse the existing ID
+	var fileID string
+	if exists {
+		fileID = existingFile.ID
+	} else {
+		fileID = uuid.New().String()
+	}
+
 	file := &database.File{
+		ID:    fileID,
 		Path:  path,
 		MTime: info.ModTime(),
 		CTime: info.ModTime(), // afero doesn't provide ctime
@@ -440,16 +537,11 @@ func (s *Scanner) checkFileInMemory(path string, info os.FileInfo, knownFiles ma
 		GID:   gid,
 	}

-	// Check against in-memory map
-	existingFile, exists := knownFiles[path]
+	// New file - needs processing
 	if !exists {
-		// New file
 		return file, true
 	}

-	// Reuse existing ID
-	file.ID = existingFile.ID
-
 	// Check if file has changed
 	if existingFile.Size != file.Size ||
 		existingFile.MTime.Unix() != file.MTime.Unix() ||
@@ -585,6 +677,11 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc
 		}
 	}

+	// Flush any remaining pending chunks and files to database
+	if err := s.flushAllPending(ctx); err != nil {
+		return fmt.Errorf("flushing pending database operations: %w", err)
+	}
+
 	// Final flush (outside any transaction)
 	s.packerMu.Lock()
 	if err := s.packer.Flush(); err != nil {
@@ -779,20 +876,18 @@ func (s *Scanner) processFileStreaming(ctx context.Context, fileToProcess *FileT
 		// Check if chunk already exists (fast in-memory lookup)
 		chunkExists := s.chunkExists(chunk.Hash)

-		// Store chunk if new
+		// Store chunk in database if new (must happen before packer.AddChunk
+		// because packer creates blob_chunk entries that reference chunks)
 		if !chunkExists {
 			err := s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
 				dbChunk := &database.Chunk{
 					ChunkHash: chunk.Hash,
 					Size:      chunk.Size,
 				}
-				if err := s.repos.Chunks.Create(txCtx, tx, dbChunk); err != nil {
-					return fmt.Errorf("creating chunk: %w", err)
-				}
-				return nil
+				return s.repos.Chunks.Create(txCtx, tx, dbChunk)
 			})
 			if err != nil {
-				return fmt.Errorf("storing chunk: %w", err)
+				return fmt.Errorf("creating chunk: %w", err)
 			}
 			// Add to in-memory cache for fast duplicate detection
 			s.addKnownChunk(chunk.Hash)
@@ -871,56 +966,30 @@ func (s *Scanner) processFileStreaming(ctx context.Context, fileToProcess *FileT
 		"file_hash", fileHash,
 		"chunks", len(chunks))

-	// Store file record, chunk associations, and snapshot association in database
-	// This happens AFTER successful chunking to avoid orphaned records on interruption
-	err = s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
-		// Create or update the file record
-		// Files.Create uses INSERT OR REPLACE, so it handles both new and changed files
-		if err := s.repos.Files.Create(txCtx, tx, fileToProcess.File); err != nil {
-			return fmt.Errorf("creating file record: %w", err)
+	// Build file data for batch insertion
+	// Update chunk associations with the file ID
+	fileChunks := make([]database.FileChunk, len(chunks))
+	chunkFiles := make([]database.ChunkFile, len(chunks))
+	for i, ci := range chunks {
+		fileChunks[i] = database.FileChunk{
+			FileID:    fileToProcess.File.ID,
+			Idx:       ci.fileChunk.Idx,
+			ChunkHash: ci.fileChunk.ChunkHash,
 		}
-
-		// Delete any existing file_chunks and chunk_files for this file
-		// This ensures old chunks are no longer associated when file content changes
-		if err := s.repos.FileChunks.DeleteByFileID(txCtx, tx, fileToProcess.File.ID); err != nil {
-			return fmt.Errorf("deleting old file chunks: %w", err)
-		}
-		if err := s.repos.ChunkFiles.DeleteByFileID(txCtx, tx, fileToProcess.File.ID); err != nil {
-			return fmt.Errorf("deleting old chunk files: %w", err)
+		chunkFiles[i] = database.ChunkFile{
+			ChunkHash:  ci.fileChunk.ChunkHash,
+			FileID:     fileToProcess.File.ID,
+			FileOffset: ci.offset,
+			Length:     ci.size,
 		}
+	}

-		// Update chunk associations with the file ID (now that we have it)
-		for i := range chunks {
-			chunks[i].fileChunk.FileID = fileToProcess.File.ID
-		}
-
-		for _, ci := range chunks {
-			// Create file-chunk mapping
-			if err := s.repos.FileChunks.Create(txCtx, tx, &ci.fileChunk); err != nil {
-				return fmt.Errorf("creating file chunk: %w", err)
-			}
-
-			// Create chunk-file mapping
-			chunkFile := &database.ChunkFile{
-				ChunkHash:  ci.fileChunk.ChunkHash,
-				FileID:     fileToProcess.File.ID,
-				FileOffset: ci.offset,
-				Length:     ci.size,
-			}
-			if err := s.repos.ChunkFiles.Create(txCtx, tx, chunkFile); err != nil {
-				return fmt.Errorf("creating chunk file: %w", err)
-			}
-		}
-
-		// Add file to snapshot
-		if err := s.repos.Snapshots.AddFileByID(txCtx, tx, s.snapshotID, fileToProcess.File.ID); err != nil {
-			return fmt.Errorf("adding file to snapshot: %w", err)
-		}
-
-		return nil
+	// Queue file for batch insertion
+	return s.addPendingFile(ctx, pendingFileData{
+		file:       fileToProcess.File,
+		fileChunks: fileChunks,
+		chunkFiles: chunkFiles,
 	})
-
-	return err
 }

 // GetProgress returns the progress reporter for this scanner