Batch transactions per blob for improved performance

Previously, each chunk and blob_chunk was inserted in a separate transaction, leading to ~560k+ transactions for large backups. This change batches all database operations per blob: - Chunks are queued in packer.pendingChunks during file processing - When blob finalizes, one transaction inserts all chunks, blob_chunks, and updates the blob record - Scanner tracks pending chunk hashes to know which files can be flushed - Files are flushed when all their chunks are committed to DB - Database is consistent after each blob finalize This reduces transaction count from O(chunks) to O(blobs), which for a 614k file / 44GB backup means ~50-100 transactions instead of ~560k.
2025-12-23 19:07:26 +07:00
parent f2c120f026
commit 05286bed01
2 changed files with 218 additions and 67 deletions
--- a/internal/snapshot/scanner.go
+++ b/internal/snapshot/scanner.go
@@ -50,7 +50,13 @@ type Scanner struct {
 	knownChunks   map[string]struct{}
 	knownChunksMu sync.RWMutex

+	// Pending chunk hashes - chunks that have been added to packer but not yet committed to DB
+	// When a blob finalizes, the committed chunks are removed from this set
+	pendingChunkHashes   map[string]struct{}
+	pendingChunkHashesMu sync.Mutex
+
 	// Pending file data buffer for batch insertion
+	// Files are flushed when all their chunks have been committed to DB
 	pendingFiles   []pendingFileData
 	pendingFilesMu sync.Mutex

@@ -61,11 +67,6 @@ type Scanner struct {
 	scanCtx context.Context
 }

-const (
-	// Batch size for file database operations
-	fileBatchSize = 100
-)
-
 // ScannerConfig contains configuration for the scanner
 type ScannerConfig struct {
 	FS               afero.Fs
@@ -120,15 +121,16 @@ func NewScanner(cfg ScannerConfig) *Scanner {
 	}

 	return &Scanner{
-		fs:               cfg.FS,
-		chunker:          chunker.NewChunker(cfg.ChunkSize),
-		packer:           packer,
-		repos:            cfg.Repositories,
-		storage:          cfg.Storage,
-		maxBlobSize:      cfg.MaxBlobSize,
-		compressionLevel: cfg.CompressionLevel,
-		ageRecipient:     strings.Join(cfg.AgeRecipients, ","),
-		progress:         progress,
+		fs:                 cfg.FS,
+		chunker:            chunker.NewChunker(cfg.ChunkSize),
+		packer:             packer,
+		repos:              cfg.Repositories,
+		storage:            cfg.Storage,
+		maxBlobSize:        cfg.MaxBlobSize,
+		compressionLevel:   cfg.CompressionLevel,
+		ageRecipient:       strings.Join(cfg.AgeRecipients, ","),
+		progress:           progress,
+		pendingChunkHashes: make(map[string]struct{}),
 	}
 }

@@ -303,17 +305,37 @@ func (s *Scanner) addKnownChunk(hash string) {
 	s.knownChunksMu.Unlock()
 }

-// addPendingFile adds a file to the pending buffer and flushes if needed
-func (s *Scanner) addPendingFile(ctx context.Context, data pendingFileData) error {
+// addPendingChunkHash marks a chunk as pending (not yet committed to DB)
+func (s *Scanner) addPendingChunkHash(hash string) {
+	s.pendingChunkHashesMu.Lock()
+	s.pendingChunkHashes[hash] = struct{}{}
+	s.pendingChunkHashesMu.Unlock()
+}
+
+// removePendingChunkHashes removes committed chunk hashes from the pending set
+func (s *Scanner) removePendingChunkHashes(hashes []string) {
+	s.pendingChunkHashesMu.Lock()
+	for _, hash := range hashes {
+		delete(s.pendingChunkHashes, hash)
+	}
+	s.pendingChunkHashesMu.Unlock()
+}
+
+// isChunkPending returns true if the chunk is still pending (not yet committed to DB)
+func (s *Scanner) isChunkPending(hash string) bool {
+	s.pendingChunkHashesMu.Lock()
+	_, pending := s.pendingChunkHashes[hash]
+	s.pendingChunkHashesMu.Unlock()
+	return pending
+}
+
+// addPendingFile adds a file to the pending buffer
+// Files are NOT auto-flushed here - they are flushed when their chunks are committed
+// (in handleBlobReady after blob finalize)
+func (s *Scanner) addPendingFile(_ context.Context, data pendingFileData) {
 	s.pendingFilesMu.Lock()
 	s.pendingFiles = append(s.pendingFiles, data)
-	needsFlush := len(s.pendingFiles) >= fileBatchSize
 	s.pendingFilesMu.Unlock()
-
-	if needsFlush {
-		return s.flushPendingFiles(ctx)
-	}
-	return nil
 }

 // flushPendingFiles writes all pending files to the database in a single transaction
@@ -370,6 +392,80 @@ func (s *Scanner) flushAllPending(ctx context.Context) error {
 	return s.flushPendingFiles(ctx)
 }

+// flushCompletedPendingFiles flushes only files whose chunks are all committed to DB
+// Files with pending chunks are kept in the queue for later flushing
+func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
+	s.pendingFilesMu.Lock()
+
+	// Separate files into complete (can flush) and incomplete (keep pending)
+	var canFlush []pendingFileData
+	var stillPending []pendingFileData
+
+	for _, data := range s.pendingFiles {
+		allChunksCommitted := true
+		for _, fc := range data.fileChunks {
+			if s.isChunkPending(fc.ChunkHash) {
+				allChunksCommitted = false
+				break
+			}
+		}
+		if allChunksCommitted {
+			canFlush = append(canFlush, data)
+		} else {
+			stillPending = append(stillPending, data)
+		}
+	}
+
+	s.pendingFiles = stillPending
+	s.pendingFilesMu.Unlock()
+
+	if len(canFlush) == 0 {
+		return nil
+	}
+
+	log.Debug("Flushing completed files after blob finalize",
+		"files_to_flush", len(canFlush),
+		"files_still_pending", len(stillPending))
+
+	// Flush the complete files
+	return s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
+		for _, data := range canFlush {
+			// Create or update the file record
+			if err := s.repos.Files.Create(txCtx, tx, data.file); err != nil {
+				return fmt.Errorf("creating file record: %w", err)
+			}
+
+			// Delete any existing file_chunks and chunk_files for this file
+			if err := s.repos.FileChunks.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
+				return fmt.Errorf("deleting old file chunks: %w", err)
+			}
+			if err := s.repos.ChunkFiles.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
+				return fmt.Errorf("deleting old chunk files: %w", err)
+			}
+
+			// Create file-chunk mappings
+			for i := range data.fileChunks {
+				if err := s.repos.FileChunks.Create(txCtx, tx, &data.fileChunks[i]); err != nil {
+					return fmt.Errorf("creating file chunk: %w", err)
+				}
+			}
+
+			// Create chunk-file mappings
+			for i := range data.chunkFiles {
+				if err := s.repos.ChunkFiles.Create(txCtx, tx, &data.chunkFiles[i]); err != nil {
+					return fmt.Errorf("creating chunk file: %w", err)
+				}
+			}
+
+			// Add file to snapshot
+			if err := s.repos.Snapshots.AddFileByID(txCtx, tx, s.snapshotID, data.file.ID); err != nil {
+				return fmt.Errorf("adding file to snapshot: %w", err)
+			}
+		}
+		return nil
+	})
+}
+
 // ScanPhaseResult contains the results of the scan phase
 type ScanPhaseResult struct {
 	FilesToProcess   []*FileToProcess
@@ -677,12 +773,8 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc
 		}
 	}

-	// Flush any remaining pending chunks and files to database
-	if err := s.flushAllPending(ctx); err != nil {
-		return fmt.Errorf("flushing pending database operations: %w", err)
-	}
-
-	// Final flush (outside any transaction)
+	// Final packer flush first - this commits remaining chunks to DB
+	// and handleBlobReady will flush files whose chunks are now committed
 	s.packerMu.Lock()
 	if err := s.packer.Flush(); err != nil {
 		s.packerMu.Unlock()
@@ -690,6 +782,12 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc
 	}
 	s.packerMu.Unlock()

+	// Flush any remaining pending files (e.g., files with only pre-existing chunks
+	// that didn't trigger a blob finalize)
+	if err := s.flushAllPending(ctx); err != nil {
+		return fmt.Errorf("flushing remaining pending files: %w", err)
+	}
+
 	// If no storage configured, store any remaining blobs locally
 	if s.storage == nil {
 		blobs := s.packer.GetFinishedBlobs()
@@ -836,7 +934,20 @@ func (s *Scanner) handleBlobReady(blobWithReader *blob.BlobWithReader) error {
 		}
 	}

-	return err
+	if err != nil {
+		return err
+	}
+
+	// Chunks from this blob are now committed to DB - remove from pending set
+	s.removePendingChunkHashes(blobWithReader.InsertedChunkHashes)
+
+	// Flush files whose chunks are now all committed
+	// This maintains database consistency after each blob
+	if err := s.flushCompletedPendingFiles(dbCtx); err != nil {
+		return fmt.Errorf("flushing completed files: %w", err)
+	}
+
+	return nil
 }

 // processFileStreaming processes a file by streaming chunks directly to the packer
@@ -876,21 +987,14 @@ func (s *Scanner) processFileStreaming(ctx context.Context, fileToProcess *FileT
 		// Check if chunk already exists (fast in-memory lookup)
 		chunkExists := s.chunkExists(chunk.Hash)

-		// Store chunk in database if new (must happen before packer.AddChunk
-		// because packer creates blob_chunk entries that reference chunks)
+		// Queue new chunks for batch insert when blob finalizes
+		// This dramatically reduces transaction overhead
 		if !chunkExists {
-			err := s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
-				dbChunk := &database.Chunk{
-					ChunkHash: chunk.Hash,
-					Size:      chunk.Size,
-				}
-				return s.repos.Chunks.Create(txCtx, tx, dbChunk)
-			})
-			if err != nil {
-				return fmt.Errorf("creating chunk: %w", err)
-			}
-			// Add to in-memory cache for fast duplicate detection
+			s.packer.AddPendingChunk(chunk.Hash, chunk.Size)
+			// Add to in-memory cache immediately for fast duplicate detection
 			s.addKnownChunk(chunk.Hash)
+			// Track as pending until blob finalizes and commits to DB
+			s.addPendingChunkHash(chunk.Hash)
 		}

 		// Track file chunk association for later storage
@@ -985,11 +1089,13 @@ func (s *Scanner) processFileStreaming(ctx context.Context, fileToProcess *FileT
 	}

 	// Queue file for batch insertion
-	return s.addPendingFile(ctx, pendingFileData{
+	// Files will be flushed when their chunks are committed (after blob finalize)
+	s.addPendingFile(ctx, pendingFileData{
 		file:       fileToProcess.File,
 		fileChunks: fileChunks,
 		chunkFiles: chunkFiles,
 	})
+	return nil
 }

 // GetProgress returns the progress reporter for this scanner