Batch transactions per blob for improved performance
Previously, each chunk and blob_chunk was inserted in a separate transaction, leading to ~560k+ transactions for large backups. This change batches all database operations per blob: - Chunks are queued in packer.pendingChunks during file processing - When blob finalizes, one transaction inserts all chunks, blob_chunks, and updates the blob record - Scanner tracks pending chunk hashes to know which files can be flushed - Files are flushed when all their chunks are committed to DB - Database is consistent after each blob finalize This reduces transaction count from O(chunks) to O(blobs), which for a 614k file / 44GB backup means ~50-100 transactions instead of ~560k.
This commit is contained in:
@@ -50,7 +50,13 @@ type Scanner struct {
|
||||
knownChunks map[string]struct{}
|
||||
knownChunksMu sync.RWMutex
|
||||
|
||||
// Pending chunk hashes - chunks that have been added to packer but not yet committed to DB
|
||||
// When a blob finalizes, the committed chunks are removed from this set
|
||||
pendingChunkHashes map[string]struct{}
|
||||
pendingChunkHashesMu sync.Mutex
|
||||
|
||||
// Pending file data buffer for batch insertion
|
||||
// Files are flushed when all their chunks have been committed to DB
|
||||
pendingFiles []pendingFileData
|
||||
pendingFilesMu sync.Mutex
|
||||
|
||||
@@ -61,11 +67,6 @@ type Scanner struct {
|
||||
scanCtx context.Context
|
||||
}
|
||||
|
||||
const (
|
||||
// Batch size for file database operations
|
||||
fileBatchSize = 100
|
||||
)
|
||||
|
||||
// ScannerConfig contains configuration for the scanner
|
||||
type ScannerConfig struct {
|
||||
FS afero.Fs
|
||||
@@ -120,15 +121,16 @@ func NewScanner(cfg ScannerConfig) *Scanner {
|
||||
}
|
||||
|
||||
return &Scanner{
|
||||
fs: cfg.FS,
|
||||
chunker: chunker.NewChunker(cfg.ChunkSize),
|
||||
packer: packer,
|
||||
repos: cfg.Repositories,
|
||||
storage: cfg.Storage,
|
||||
maxBlobSize: cfg.MaxBlobSize,
|
||||
compressionLevel: cfg.CompressionLevel,
|
||||
ageRecipient: strings.Join(cfg.AgeRecipients, ","),
|
||||
progress: progress,
|
||||
fs: cfg.FS,
|
||||
chunker: chunker.NewChunker(cfg.ChunkSize),
|
||||
packer: packer,
|
||||
repos: cfg.Repositories,
|
||||
storage: cfg.Storage,
|
||||
maxBlobSize: cfg.MaxBlobSize,
|
||||
compressionLevel: cfg.CompressionLevel,
|
||||
ageRecipient: strings.Join(cfg.AgeRecipients, ","),
|
||||
progress: progress,
|
||||
pendingChunkHashes: make(map[string]struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,17 +305,37 @@ func (s *Scanner) addKnownChunk(hash string) {
|
||||
s.knownChunksMu.Unlock()
|
||||
}
|
||||
|
||||
// addPendingFile adds a file to the pending buffer and flushes if needed
|
||||
func (s *Scanner) addPendingFile(ctx context.Context, data pendingFileData) error {
|
||||
// addPendingChunkHash marks a chunk as pending (not yet committed to DB)
|
||||
func (s *Scanner) addPendingChunkHash(hash string) {
|
||||
s.pendingChunkHashesMu.Lock()
|
||||
s.pendingChunkHashes[hash] = struct{}{}
|
||||
s.pendingChunkHashesMu.Unlock()
|
||||
}
|
||||
|
||||
// removePendingChunkHashes removes committed chunk hashes from the pending set
|
||||
func (s *Scanner) removePendingChunkHashes(hashes []string) {
|
||||
s.pendingChunkHashesMu.Lock()
|
||||
for _, hash := range hashes {
|
||||
delete(s.pendingChunkHashes, hash)
|
||||
}
|
||||
s.pendingChunkHashesMu.Unlock()
|
||||
}
|
||||
|
||||
// isChunkPending returns true if the chunk is still pending (not yet committed to DB)
|
||||
func (s *Scanner) isChunkPending(hash string) bool {
|
||||
s.pendingChunkHashesMu.Lock()
|
||||
_, pending := s.pendingChunkHashes[hash]
|
||||
s.pendingChunkHashesMu.Unlock()
|
||||
return pending
|
||||
}
|
||||
|
||||
// addPendingFile adds a file to the pending buffer
|
||||
// Files are NOT auto-flushed here - they are flushed when their chunks are committed
|
||||
// (in handleBlobReady after blob finalize)
|
||||
func (s *Scanner) addPendingFile(_ context.Context, data pendingFileData) {
|
||||
s.pendingFilesMu.Lock()
|
||||
s.pendingFiles = append(s.pendingFiles, data)
|
||||
needsFlush := len(s.pendingFiles) >= fileBatchSize
|
||||
s.pendingFilesMu.Unlock()
|
||||
|
||||
if needsFlush {
|
||||
return s.flushPendingFiles(ctx)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// flushPendingFiles writes all pending files to the database in a single transaction
|
||||
@@ -370,6 +392,80 @@ func (s *Scanner) flushAllPending(ctx context.Context) error {
|
||||
return s.flushPendingFiles(ctx)
|
||||
}
|
||||
|
||||
// flushCompletedPendingFiles flushes only files whose chunks are all committed to DB
|
||||
// Files with pending chunks are kept in the queue for later flushing
|
||||
func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
|
||||
s.pendingFilesMu.Lock()
|
||||
|
||||
// Separate files into complete (can flush) and incomplete (keep pending)
|
||||
var canFlush []pendingFileData
|
||||
var stillPending []pendingFileData
|
||||
|
||||
for _, data := range s.pendingFiles {
|
||||
allChunksCommitted := true
|
||||
for _, fc := range data.fileChunks {
|
||||
if s.isChunkPending(fc.ChunkHash) {
|
||||
allChunksCommitted = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allChunksCommitted {
|
||||
canFlush = append(canFlush, data)
|
||||
} else {
|
||||
stillPending = append(stillPending, data)
|
||||
}
|
||||
}
|
||||
|
||||
s.pendingFiles = stillPending
|
||||
s.pendingFilesMu.Unlock()
|
||||
|
||||
if len(canFlush) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
log.Debug("Flushing completed files after blob finalize",
|
||||
"files_to_flush", len(canFlush),
|
||||
"files_still_pending", len(stillPending))
|
||||
|
||||
// Flush the complete files
|
||||
return s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
|
||||
for _, data := range canFlush {
|
||||
// Create or update the file record
|
||||
if err := s.repos.Files.Create(txCtx, tx, data.file); err != nil {
|
||||
return fmt.Errorf("creating file record: %w", err)
|
||||
}
|
||||
|
||||
// Delete any existing file_chunks and chunk_files for this file
|
||||
if err := s.repos.FileChunks.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
|
||||
return fmt.Errorf("deleting old file chunks: %w", err)
|
||||
}
|
||||
if err := s.repos.ChunkFiles.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
|
||||
return fmt.Errorf("deleting old chunk files: %w", err)
|
||||
}
|
||||
|
||||
// Create file-chunk mappings
|
||||
for i := range data.fileChunks {
|
||||
if err := s.repos.FileChunks.Create(txCtx, tx, &data.fileChunks[i]); err != nil {
|
||||
return fmt.Errorf("creating file chunk: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Create chunk-file mappings
|
||||
for i := range data.chunkFiles {
|
||||
if err := s.repos.ChunkFiles.Create(txCtx, tx, &data.chunkFiles[i]); err != nil {
|
||||
return fmt.Errorf("creating chunk file: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Add file to snapshot
|
||||
if err := s.repos.Snapshots.AddFileByID(txCtx, tx, s.snapshotID, data.file.ID); err != nil {
|
||||
return fmt.Errorf("adding file to snapshot: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// ScanPhaseResult contains the results of the scan phase
|
||||
type ScanPhaseResult struct {
|
||||
FilesToProcess []*FileToProcess
|
||||
@@ -677,12 +773,8 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc
|
||||
}
|
||||
}
|
||||
|
||||
// Flush any remaining pending chunks and files to database
|
||||
if err := s.flushAllPending(ctx); err != nil {
|
||||
return fmt.Errorf("flushing pending database operations: %w", err)
|
||||
}
|
||||
|
||||
// Final flush (outside any transaction)
|
||||
// Final packer flush first - this commits remaining chunks to DB
|
||||
// and handleBlobReady will flush files whose chunks are now committed
|
||||
s.packerMu.Lock()
|
||||
if err := s.packer.Flush(); err != nil {
|
||||
s.packerMu.Unlock()
|
||||
@@ -690,6 +782,12 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc
|
||||
}
|
||||
s.packerMu.Unlock()
|
||||
|
||||
// Flush any remaining pending files (e.g., files with only pre-existing chunks
|
||||
// that didn't trigger a blob finalize)
|
||||
if err := s.flushAllPending(ctx); err != nil {
|
||||
return fmt.Errorf("flushing remaining pending files: %w", err)
|
||||
}
|
||||
|
||||
// If no storage configured, store any remaining blobs locally
|
||||
if s.storage == nil {
|
||||
blobs := s.packer.GetFinishedBlobs()
|
||||
@@ -836,7 +934,20 @@ func (s *Scanner) handleBlobReady(blobWithReader *blob.BlobWithReader) error {
|
||||
}
|
||||
}
|
||||
|
||||
return err
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Chunks from this blob are now committed to DB - remove from pending set
|
||||
s.removePendingChunkHashes(blobWithReader.InsertedChunkHashes)
|
||||
|
||||
// Flush files whose chunks are now all committed
|
||||
// This maintains database consistency after each blob
|
||||
if err := s.flushCompletedPendingFiles(dbCtx); err != nil {
|
||||
return fmt.Errorf("flushing completed files: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// processFileStreaming processes a file by streaming chunks directly to the packer
|
||||
@@ -876,21 +987,14 @@ func (s *Scanner) processFileStreaming(ctx context.Context, fileToProcess *FileT
|
||||
// Check if chunk already exists (fast in-memory lookup)
|
||||
chunkExists := s.chunkExists(chunk.Hash)
|
||||
|
||||
// Store chunk in database if new (must happen before packer.AddChunk
|
||||
// because packer creates blob_chunk entries that reference chunks)
|
||||
// Queue new chunks for batch insert when blob finalizes
|
||||
// This dramatically reduces transaction overhead
|
||||
if !chunkExists {
|
||||
err := s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
|
||||
dbChunk := &database.Chunk{
|
||||
ChunkHash: chunk.Hash,
|
||||
Size: chunk.Size,
|
||||
}
|
||||
return s.repos.Chunks.Create(txCtx, tx, dbChunk)
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating chunk: %w", err)
|
||||
}
|
||||
// Add to in-memory cache for fast duplicate detection
|
||||
s.packer.AddPendingChunk(chunk.Hash, chunk.Size)
|
||||
// Add to in-memory cache immediately for fast duplicate detection
|
||||
s.addKnownChunk(chunk.Hash)
|
||||
// Track as pending until blob finalizes and commits to DB
|
||||
s.addPendingChunkHash(chunk.Hash)
|
||||
}
|
||||
|
||||
// Track file chunk association for later storage
|
||||
@@ -985,11 +1089,13 @@ func (s *Scanner) processFileStreaming(ctx context.Context, fileToProcess *FileT
|
||||
}
|
||||
|
||||
// Queue file for batch insertion
|
||||
return s.addPendingFile(ctx, pendingFileData{
|
||||
// Files will be flushed when their chunks are committed (after blob finalize)
|
||||
s.addPendingFile(ctx, pendingFileData{
|
||||
file: fileToProcess.File,
|
||||
fileChunks: fileChunks,
|
||||
chunkFiles: chunkFiles,
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetProgress returns the progress reporter for this scanner
|
||||
|
||||
Reference in New Issue
Block a user