Batch transactions per blob for improved performance

Previously, each chunk and blob_chunk was inserted in a separate
transaction, leading to ~560k+ transactions for large backups.
This change batches all database operations per blob:

- Chunks are queued in packer.pendingChunks during file processing
- When blob finalizes, one transaction inserts all chunks, blob_chunks,
  and updates the blob record
- Scanner tracks pending chunk hashes to know which files can be flushed
- Files are flushed when all their chunks are committed to DB
- Database is consistent after each blob finalize

This reduces transaction count from O(chunks) to O(blobs), which for a
614k file / 44GB backup means ~50-100 transactions instead of ~560k.
This commit is contained in:
2025-12-23 19:07:26 +07:00
parent f2c120f026
commit 05286bed01
2 changed files with 218 additions and 67 deletions

View File

@@ -47,6 +47,12 @@ type PackerConfig struct {
Fs afero.Fs // Filesystem for temporary files
}
// PendingChunk represents a chunk waiting to be inserted into the database.
type PendingChunk struct {
Hash string
Size int64
}
// Packer accumulates chunks and packs them into blobs.
// It handles compression, encryption, and coordination with the database
// to track blob metadata. Packer is thread-safe.
@@ -64,6 +70,9 @@ type Packer struct {
// Current blob being packed
currentBlob *blobInProgress
finishedBlobs []*FinishedBlob // Only used if no handler provided
// Pending chunks to be inserted when blob finalizes
pendingChunks []PendingChunk
}
// blobInProgress represents a blob being assembled
@@ -114,8 +123,9 @@ type BlobChunkRef struct {
// BlobWithReader wraps a FinishedBlob with its data reader
type BlobWithReader struct {
*FinishedBlob
Reader io.ReadSeeker
TempFile afero.File // Optional, only set for disk-based blobs
Reader io.ReadSeeker
TempFile afero.File // Optional, only set for disk-based blobs
InsertedChunkHashes []string // Chunk hashes that were inserted to DB with this blob
}
// NewPacker creates a new blob packer that accumulates chunks into blobs.
@@ -152,6 +162,15 @@ func (p *Packer) SetBlobHandler(handler BlobHandler) {
p.blobHandler = handler
}
// AddPendingChunk queues a chunk to be inserted into the database when the
// current blob is finalized. This batches chunk inserts to reduce transaction
// overhead. Thread-safe.
func (p *Packer) AddPendingChunk(hash string, size int64) {
p.mu.Lock()
defer p.mu.Unlock()
p.pendingChunks = append(p.pendingChunks, PendingChunk{Hash: hash, Size: size})
}
// AddChunk adds a chunk to the current blob being packed.
// If adding the chunk would exceed MaxBlobSize, returns ErrBlobSizeLimitExceeded.
// In this case, the caller should finalize the current blob and retry.
@@ -314,23 +333,9 @@ func (p *Packer) addChunkToCurrentBlob(chunk *ChunkRef) error {
p.currentBlob.chunks = append(p.currentBlob.chunks, chunkInfo)
p.currentBlob.chunkSet[chunk.Hash] = true
// Store blob-chunk association in database immediately
if p.repos != nil {
blobChunk := &database.BlobChunk{
BlobID: p.currentBlob.id,
ChunkHash: chunk.Hash,
Offset: offset,
Length: chunkSize,
}
err := p.repos.WithTx(context.Background(), func(ctx context.Context, tx *sql.Tx) error {
return p.repos.BlobChunks.Create(ctx, tx, blobChunk)
})
if err != nil {
log.Error("Failed to store blob-chunk association in database", "error", err,
"blob_id", p.currentBlob.id, "chunk_hash", chunk.Hash)
// Continue anyway - we can reconstruct this later if needed
}
}
// Note: blob_chunk records are inserted in batch when blob is finalized
// to reduce transaction overhead. The chunk info is already stored in
// p.currentBlob.chunks for later insertion.
// Update total size
p.currentBlob.size += chunkSize
@@ -392,16 +397,49 @@ func (p *Packer) finalizeCurrentBlob() error {
})
}
// Update blob record in database with hash and sizes
// Get pending chunks (will be inserted to DB and reported to handler)
chunksToInsert := p.pendingChunks
p.pendingChunks = nil // Clear pending list
// Insert pending chunks, blob_chunks, and update blob in a single transaction
if p.repos != nil {
err := p.repos.WithTx(context.Background(), func(ctx context.Context, tx *sql.Tx) error {
// First insert all pending chunks (required for blob_chunks FK)
for _, chunk := range chunksToInsert {
dbChunk := &database.Chunk{
ChunkHash: chunk.Hash,
Size: chunk.Size,
}
if err := p.repos.Chunks.Create(ctx, tx, dbChunk); err != nil {
return fmt.Errorf("creating chunk: %w", err)
}
}
// Insert all blob_chunk records in batch
for _, chunk := range p.currentBlob.chunks {
blobChunk := &database.BlobChunk{
BlobID: p.currentBlob.id,
ChunkHash: chunk.Hash,
Offset: chunk.Offset,
Length: chunk.Size,
}
if err := p.repos.BlobChunks.Create(ctx, tx, blobChunk); err != nil {
return fmt.Errorf("creating blob_chunk: %w", err)
}
}
// Update blob record with final hash and sizes
return p.repos.Blobs.UpdateFinished(ctx, tx, p.currentBlob.id, blobHash,
p.currentBlob.size, finalSize)
})
if err != nil {
p.cleanupTempFile()
return fmt.Errorf("updating blob record: %w", err)
return fmt.Errorf("finalizing blob transaction: %w", err)
}
log.Debug("Committed blob transaction",
"chunks_inserted", len(chunksToInsert),
"blob_chunks_inserted", len(p.currentBlob.chunks))
}
// Create finished blob
@@ -424,6 +462,12 @@ func (p *Packer) finalizeCurrentBlob() error {
"ratio", fmt.Sprintf("%.2f", compressionRatio),
"duration", time.Since(p.currentBlob.startTime))
// Collect inserted chunk hashes for the scanner to track
var insertedChunkHashes []string
for _, chunk := range chunksToInsert {
insertedChunkHashes = append(insertedChunkHashes, chunk.Hash)
}
// Call blob handler if set
if p.blobHandler != nil {
// Reset file position for handler
@@ -434,9 +478,10 @@ func (p *Packer) finalizeCurrentBlob() error {
// Create a blob reader that includes the data stream
blobWithReader := &BlobWithReader{
FinishedBlob: finished,
Reader: p.currentBlob.tempFile,
TempFile: p.currentBlob.tempFile,
FinishedBlob: finished,
Reader: p.currentBlob.tempFile,
TempFile: p.currentBlob.tempFile,
InsertedChunkHashes: insertedChunkHashes,
}
if err := p.blobHandler(blobWithReader); err != nil {