Fix FK constraint errors in batched file insertion

Generate file UUIDs upfront in checkFileInMemory() rather than
deferring to Files.Create(). This ensures file_chunks and chunk_files
records have valid FileID values when constructed during file
processing, before the batch insert transaction.

Root cause: For new files, file.ID was empty when building the
fileChunks and chunkFiles slices. The ID was only generated later
in Files.Create(), but by then the slices already had empty FileID
values, causing FK constraint failures.

Also adds PROCESS.md documenting the snapshot creation lifecycle,
database transactions, and FK dependency ordering.
This commit is contained in:
2025-12-19 19:48:48 +07:00
parent 899448e1da
commit 43a69c2cfb
3 changed files with 692 additions and 61 deletions

View File

@@ -15,6 +15,7 @@ import (
"git.eeqj.de/sneak/vaultik/internal/log"
"git.eeqj.de/sneak/vaultik/internal/storage"
"github.com/dustin/go-humanize"
"github.com/google/uuid"
"github.com/spf13/afero"
)
@@ -25,6 +26,13 @@ type FileToProcess struct {
File *database.File
}
// pendingFileData holds all data needed to commit a file to the database
type pendingFileData struct {
file *database.File
fileChunks []database.FileChunk
chunkFiles []database.ChunkFile
}
// Scanner scans directories and populates the database with file and chunk information
type Scanner struct {
fs afero.Fs
@@ -42,6 +50,10 @@ type Scanner struct {
knownChunks map[string]struct{}
knownChunksMu sync.RWMutex
// Pending file data buffer for batch insertion
pendingFiles []pendingFileData
pendingFilesMu sync.Mutex
// Mutex for coordinating blob creation
packerMu sync.Mutex // Blocks chunk production during blob creation
@@ -49,6 +61,11 @@ type Scanner struct {
scanCtx context.Context
}
const (
// Batch size for file database operations
fileBatchSize = 100
)
// ScannerConfig contains configuration for the scanner
type ScannerConfig struct {
FS afero.Fs
@@ -286,6 +303,73 @@ func (s *Scanner) addKnownChunk(hash string) {
s.knownChunksMu.Unlock()
}
// addPendingFile adds a file to the pending buffer and flushes if needed
func (s *Scanner) addPendingFile(ctx context.Context, data pendingFileData) error {
s.pendingFilesMu.Lock()
s.pendingFiles = append(s.pendingFiles, data)
needsFlush := len(s.pendingFiles) >= fileBatchSize
s.pendingFilesMu.Unlock()
if needsFlush {
return s.flushPendingFiles(ctx)
}
return nil
}
// flushPendingFiles writes all pending files to the database in a single transaction
func (s *Scanner) flushPendingFiles(ctx context.Context) error {
s.pendingFilesMu.Lock()
files := s.pendingFiles
s.pendingFiles = nil
s.pendingFilesMu.Unlock()
if len(files) == 0 {
return nil
}
return s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
for _, data := range files {
// Create or update the file record
if err := s.repos.Files.Create(txCtx, tx, data.file); err != nil {
return fmt.Errorf("creating file record: %w", err)
}
// Delete any existing file_chunks and chunk_files for this file
if err := s.repos.FileChunks.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
return fmt.Errorf("deleting old file chunks: %w", err)
}
if err := s.repos.ChunkFiles.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
return fmt.Errorf("deleting old chunk files: %w", err)
}
// Create file-chunk mappings
for i := range data.fileChunks {
if err := s.repos.FileChunks.Create(txCtx, tx, &data.fileChunks[i]); err != nil {
return fmt.Errorf("creating file chunk: %w", err)
}
}
// Create chunk-file mappings
for i := range data.chunkFiles {
if err := s.repos.ChunkFiles.Create(txCtx, tx, &data.chunkFiles[i]); err != nil {
return fmt.Errorf("creating chunk file: %w", err)
}
}
// Add file to snapshot
if err := s.repos.Snapshots.AddFileByID(txCtx, tx, s.snapshotID, data.file.ID); err != nil {
return fmt.Errorf("adding file to snapshot: %w", err)
}
}
return nil
})
}
// flushAllPending flushes all pending files to the database
func (s *Scanner) flushAllPending(ctx context.Context) error {
return s.flushPendingFiles(ctx)
}
// ScanPhaseResult contains the results of the scan phase
type ScanPhaseResult struct {
FilesToProcess []*FileToProcess
@@ -429,8 +513,21 @@ func (s *Scanner) checkFileInMemory(path string, info os.FileInfo, knownFiles ma
gid = stat.Gid()
}
// Create file record
// Check against in-memory map first to get existing ID if available
existingFile, exists := knownFiles[path]
// Create file record with ID set upfront
// For new files, generate UUID immediately so it's available for chunk associations
// For existing files, reuse the existing ID
var fileID string
if exists {
fileID = existingFile.ID
} else {
fileID = uuid.New().String()
}
file := &database.File{
ID: fileID,
Path: path,
MTime: info.ModTime(),
CTime: info.ModTime(), // afero doesn't provide ctime
@@ -440,16 +537,11 @@ func (s *Scanner) checkFileInMemory(path string, info os.FileInfo, knownFiles ma
GID: gid,
}
// Check against in-memory map
existingFile, exists := knownFiles[path]
// New file - needs processing
if !exists {
// New file
return file, true
}
// Reuse existing ID
file.ID = existingFile.ID
// Check if file has changed
if existingFile.Size != file.Size ||
existingFile.MTime.Unix() != file.MTime.Unix() ||
@@ -585,6 +677,11 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc
}
}
// Flush any remaining pending chunks and files to database
if err := s.flushAllPending(ctx); err != nil {
return fmt.Errorf("flushing pending database operations: %w", err)
}
// Final flush (outside any transaction)
s.packerMu.Lock()
if err := s.packer.Flush(); err != nil {
@@ -779,20 +876,18 @@ func (s *Scanner) processFileStreaming(ctx context.Context, fileToProcess *FileT
// Check if chunk already exists (fast in-memory lookup)
chunkExists := s.chunkExists(chunk.Hash)
// Store chunk if new
// Store chunk in database if new (must happen before packer.AddChunk
// because packer creates blob_chunk entries that reference chunks)
if !chunkExists {
err := s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
dbChunk := &database.Chunk{
ChunkHash: chunk.Hash,
Size: chunk.Size,
}
if err := s.repos.Chunks.Create(txCtx, tx, dbChunk); err != nil {
return fmt.Errorf("creating chunk: %w", err)
}
return nil
return s.repos.Chunks.Create(txCtx, tx, dbChunk)
})
if err != nil {
return fmt.Errorf("storing chunk: %w", err)
return fmt.Errorf("creating chunk: %w", err)
}
// Add to in-memory cache for fast duplicate detection
s.addKnownChunk(chunk.Hash)
@@ -871,56 +966,30 @@ func (s *Scanner) processFileStreaming(ctx context.Context, fileToProcess *FileT
"file_hash", fileHash,
"chunks", len(chunks))
// Store file record, chunk associations, and snapshot association in database
// This happens AFTER successful chunking to avoid orphaned records on interruption
err = s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
// Create or update the file record
// Files.Create uses INSERT OR REPLACE, so it handles both new and changed files
if err := s.repos.Files.Create(txCtx, tx, fileToProcess.File); err != nil {
return fmt.Errorf("creating file record: %w", err)
// Build file data for batch insertion
// Update chunk associations with the file ID
fileChunks := make([]database.FileChunk, len(chunks))
chunkFiles := make([]database.ChunkFile, len(chunks))
for i, ci := range chunks {
fileChunks[i] = database.FileChunk{
FileID: fileToProcess.File.ID,
Idx: ci.fileChunk.Idx,
ChunkHash: ci.fileChunk.ChunkHash,
}
// Delete any existing file_chunks and chunk_files for this file
// This ensures old chunks are no longer associated when file content changes
if err := s.repos.FileChunks.DeleteByFileID(txCtx, tx, fileToProcess.File.ID); err != nil {
return fmt.Errorf("deleting old file chunks: %w", err)
}
if err := s.repos.ChunkFiles.DeleteByFileID(txCtx, tx, fileToProcess.File.ID); err != nil {
return fmt.Errorf("deleting old chunk files: %w", err)
chunkFiles[i] = database.ChunkFile{
ChunkHash: ci.fileChunk.ChunkHash,
FileID: fileToProcess.File.ID,
FileOffset: ci.offset,
Length: ci.size,
}
}
// Update chunk associations with the file ID (now that we have it)
for i := range chunks {
chunks[i].fileChunk.FileID = fileToProcess.File.ID
}
for _, ci := range chunks {
// Create file-chunk mapping
if err := s.repos.FileChunks.Create(txCtx, tx, &ci.fileChunk); err != nil {
return fmt.Errorf("creating file chunk: %w", err)
}
// Create chunk-file mapping
chunkFile := &database.ChunkFile{
ChunkHash: ci.fileChunk.ChunkHash,
FileID: fileToProcess.File.ID,
FileOffset: ci.offset,
Length: ci.size,
}
if err := s.repos.ChunkFiles.Create(txCtx, tx, chunkFile); err != nil {
return fmt.Errorf("creating chunk file: %w", err)
}
}
// Add file to snapshot
if err := s.repos.Snapshots.AddFileByID(txCtx, tx, s.snapshotID, fileToProcess.File.ID); err != nil {
return fmt.Errorf("adding file to snapshot: %w", err)
}
return nil
// Queue file for batch insertion
return s.addPendingFile(ctx, pendingFileData{
file: fileToProcess.File,
fileChunks: fileChunks,
chunkFiles: chunkFiles,
})
return err
}
// GetProgress returns the progress reporter for this scanner