Add exclude patterns, snapshot prune, and other improvements

- Implement exclude patterns with anchored pattern support: - Patterns starting with / only match from root of source dir - Unanchored patterns match anywhere in path - Support for glob patterns (*.log, .*, **/*.pack) - Directory patterns skip entire subtrees - Add gobwas/glob dependency for pattern matching - Add 16 comprehensive tests for exclude functionality - Add snapshot prune command to clean orphaned data: - Removes incomplete snapshots from database - Cleans orphaned files, chunks, and blobs - Runs automatically at backup start for consistency - Add snapshot remove command for deleting snapshots - Add VAULTIK_AGE_SECRET_KEY environment variable support - Fix duplicate fx module provider in restore command - Change snapshot ID format to hostname_YYYY-MM-DDTHH:MM:SSZ
2026-01-01 05:42:56 -08:00
parent 05286bed01
commit 2afd54d693
23 changed files with 1769 additions and 98 deletions
--- a/internal/snapshot/scanner.go
+++ b/internal/snapshot/scanner.go
@@ -3,8 +3,10 @@ package snapshot
 import (
 	"context"
 	"database/sql"
+	"errors"
 	"fmt"
 	"os"
+	"path/filepath"
 	"strings"
 	"sync"
 	"time"
@@ -15,6 +17,7 @@ import (
 	"git.eeqj.de/sneak/vaultik/internal/log"
 	"git.eeqj.de/sneak/vaultik/internal/storage"
 	"github.com/dustin/go-humanize"
+	"github.com/gobwas/glob"
 	"github.com/google/uuid"
 	"github.com/spf13/afero"
 )
@@ -33,6 +36,13 @@ type pendingFileData struct {
 	chunkFiles []database.ChunkFile
 }

+// compiledPattern holds a compiled glob pattern and whether it's anchored
+type compiledPattern struct {
+	pattern  glob.Glob
+	anchored bool // If true, only matches from root of source dir
+	original string
+}
+
 // Scanner scans directories and populates the database with file and chunk information
 type Scanner struct {
 	fs               afero.Fs
@@ -43,7 +53,9 @@ type Scanner struct {
 	maxBlobSize      int64
 	compressionLevel int
 	ageRecipient     string
-	snapshotID       string // Current snapshot being processed
+	snapshotID       string            // Current snapshot being processed
+	exclude          []string          // Glob patterns for files/directories to exclude
+	compiledExclude  []compiledPattern // Compiled glob patterns
 	progress         *ProgressReporter

 	// In-memory cache of known chunk hashes for fast existence checks
@@ -77,6 +89,7 @@ type ScannerConfig struct {
 	CompressionLevel int
 	AgeRecipients    []string // Optional, empty means no encryption
 	EnableProgress   bool     // Enable progress reporting
+	Exclude          []string // Glob patterns for files/directories to exclude
 }

 // ScanResult contains the results of a scan operation
@@ -120,6 +133,9 @@ func NewScanner(cfg ScannerConfig) *Scanner {
 		progress = NewProgressReporter()
 	}

+	// Compile exclude patterns
+	compiledExclude := compileExcludePatterns(cfg.Exclude)
+
 	return &Scanner{
 		fs:                 cfg.FS,
 		chunker:            chunker.NewChunker(cfg.ChunkSize),
@@ -129,6 +145,8 @@ func NewScanner(cfg ScannerConfig) *Scanner {
 		maxBlobSize:        cfg.MaxBlobSize,
 		compressionLevel:   cfg.CompressionLevel,
 		ageRecipient:       strings.Join(cfg.AgeRecipients, ","),
+		exclude:            cfg.Exclude,
+		compiledExclude:    compiledExclude,
 		progress:           progress,
 		pendingChunkHashes: make(map[string]struct{}),
 	}
@@ -314,11 +332,14 @@ func (s *Scanner) addPendingChunkHash(hash string) {

 // removePendingChunkHashes removes committed chunk hashes from the pending set
 func (s *Scanner) removePendingChunkHashes(hashes []string) {
+	log.Debug("removePendingChunkHashes: starting", "count", len(hashes))
+	start := time.Now()
 	s.pendingChunkHashesMu.Lock()
 	for _, hash := range hashes {
 		delete(s.pendingChunkHashes, hash)
 	}
 	s.pendingChunkHashesMu.Unlock()
+	log.Debug("removePendingChunkHashes: done", "count", len(hashes), "duration", time.Since(start))
 }

 // isChunkPending returns true if the chunk is still pending (not yet committed to DB)
@@ -395,12 +416,19 @@ func (s *Scanner) flushAllPending(ctx context.Context) error {
 // flushCompletedPendingFiles flushes only files whose chunks are all committed to DB
 // Files with pending chunks are kept in the queue for later flushing
 func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
+	flushStart := time.Now()
+	log.Debug("flushCompletedPendingFiles: starting")
+
+	log.Debug("flushCompletedPendingFiles: acquiring pendingFilesMu lock")
 	s.pendingFilesMu.Lock()
+	log.Debug("flushCompletedPendingFiles: acquired lock", "pending_files", len(s.pendingFiles))

 	// Separate files into complete (can flush) and incomplete (keep pending)
 	var canFlush []pendingFileData
 	var stillPending []pendingFileData

+	log.Debug("flushCompletedPendingFiles: checking which files can flush")
+	checkStart := time.Now()
 	for _, data := range s.pendingFiles {
 		allChunksCommitted := true
 		for _, fc := range data.fileChunks {
@@ -415,11 +443,14 @@ func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
 			stillPending = append(stillPending, data)
 		}
 	}
+	log.Debug("flushCompletedPendingFiles: check done", "duration", time.Since(checkStart), "can_flush", len(canFlush), "still_pending", len(stillPending))

 	s.pendingFiles = stillPending
 	s.pendingFilesMu.Unlock()
+	log.Debug("flushCompletedPendingFiles: released lock")

 	if len(canFlush) == 0 {
+		log.Debug("flushCompletedPendingFiles: nothing to flush")
 		return nil
 	}

@@ -427,43 +458,85 @@ func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
 		"files_to_flush", len(canFlush),
 		"files_still_pending", len(stillPending))

-	// Flush the complete files
-	return s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
-		for _, data := range canFlush {
-			// Create or update the file record
-			if err := s.repos.Files.Create(txCtx, tx, data.file); err != nil {
-				return fmt.Errorf("creating file record: %w", err)
-			}
+	// Collect all data for batch operations
+	log.Debug("flushCompletedPendingFiles: collecting data for batch ops")
+	collectStart := time.Now()
+	var allFileChunks []database.FileChunk
+	var allChunkFiles []database.ChunkFile
+	var allFileIDs []string
+	var allFiles []*database.File

-			// Delete any existing file_chunks and chunk_files for this file
-			if err := s.repos.FileChunks.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
-				return fmt.Errorf("deleting old file chunks: %w", err)
-			}
-			if err := s.repos.ChunkFiles.DeleteByFileID(txCtx, tx, data.file.ID); err != nil {
-				return fmt.Errorf("deleting old chunk files: %w", err)
-			}
+	for _, data := range canFlush {
+		allFileChunks = append(allFileChunks, data.fileChunks...)
+		allChunkFiles = append(allChunkFiles, data.chunkFiles...)
+		allFileIDs = append(allFileIDs, data.file.ID)
+		allFiles = append(allFiles, data.file)
+	}
+	log.Debug("flushCompletedPendingFiles: collected data",
+		"duration", time.Since(collectStart),
+		"file_chunks", len(allFileChunks),
+		"chunk_files", len(allChunkFiles),
+		"files", len(allFiles))

-			// Create file-chunk mappings
-			for i := range data.fileChunks {
-				if err := s.repos.FileChunks.Create(txCtx, tx, &data.fileChunks[i]); err != nil {
-					return fmt.Errorf("creating file chunk: %w", err)
-				}
-			}
+	// Flush the complete files using batch operations
+	log.Debug("flushCompletedPendingFiles: starting transaction")
+	txStart := time.Now()
+	err := s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
+		log.Debug("flushCompletedPendingFiles: inside transaction")

-			// Create chunk-file mappings
-			for i := range data.chunkFiles {
-				if err := s.repos.ChunkFiles.Create(txCtx, tx, &data.chunkFiles[i]); err != nil {
-					return fmt.Errorf("creating chunk file: %w", err)
-				}
-			}
-
-			// Add file to snapshot
-			if err := s.repos.Snapshots.AddFileByID(txCtx, tx, s.snapshotID, data.file.ID); err != nil {
-				return fmt.Errorf("adding file to snapshot: %w", err)
-			}
+		// Batch delete old file_chunks and chunk_files
+		log.Debug("flushCompletedPendingFiles: deleting old file_chunks")
+		opStart := time.Now()
+		if err := s.repos.FileChunks.DeleteByFileIDs(txCtx, tx, allFileIDs); err != nil {
+			return fmt.Errorf("batch deleting old file chunks: %w", err)
 		}
+		log.Debug("flushCompletedPendingFiles: deleted file_chunks", "duration", time.Since(opStart))
+
+		log.Debug("flushCompletedPendingFiles: deleting old chunk_files")
+		opStart = time.Now()
+		if err := s.repos.ChunkFiles.DeleteByFileIDs(txCtx, tx, allFileIDs); err != nil {
+			return fmt.Errorf("batch deleting old chunk files: %w", err)
+		}
+		log.Debug("flushCompletedPendingFiles: deleted chunk_files", "duration", time.Since(opStart))
+
+		// Batch create/update file records
+		log.Debug("flushCompletedPendingFiles: creating files")
+		opStart = time.Now()
+		if err := s.repos.Files.CreateBatch(txCtx, tx, allFiles); err != nil {
+			return fmt.Errorf("batch creating file records: %w", err)
+		}
+		log.Debug("flushCompletedPendingFiles: created files", "duration", time.Since(opStart))
+
+		// Batch insert file_chunks
+		log.Debug("flushCompletedPendingFiles: inserting file_chunks")
+		opStart = time.Now()
+		if err := s.repos.FileChunks.CreateBatch(txCtx, tx, allFileChunks); err != nil {
+			return fmt.Errorf("batch creating file chunks: %w", err)
+		}
+		log.Debug("flushCompletedPendingFiles: inserted file_chunks", "duration", time.Since(opStart))
+
+		// Batch insert chunk_files
+		log.Debug("flushCompletedPendingFiles: inserting chunk_files")
+		opStart = time.Now()
+		if err := s.repos.ChunkFiles.CreateBatch(txCtx, tx, allChunkFiles); err != nil {
+			return fmt.Errorf("batch creating chunk files: %w", err)
+		}
+		log.Debug("flushCompletedPendingFiles: inserted chunk_files", "duration", time.Since(opStart))
+
+		// Batch add files to snapshot
+		log.Debug("flushCompletedPendingFiles: adding files to snapshot")
+		opStart = time.Now()
+		if err := s.repos.Snapshots.AddFilesByIDBatch(txCtx, tx, s.snapshotID, allFileIDs); err != nil {
+			return fmt.Errorf("batch adding files to snapshot: %w", err)
+		}
+		log.Debug("flushCompletedPendingFiles: added files to snapshot", "duration", time.Since(opStart))
+
+		log.Debug("flushCompletedPendingFiles: transaction complete")
 		return nil
 	})
+	log.Debug("flushCompletedPendingFiles: transaction done", "duration", time.Since(txStart))
+	log.Debug("flushCompletedPendingFiles: total duration", "duration", time.Since(flushStart))
+	return err
 }

 // ScanPhaseResult contains the results of the scan phase
@@ -504,6 +577,14 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
 		default:
 		}

+		// Check exclude patterns - for directories, skip the entire subtree
+		if s.shouldExclude(filePath, path) {
+			if info.IsDir() {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+
 		// Skip non-regular files for processing (but still count them)
 		if !info.Mode().IsRegular() {
 			return nil
@@ -730,6 +811,12 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc

 		// Process file in streaming fashion
 		if err := s.processFileStreaming(ctx, fileToProcess, result); err != nil {
+			// Handle files that were deleted between scan and process phases
+			if errors.Is(err, os.ErrNotExist) {
+				log.Warn("File was deleted during backup, skipping", "path", fileToProcess.Path)
+				result.FilesSkipped++
+				continue
+			}
 			return fmt.Errorf("processing file %s: %w", fileToProcess.Path, err)
 		}

@@ -939,14 +1026,19 @@ func (s *Scanner) handleBlobReady(blobWithReader *blob.BlobWithReader) error {
 	}

 	// Chunks from this blob are now committed to DB - remove from pending set
+	log.Debug("handleBlobReady: removing pending chunk hashes")
 	s.removePendingChunkHashes(blobWithReader.InsertedChunkHashes)
+	log.Debug("handleBlobReady: removed pending chunk hashes")

 	// Flush files whose chunks are now all committed
 	// This maintains database consistency after each blob
+	log.Debug("handleBlobReady: calling flushCompletedPendingFiles")
 	if err := s.flushCompletedPendingFiles(dbCtx); err != nil {
 		return fmt.Errorf("flushing completed files: %w", err)
 	}
+	log.Debug("handleBlobReady: flushCompletedPendingFiles returned")

+	log.Debug("handleBlobReady: complete")
 	return nil
 }

@@ -1135,6 +1227,105 @@ func (s *Scanner) detectDeletedFilesFromMap(ctx context.Context, knownFiles map[
 	return nil
 }

+// compileExcludePatterns compiles the exclude patterns into glob matchers
+func compileExcludePatterns(patterns []string) []compiledPattern {
+	var compiled []compiledPattern
+	for _, p := range patterns {
+		if p == "" {
+			continue
+		}
+
+		// Check if pattern is anchored (starts with /)
+		anchored := strings.HasPrefix(p, "/")
+		pattern := p
+		if anchored {
+			pattern = p[1:] // Remove leading /
+		}
+
+		// Remove trailing slash if present (directory indicator)
+		pattern = strings.TrimSuffix(pattern, "/")
+
+		// Compile the glob pattern
+		// For patterns without path separators, we need to match them as components
+		// e.g., ".git" should match ".git" anywhere in the path
+		g, err := glob.Compile(pattern, '/')
+		if err != nil {
+			log.Warn("Invalid exclude pattern, skipping", "pattern", p, "error", err)
+			continue
+		}
+
+		compiled = append(compiled, compiledPattern{
+			pattern:  g,
+			anchored: anchored,
+			original: p,
+		})
+	}
+	return compiled
+}
+
+// shouldExclude checks if a path should be excluded based on exclude patterns
+// filePath is the full path to the file
+// rootPath is the root of the backup source directory
+func (s *Scanner) shouldExclude(filePath, rootPath string) bool {
+	if len(s.compiledExclude) == 0 {
+		return false
+	}
+
+	// Get the relative path from root
+	relPath, err := filepath.Rel(rootPath, filePath)
+	if err != nil {
+		return false
+	}
+
+	// Never exclude the root directory itself
+	if relPath == "." {
+		return false
+	}
+
+	// Normalize path separators
+	relPath = filepath.ToSlash(relPath)
+
+	// Check each pattern
+	for _, cp := range s.compiledExclude {
+		if cp.anchored {
+			// Anchored pattern: must match from the root
+			// Match the relative path directly
+			if cp.pattern.Match(relPath) {
+				return true
+			}
+			// Also check if any prefix of the path matches (for directory patterns)
+			parts := strings.Split(relPath, "/")
+			for i := 1; i <= len(parts); i++ {
+				prefix := strings.Join(parts[:i], "/")
+				if cp.pattern.Match(prefix) {
+					return true
+				}
+			}
+		} else {
+			// Unanchored pattern: can match anywhere in path
+			// Check the full relative path
+			if cp.pattern.Match(relPath) {
+				return true
+			}
+			// Check each path component and subpath
+			parts := strings.Split(relPath, "/")
+			for i := range parts {
+				// Match individual component (e.g., ".git" matches ".git" directory)
+				if cp.pattern.Match(parts[i]) {
+					return true
+				}
+				// Match subpath from this component onwards
+				subpath := strings.Join(parts[i:], "/")
+				if cp.pattern.Match(subpath) {
+					return true
+				}
+			}
+		}
+	}
+
+	return false
+}
+
 // formatNumber formats a number with comma separators
 func formatNumber(n int) string {
 	if n < 1000 {