Add exclude patterns, snapshot prune, and other improvements

- Implement exclude patterns with anchored pattern support: - Patterns starting with / only match from root of source dir - Unanchored patterns match anywhere in path - Support for glob patterns (*.log, .*, **/*.pack) - Directory patterns skip entire subtrees - Add gobwas/glob dependency for pattern matching - Add 16 comprehensive tests for exclude functionality - Add snapshot prune command to clean orphaned data: - Removes incomplete snapshots from database - Cleans orphaned files, chunks, and blobs - Runs automatically at backup start for consistency - Add snapshot remove command for deleting snapshots - Add VAULTIK_AGE_SECRET_KEY environment variable support - Fix duplicate fx module provider in restore command - Change snapshot ID format to hostname_YYYY-MM-DDTHH:MM:SSZ
2026-01-01 05:42:56 -08:00
parent 05286bed01
commit 2afd54d693
23 changed files with 1769 additions and 98 deletions
--- a/internal/database/chunk_files.go
+++ b/internal/database/chunk_files.go
@@ -132,3 +132,80 @@ func (r *ChunkFileRepository) DeleteByFileID(ctx context.Context, tx *sql.Tx, fi

 	return nil
 }
+
+// DeleteByFileIDs deletes all chunk_files for multiple files in a single statement.
+func (r *ChunkFileRepository) DeleteByFileIDs(ctx context.Context, tx *sql.Tx, fileIDs []string) error {
+	if len(fileIDs) == 0 {
+		return nil
+	}
+
+	// Batch at 500 to stay within SQLite's variable limit
+	const batchSize = 500
+
+	for i := 0; i < len(fileIDs); i += batchSize {
+		end := i + batchSize
+		if end > len(fileIDs) {
+			end = len(fileIDs)
+		}
+		batch := fileIDs[i:end]
+
+		query := "DELETE FROM chunk_files WHERE file_id IN (?" + repeatPlaceholder(len(batch)-1) + ")"
+		args := make([]interface{}, len(batch))
+		for j, id := range batch {
+			args[j] = id
+		}
+
+		var err error
+		if tx != nil {
+			_, err = tx.ExecContext(ctx, query, args...)
+		} else {
+			_, err = r.db.ExecWithLog(ctx, query, args...)
+		}
+		if err != nil {
+			return fmt.Errorf("batch deleting chunk_files: %w", err)
+		}
+	}
+
+	return nil
+}
+
+// CreateBatch inserts multiple chunk_files in a single statement for efficiency.
+func (r *ChunkFileRepository) CreateBatch(ctx context.Context, tx *sql.Tx, cfs []ChunkFile) error {
+	if len(cfs) == 0 {
+		return nil
+	}
+
+	// Each ChunkFile has 4 values, so batch at 200 to be safe with SQLite's variable limit
+	const batchSize = 200
+
+	for i := 0; i < len(cfs); i += batchSize {
+		end := i + batchSize
+		if end > len(cfs) {
+			end = len(cfs)
+		}
+		batch := cfs[i:end]
+
+		query := "INSERT INTO chunk_files (chunk_hash, file_id, file_offset, length) VALUES "
+		args := make([]interface{}, 0, len(batch)*4)
+		for j, cf := range batch {
+			if j > 0 {
+				query += ", "
+			}
+			query += "(?, ?, ?, ?)"
+			args = append(args, cf.ChunkHash, cf.FileID, cf.FileOffset, cf.Length)
+		}
+		query += " ON CONFLICT(chunk_hash, file_id) DO NOTHING"
+
+		var err error
+		if tx != nil {
+			_, err = tx.ExecContext(ctx, query, args...)
+		} else {
+			_, err = r.db.ExecWithLog(ctx, query, args...)
+		}
+		if err != nil {
+			return fmt.Errorf("batch inserting chunk_files: %w", err)
+		}
+	}
+
+	return nil
+}
--- a/internal/database/database.go
+++ b/internal/database/database.go
@@ -205,6 +205,15 @@ func NewTestDB() (*DB, error) {
 	return New(context.Background(), ":memory:")
 }

+// repeatPlaceholder generates a string of ", ?" repeated n times for IN clause construction.
+// For example, repeatPlaceholder(2) returns ", ?, ?".
+func repeatPlaceholder(n int) string {
+	if n <= 0 {
+		return ""
+	}
+	return strings.Repeat(", ?", n)
+}
+
 // LogSQL logs SQL queries and their arguments when debug mode is enabled.
 // Debug mode is activated by setting the GODEBUG environment variable to include "vaultik".
 // This is useful for troubleshooting database operations and understanding query patterns.
--- a/internal/database/file_chunks.go
+++ b/internal/database/file_chunks.go
@@ -157,6 +157,86 @@ func (r *FileChunkRepository) DeleteByFileID(ctx context.Context, tx *sql.Tx, fi
 	return nil
 }

+// DeleteByFileIDs deletes all chunks for multiple files in a single statement.
+func (r *FileChunkRepository) DeleteByFileIDs(ctx context.Context, tx *sql.Tx, fileIDs []string) error {
+	if len(fileIDs) == 0 {
+		return nil
+	}
+
+	// Batch at 500 to stay within SQLite's variable limit
+	const batchSize = 500
+
+	for i := 0; i < len(fileIDs); i += batchSize {
+		end := i + batchSize
+		if end > len(fileIDs) {
+			end = len(fileIDs)
+		}
+		batch := fileIDs[i:end]
+
+		query := "DELETE FROM file_chunks WHERE file_id IN (?" + repeatPlaceholder(len(batch)-1) + ")"
+		args := make([]interface{}, len(batch))
+		for j, id := range batch {
+			args[j] = id
+		}
+
+		var err error
+		if tx != nil {
+			_, err = tx.ExecContext(ctx, query, args...)
+		} else {
+			_, err = r.db.ExecWithLog(ctx, query, args...)
+		}
+		if err != nil {
+			return fmt.Errorf("batch deleting file_chunks: %w", err)
+		}
+	}
+
+	return nil
+}
+
+// CreateBatch inserts multiple file_chunks in a single statement for efficiency.
+// Batches are automatically split to stay within SQLite's variable limit.
+func (r *FileChunkRepository) CreateBatch(ctx context.Context, tx *sql.Tx, fcs []FileChunk) error {
+	if len(fcs) == 0 {
+		return nil
+	}
+
+	// SQLite has a limit on variables (typically 999 or 32766).
+	// Each FileChunk has 3 values, so batch at 300 to be safe.
+	const batchSize = 300
+
+	for i := 0; i < len(fcs); i += batchSize {
+		end := i + batchSize
+		if end > len(fcs) {
+			end = len(fcs)
+		}
+		batch := fcs[i:end]
+
+		// Build the query with multiple value sets
+		query := "INSERT INTO file_chunks (file_id, idx, chunk_hash) VALUES "
+		args := make([]interface{}, 0, len(batch)*3)
+		for j, fc := range batch {
+			if j > 0 {
+				query += ", "
+			}
+			query += "(?, ?, ?)"
+			args = append(args, fc.FileID, fc.Idx, fc.ChunkHash)
+		}
+		query += " ON CONFLICT(file_id, idx) DO NOTHING"
+
+		var err error
+		if tx != nil {
+			_, err = tx.ExecContext(ctx, query, args...)
+		} else {
+			_, err = r.db.ExecWithLog(ctx, query, args...)
+		}
+		if err != nil {
+			return fmt.Errorf("batch inserting file_chunks: %w", err)
+		}
+	}
+
+	return nil
+}
+
 // GetByFile is an alias for GetByPath for compatibility
 func (r *FileChunkRepository) GetByFile(ctx context.Context, path string) ([]*FileChunk, error) {
 	LogSQL("GetByFile", "Starting", path)
--- a/internal/database/files.go
+++ b/internal/database/files.go
@@ -302,6 +302,55 @@ func (r *FileRepository) ListByPrefix(ctx context.Context, prefix string) ([]*Fi
 	return files, rows.Err()
 }

+// CreateBatch inserts or updates multiple files in a single statement for efficiency.
+// File IDs must be pre-generated before calling this method.
+func (r *FileRepository) CreateBatch(ctx context.Context, tx *sql.Tx, files []*File) error {
+	if len(files) == 0 {
+		return nil
+	}
+
+	// Each File has 9 values, so batch at 100 to be safe with SQLite's variable limit
+	const batchSize = 100
+
+	for i := 0; i < len(files); i += batchSize {
+		end := i + batchSize
+		if end > len(files) {
+			end = len(files)
+		}
+		batch := files[i:end]
+
+		query := `INSERT INTO files (id, path, mtime, ctime, size, mode, uid, gid, link_target) VALUES `
+		args := make([]interface{}, 0, len(batch)*9)
+		for j, f := range batch {
+			if j > 0 {
+				query += ", "
+			}
+			query += "(?, ?, ?, ?, ?, ?, ?, ?, ?)"
+			args = append(args, f.ID, f.Path, f.MTime.Unix(), f.CTime.Unix(), f.Size, f.Mode, f.UID, f.GID, f.LinkTarget)
+		}
+		query += ` ON CONFLICT(path) DO UPDATE SET
+			mtime = excluded.mtime,
+			ctime = excluded.ctime,
+			size = excluded.size,
+			mode = excluded.mode,
+			uid = excluded.uid,
+			gid = excluded.gid,
+			link_target = excluded.link_target`
+
+		var err error
+		if tx != nil {
+			_, err = tx.ExecContext(ctx, query, args...)
+		} else {
+			_, err = r.db.ExecWithLog(ctx, query, args...)
+		}
+		if err != nil {
+			return fmt.Errorf("batch inserting files: %w", err)
+		}
+	}
+
+	return nil
+}
+
 // DeleteOrphaned deletes files that are not referenced by any snapshot
 func (r *FileRepository) DeleteOrphaned(ctx context.Context) error {
 	query := `
--- a/internal/database/schema.sql
+++ b/internal/database/schema.sql
@@ -28,6 +28,9 @@ CREATE TABLE IF NOT EXISTS file_chunks (
    FOREIGN KEY (chunk_hash) REFERENCES chunks(chunk_hash)
 );

+-- Index for efficient chunk lookups (used in orphan detection)
+CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk_hash ON file_chunks(chunk_hash);
+
 -- Chunks table: stores unique content-defined chunks
 CREATE TABLE IF NOT EXISTS chunks (
    chunk_hash TEXT PRIMARY KEY,
@@ -56,6 +59,9 @@ CREATE TABLE IF NOT EXISTS blob_chunks (
    FOREIGN KEY (chunk_hash) REFERENCES chunks(chunk_hash)
 );

+-- Index for efficient chunk lookups (used in orphan detection)
+CREATE INDEX IF NOT EXISTS idx_blob_chunks_chunk_hash ON blob_chunks(chunk_hash);
+
 -- Chunk files table: reverse mapping of chunks to files
 CREATE TABLE IF NOT EXISTS chunk_files (
    chunk_hash TEXT NOT NULL,
@@ -67,6 +73,9 @@ CREATE TABLE IF NOT EXISTS chunk_files (
    FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
 );

+-- Index for efficient file lookups (used in orphan detection)
+CREATE INDEX IF NOT EXISTS idx_chunk_files_file_id ON chunk_files(file_id);
+
 -- Snapshots table: tracks backup snapshots
 CREATE TABLE IF NOT EXISTS snapshots (
    id TEXT PRIMARY KEY,
@@ -96,6 +105,9 @@ CREATE TABLE IF NOT EXISTS snapshot_files (
    FOREIGN KEY (file_id) REFERENCES files(id)
 );

+-- Index for efficient file lookups (used in orphan detection)
+CREATE INDEX IF NOT EXISTS idx_snapshot_files_file_id ON snapshot_files(file_id);
+
 -- Snapshot blobs table: maps snapshots to blobs
 CREATE TABLE IF NOT EXISTS snapshot_blobs (
    snapshot_id TEXT NOT NULL,
@@ -106,6 +118,9 @@ CREATE TABLE IF NOT EXISTS snapshot_blobs (
    FOREIGN KEY (blob_id) REFERENCES blobs(id)
 );

+-- Index for efficient blob lookups (used in orphan detection)
+CREATE INDEX IF NOT EXISTS idx_snapshot_blobs_blob_id ON snapshot_blobs(blob_id);
+
 -- Uploads table: tracks blob upload metrics
 CREATE TABLE IF NOT EXISTS uploads (
    blob_hash TEXT PRIMARY KEY,
@@ -115,4 +130,7 @@ CREATE TABLE IF NOT EXISTS uploads (
    duration_ms INTEGER NOT NULL,
    FOREIGN KEY (blob_hash) REFERENCES blobs(blob_hash),
    FOREIGN KEY (snapshot_id) REFERENCES snapshots(id)
-);
+);
+
+-- Index for efficient snapshot lookups
+CREATE INDEX IF NOT EXISTS idx_uploads_snapshot_id ON uploads(snapshot_id);
--- a/internal/database/snapshots.go
+++ b/internal/database/snapshots.go
@@ -289,6 +289,46 @@ func (r *SnapshotRepository) AddFileByID(ctx context.Context, tx *sql.Tx, snapsh
 	return nil
 }

+// AddFilesByIDBatch adds multiple files to a snapshot in batched inserts
+func (r *SnapshotRepository) AddFilesByIDBatch(ctx context.Context, tx *sql.Tx, snapshotID string, fileIDs []string) error {
+	if len(fileIDs) == 0 {
+		return nil
+	}
+
+	// Each entry has 2 values, so batch at 400 to be safe
+	const batchSize = 400
+
+	for i := 0; i < len(fileIDs); i += batchSize {
+		end := i + batchSize
+		if end > len(fileIDs) {
+			end = len(fileIDs)
+		}
+		batch := fileIDs[i:end]
+
+		query := "INSERT OR IGNORE INTO snapshot_files (snapshot_id, file_id) VALUES "
+		args := make([]interface{}, 0, len(batch)*2)
+		for j, fileID := range batch {
+			if j > 0 {
+				query += ", "
+			}
+			query += "(?, ?)"
+			args = append(args, snapshotID, fileID)
+		}
+
+		var err error
+		if tx != nil {
+			_, err = tx.ExecContext(ctx, query, args...)
+		} else {
+			_, err = r.db.ExecWithLog(ctx, query, args...)
+		}
+		if err != nil {
+			return fmt.Errorf("batch adding files to snapshot: %w", err)
+		}
+	}
+
+	return nil
+}
+
 // AddBlob adds a blob to a snapshot
 func (r *SnapshotRepository) AddBlob(ctx context.Context, tx *sql.Tx, snapshotID string, blobID string, blobHash string) error {
 	query := `