diff --git a/Makefile b/Makefile index b05efe3..9af1d21 100644 --- a/Makefile +++ b/Makefile @@ -59,4 +59,8 @@ test-coverage: # Run integration tests test-integration: - go test -v -tags=integration ./... \ No newline at end of file + go test -v -tags=integration ./... + +local: + VAULTIK_CONFIG=$(HOME)/etc/vaultik/config.yml ./vaultik snapshot --debug list 2>&1 + VAULTIK_CONFIG=$(HOME)/etc/vaultik/config.yml ./vaultik snapshot --debug create 2>&1 diff --git a/README.md b/README.md index df5b7a5..39aa066 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,22 @@ # vaultik (ваултик) -`vaultik` is a incremental backup daemon written in Go. It -encrypts data using an `age` public key and uploads each encrypted blob -directly to a remote S3-compatible object store. It requires no private -keys, secrets, or credentials stored on the backed-up system. +WIP: pre-1.0, some functions may not be fully implemented yet + +`vaultik` is a incremental backup daemon written in Go. It encrypts data +using an `age` public key and uploads each encrypted blob directly to a +remote S3-compatible object store. It requires no private keys, secrets, or +credentials (other than those required to PUT to encrypted object storage, +such as S3 API keys) stored on the backed-up system. It includes table-stakes features such as: -* modern authenticated encryption +* modern encryption (the excellent `age`) * deduplication * incremental backups * modern multithreaded zstd compression with configurable levels * content-addressed immutable storage -* local state tracking in standard SQLite database -* inotify-based change detection -* streaming processing of all data to not require lots of ram or temp file - storage +* local state tracking in standard SQLite database, enables write-only + incremental backups to destination * no mutable remote metadata * no plaintext file paths or metadata stored in remote * does not create huge numbers of small files (to keep S3 operation counts @@ -27,12 +28,12 @@ It includes table-stakes features such as: content-addressable chunk map of changed files using deterministic chunking. Each chunk is streamed into a blob packer. Blobs are compressed with `zstd`, encrypted with `age`, and uploaded directly to remote storage under a -content-addressed S3 path. +content-addressed S3 path. at the end, a pruned snapshot-specific sqlite +database of metadata is created, encrypted, and uploaded alongside the +blobs. No plaintext file contents ever hit disk. No private key or secret -passphrase is needed or stored locally. All encrypted data is -streaming-processed and immediately discarded once uploaded. Metadata is -encrypted and pushed with the same mechanism. +passphrase is needed or stored locally. ## why @@ -42,6 +43,7 @@ Existing backup software fails under one or more of these conditions: compromises encrypted backups in the case of host system compromise * Depends on symmetric encryption unsuitable for zero-trust environments * Creates one-blob-per-file, which results in excessive S3 operation counts +* is slow `vaultik` addresses these by using: diff --git a/TODO.md b/TODO.md index 6cc3cb4..1908f91 100644 --- a/TODO.md +++ b/TODO.md @@ -40,6 +40,28 @@ Reorganize commands to provide better visibility into stored data and snapshots. - `--deep` mode: Downloads each blob and verifies its hash matches the stored hash - **Stub implementation for now** +- `vaultik snapshot remove ` (alias: `rm`) + - Removes a snapshot and any blobs that become orphaned + - Algorithm: + 1. Validate target snapshot exists in storage + 2. List all snapshots in storage + 3. Download manifests from all OTHER snapshots to build "in-use" blob set + 4. Download target snapshot's manifest to get its blob hashes + 5. Identify orphaned blobs: target blobs NOT in the in-use set + 6. Delete orphaned blobs from storage + 7. Delete snapshot metadata using existing `deleteSnapshot()` helper + - Flags: + - `--force` / `-f`: Skip confirmation prompt + - `--dry-run`: Show what would be deleted without deleting + - Files to modify: + - `internal/cli/snapshot.go`: Add `newSnapshotRemoveCommand()` + - `internal/vaultik/snapshot.go`: Add `RemoveSnapshot()` method + - Reuse existing code: + - Snapshot enumeration pattern from `PruneBlobs()` in `prune.go` + - `v.downloadManifest(snapshotID)` for manifest downloading + - Blob path format: `blobs/{hash[:2]}/{hash[2:4]}/{hash}` + - `v.deleteSnapshot(snapshotID)` for metadata deletion + ### Implementation Notes 1. **No Decryption Required**: All commands work with unencrypted blob manifests diff --git a/cmd/vaultik/main.go b/cmd/vaultik/main.go index 3c7f470..90f1412 100644 --- a/cmd/vaultik/main.go +++ b/cmd/vaultik/main.go @@ -1,9 +1,41 @@ package main import ( + "os" + "runtime" + "runtime/pprof" + "git.eeqj.de/sneak/vaultik/internal/cli" ) func main() { + // CPU profiling: set VAULTIK_CPUPROFILE=/path/to/cpu.prof + if cpuProfile := os.Getenv("VAULTIK_CPUPROFILE"); cpuProfile != "" { + f, err := os.Create(cpuProfile) + if err != nil { + panic("could not create CPU profile: " + err.Error()) + } + defer func() { _ = f.Close() }() + if err := pprof.StartCPUProfile(f); err != nil { + panic("could not start CPU profile: " + err.Error()) + } + defer pprof.StopCPUProfile() + } + + // Memory profiling: set VAULTIK_MEMPROFILE=/path/to/mem.prof + if memProfile := os.Getenv("VAULTIK_MEMPROFILE"); memProfile != "" { + defer func() { + f, err := os.Create(memProfile) + if err != nil { + panic("could not create memory profile: " + err.Error()) + } + defer func() { _ = f.Close() }() + runtime.GC() // get up-to-date statistics + if err := pprof.WriteHeapProfile(f); err != nil { + panic("could not write memory profile: " + err.Error()) + } + }() + } + cli.CLIEntry() } diff --git a/go.mod b/go.mod index 37f362d..7c3f353 100644 --- a/go.mod +++ b/go.mod @@ -68,6 +68,7 @@ require ( github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect + github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang-jwt/jwt/v5 v5.2.2 // indirect github.com/golang/protobuf v1.5.4 // indirect diff --git a/go.sum b/go.sum index 75c59bc..d836bd5 100644 --- a/go.sum +++ b/go.sum @@ -149,6 +149,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/go-test/deep v1.0.2 h1:onZX1rnHT3Wv6cqNgYyFOOlgVKJrksuCMCRvJStbMYw= github.com/go-test/deep v1.0.2/go.mod h1:wGDj63lr65AM2AQyKZd/NYHGb0R+1RLqB8NKt3aSFNA= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= diff --git a/internal/chunker/chunker.go b/internal/chunker/chunker.go index b162b26..3995990 100644 --- a/internal/chunker/chunker.go +++ b/internal/chunker/chunker.go @@ -6,8 +6,6 @@ import ( "fmt" "io" "os" - - "github.com/jotfs/fastcdc-go" ) // Chunk represents a single chunk of data produced by the content-defined chunking algorithm. @@ -48,16 +46,8 @@ func NewChunker(avgChunkSize int64) *Chunker { // reasonably sized inputs. For large files or streams, use ChunkReaderStreaming instead. // Returns an error if chunking fails or if reading from the input fails. func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) { - opts := fastcdc.Options{ - MinSize: c.minChunkSize, - AverageSize: c.avgChunkSize, - MaxSize: c.maxChunkSize, - } - - chunker, err := fastcdc.NewChunker(r, opts) - if err != nil { - return nil, fmt.Errorf("creating chunker: %w", err) - } + chunker := AcquireReusableChunker(r, c.minChunkSize, c.avgChunkSize, c.maxChunkSize) + defer chunker.Release() var chunks []Chunk offset := int64(0) @@ -74,7 +64,7 @@ func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) { // Calculate hash hash := sha256.Sum256(chunk.Data) - // Make a copy of the data since FastCDC reuses the buffer + // Make a copy of the data since the chunker reuses the buffer chunkData := make([]byte, len(chunk.Data)) copy(chunkData, chunk.Data) @@ -107,16 +97,8 @@ func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) (str fileHasher := sha256.New() teeReader := io.TeeReader(r, fileHasher) - opts := fastcdc.Options{ - MinSize: c.minChunkSize, - AverageSize: c.avgChunkSize, - MaxSize: c.maxChunkSize, - } - - chunker, err := fastcdc.NewChunker(teeReader, opts) - if err != nil { - return "", fmt.Errorf("creating chunker: %w", err) - } + chunker := AcquireReusableChunker(teeReader, c.minChunkSize, c.avgChunkSize, c.maxChunkSize) + defer chunker.Release() offset := int64(0) @@ -132,13 +114,12 @@ func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) (str // Calculate chunk hash hash := sha256.Sum256(chunk.Data) - // Make a copy of the data since FastCDC reuses the buffer - chunkData := make([]byte, len(chunk.Data)) - copy(chunkData, chunk.Data) - + // Pass the data directly - caller must process it before we call Next() again + // (chunker reuses its internal buffer, but since we process synchronously + // and completely before continuing, no copy is needed) if err := callback(Chunk{ Hash: hex.EncodeToString(hash[:]), - Data: chunkData, + Data: chunk.Data, Offset: offset, Size: int64(len(chunk.Data)), }); err != nil { diff --git a/internal/chunker/fastcdc.go b/internal/chunker/fastcdc.go new file mode 100644 index 0000000..54c87c7 --- /dev/null +++ b/internal/chunker/fastcdc.go @@ -0,0 +1,265 @@ +package chunker + +import ( + "io" + "math" + "sync" +) + +// ReusableChunker implements FastCDC with reusable buffers to minimize allocations. +// Unlike the upstream fastcdc-go library which allocates a new buffer per file, +// this implementation uses sync.Pool to reuse buffers across files. +type ReusableChunker struct { + minSize int + maxSize int + normSize int + bufSize int + + maskS uint64 + maskL uint64 + + rd io.Reader + + buf []byte + cursor int + offset int + eof bool +} + +// reusableChunkerPool pools ReusableChunker instances to avoid allocations. +var reusableChunkerPool = sync.Pool{ + New: func() interface{} { + return &ReusableChunker{} + }, +} + +// bufferPools contains pools for different buffer sizes. +// Key is the buffer size. +var bufferPools = sync.Map{} + +func getBuffer(size int) []byte { + poolI, _ := bufferPools.LoadOrStore(size, &sync.Pool{ + New: func() interface{} { + buf := make([]byte, size) + return &buf + }, + }) + pool := poolI.(*sync.Pool) + return *pool.Get().(*[]byte) +} + +func putBuffer(buf []byte) { + size := cap(buf) + poolI, ok := bufferPools.Load(size) + if ok { + pool := poolI.(*sync.Pool) + b := buf[:size] + pool.Put(&b) + } +} + +// FastCDCChunk represents a chunk from the FastCDC algorithm. +type FastCDCChunk struct { + Offset int + Length int + Data []byte + Fingerprint uint64 +} + +// AcquireReusableChunker gets a chunker from the pool and initializes it for the given reader. +func AcquireReusableChunker(rd io.Reader, minSize, avgSize, maxSize int) *ReusableChunker { + c := reusableChunkerPool.Get().(*ReusableChunker) + + bufSize := maxSize * 2 + + // Reuse buffer if it's the right size, otherwise get a new one + if c.buf == nil || cap(c.buf) != bufSize { + if c.buf != nil { + putBuffer(c.buf) + } + c.buf = getBuffer(bufSize) + } else { + // Restore buffer to full capacity (may have been truncated by previous EOF) + c.buf = c.buf[:cap(c.buf)] + } + + bits := int(math.Round(math.Log2(float64(avgSize)))) + normalization := 2 + smallBits := bits + normalization + largeBits := bits - normalization + + c.minSize = minSize + c.maxSize = maxSize + c.normSize = avgSize + c.bufSize = bufSize + c.maskS = (1 << smallBits) - 1 + c.maskL = (1 << largeBits) - 1 + c.rd = rd + c.cursor = bufSize + c.offset = 0 + c.eof = false + + return c +} + +// Release returns the chunker to the pool for reuse. +func (c *ReusableChunker) Release() { + c.rd = nil + reusableChunkerPool.Put(c) +} + +func (c *ReusableChunker) fillBuffer() error { + n := len(c.buf) - c.cursor + if n >= c.maxSize { + return nil + } + + // Move all data after the cursor to the start of the buffer + copy(c.buf[:n], c.buf[c.cursor:]) + c.cursor = 0 + + if c.eof { + c.buf = c.buf[:n] + return nil + } + + // Restore buffer to full capacity for reading + c.buf = c.buf[:c.bufSize] + + // Fill the rest of the buffer + m, err := io.ReadFull(c.rd, c.buf[n:]) + if err == io.EOF || err == io.ErrUnexpectedEOF { + c.buf = c.buf[:n+m] + c.eof = true + } else if err != nil { + return err + } + return nil +} + +// Next returns the next chunk or io.EOF when done. +// The returned Data slice is only valid until the next call to Next. +func (c *ReusableChunker) Next() (FastCDCChunk, error) { + if err := c.fillBuffer(); err != nil { + return FastCDCChunk{}, err + } + if len(c.buf) == 0 { + return FastCDCChunk{}, io.EOF + } + + length, fp := c.nextChunk(c.buf[c.cursor:]) + + chunk := FastCDCChunk{ + Offset: c.offset, + Length: length, + Data: c.buf[c.cursor : c.cursor+length], + Fingerprint: fp, + } + + c.cursor += length + c.offset += chunk.Length + + return chunk, nil +} + +func (c *ReusableChunker) nextChunk(data []byte) (int, uint64) { + fp := uint64(0) + i := c.minSize + + if len(data) <= c.minSize { + return len(data), fp + } + + n := min(len(data), c.maxSize) + + for ; i < min(n, c.normSize); i++ { + fp = (fp << 1) + table[data[i]] + if (fp & c.maskS) == 0 { + return i + 1, fp + } + } + + for ; i < n; i++ { + fp = (fp << 1) + table[data[i]] + if (fp & c.maskL) == 0 { + return i + 1, fp + } + } + + return i, fp +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +// 256 random uint64s for the rolling hash function (from FastCDC paper) +var table = [256]uint64{ + 0xe80e8d55032474b3, 0x11b25b61f5924e15, 0x03aa5bd82a9eb669, 0xc45a153ef107a38c, + 0xeac874b86f0f57b9, 0xa5ccedec95ec79c7, 0xe15a3320ad42ac0a, 0x5ed3583fa63cec15, + 0xcd497bf624a4451d, 0xf9ade5b059683605, 0x773940c03fb11ca1, 0xa36b16e4a6ae15b2, + 0x67afd1adb5a89eac, 0xc44c75ee32f0038e, 0x2101790f365c0967, 0x76415c64a222fc4a, + 0x579929249a1e577a, 0xe4762fc41fdbf750, 0xea52198e57dfcdcc, 0xe2535aafe30b4281, + 0xcb1a1bd6c77c9056, 0x5a1aa9bfc4612a62, 0x15a728aef8943eb5, 0x2f8f09738a8ec8d9, + 0x200f3dec9fac8074, 0x0fa9a7b1e0d318df, 0x06c0804ffd0d8e3a, 0x630cbc412669dd25, + 0x10e34f85f4b10285, 0x2a6fe8164b9b6410, 0xcacb57d857d55810, 0x77f8a3a36ff11b46, + 0x66af517e0dc3003e, 0x76c073c789b4009a, 0x853230dbb529f22a, 0x1e9e9c09a1f77e56, + 0x1e871223802ee65d, 0x37fe4588718ff813, 0x10088539f30db464, 0x366f7470b80b72d1, + 0x33f2634d9a6b31db, 0xd43917751d69ea18, 0xa0f492bc1aa7b8de, 0x3f94e5a8054edd20, + 0xedfd6e25eb8b1dbf, 0x759517a54f196a56, 0xe81d5006ec7b6b17, 0x8dd8385fa894a6b7, + 0x45f4d5467b0d6f91, 0xa1f894699de22bc8, 0x33829d09ef93e0fe, 0x3e29e250caed603c, + 0xf7382cba7f63a45e, 0x970f95412bb569d1, 0xc7fcea456d356b4b, 0x723042513f3e7a57, + 0x17ae7688de3596f1, 0x27ac1fcd7cd23c1a, 0xf429beeb78b3f71f, 0xd0780692fb93a3f9, + 0x9f507e28a7c9842f, 0x56001ad536e433ae, 0x7e1dd1ecf58be306, 0x15fee353aa233fc6, + 0xb033a0730b7638e8, 0xeb593ad6bd2406d1, 0x7c86502574d0f133, 0xce3b008d4ccb4be7, + 0xf8566e3d383594c8, 0xb2c261e9b7af4429, 0xf685e7e253799dbb, 0x05d33ed60a494cbc, + 0xeaf88d55a4cb0d1a, 0x3ee9368a902415a1, 0x8980fe6a8493a9a4, 0x358ed008cb448631, + 0xd0cb7e37b46824b8, 0xe9bc375c0bc94f84, 0xea0bf1d8e6b55bb3, 0xb66a60d0f9f6f297, + 0x66db2cc4807b3758, 0x7e4e014afbca8b4d, 0xa5686a4938b0c730, 0xa5f0d7353d623316, + 0x26e38c349242d5e8, 0xeeefa80a29858e30, 0x8915cb912aa67386, 0x4b957a47bfc420d4, + 0xbb53d051a895f7e1, 0x09f5e3235f6911ce, 0x416b98e695cfb7ce, 0x97a08183344c5c86, + 0xbf68e0791839a861, 0xea05dde59ed3ed56, 0x0ca732280beda160, 0xac748ed62fe7f4e2, + 0xc686da075cf6e151, 0xe1ba5658f4af05c8, 0xe9ff09fbeb67cc35, 0xafaea9470323b28d, + 0x0291e8db5bb0ac2a, 0x342072a9bbee77ae, 0x03147eed6b3d0a9c, 0x21379d4de31dbadb, + 0x2388d965226fb986, 0x52c96988bfebabfa, 0xa6fc29896595bc2d, 0x38fa4af70aa46b8b, + 0xa688dd13939421ee, 0x99d5275d9b1415da, 0x453d31bb4fe73631, 0xde51debc1fbe3356, + 0x75a3c847a06c622f, 0xe80e32755d272579, 0x5444052250d8ec0d, 0x8f17dfda19580a3b, + 0xf6b3e9363a185e42, 0x7a42adec6868732f, 0x32cb6a07629203a2, 0x1eca8957defe56d9, + 0x9fa85e4bc78ff9ed, 0x20ff07224a499ca7, 0x3fa6295ff9682c70, 0xe3d5b1e3ce993eff, + 0xa341209362e0b79a, 0x64bd9eae5712ffe8, 0xceebb537babbd12a, 0x5586ef404315954f, + 0x46c3085c938ab51a, 0xa82ccb9199907cee, 0x8c51b6690a3523c8, 0xc4dbd4c9ae518332, + 0x979898dbb23db7b2, 0x1b5b585e6f672a9d, 0xce284da7c4903810, 0x841166e8bb5f1c4f, + 0xb7d884a3fceca7d0, 0xa76468f5a4572374, 0xc10c45f49ee9513d, 0x68f9a5663c1908c9, + 0x0095a13476a6339d, 0xd1d7516ffbe9c679, 0xfd94ab0c9726f938, 0x627468bbdb27c959, + 0xedc3f8988e4a8c9a, 0x58efd33f0dfaa499, 0x21e37d7e2ef4ac8b, 0x297f9ab5586259c6, + 0xda3ba4dc6cb9617d, 0xae11d8d9de2284d2, 0xcfeed88cb3729865, 0xefc2f9e4f03e2633, + 0x8226393e8f0855a4, 0xd6e25fd7acf3a767, 0x435784c3bfd6d14a, 0xf97142e6343fe757, + 0xd73b9fe826352f85, 0x6c3ac444b5b2bd76, 0xd8e88f3e9fd4a3fd, 0x31e50875c36f3460, + 0xa824f1bf88cf4d44, 0x54a4d2c8f5f25899, 0xbff254637ce3b1e6, 0xa02cfe92561b3caa, + 0x7bedb4edee9f0af7, 0x879c0620ac49a102, 0xa12c4ccd23b332e7, 0x09a5ff47bf94ed1e, + 0x7b62f43cd3046fa0, 0xaa3af0476b9c2fb9, 0x22e55301abebba8e, 0x3a6035c42747bd58, + 0x1705373106c8ec07, 0xb1f660de828d0628, 0x065fe82d89ca563d, 0xf555c2d8074d516d, + 0x6bb6c186b423ee99, 0x54a807be6f3120a8, 0x8a3c7fe2f88860b8, 0xbeffc344f5118e81, + 0xd686e80b7d1bd268, 0x661aef4ef5e5e88b, 0x5bf256c654cd1dda, 0x9adb1ab85d7640f4, + 0x68449238920833a2, 0x843279f4cebcb044, 0xc8710cdefa93f7bb, 0x236943294538f3e6, + 0x80d7d136c486d0b4, 0x61653956b28851d3, 0x3f843be9a9a956b5, 0xf73cfbbf137987e5, + 0xcf0cb6dee8ceac2c, 0x50c401f52f185cae, 0xbdbe89ce735c4c1c, 0xeef3ade9c0570bc7, + 0xbe8b066f8f64cbf6, 0x5238d6131705dcb9, 0x20219086c950e9f6, 0x634468d9ed74de02, + 0x0aba4b3d705c7fa5, 0x3374416f725a6672, 0xe7378bdf7beb3bc6, 0x0f7b6a1b1cee565b, + 0x234e4c41b0c33e64, 0x4efa9a0c3f21fe28, 0x1167fc551643e514, 0x9f81a69d3eb01fa4, + 0xdb75c22b12306ed0, 0xe25055d738fc9686, 0x9f9f167a3f8507bb, 0x195f8336d3fbe4d3, + 0x8442b6feffdcb6f6, 0x1e07ed24746ffde9, 0x140e31462d555266, 0x8bd0ce515ae1406e, + 0x2c0be0042b5584b3, 0x35a23d0e15d45a60, 0xc14f1ba147d9bc83, 0xbbf168691264b23f, + 0xad2cc7b57e589ade, 0x9501963154c7815c, 0x9664afa6b8d67d47, 0x7f9e5101fea0a81c, + 0x45ecffb610d25bfd, 0x3157f7aecf9b6ab3, 0xc43ca6f88d87501d, 0x9576ff838dee38dc, + 0x93f21afe0ce1c7d7, 0xceac699df343d8f9, 0x2fec49e29f03398d, 0x8805ccd5730281ed, + 0xf9fc16fc750a8e59, 0x35308cc771adf736, 0x4a57b7c9ee2b7def, 0x03a4c6cdc937a02a, + 0x6c9a8a269fc8c4fc, 0x4681decec7a03f43, 0x342eecded1353ef9, 0x8be0552d8413a867, + 0xc7b4ac51beda8be8, 0xebcc64fb719842c0, 0xde8e4c7fb6d40c1c, 0xcc8263b62f9738b1, + 0xd3cfc0f86511929a, 0x466024ce8bb226ea, 0x459ff690253a3c18, 0x98b27e9d91284c9c, + 0x75c3ae8aa3af373d, 0xfbf8f8e79a866ffc, 0x32327f59d0662799, 0x8228b57e729e9830, + 0x065ceb7a18381b58, 0xd2177671a31dc5ff, 0x90cd801f2f8701f9, 0x9d714428471c65fe, +} diff --git a/internal/cli/restore.go b/internal/cli/restore.go index fa3d396..cd112e3 100644 --- a/internal/cli/restore.go +++ b/internal/cli/restore.go @@ -8,7 +8,6 @@ import ( "git.eeqj.de/sneak/vaultik/internal/database" "git.eeqj.de/sneak/vaultik/internal/globals" "git.eeqj.de/sneak/vaultik/internal/log" - "git.eeqj.de/sneak/vaultik/internal/snapshot" "git.eeqj.de/sneak/vaultik/internal/storage" "github.com/spf13/cobra" "go.uber.org/fx" @@ -60,7 +59,6 @@ The age_secret_key must be configured in the config file for decryption.`, Debug: rootFlags.Debug, }, Modules: []fx.Option{ - snapshot.Module, fx.Provide(fx.Annotate( func(g *globals.Globals, cfg *config.Config, repos *database.Repositories, storer storage.Storer, db *database.DB, shutdowner fx.Shutdowner) *RestoreApp { @@ -113,7 +111,7 @@ The age_secret_key must be configured in the config file for decryption.`, func (app *RestoreApp) runRestore(ctx context.Context, snapshotID string, opts *RestoreOptions) error { // Check for age_secret_key if app.Config.AgeSecretKey == "" { - return fmt.Errorf("age_secret_key missing from config - required for restore") + return fmt.Errorf("age_secret_key required for restore - set in config file or VAULTIK_AGE_SECRET_KEY environment variable") } log.Info("Starting restore operation", diff --git a/internal/cli/snapshot.go b/internal/cli/snapshot.go index 7cdd255..4e9eb47 100644 --- a/internal/cli/snapshot.go +++ b/internal/cli/snapshot.go @@ -24,6 +24,8 @@ func NewSnapshotCommand() *cobra.Command { cmd.AddCommand(newSnapshotListCommand()) cmd.AddCommand(newSnapshotPurgeCommand()) cmd.AddCommand(newSnapshotVerifyCommand()) + cmd.AddCommand(newSnapshotRemoveCommand()) + cmd.AddCommand(newSnapshotPruneCommand()) return cmd } @@ -281,3 +283,123 @@ func newSnapshotVerifyCommand() *cobra.Command { return cmd } + +// newSnapshotRemoveCommand creates the 'snapshot remove' subcommand +func newSnapshotRemoveCommand() *cobra.Command { + opts := &vaultik.RemoveOptions{} + + cmd := &cobra.Command{ + Use: "remove ", + Aliases: []string{"rm"}, + Short: "Remove a snapshot and its orphaned blobs", + Long: `Removes a snapshot and any blobs that are no longer referenced by other snapshots. + +This command downloads manifests from all other snapshots to determine which blobs +are still in use, then deletes any blobs that would become orphaned.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + snapshotID := args[0] + + // Use unified config resolution + configPath, err := ResolveConfigPath() + if err != nil { + return err + } + + rootFlags := GetRootFlags() + return RunWithApp(cmd.Context(), AppOptions{ + ConfigPath: configPath, + LogOptions: log.LogOptions{ + Verbose: rootFlags.Verbose, + Debug: rootFlags.Debug, + }, + Modules: []fx.Option{}, + Invokes: []fx.Option{ + fx.Invoke(func(v *vaultik.Vaultik, lc fx.Lifecycle) { + lc.Append(fx.Hook{ + OnStart: func(ctx context.Context) error { + go func() { + if _, err := v.RemoveSnapshot(snapshotID, opts); err != nil { + if err != context.Canceled { + log.Error("Failed to remove snapshot", "error", err) + os.Exit(1) + } + } + if err := v.Shutdowner.Shutdown(); err != nil { + log.Error("Failed to shutdown", "error", err) + } + }() + return nil + }, + OnStop: func(ctx context.Context) error { + v.Cancel() + return nil + }, + }) + }), + }, + }) + }, + } + + cmd.Flags().BoolVarP(&opts.Force, "force", "f", false, "Skip confirmation prompt") + cmd.Flags().BoolVar(&opts.DryRun, "dry-run", false, "Show what would be deleted without deleting") + + return cmd +} + +// newSnapshotPruneCommand creates the 'snapshot prune' subcommand +func newSnapshotPruneCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "prune", + Short: "Remove orphaned data from local database", + Long: `Removes orphaned files, chunks, and blobs from the local database. + +This cleans up data that is no longer referenced by any snapshot, which can +accumulate from incomplete backups or deleted snapshots.`, + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + // Use unified config resolution + configPath, err := ResolveConfigPath() + if err != nil { + return err + } + + rootFlags := GetRootFlags() + return RunWithApp(cmd.Context(), AppOptions{ + ConfigPath: configPath, + LogOptions: log.LogOptions{ + Verbose: rootFlags.Verbose, + Debug: rootFlags.Debug, + }, + Modules: []fx.Option{}, + Invokes: []fx.Option{ + fx.Invoke(func(v *vaultik.Vaultik, lc fx.Lifecycle) { + lc.Append(fx.Hook{ + OnStart: func(ctx context.Context) error { + go func() { + if _, err := v.PruneDatabase(); err != nil { + if err != context.Canceled { + log.Error("Failed to prune database", "error", err) + os.Exit(1) + } + } + if err := v.Shutdowner.Shutdown(); err != nil { + log.Error("Failed to shutdown", "error", err) + } + }() + return nil + }, + OnStop: func(ctx context.Context) error { + v.Cancel() + return nil + }, + }) + }), + }, + }) + }, + } + + return cmd +} diff --git a/internal/config/config.go b/internal/config/config.go index 03c872a..d1b5364 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -146,6 +146,11 @@ func Load(path string) (*Config, error) { cfg.IndexPath = expandTilde(envIndexPath) } + // Check for environment variable override for AgeSecretKey + if envAgeSecretKey := os.Getenv("VAULTIK_AGE_SECRET_KEY"); envAgeSecretKey != "" { + cfg.AgeSecretKey = envAgeSecretKey + } + // Get hostname if not set if cfg.Hostname == "" { hostname, err := os.Hostname() diff --git a/internal/database/chunk_files.go b/internal/database/chunk_files.go index cb03d85..e967322 100644 --- a/internal/database/chunk_files.go +++ b/internal/database/chunk_files.go @@ -132,3 +132,80 @@ func (r *ChunkFileRepository) DeleteByFileID(ctx context.Context, tx *sql.Tx, fi return nil } + +// DeleteByFileIDs deletes all chunk_files for multiple files in a single statement. +func (r *ChunkFileRepository) DeleteByFileIDs(ctx context.Context, tx *sql.Tx, fileIDs []string) error { + if len(fileIDs) == 0 { + return nil + } + + // Batch at 500 to stay within SQLite's variable limit + const batchSize = 500 + + for i := 0; i < len(fileIDs); i += batchSize { + end := i + batchSize + if end > len(fileIDs) { + end = len(fileIDs) + } + batch := fileIDs[i:end] + + query := "DELETE FROM chunk_files WHERE file_id IN (?" + repeatPlaceholder(len(batch)-1) + ")" + args := make([]interface{}, len(batch)) + for j, id := range batch { + args[j] = id + } + + var err error + if tx != nil { + _, err = tx.ExecContext(ctx, query, args...) + } else { + _, err = r.db.ExecWithLog(ctx, query, args...) + } + if err != nil { + return fmt.Errorf("batch deleting chunk_files: %w", err) + } + } + + return nil +} + +// CreateBatch inserts multiple chunk_files in a single statement for efficiency. +func (r *ChunkFileRepository) CreateBatch(ctx context.Context, tx *sql.Tx, cfs []ChunkFile) error { + if len(cfs) == 0 { + return nil + } + + // Each ChunkFile has 4 values, so batch at 200 to be safe with SQLite's variable limit + const batchSize = 200 + + for i := 0; i < len(cfs); i += batchSize { + end := i + batchSize + if end > len(cfs) { + end = len(cfs) + } + batch := cfs[i:end] + + query := "INSERT INTO chunk_files (chunk_hash, file_id, file_offset, length) VALUES " + args := make([]interface{}, 0, len(batch)*4) + for j, cf := range batch { + if j > 0 { + query += ", " + } + query += "(?, ?, ?, ?)" + args = append(args, cf.ChunkHash, cf.FileID, cf.FileOffset, cf.Length) + } + query += " ON CONFLICT(chunk_hash, file_id) DO NOTHING" + + var err error + if tx != nil { + _, err = tx.ExecContext(ctx, query, args...) + } else { + _, err = r.db.ExecWithLog(ctx, query, args...) + } + if err != nil { + return fmt.Errorf("batch inserting chunk_files: %w", err) + } + } + + return nil +} diff --git a/internal/database/database.go b/internal/database/database.go index 0fd7b49..c90f7d7 100644 --- a/internal/database/database.go +++ b/internal/database/database.go @@ -205,6 +205,15 @@ func NewTestDB() (*DB, error) { return New(context.Background(), ":memory:") } +// repeatPlaceholder generates a string of ", ?" repeated n times for IN clause construction. +// For example, repeatPlaceholder(2) returns ", ?, ?". +func repeatPlaceholder(n int) string { + if n <= 0 { + return "" + } + return strings.Repeat(", ?", n) +} + // LogSQL logs SQL queries and their arguments when debug mode is enabled. // Debug mode is activated by setting the GODEBUG environment variable to include "vaultik". // This is useful for troubleshooting database operations and understanding query patterns. diff --git a/internal/database/file_chunks.go b/internal/database/file_chunks.go index e94d71c..e3c22cc 100644 --- a/internal/database/file_chunks.go +++ b/internal/database/file_chunks.go @@ -157,6 +157,86 @@ func (r *FileChunkRepository) DeleteByFileID(ctx context.Context, tx *sql.Tx, fi return nil } +// DeleteByFileIDs deletes all chunks for multiple files in a single statement. +func (r *FileChunkRepository) DeleteByFileIDs(ctx context.Context, tx *sql.Tx, fileIDs []string) error { + if len(fileIDs) == 0 { + return nil + } + + // Batch at 500 to stay within SQLite's variable limit + const batchSize = 500 + + for i := 0; i < len(fileIDs); i += batchSize { + end := i + batchSize + if end > len(fileIDs) { + end = len(fileIDs) + } + batch := fileIDs[i:end] + + query := "DELETE FROM file_chunks WHERE file_id IN (?" + repeatPlaceholder(len(batch)-1) + ")" + args := make([]interface{}, len(batch)) + for j, id := range batch { + args[j] = id + } + + var err error + if tx != nil { + _, err = tx.ExecContext(ctx, query, args...) + } else { + _, err = r.db.ExecWithLog(ctx, query, args...) + } + if err != nil { + return fmt.Errorf("batch deleting file_chunks: %w", err) + } + } + + return nil +} + +// CreateBatch inserts multiple file_chunks in a single statement for efficiency. +// Batches are automatically split to stay within SQLite's variable limit. +func (r *FileChunkRepository) CreateBatch(ctx context.Context, tx *sql.Tx, fcs []FileChunk) error { + if len(fcs) == 0 { + return nil + } + + // SQLite has a limit on variables (typically 999 or 32766). + // Each FileChunk has 3 values, so batch at 300 to be safe. + const batchSize = 300 + + for i := 0; i < len(fcs); i += batchSize { + end := i + batchSize + if end > len(fcs) { + end = len(fcs) + } + batch := fcs[i:end] + + // Build the query with multiple value sets + query := "INSERT INTO file_chunks (file_id, idx, chunk_hash) VALUES " + args := make([]interface{}, 0, len(batch)*3) + for j, fc := range batch { + if j > 0 { + query += ", " + } + query += "(?, ?, ?)" + args = append(args, fc.FileID, fc.Idx, fc.ChunkHash) + } + query += " ON CONFLICT(file_id, idx) DO NOTHING" + + var err error + if tx != nil { + _, err = tx.ExecContext(ctx, query, args...) + } else { + _, err = r.db.ExecWithLog(ctx, query, args...) + } + if err != nil { + return fmt.Errorf("batch inserting file_chunks: %w", err) + } + } + + return nil +} + // GetByFile is an alias for GetByPath for compatibility func (r *FileChunkRepository) GetByFile(ctx context.Context, path string) ([]*FileChunk, error) { LogSQL("GetByFile", "Starting", path) diff --git a/internal/database/files.go b/internal/database/files.go index c0b1584..d6c8dcb 100644 --- a/internal/database/files.go +++ b/internal/database/files.go @@ -302,6 +302,55 @@ func (r *FileRepository) ListByPrefix(ctx context.Context, prefix string) ([]*Fi return files, rows.Err() } +// CreateBatch inserts or updates multiple files in a single statement for efficiency. +// File IDs must be pre-generated before calling this method. +func (r *FileRepository) CreateBatch(ctx context.Context, tx *sql.Tx, files []*File) error { + if len(files) == 0 { + return nil + } + + // Each File has 9 values, so batch at 100 to be safe with SQLite's variable limit + const batchSize = 100 + + for i := 0; i < len(files); i += batchSize { + end := i + batchSize + if end > len(files) { + end = len(files) + } + batch := files[i:end] + + query := `INSERT INTO files (id, path, mtime, ctime, size, mode, uid, gid, link_target) VALUES ` + args := make([]interface{}, 0, len(batch)*9) + for j, f := range batch { + if j > 0 { + query += ", " + } + query += "(?, ?, ?, ?, ?, ?, ?, ?, ?)" + args = append(args, f.ID, f.Path, f.MTime.Unix(), f.CTime.Unix(), f.Size, f.Mode, f.UID, f.GID, f.LinkTarget) + } + query += ` ON CONFLICT(path) DO UPDATE SET + mtime = excluded.mtime, + ctime = excluded.ctime, + size = excluded.size, + mode = excluded.mode, + uid = excluded.uid, + gid = excluded.gid, + link_target = excluded.link_target` + + var err error + if tx != nil { + _, err = tx.ExecContext(ctx, query, args...) + } else { + _, err = r.db.ExecWithLog(ctx, query, args...) + } + if err != nil { + return fmt.Errorf("batch inserting files: %w", err) + } + } + + return nil +} + // DeleteOrphaned deletes files that are not referenced by any snapshot func (r *FileRepository) DeleteOrphaned(ctx context.Context) error { query := ` diff --git a/internal/database/schema.sql b/internal/database/schema.sql index f28f791..f32521b 100644 --- a/internal/database/schema.sql +++ b/internal/database/schema.sql @@ -28,6 +28,9 @@ CREATE TABLE IF NOT EXISTS file_chunks ( FOREIGN KEY (chunk_hash) REFERENCES chunks(chunk_hash) ); +-- Index for efficient chunk lookups (used in orphan detection) +CREATE INDEX IF NOT EXISTS idx_file_chunks_chunk_hash ON file_chunks(chunk_hash); + -- Chunks table: stores unique content-defined chunks CREATE TABLE IF NOT EXISTS chunks ( chunk_hash TEXT PRIMARY KEY, @@ -56,6 +59,9 @@ CREATE TABLE IF NOT EXISTS blob_chunks ( FOREIGN KEY (chunk_hash) REFERENCES chunks(chunk_hash) ); +-- Index for efficient chunk lookups (used in orphan detection) +CREATE INDEX IF NOT EXISTS idx_blob_chunks_chunk_hash ON blob_chunks(chunk_hash); + -- Chunk files table: reverse mapping of chunks to files CREATE TABLE IF NOT EXISTS chunk_files ( chunk_hash TEXT NOT NULL, @@ -67,6 +73,9 @@ CREATE TABLE IF NOT EXISTS chunk_files ( FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE ); +-- Index for efficient file lookups (used in orphan detection) +CREATE INDEX IF NOT EXISTS idx_chunk_files_file_id ON chunk_files(file_id); + -- Snapshots table: tracks backup snapshots CREATE TABLE IF NOT EXISTS snapshots ( id TEXT PRIMARY KEY, @@ -96,6 +105,9 @@ CREATE TABLE IF NOT EXISTS snapshot_files ( FOREIGN KEY (file_id) REFERENCES files(id) ); +-- Index for efficient file lookups (used in orphan detection) +CREATE INDEX IF NOT EXISTS idx_snapshot_files_file_id ON snapshot_files(file_id); + -- Snapshot blobs table: maps snapshots to blobs CREATE TABLE IF NOT EXISTS snapshot_blobs ( snapshot_id TEXT NOT NULL, @@ -106,6 +118,9 @@ CREATE TABLE IF NOT EXISTS snapshot_blobs ( FOREIGN KEY (blob_id) REFERENCES blobs(id) ); +-- Index for efficient blob lookups (used in orphan detection) +CREATE INDEX IF NOT EXISTS idx_snapshot_blobs_blob_id ON snapshot_blobs(blob_id); + -- Uploads table: tracks blob upload metrics CREATE TABLE IF NOT EXISTS uploads ( blob_hash TEXT PRIMARY KEY, @@ -115,4 +130,7 @@ CREATE TABLE IF NOT EXISTS uploads ( duration_ms INTEGER NOT NULL, FOREIGN KEY (blob_hash) REFERENCES blobs(blob_hash), FOREIGN KEY (snapshot_id) REFERENCES snapshots(id) -); \ No newline at end of file +); + +-- Index for efficient snapshot lookups +CREATE INDEX IF NOT EXISTS idx_uploads_snapshot_id ON uploads(snapshot_id); \ No newline at end of file diff --git a/internal/database/snapshots.go b/internal/database/snapshots.go index c6e721b..25ba49a 100644 --- a/internal/database/snapshots.go +++ b/internal/database/snapshots.go @@ -289,6 +289,46 @@ func (r *SnapshotRepository) AddFileByID(ctx context.Context, tx *sql.Tx, snapsh return nil } +// AddFilesByIDBatch adds multiple files to a snapshot in batched inserts +func (r *SnapshotRepository) AddFilesByIDBatch(ctx context.Context, tx *sql.Tx, snapshotID string, fileIDs []string) error { + if len(fileIDs) == 0 { + return nil + } + + // Each entry has 2 values, so batch at 400 to be safe + const batchSize = 400 + + for i := 0; i < len(fileIDs); i += batchSize { + end := i + batchSize + if end > len(fileIDs) { + end = len(fileIDs) + } + batch := fileIDs[i:end] + + query := "INSERT OR IGNORE INTO snapshot_files (snapshot_id, file_id) VALUES " + args := make([]interface{}, 0, len(batch)*2) + for j, fileID := range batch { + if j > 0 { + query += ", " + } + query += "(?, ?)" + args = append(args, snapshotID, fileID) + } + + var err error + if tx != nil { + _, err = tx.ExecContext(ctx, query, args...) + } else { + _, err = r.db.ExecWithLog(ctx, query, args...) + } + if err != nil { + return fmt.Errorf("batch adding files to snapshot: %w", err) + } + } + + return nil +} + // AddBlob adds a blob to a snapshot func (r *SnapshotRepository) AddBlob(ctx context.Context, tx *sql.Tx, snapshotID string, blobID string, blobHash string) error { query := ` diff --git a/internal/snapshot/exclude_test.go b/internal/snapshot/exclude_test.go new file mode 100644 index 0000000..6de6fc2 --- /dev/null +++ b/internal/snapshot/exclude_test.go @@ -0,0 +1,453 @@ +package snapshot_test + +import ( + "context" + "database/sql" + "path/filepath" + "testing" + "time" + + "git.eeqj.de/sneak/vaultik/internal/database" + "git.eeqj.de/sneak/vaultik/internal/log" + "git.eeqj.de/sneak/vaultik/internal/snapshot" + "github.com/spf13/afero" + "github.com/stretchr/testify/require" +) + +func setupExcludeTestFS(t *testing.T) afero.Fs { + t.Helper() + + // Create in-memory filesystem + fs := afero.NewMemMapFs() + + // Create test directory structure: + // /backup/ + // file1.txt (should be backed up) + // file2.log (should be excluded if *.log is in patterns) + // .git/ + // config (should be excluded if .git is in patterns) + // objects/ + // pack/ + // data.pack (should be excluded if .git is in patterns) + // src/ + // main.go (should be backed up) + // test.go (should be backed up) + // node_modules/ + // package/ + // index.js (should be excluded if node_modules is in patterns) + // cache/ + // temp.dat (should be excluded if cache/ is in patterns) + // build/ + // output.bin (should be excluded if build is in patterns) + // docs/ + // readme.md (should be backed up) + // .DS_Store (should be excluded if .DS_Store is in patterns) + // thumbs.db (should be excluded if thumbs.db is in patterns) + + files := map[string]string{ + "/backup/file1.txt": "content1", + "/backup/file2.log": "log content", + "/backup/.git/config": "git config", + "/backup/.git/objects/pack/data.pack": "pack data", + "/backup/src/main.go": "package main", + "/backup/src/test.go": "package main_test", + "/backup/node_modules/package/index.js": "module.exports = {}", + "/backup/cache/temp.dat": "cached data", + "/backup/build/output.bin": "binary data", + "/backup/docs/readme.md": "# Documentation", + "/backup/.DS_Store": "ds store data", + "/backup/thumbs.db": "thumbs data", + "/backup/src/.hidden": "hidden file", + "/backup/important.log.bak": "backup of log", + } + + testTime := time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC) + for path, content := range files { + dir := filepath.Dir(path) + err := fs.MkdirAll(dir, 0755) + require.NoError(t, err) + err = afero.WriteFile(fs, path, []byte(content), 0644) + require.NoError(t, err) + err = fs.Chtimes(path, testTime, testTime) + require.NoError(t, err) + } + + return fs +} + +func createTestScanner(t *testing.T, fs afero.Fs, excludePatterns []string) (*snapshot.Scanner, *database.Repositories, func()) { + t.Helper() + + // Initialize logger + log.Initialize(log.Config{}) + + // Create test database + db, err := database.NewTestDB() + require.NoError(t, err) + + repos := database.NewRepositories(db) + + scanner := snapshot.NewScanner(snapshot.ScannerConfig{ + FS: fs, + ChunkSize: 64 * 1024, + Repositories: repos, + MaxBlobSize: 1024 * 1024, + CompressionLevel: 3, + AgeRecipients: []string{"age1ql3z7hjy54pw3hyww5ayyfg7zqgvc7w3j2elw8zmrj2kg5sfn9aqmcac8p"}, + Exclude: excludePatterns, + }) + + cleanup := func() { + _ = db.Close() + } + + return scanner, repos, cleanup +} + +func createSnapshotRecord(t *testing.T, ctx context.Context, repos *database.Repositories, snapshotID string) { + t.Helper() + err := repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error { + snap := &database.Snapshot{ + ID: snapshotID, + Hostname: "test-host", + VaultikVersion: "test", + StartedAt: time.Now(), + CompletedAt: nil, + FileCount: 0, + ChunkCount: 0, + BlobCount: 0, + TotalSize: 0, + BlobSize: 0, + CompressionRatio: 1.0, + } + return repos.Snapshots.Create(ctx, tx, snap) + }) + require.NoError(t, err) +} + +func TestExcludePatterns_ExcludeGitDirectory(t *testing.T) { + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{".git"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should have scanned files but NOT .git directory contents + // Expected: file1.txt, file2.log, src/main.go, src/test.go, node_modules/package/index.js, + // cache/temp.dat, build/output.bin, docs/readme.md, .DS_Store, thumbs.db, + // src/.hidden, important.log.bak + // Excluded: .git/config, .git/objects/pack/data.pack + require.Equal(t, 12, result.FilesScanned, "Should exclude .git directory contents") +} + +func TestExcludePatterns_ExcludeByExtension(t *testing.T) { + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"*.log"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should exclude file2.log but NOT important.log.bak (different extension) + // Total files: 14, excluded: 1 (file2.log) + require.Equal(t, 13, result.FilesScanned, "Should exclude *.log files") +} + +func TestExcludePatterns_ExcludeNodeModules(t *testing.T) { + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"node_modules"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should exclude node_modules/package/index.js + // Total files: 14, excluded: 1 + require.Equal(t, 13, result.FilesScanned, "Should exclude node_modules directory") +} + +func TestExcludePatterns_MultiplePatterns(t *testing.T) { + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{".git", "node_modules", "*.log", ".DS_Store", "thumbs.db", "cache", "build"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should only have: file1.txt, src/main.go, src/test.go, docs/readme.md, src/.hidden, important.log.bak + // Excluded: .git/*, node_modules/*, *.log (file2.log), .DS_Store, thumbs.db, cache/*, build/* + require.Equal(t, 6, result.FilesScanned, "Should exclude multiple patterns") +} + +func TestExcludePatterns_NoExclusions(t *testing.T) { + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should scan all 14 files + require.Equal(t, 14, result.FilesScanned, "Should scan all files when no exclusions") +} + +func TestExcludePatterns_ExcludeHiddenFiles(t *testing.T) { + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{".*"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should exclude: .git/*, .DS_Store, src/.hidden + // Total files: 14, excluded: 4 (.git/config, .git/objects/pack/data.pack, .DS_Store, src/.hidden) + require.Equal(t, 10, result.FilesScanned, "Should exclude hidden files and directories") +} + +func TestExcludePatterns_DoubleStarGlob(t *testing.T) { + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"**/*.pack"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should exclude .git/objects/pack/data.pack + // Total files: 14, excluded: 1 + require.Equal(t, 13, result.FilesScanned, "Should exclude **/*.pack files") +} + +func TestExcludePatterns_ExactFileName(t *testing.T) { + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"thumbs.db", ".DS_Store"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should exclude thumbs.db and .DS_Store + // Total files: 14, excluded: 2 + require.Equal(t, 12, result.FilesScanned, "Should exclude exact file names") +} + +func TestExcludePatterns_CaseSensitive(t *testing.T) { + // Pattern matching should be case-sensitive + fs := setupExcludeTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"THUMBS.DB"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Case-sensitive matching: THUMBS.DB should NOT match thumbs.db + // All 14 files should be scanned + require.Equal(t, 14, result.FilesScanned, "Pattern matching should be case-sensitive") +} + +func TestExcludePatterns_DirectoryWithTrailingSlash(t *testing.T) { + fs := setupExcludeTestFS(t) + // Some users might add trailing slashes to directory patterns + scanner, repos, cleanup := createTestScanner(t, fs, []string{"cache/", "build/"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should exclude cache/temp.dat and build/output.bin + // Total files: 14, excluded: 2 + require.Equal(t, 12, result.FilesScanned, "Should handle directory patterns with trailing slashes") +} + +func TestExcludePatterns_PatternInSubdirectory(t *testing.T) { + fs := setupExcludeTestFS(t) + // Exclude .hidden file specifically in src directory + scanner, repos, cleanup := createTestScanner(t, fs, []string{"src/.hidden"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // Should exclude only src/.hidden + // Total files: 14, excluded: 1 + require.Equal(t, 13, result.FilesScanned, "Should exclude specific subdirectory files") +} + +// setupAnchoredTestFS creates a filesystem for testing anchored patterns +// Source dir: /backup +// Structure: +// +// /backup/ +// projectname/ +// file.txt (should be excluded with /projectname) +// otherproject/ +// projectname/ +// file.txt (should NOT be excluded with /projectname, only with projectname) +// src/ +// file.go +func setupAnchoredTestFS(t *testing.T) afero.Fs { + t.Helper() + + fs := afero.NewMemMapFs() + + files := map[string]string{ + "/backup/projectname/file.txt": "root project file", + "/backup/otherproject/projectname/file.txt": "nested project file", + "/backup/src/file.go": "source file", + "/backup/file.txt": "root file", + } + + testTime := time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC) + for path, content := range files { + dir := filepath.Dir(path) + err := fs.MkdirAll(dir, 0755) + require.NoError(t, err) + err = afero.WriteFile(fs, path, []byte(content), 0644) + require.NoError(t, err) + err = fs.Chtimes(path, testTime, testTime) + require.NoError(t, err) + } + + return fs +} + +func TestExcludePatterns_AnchoredPattern(t *testing.T) { + // Pattern starting with / should only match from root of source dir + fs := setupAnchoredTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"/projectname"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // /projectname should ONLY exclude /backup/projectname/file.txt (1 file) + // /backup/otherproject/projectname/file.txt should NOT be excluded + // Total files: 4, excluded: 1 + require.Equal(t, 3, result.FilesScanned, "Anchored pattern /projectname should only match at root of source dir") +} + +func TestExcludePatterns_UnanchoredPattern(t *testing.T) { + // Pattern without leading / should match anywhere in path + fs := setupAnchoredTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"projectname"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // projectname (without /) should exclude BOTH: + // - /backup/projectname/file.txt + // - /backup/otherproject/projectname/file.txt + // Total files: 4, excluded: 2 + require.Equal(t, 2, result.FilesScanned, "Unanchored pattern should match anywhere in path") +} + +func TestExcludePatterns_AnchoredPatternWithGlob(t *testing.T) { + // Anchored pattern with glob + fs := setupAnchoredTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"/src/*.go"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // /src/*.go should exclude /backup/src/file.go + // Total files: 4, excluded: 1 + require.Equal(t, 3, result.FilesScanned, "Anchored pattern with glob should work") +} + +func TestExcludePatterns_AnchoredPatternFile(t *testing.T) { + // Anchored pattern for exact file at root + fs := setupAnchoredTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"/file.txt"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // /file.txt should ONLY exclude /backup/file.txt + // NOT /backup/projectname/file.txt or /backup/otherproject/projectname/file.txt + // Total files: 4, excluded: 1 + require.Equal(t, 3, result.FilesScanned, "Anchored pattern for file should only match at root") +} + +func TestExcludePatterns_UnanchoredPatternFile(t *testing.T) { + // Unanchored pattern for file should match anywhere + fs := setupAnchoredTestFS(t) + scanner, repos, cleanup := createTestScanner(t, fs, []string{"file.txt"}) + defer cleanup() + require.NotNil(t, scanner) + + ctx := context.Background() + createSnapshotRecord(t, ctx, repos, "test-snapshot") + + result, err := scanner.Scan(ctx, "/backup", "test-snapshot") + require.NoError(t, err) + + // file.txt should exclude ALL file.txt files: + // - /backup/file.txt + // - /backup/projectname/file.txt + // - /backup/otherproject/projectname/file.txt + // Total files: 4, excluded: 3 + require.Equal(t, 1, result.FilesScanned, "Unanchored pattern for file should match anywhere") +} diff --git a/internal/snapshot/module.go b/internal/snapshot/module.go index 9beea61..cfa3c57 100644 --- a/internal/snapshot/module.go +++ b/internal/snapshot/module.go @@ -38,6 +38,7 @@ func provideScannerFactory(cfg *config.Config, repos *database.Repositories, sto CompressionLevel: cfg.CompressionLevel, AgeRecipients: cfg.AgeRecipients, EnableProgress: params.EnableProgress, + Exclude: cfg.Exclude, }) } } diff --git a/internal/snapshot/scanner.go b/internal/snapshot/scanner.go index 242abf1..deed74e 100644 --- a/internal/snapshot/scanner.go +++ b/internal/snapshot/scanner.go @@ -3,8 +3,10 @@ package snapshot import ( "context" "database/sql" + "errors" "fmt" "os" + "path/filepath" "strings" "sync" "time" @@ -15,6 +17,7 @@ import ( "git.eeqj.de/sneak/vaultik/internal/log" "git.eeqj.de/sneak/vaultik/internal/storage" "github.com/dustin/go-humanize" + "github.com/gobwas/glob" "github.com/google/uuid" "github.com/spf13/afero" ) @@ -33,6 +36,13 @@ type pendingFileData struct { chunkFiles []database.ChunkFile } +// compiledPattern holds a compiled glob pattern and whether it's anchored +type compiledPattern struct { + pattern glob.Glob + anchored bool // If true, only matches from root of source dir + original string +} + // Scanner scans directories and populates the database with file and chunk information type Scanner struct { fs afero.Fs @@ -43,7 +53,9 @@ type Scanner struct { maxBlobSize int64 compressionLevel int ageRecipient string - snapshotID string // Current snapshot being processed + snapshotID string // Current snapshot being processed + exclude []string // Glob patterns for files/directories to exclude + compiledExclude []compiledPattern // Compiled glob patterns progress *ProgressReporter // In-memory cache of known chunk hashes for fast existence checks @@ -77,6 +89,7 @@ type ScannerConfig struct { CompressionLevel int AgeRecipients []string // Optional, empty means no encryption EnableProgress bool // Enable progress reporting + Exclude []string // Glob patterns for files/directories to exclude } // ScanResult contains the results of a scan operation @@ -120,6 +133,9 @@ func NewScanner(cfg ScannerConfig) *Scanner { progress = NewProgressReporter() } + // Compile exclude patterns + compiledExclude := compileExcludePatterns(cfg.Exclude) + return &Scanner{ fs: cfg.FS, chunker: chunker.NewChunker(cfg.ChunkSize), @@ -129,6 +145,8 @@ func NewScanner(cfg ScannerConfig) *Scanner { maxBlobSize: cfg.MaxBlobSize, compressionLevel: cfg.CompressionLevel, ageRecipient: strings.Join(cfg.AgeRecipients, ","), + exclude: cfg.Exclude, + compiledExclude: compiledExclude, progress: progress, pendingChunkHashes: make(map[string]struct{}), } @@ -314,11 +332,14 @@ func (s *Scanner) addPendingChunkHash(hash string) { // removePendingChunkHashes removes committed chunk hashes from the pending set func (s *Scanner) removePendingChunkHashes(hashes []string) { + log.Debug("removePendingChunkHashes: starting", "count", len(hashes)) + start := time.Now() s.pendingChunkHashesMu.Lock() for _, hash := range hashes { delete(s.pendingChunkHashes, hash) } s.pendingChunkHashesMu.Unlock() + log.Debug("removePendingChunkHashes: done", "count", len(hashes), "duration", time.Since(start)) } // isChunkPending returns true if the chunk is still pending (not yet committed to DB) @@ -395,12 +416,19 @@ func (s *Scanner) flushAllPending(ctx context.Context) error { // flushCompletedPendingFiles flushes only files whose chunks are all committed to DB // Files with pending chunks are kept in the queue for later flushing func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error { + flushStart := time.Now() + log.Debug("flushCompletedPendingFiles: starting") + + log.Debug("flushCompletedPendingFiles: acquiring pendingFilesMu lock") s.pendingFilesMu.Lock() + log.Debug("flushCompletedPendingFiles: acquired lock", "pending_files", len(s.pendingFiles)) // Separate files into complete (can flush) and incomplete (keep pending) var canFlush []pendingFileData var stillPending []pendingFileData + log.Debug("flushCompletedPendingFiles: checking which files can flush") + checkStart := time.Now() for _, data := range s.pendingFiles { allChunksCommitted := true for _, fc := range data.fileChunks { @@ -415,11 +443,14 @@ func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error { stillPending = append(stillPending, data) } } + log.Debug("flushCompletedPendingFiles: check done", "duration", time.Since(checkStart), "can_flush", len(canFlush), "still_pending", len(stillPending)) s.pendingFiles = stillPending s.pendingFilesMu.Unlock() + log.Debug("flushCompletedPendingFiles: released lock") if len(canFlush) == 0 { + log.Debug("flushCompletedPendingFiles: nothing to flush") return nil } @@ -427,43 +458,85 @@ func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error { "files_to_flush", len(canFlush), "files_still_pending", len(stillPending)) - // Flush the complete files - return s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error { - for _, data := range canFlush { - // Create or update the file record - if err := s.repos.Files.Create(txCtx, tx, data.file); err != nil { - return fmt.Errorf("creating file record: %w", err) - } + // Collect all data for batch operations + log.Debug("flushCompletedPendingFiles: collecting data for batch ops") + collectStart := time.Now() + var allFileChunks []database.FileChunk + var allChunkFiles []database.ChunkFile + var allFileIDs []string + var allFiles []*database.File - // Delete any existing file_chunks and chunk_files for this file - if err := s.repos.FileChunks.DeleteByFileID(txCtx, tx, data.file.ID); err != nil { - return fmt.Errorf("deleting old file chunks: %w", err) - } - if err := s.repos.ChunkFiles.DeleteByFileID(txCtx, tx, data.file.ID); err != nil { - return fmt.Errorf("deleting old chunk files: %w", err) - } + for _, data := range canFlush { + allFileChunks = append(allFileChunks, data.fileChunks...) + allChunkFiles = append(allChunkFiles, data.chunkFiles...) + allFileIDs = append(allFileIDs, data.file.ID) + allFiles = append(allFiles, data.file) + } + log.Debug("flushCompletedPendingFiles: collected data", + "duration", time.Since(collectStart), + "file_chunks", len(allFileChunks), + "chunk_files", len(allChunkFiles), + "files", len(allFiles)) - // Create file-chunk mappings - for i := range data.fileChunks { - if err := s.repos.FileChunks.Create(txCtx, tx, &data.fileChunks[i]); err != nil { - return fmt.Errorf("creating file chunk: %w", err) - } - } + // Flush the complete files using batch operations + log.Debug("flushCompletedPendingFiles: starting transaction") + txStart := time.Now() + err := s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error { + log.Debug("flushCompletedPendingFiles: inside transaction") - // Create chunk-file mappings - for i := range data.chunkFiles { - if err := s.repos.ChunkFiles.Create(txCtx, tx, &data.chunkFiles[i]); err != nil { - return fmt.Errorf("creating chunk file: %w", err) - } - } - - // Add file to snapshot - if err := s.repos.Snapshots.AddFileByID(txCtx, tx, s.snapshotID, data.file.ID); err != nil { - return fmt.Errorf("adding file to snapshot: %w", err) - } + // Batch delete old file_chunks and chunk_files + log.Debug("flushCompletedPendingFiles: deleting old file_chunks") + opStart := time.Now() + if err := s.repos.FileChunks.DeleteByFileIDs(txCtx, tx, allFileIDs); err != nil { + return fmt.Errorf("batch deleting old file chunks: %w", err) } + log.Debug("flushCompletedPendingFiles: deleted file_chunks", "duration", time.Since(opStart)) + + log.Debug("flushCompletedPendingFiles: deleting old chunk_files") + opStart = time.Now() + if err := s.repos.ChunkFiles.DeleteByFileIDs(txCtx, tx, allFileIDs); err != nil { + return fmt.Errorf("batch deleting old chunk files: %w", err) + } + log.Debug("flushCompletedPendingFiles: deleted chunk_files", "duration", time.Since(opStart)) + + // Batch create/update file records + log.Debug("flushCompletedPendingFiles: creating files") + opStart = time.Now() + if err := s.repos.Files.CreateBatch(txCtx, tx, allFiles); err != nil { + return fmt.Errorf("batch creating file records: %w", err) + } + log.Debug("flushCompletedPendingFiles: created files", "duration", time.Since(opStart)) + + // Batch insert file_chunks + log.Debug("flushCompletedPendingFiles: inserting file_chunks") + opStart = time.Now() + if err := s.repos.FileChunks.CreateBatch(txCtx, tx, allFileChunks); err != nil { + return fmt.Errorf("batch creating file chunks: %w", err) + } + log.Debug("flushCompletedPendingFiles: inserted file_chunks", "duration", time.Since(opStart)) + + // Batch insert chunk_files + log.Debug("flushCompletedPendingFiles: inserting chunk_files") + opStart = time.Now() + if err := s.repos.ChunkFiles.CreateBatch(txCtx, tx, allChunkFiles); err != nil { + return fmt.Errorf("batch creating chunk files: %w", err) + } + log.Debug("flushCompletedPendingFiles: inserted chunk_files", "duration", time.Since(opStart)) + + // Batch add files to snapshot + log.Debug("flushCompletedPendingFiles: adding files to snapshot") + opStart = time.Now() + if err := s.repos.Snapshots.AddFilesByIDBatch(txCtx, tx, s.snapshotID, allFileIDs); err != nil { + return fmt.Errorf("batch adding files to snapshot: %w", err) + } + log.Debug("flushCompletedPendingFiles: added files to snapshot", "duration", time.Since(opStart)) + + log.Debug("flushCompletedPendingFiles: transaction complete") return nil }) + log.Debug("flushCompletedPendingFiles: transaction done", "duration", time.Since(txStart)) + log.Debug("flushCompletedPendingFiles: total duration", "duration", time.Since(flushStart)) + return err } // ScanPhaseResult contains the results of the scan phase @@ -504,6 +577,14 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult default: } + // Check exclude patterns - for directories, skip the entire subtree + if s.shouldExclude(filePath, path) { + if info.IsDir() { + return filepath.SkipDir + } + return nil + } + // Skip non-regular files for processing (but still count them) if !info.Mode().IsRegular() { return nil @@ -730,6 +811,12 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc // Process file in streaming fashion if err := s.processFileStreaming(ctx, fileToProcess, result); err != nil { + // Handle files that were deleted between scan and process phases + if errors.Is(err, os.ErrNotExist) { + log.Warn("File was deleted during backup, skipping", "path", fileToProcess.Path) + result.FilesSkipped++ + continue + } return fmt.Errorf("processing file %s: %w", fileToProcess.Path, err) } @@ -939,14 +1026,19 @@ func (s *Scanner) handleBlobReady(blobWithReader *blob.BlobWithReader) error { } // Chunks from this blob are now committed to DB - remove from pending set + log.Debug("handleBlobReady: removing pending chunk hashes") s.removePendingChunkHashes(blobWithReader.InsertedChunkHashes) + log.Debug("handleBlobReady: removed pending chunk hashes") // Flush files whose chunks are now all committed // This maintains database consistency after each blob + log.Debug("handleBlobReady: calling flushCompletedPendingFiles") if err := s.flushCompletedPendingFiles(dbCtx); err != nil { return fmt.Errorf("flushing completed files: %w", err) } + log.Debug("handleBlobReady: flushCompletedPendingFiles returned") + log.Debug("handleBlobReady: complete") return nil } @@ -1135,6 +1227,105 @@ func (s *Scanner) detectDeletedFilesFromMap(ctx context.Context, knownFiles map[ return nil } +// compileExcludePatterns compiles the exclude patterns into glob matchers +func compileExcludePatterns(patterns []string) []compiledPattern { + var compiled []compiledPattern + for _, p := range patterns { + if p == "" { + continue + } + + // Check if pattern is anchored (starts with /) + anchored := strings.HasPrefix(p, "/") + pattern := p + if anchored { + pattern = p[1:] // Remove leading / + } + + // Remove trailing slash if present (directory indicator) + pattern = strings.TrimSuffix(pattern, "/") + + // Compile the glob pattern + // For patterns without path separators, we need to match them as components + // e.g., ".git" should match ".git" anywhere in the path + g, err := glob.Compile(pattern, '/') + if err != nil { + log.Warn("Invalid exclude pattern, skipping", "pattern", p, "error", err) + continue + } + + compiled = append(compiled, compiledPattern{ + pattern: g, + anchored: anchored, + original: p, + }) + } + return compiled +} + +// shouldExclude checks if a path should be excluded based on exclude patterns +// filePath is the full path to the file +// rootPath is the root of the backup source directory +func (s *Scanner) shouldExclude(filePath, rootPath string) bool { + if len(s.compiledExclude) == 0 { + return false + } + + // Get the relative path from root + relPath, err := filepath.Rel(rootPath, filePath) + if err != nil { + return false + } + + // Never exclude the root directory itself + if relPath == "." { + return false + } + + // Normalize path separators + relPath = filepath.ToSlash(relPath) + + // Check each pattern + for _, cp := range s.compiledExclude { + if cp.anchored { + // Anchored pattern: must match from the root + // Match the relative path directly + if cp.pattern.Match(relPath) { + return true + } + // Also check if any prefix of the path matches (for directory patterns) + parts := strings.Split(relPath, "/") + for i := 1; i <= len(parts); i++ { + prefix := strings.Join(parts[:i], "/") + if cp.pattern.Match(prefix) { + return true + } + } + } else { + // Unanchored pattern: can match anywhere in path + // Check the full relative path + if cp.pattern.Match(relPath) { + return true + } + // Check each path component and subpath + parts := strings.Split(relPath, "/") + for i := range parts { + // Match individual component (e.g., ".git" matches ".git" directory) + if cp.pattern.Match(parts[i]) { + return true + } + // Match subpath from this component onwards + subpath := strings.Join(parts[i:], "/") + if cp.pattern.Match(subpath) { + return true + } + } + } + } + + return false +} + // formatNumber formats a number with comma separators func formatNumber(n int) string { if n < 1000 { diff --git a/internal/snapshot/snapshot.go b/internal/snapshot/snapshot.go index 9e6052a..ec97ea2 100644 --- a/internal/snapshot/snapshot.go +++ b/internal/snapshot/snapshot.go @@ -46,6 +46,7 @@ import ( "io" "os/exec" "path/filepath" + "strings" "time" "git.eeqj.de/sneak/vaultik/internal/blobgen" @@ -91,7 +92,12 @@ func (sm *SnapshotManager) SetFilesystem(fs afero.Fs) { // CreateSnapshot creates a new snapshot record in the database at the start of a backup func (sm *SnapshotManager) CreateSnapshot(ctx context.Context, hostname, version, gitRevision string) (string, error) { - snapshotID := fmt.Sprintf("%s-%s", hostname, time.Now().UTC().Format("20060102-150405Z")) + // Use short hostname (strip domain if present) + shortHostname := hostname + if idx := strings.Index(hostname, "."); idx != -1 { + shortHostname = hostname[:idx] + } + snapshotID := fmt.Sprintf("%s_%s", shortHostname, time.Now().UTC().Format("2006-01-02T15:04:05Z")) snapshot := &database.Snapshot{ ID: snapshotID, @@ -688,15 +694,16 @@ func (sm *SnapshotManager) deleteSnapshot(ctx context.Context, snapshotID string // Clean up orphaned data log.Debug("Cleaning up orphaned records in main database") - if err := sm.cleanupOrphanedData(ctx); err != nil { + if err := sm.CleanupOrphanedData(ctx); err != nil { return fmt.Errorf("cleaning up orphaned data: %w", err) } return nil } -// cleanupOrphanedData removes files, chunks, and blobs that are no longer referenced by any snapshot -func (sm *SnapshotManager) cleanupOrphanedData(ctx context.Context) error { +// CleanupOrphanedData removes files, chunks, and blobs that are no longer referenced by any snapshot. +// This should be called periodically to clean up data from deleted or incomplete snapshots. +func (sm *SnapshotManager) CleanupOrphanedData(ctx context.Context) error { // Order is important to respect foreign key constraints: // 1. Delete orphaned files (will cascade delete file_chunks) // 2. Delete orphaned blobs (will cascade delete blob_chunks for deleted blobs) diff --git a/internal/vaultik/snapshot.go b/internal/vaultik/snapshot.go index f6a865f..67dbd7e 100644 --- a/internal/vaultik/snapshot.go +++ b/internal/vaultik/snapshot.go @@ -43,8 +43,12 @@ func (v *Vaultik) CreateSnapshot(opts *SnapshotCreateOptions) error { // CRITICAL: This MUST succeed. If we fail to clean up incomplete snapshots, // the deduplication logic will think files from the incomplete snapshot were // already backed up and skip them, resulting in data loss. - if err := v.SnapshotManager.CleanupIncompleteSnapshots(v.ctx, hostname); err != nil { - return fmt.Errorf("cleanup incomplete snapshots: %w", err) + // + // Prune the database before starting: delete incomplete snapshots and orphaned data. + // This ensures the database is consistent before we start a new snapshot. + // Since we use locking, only one vaultik instance accesses the DB at a time. + if _, err := v.PruneDatabase(); err != nil { + return fmt.Errorf("prune database: %w", err) } if opts.Daemon { @@ -633,21 +637,23 @@ func (v *Vaultik) deleteSnapshot(snapshotID string) error { } } - // Then, delete from local database - // Delete related records first to avoid foreign key constraints - if err := v.Repositories.Snapshots.DeleteSnapshotFiles(v.ctx, snapshotID); err != nil { - log.Error("Failed to delete snapshot files", "snapshot_id", snapshotID, "error", err) - } - if err := v.Repositories.Snapshots.DeleteSnapshotBlobs(v.ctx, snapshotID); err != nil { - log.Error("Failed to delete snapshot blobs", "snapshot_id", snapshotID, "error", err) - } - if err := v.Repositories.Snapshots.DeleteSnapshotUploads(v.ctx, snapshotID); err != nil { - log.Error("Failed to delete snapshot uploads", "snapshot_id", snapshotID, "error", err) - } + // Then, delete from local database (if we have a local database) + if v.Repositories != nil { + // Delete related records first to avoid foreign key constraints + if err := v.Repositories.Snapshots.DeleteSnapshotFiles(v.ctx, snapshotID); err != nil { + log.Error("Failed to delete snapshot files", "snapshot_id", snapshotID, "error", err) + } + if err := v.Repositories.Snapshots.DeleteSnapshotBlobs(v.ctx, snapshotID); err != nil { + log.Error("Failed to delete snapshot blobs", "snapshot_id", snapshotID, "error", err) + } + if err := v.Repositories.Snapshots.DeleteSnapshotUploads(v.ctx, snapshotID); err != nil { + log.Error("Failed to delete snapshot uploads", "snapshot_id", snapshotID, "error", err) + } - // Now delete the snapshot itself - if err := v.Repositories.Snapshots.Delete(v.ctx, snapshotID); err != nil { - return fmt.Errorf("deleting snapshot from database: %w", err) + // Now delete the snapshot itself + if err := v.Repositories.Snapshots.Delete(v.ctx, snapshotID); err != nil { + return fmt.Errorf("deleting snapshot from database: %w", err) + } } return nil @@ -699,3 +705,277 @@ func (v *Vaultik) syncWithRemote() error { return nil } + +// RemoveOptions contains options for the snapshot remove command +type RemoveOptions struct { + Force bool + DryRun bool +} + +// RemoveResult contains the result of a snapshot removal +type RemoveResult struct { + SnapshotID string + BlobsDeleted int + BytesFreed int64 + BlobsFailed int +} + +// RemoveSnapshot removes a snapshot and any blobs that become orphaned +func (v *Vaultik) RemoveSnapshot(snapshotID string, opts *RemoveOptions) (*RemoveResult, error) { + log.Info("Starting snapshot removal", "snapshot_id", snapshotID) + + result := &RemoveResult{ + SnapshotID: snapshotID, + } + + // Step 1: List all snapshots in storage + log.Info("Listing remote snapshots") + objectCh := v.Storage.ListStream(v.ctx, "metadata/") + + var allSnapshotIDs []string + targetExists := false + for object := range objectCh { + if object.Err != nil { + return nil, fmt.Errorf("listing remote snapshots: %w", object.Err) + } + + // Extract snapshot ID from paths like metadata/hostname-20240115-143052Z/ + parts := strings.Split(object.Key, "/") + if len(parts) >= 2 && parts[0] == "metadata" && parts[1] != "" { + if strings.HasSuffix(object.Key, "/") || strings.Contains(object.Key, "/manifest.json.zst") { + sid := parts[1] + // Only add unique snapshot IDs + found := false + for _, id := range allSnapshotIDs { + if id == sid { + found = true + break + } + } + if !found { + allSnapshotIDs = append(allSnapshotIDs, sid) + if sid == snapshotID { + targetExists = true + } + } + } + } + } + + if !targetExists { + return nil, fmt.Errorf("snapshot not found: %s", snapshotID) + } + + log.Info("Found snapshots", "total", len(allSnapshotIDs)) + + // Step 2: Download target snapshot's manifest + log.Info("Downloading target manifest", "snapshot_id", snapshotID) + targetManifest, err := v.downloadManifest(snapshotID) + if err != nil { + return nil, fmt.Errorf("downloading target manifest: %w", err) + } + + // Build set of target blob hashes with sizes + targetBlobs := make(map[string]int64) // hash -> size + for _, blob := range targetManifest.Blobs { + targetBlobs[blob.Hash] = blob.CompressedSize + } + log.Info("Target snapshot has blobs", "count", len(targetBlobs)) + + // Step 3: Download manifests from all OTHER snapshots to build "in-use" set + inUseBlobs := make(map[string]bool) + otherCount := 0 + + for _, sid := range allSnapshotIDs { + if sid == snapshotID { + continue // Skip target snapshot + } + + log.Debug("Processing manifest", "snapshot_id", sid) + manifest, err := v.downloadManifest(sid) + if err != nil { + log.Error("Failed to download manifest", "snapshot_id", sid, "error", err) + continue + } + + for _, blob := range manifest.Blobs { + inUseBlobs[blob.Hash] = true + } + otherCount++ + } + + log.Info("Processed other manifests", "count", otherCount, "in_use_blobs", len(inUseBlobs)) + + // Step 4: Find orphaned blobs (in target but not in use by others) + var orphanedBlobs []string + var totalSize int64 + for hash, size := range targetBlobs { + if !inUseBlobs[hash] { + orphanedBlobs = append(orphanedBlobs, hash) + totalSize += size + } + } + + log.Info("Found orphaned blobs", + "count", len(orphanedBlobs), + "total_size", humanize.Bytes(uint64(totalSize)), + ) + + // Show summary + _, _ = fmt.Fprintf(v.Stdout, "\nSnapshot: %s\n", snapshotID) + _, _ = fmt.Fprintf(v.Stdout, "Blobs in snapshot: %d\n", len(targetBlobs)) + _, _ = fmt.Fprintf(v.Stdout, "Orphaned blobs to delete: %d (%s)\n", len(orphanedBlobs), humanize.Bytes(uint64(totalSize))) + + if opts.DryRun { + _, _ = fmt.Fprintln(v.Stdout, "\n[Dry run - no changes made]") + return result, nil + } + + // Confirm unless --force is used + if !opts.Force { + _, _ = fmt.Fprintf(v.Stdout, "\nDelete snapshot and %d orphaned blob(s)? [y/N] ", len(orphanedBlobs)) + var confirm string + if _, err := fmt.Fscanln(v.Stdin, &confirm); err != nil { + _, _ = fmt.Fprintln(v.Stdout, "Cancelled") + return result, nil + } + if strings.ToLower(confirm) != "y" { + _, _ = fmt.Fprintln(v.Stdout, "Cancelled") + return result, nil + } + } + + // Step 5: Delete orphaned blobs + if len(orphanedBlobs) > 0 { + log.Info("Deleting orphaned blobs") + for i, hash := range orphanedBlobs { + blobPath := fmt.Sprintf("blobs/%s/%s/%s", hash[:2], hash[2:4], hash) + + if err := v.Storage.Delete(v.ctx, blobPath); err != nil { + log.Error("Failed to delete blob", "hash", hash, "error", err) + result.BlobsFailed++ + continue + } + + result.BlobsDeleted++ + result.BytesFreed += targetBlobs[hash] + + // Progress update every 100 blobs + if (i+1)%100 == 0 || i == len(orphanedBlobs)-1 { + log.Info("Deletion progress", + "deleted", i+1, + "total", len(orphanedBlobs), + "percent", fmt.Sprintf("%.1f%%", float64(i+1)/float64(len(orphanedBlobs))*100), + ) + } + } + } + + // Step 6: Delete snapshot metadata + log.Info("Deleting snapshot metadata") + if err := v.deleteSnapshot(snapshotID); err != nil { + return result, fmt.Errorf("deleting snapshot metadata: %w", err) + } + + // Print summary + _, _ = fmt.Fprintf(v.Stdout, "\nRemoved snapshot %s\n", snapshotID) + _, _ = fmt.Fprintf(v.Stdout, " Blobs deleted: %d\n", result.BlobsDeleted) + _, _ = fmt.Fprintf(v.Stdout, " Storage freed: %s\n", humanize.Bytes(uint64(result.BytesFreed))) + if result.BlobsFailed > 0 { + _, _ = fmt.Fprintf(v.Stdout, " Blobs failed: %d\n", result.BlobsFailed) + } + + return result, nil +} + +// PruneResult contains statistics about the prune operation +type PruneResult struct { + SnapshotsDeleted int64 + FilesDeleted int64 + ChunksDeleted int64 + BlobsDeleted int64 +} + +// PruneDatabase removes incomplete snapshots and orphaned files, chunks, +// and blobs from the local database. This ensures database consistency +// before starting a new backup or on-demand via the prune command. +func (v *Vaultik) PruneDatabase() (*PruneResult, error) { + log.Info("Pruning database: removing incomplete snapshots and orphaned data") + + result := &PruneResult{} + + // First, delete any incomplete snapshots + incompleteSnapshots, err := v.Repositories.Snapshots.GetIncompleteSnapshots(v.ctx) + if err != nil { + return nil, fmt.Errorf("getting incomplete snapshots: %w", err) + } + + for _, snapshot := range incompleteSnapshots { + log.Info("Deleting incomplete snapshot", "snapshot_id", snapshot.ID) + // Delete related records first + if err := v.Repositories.Snapshots.DeleteSnapshotFiles(v.ctx, snapshot.ID); err != nil { + log.Error("Failed to delete snapshot files", "snapshot_id", snapshot.ID, "error", err) + } + if err := v.Repositories.Snapshots.DeleteSnapshotBlobs(v.ctx, snapshot.ID); err != nil { + log.Error("Failed to delete snapshot blobs", "snapshot_id", snapshot.ID, "error", err) + } + if err := v.Repositories.Snapshots.DeleteSnapshotUploads(v.ctx, snapshot.ID); err != nil { + log.Error("Failed to delete snapshot uploads", "snapshot_id", snapshot.ID, "error", err) + } + if err := v.Repositories.Snapshots.Delete(v.ctx, snapshot.ID); err != nil { + log.Error("Failed to delete snapshot", "snapshot_id", snapshot.ID, "error", err) + } else { + result.SnapshotsDeleted++ + } + } + + // Get counts before cleanup for reporting + fileCountBefore, _ := v.getTableCount("files") + chunkCountBefore, _ := v.getTableCount("chunks") + blobCountBefore, _ := v.getTableCount("blobs") + + // Run the cleanup + if err := v.SnapshotManager.CleanupOrphanedData(v.ctx); err != nil { + return nil, fmt.Errorf("cleanup orphaned data: %w", err) + } + + // Get counts after cleanup + fileCountAfter, _ := v.getTableCount("files") + chunkCountAfter, _ := v.getTableCount("chunks") + blobCountAfter, _ := v.getTableCount("blobs") + + result.FilesDeleted = fileCountBefore - fileCountAfter + result.ChunksDeleted = chunkCountBefore - chunkCountAfter + result.BlobsDeleted = blobCountBefore - blobCountAfter + + log.Info("Prune complete", + "incomplete_snapshots", result.SnapshotsDeleted, + "orphaned_files", result.FilesDeleted, + "orphaned_chunks", result.ChunksDeleted, + "orphaned_blobs", result.BlobsDeleted, + ) + + // Print summary + _, _ = fmt.Fprintf(v.Stdout, "Prune complete:\n") + _, _ = fmt.Fprintf(v.Stdout, " Incomplete snapshots removed: %d\n", result.SnapshotsDeleted) + _, _ = fmt.Fprintf(v.Stdout, " Orphaned files removed: %d\n", result.FilesDeleted) + _, _ = fmt.Fprintf(v.Stdout, " Orphaned chunks removed: %d\n", result.ChunksDeleted) + _, _ = fmt.Fprintf(v.Stdout, " Orphaned blobs removed: %d\n", result.BlobsDeleted) + + return result, nil +} + +// getTableCount returns the count of rows in a table +func (v *Vaultik) getTableCount(tableName string) (int64, error) { + if v.DB == nil { + return 0, nil + } + + var count int64 + query := fmt.Sprintf("SELECT COUNT(*) FROM %s", tableName) + err := v.DB.Conn().QueryRowContext(v.ctx, query).Scan(&count) + if err != nil { + return 0, err + } + return count, nil +} diff --git a/internal/vaultik/vaultik.go b/internal/vaultik/vaultik.go index 1fbcb4d..27cd5e2 100644 --- a/internal/vaultik/vaultik.go +++ b/internal/vaultik/vaultik.go @@ -1,6 +1,7 @@ package vaultik import ( + "bytes" "context" "fmt" "io" @@ -122,3 +123,34 @@ func (v *Vaultik) GetDecryptor() (*crypto.Decryptor, error) { func (v *Vaultik) GetFilesystem() afero.Fs { return v.Fs } + +// TestVaultik wraps a Vaultik with captured stdout/stderr for testing +type TestVaultik struct { + *Vaultik + Stdout *bytes.Buffer + Stderr *bytes.Buffer + Stdin *bytes.Buffer +} + +// NewForTesting creates a minimal Vaultik instance for testing purposes. +// Only the Storage field is populated; other fields are nil. +// Returns a TestVaultik that captures stdout/stderr in buffers. +func NewForTesting(storage storage.Storer) *TestVaultik { + ctx, cancel := context.WithCancel(context.Background()) + stdout := &bytes.Buffer{} + stderr := &bytes.Buffer{} + stdin := &bytes.Buffer{} + return &TestVaultik{ + Vaultik: &Vaultik{ + Storage: storage, + ctx: ctx, + cancel: cancel, + Stdout: stdout, + Stderr: stderr, + Stdin: stdin, + }, + Stdout: stdout, + Stderr: stderr, + Stdin: stdin, + } +}