Major refactoring: UUID-based storage, streaming architecture, and CLI improvements

This commit represents a significant architectural overhaul of vaultik: Database Schema Changes: - Switch files table to use UUID primary keys instead of path-based keys - Add UUID primary keys to blobs table for immediate chunk association - Update all foreign key relationships to use UUIDs - Add comprehensive schema documentation in DATAMODEL.md - Add SQLite busy timeout handling for concurrent operations Streaming and Performance Improvements: - Implement true streaming blob packing without intermediate storage - Add streaming chunk processing to reduce memory usage - Improve progress reporting with real-time metrics - Add upload metrics tracking in new uploads table CLI Refactoring: - Restructure CLI to use subcommands: snapshot create/list/purge/verify - Add store info command for S3 configuration display - Add custom duration parser supporting days/weeks/months/years - Remove old backup.go in favor of enhanced snapshot.go - Add --cron flag for silent operation Configuration Changes: - Remove unused index_prefix configuration option - Add support for snapshot pruning retention policies - Improve configuration validation and error messages Testing Improvements: - Add comprehensive repository tests with edge cases - Add cascade delete debugging tests - Fix concurrent operation tests to use SQLite busy timeout - Remove tolerance for SQLITE_BUSY errors in tests Documentation: - Add MIT LICENSE file - Update README with new command structure - Add comprehensive DATAMODEL.md explaining database schema - Update DESIGN.md with UUID-based architecture Other Changes: - Add test-config.yml for testing - Update Makefile with better test output formatting - Fix various race conditions in concurrent operations - Improve error handling throughout
2025-07-22 14:54:37 +02:00
parent 86b533d6ee
commit 78af626759
54 changed files with 5525 additions and 1109 deletions
--- a/internal/chunker/chunker.go
+++ b/internal/chunker/chunker.go
@@ -10,7 +10,9 @@ import (
 	"github.com/jotfs/fastcdc-go"
 )

-// Chunk represents a single chunk of data
+// Chunk represents a single chunk of data produced by the content-defined chunking algorithm.
+// Each chunk is identified by its SHA256 hash and contains the raw data along with
+// its position and size information from the original file.
 type Chunk struct {
 	Hash   string // Content hash of the chunk
 	Data   []byte // Chunk data
@@ -18,14 +20,20 @@ type Chunk struct {
 	Size   int64  // Size of the chunk
 }

-// Chunker provides content-defined chunking using FastCDC
+// Chunker provides content-defined chunking using the FastCDC algorithm.
+// It splits data into variable-sized chunks based on content patterns, ensuring
+// that identical data sequences produce identical chunks regardless of their
+// position in the file. This enables efficient deduplication.
 type Chunker struct {
 	avgChunkSize int
 	minChunkSize int
 	maxChunkSize int
 }

-// NewChunker creates a new chunker with the specified average chunk size
+// NewChunker creates a new chunker with the specified average chunk size.
+// The actual chunk sizes will vary between avgChunkSize/4 and avgChunkSize*4
+// as recommended by the FastCDC algorithm. Typical values for avgChunkSize
+// are 64KB (65536), 256KB (262144), or 1MB (1048576).
 func NewChunker(avgChunkSize int64) *Chunker {
 	// FastCDC recommends min = avg/4 and max = avg*4
 	return &Chunker{
@@ -35,7 +43,10 @@ func NewChunker(avgChunkSize int64) *Chunker {
 	}
 }

-// ChunkReader splits the reader into content-defined chunks
+// ChunkReader splits the reader into content-defined chunks and returns all chunks at once.
+// This method loads all chunk data into memory, so it should only be used for
+// reasonably sized inputs. For large files or streams, use ChunkReaderStreaming instead.
+// Returns an error if chunking fails or if reading from the input fails.
 func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
 	opts := fastcdc.Options{
 		MinSize:     c.minChunkSize,
@@ -80,20 +91,31 @@ func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
 	return chunks, nil
 }

-// ChunkCallback is called for each chunk as it's processed
+// ChunkCallback is a function called for each chunk as it's processed.
+// The callback receives a Chunk containing the hash, data, offset, and size.
+// If the callback returns an error, chunk processing stops and the error is propagated.
 type ChunkCallback func(chunk Chunk) error

-// ChunkReaderStreaming splits the reader into chunks and calls the callback for each
-func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) error {
+// ChunkReaderStreaming splits the reader into chunks and calls the callback for each chunk.
+// This is the preferred method for processing large files or streams as it doesn't
+// accumulate all chunks in memory. The callback is invoked for each chunk as it's
+// produced, allowing for streaming processing and immediate storage or transmission.
+// Returns the SHA256 hash of the entire file content and an error if chunking fails,
+// reading fails, or if the callback returns an error.
+func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) (string, error) {
+	// Create a tee reader to calculate full file hash while chunking
+	fileHasher := sha256.New()
+	teeReader := io.TeeReader(r, fileHasher)
+
 	opts := fastcdc.Options{
 		MinSize:     c.minChunkSize,
 		AverageSize: c.avgChunkSize,
 		MaxSize:     c.maxChunkSize,
 	}

-	chunker, err := fastcdc.NewChunker(r, opts)
+	chunker, err := fastcdc.NewChunker(teeReader, opts)
 	if err != nil {
-		return fmt.Errorf("creating chunker: %w", err)
+		return "", fmt.Errorf("creating chunker: %w", err)
 	}

 	offset := int64(0)
@@ -104,10 +126,10 @@ func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) erro
 			break
 		}
 		if err != nil {
-			return fmt.Errorf("reading chunk: %w", err)
+			return "", fmt.Errorf("reading chunk: %w", err)
 		}

-		// Calculate hash
+		// Calculate chunk hash
 		hash := sha256.Sum256(chunk.Data)

 		// Make a copy of the data since FastCDC reuses the buffer
@@ -120,16 +142,20 @@ func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) erro
 			Offset: offset,
 			Size:   int64(len(chunk.Data)),
 		}); err != nil {
-			return fmt.Errorf("callback error: %w", err)
+			return "", fmt.Errorf("callback error: %w", err)
 		}

 		offset += int64(len(chunk.Data))
 	}

-	return nil
+	// Return the full file hash
+	return hex.EncodeToString(fileHasher.Sum(nil)), nil
 }

-// ChunkFile splits a file into content-defined chunks
+// ChunkFile splits a file into content-defined chunks by reading the entire file.
+// This is a convenience method that opens the file and passes it to ChunkReader.
+// For large files, consider using ChunkReaderStreaming with a file handle instead.
+// Returns an error if the file cannot be opened or if chunking fails.
 func (c *Chunker) ChunkFile(path string) ([]Chunk, error) {
 	file, err := os.Open(path)
 	if err != nil {