Major refactoring: UUID-based storage, streaming architecture, and CLI improvements

This commit represents a significant architectural overhaul of vaultik: Database Schema Changes: - Switch files table to use UUID primary keys instead of path-based keys - Add UUID primary keys to blobs table for immediate chunk association - Update all foreign key relationships to use UUIDs - Add comprehensive schema documentation in DATAMODEL.md - Add SQLite busy timeout handling for concurrent operations Streaming and Performance Improvements: - Implement true streaming blob packing without intermediate storage - Add streaming chunk processing to reduce memory usage - Improve progress reporting with real-time metrics - Add upload metrics tracking in new uploads table CLI Refactoring: - Restructure CLI to use subcommands: snapshot create/list/purge/verify - Add store info command for S3 configuration display - Add custom duration parser supporting days/weeks/months/years - Remove old backup.go in favor of enhanced snapshot.go - Add --cron flag for silent operation Configuration Changes: - Remove unused index_prefix configuration option - Add support for snapshot pruning retention policies - Improve configuration validation and error messages Testing Improvements: - Add comprehensive repository tests with edge cases - Add cascade delete debugging tests - Fix concurrent operation tests to use SQLite busy timeout - Remove tolerance for SQLITE_BUSY errors in tests Documentation: - Add MIT LICENSE file - Update README with new command structure - Add comprehensive DATAMODEL.md explaining database schema - Update DESIGN.md with UUID-based architecture Other Changes: - Add test-config.yml for testing - Update Makefile with better test output formatting - Fix various race conditions in concurrent operations - Improve error handling throughout
2025-07-22 14:54:37 +02:00
parent 86b533d6ee
commit 78af626759
54 changed files with 5525 additions and 1109 deletions
--- a/internal/blob/packer.go
+++ b/internal/blob/packer.go
@@ -1,3 +1,17 @@
+// Package blob handles the creation of blobs - the final storage units for Vaultik.
+// A blob is a large file (up to 10GB) containing many compressed and encrypted chunks
+// from multiple source files. Blobs are content-addressed, meaning their filename
+// is derived from the SHA256 hash of their compressed and encrypted content.
+//
+// The blob creation process:
+// 1. Chunks are accumulated from multiple files
+// 2. The collection is compressed using zstd
+// 3. The compressed data is encrypted using age
+// 4. The encrypted blob is hashed to create its content-addressed name
+// 5. The blob is uploaded to S3 using the hash as the filename
+//
+// This design optimizes storage efficiency by batching many small chunks into
+// larger blobs, reducing the number of S3 operations and associated costs.
 package blob

 import (
@@ -20,19 +34,25 @@ import (
 	"github.com/klauspost/compress/zstd"
 )

-// BlobHandler is called when a blob is finalized
+// BlobHandler is a callback function invoked when a blob is finalized and ready for upload.
+// The handler receives a BlobWithReader containing the blob metadata and a reader for
+// the compressed and encrypted blob content. The handler is responsible for uploading
+// the blob to storage and cleaning up any temporary files.
 type BlobHandler func(blob *BlobWithReader) error

-// PackerConfig holds configuration for creating a Packer
+// PackerConfig holds configuration for creating a Packer.
+// All fields except BlobHandler are required.
 type PackerConfig struct {
-	MaxBlobSize      int64
-	CompressionLevel int
-	Encryptor        Encryptor              // Required - blobs are always encrypted
-	Repositories     *database.Repositories // For creating blob records
-	BlobHandler      BlobHandler            // Optional - called when blob is ready
+	MaxBlobSize      int64                  // Maximum size of a blob before forcing finalization
+	CompressionLevel int                    // Zstd compression level (1-19, higher = better compression)
+	Encryptor        Encryptor              // Age encryptor for blob encryption (required)
+	Repositories     *database.Repositories // Database repositories for tracking blob metadata
+	BlobHandler      BlobHandler            // Optional callback when blob is ready for upload
 }

-// Packer combines chunks into blobs with compression and encryption
+// Packer accumulates chunks and packs them into blobs.
+// It handles compression, encryption, and coordination with the database
+// to track blob metadata. Packer is thread-safe.
 type Packer struct {
 	maxBlobSize      int64
 	compressionLevel int
@@ -69,10 +89,13 @@ type blobInProgress struct {
 	compressedSize int64 // Current compressed size (estimated)
 }

-// ChunkRef represents a chunk to be added to a blob
+// ChunkRef represents a chunk to be added to a blob.
+// The Hash is the content-addressed identifier (SHA256) of the chunk,
+// and Data contains the raw chunk bytes. After adding to a blob,
+// the Data can be safely discarded as it's written to the blob immediately.
 type ChunkRef struct {
-	Hash string
-	Data []byte
+	Hash string // SHA256 hash of the chunk data
+	Data []byte // Raw chunk content
 }

 // chunkInfo tracks chunk metadata in a blob
@@ -107,7 +130,9 @@ type BlobWithReader struct {
 	TempFile *os.File // Optional, only set for disk-based blobs
 }

-// NewPacker creates a new blob packer
+// NewPacker creates a new blob packer that accumulates chunks into blobs.
+// The packer will automatically finalize blobs when they reach MaxBlobSize.
+// Returns an error if required configuration fields are missing or invalid.
 func NewPacker(cfg PackerConfig) (*Packer, error) {
 	if cfg.Encryptor == nil {
 		return nil, fmt.Errorf("encryptor is required - blobs must be encrypted")
@@ -125,15 +150,21 @@ func NewPacker(cfg PackerConfig) (*Packer, error) {
 	}, nil
 }

-// SetBlobHandler sets the handler to be called when a blob is finalized
+// SetBlobHandler sets the handler to be called when a blob is finalized.
+// The handler is responsible for uploading the blob to storage.
+// If no handler is set, finalized blobs are stored in memory and can be
+// retrieved with GetFinishedBlobs().
 func (p *Packer) SetBlobHandler(handler BlobHandler) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	p.blobHandler = handler
 }

-// AddChunk adds a chunk to the current blob
-// Returns ErrBlobSizeLimitExceeded if adding the chunk would exceed the size limit
+// AddChunk adds a chunk to the current blob being packed.
+// If adding the chunk would exceed MaxBlobSize, returns ErrBlobSizeLimitExceeded.
+// In this case, the caller should finalize the current blob and retry.
+// The chunk data is written immediately and can be garbage collected after this call.
+// Thread-safe.
 func (p *Packer) AddChunk(chunk *ChunkRef) error {
 	p.mu.Lock()
 	defer p.mu.Unlock()
@@ -166,7 +197,10 @@ func (p *Packer) AddChunk(chunk *ChunkRef) error {
 	return nil
 }

-// Flush finalizes any pending blob
+// Flush finalizes any in-progress blob, compressing, encrypting, and hashing it.
+// This should be called after all chunks have been added to ensure no data is lost.
+// If a BlobHandler is set, it will be called with the finalized blob.
+// Thread-safe.
 func (p *Packer) Flush() error {
 	p.mu.Lock()
 	defer p.mu.Unlock()
@@ -180,8 +214,12 @@ func (p *Packer) Flush() error {
 	return nil
 }

-// FinalizeBlob finalizes the current blob being assembled
-// Caller must handle retrying the chunk that triggered size limit
+// FinalizeBlob finalizes the current blob being assembled.
+// This compresses the accumulated chunks, encrypts the result, and computes
+// the content-addressed hash. The finalized blob is either passed to the
+// BlobHandler (if set) or stored internally.
+// Caller must handle retrying any chunk that triggered size limit exceeded.
+// Not thread-safe - caller must hold the lock.
 func (p *Packer) FinalizeBlob() error {
 	p.mu.Lock()
 	defer p.mu.Unlock()
@@ -193,7 +231,10 @@ func (p *Packer) FinalizeBlob() error {
 	return p.finalizeCurrentBlob()
 }

-// GetFinishedBlobs returns all completed blobs and clears the list
+// GetFinishedBlobs returns all completed blobs and clears the internal list.
+// This is only used when no BlobHandler is set. After calling this method,
+// the caller is responsible for uploading the blobs to storage.
+// Thread-safe.
 func (p *Packer) GetFinishedBlobs() []*FinishedBlob {
 	p.mu.Lock()
 	defer p.mu.Unlock()
@@ -212,8 +253,8 @@ func (p *Packer) startNewBlob() error {
 	if p.repos != nil {
 		blob := &database.Blob{
 			ID:               blobID,
-			Hash:             "", // Will be set when finalized
-			CreatedTS:        time.Now(),
+			Hash:             "temp-placeholder-" + blobID, // Temporary placeholder until finalized
+			CreatedTS:        time.Now().UTC(),
 			FinishedTS:       nil,
 			UncompressedSize: 0,
 			CompressedSize:   0,
@@ -237,7 +278,7 @@ func (p *Packer) startNewBlob() error {
 		id:             blobID,
 		chunks:         make([]*chunkInfo, 0),
 		chunkSet:       make(map[string]bool),
-		startTime:      time.Now(),
+		startTime:      time.Now().UTC(),
 		tempFile:       tempFile,
 		hasher:         sha256.New(),
 		size:           0,