Major refactoring: UUID-based storage, streaming architecture, and CLI improvements

This commit represents a significant architectural overhaul of vaultik:

Database Schema Changes:
- Switch files table to use UUID primary keys instead of path-based keys
- Add UUID primary keys to blobs table for immediate chunk association
- Update all foreign key relationships to use UUIDs
- Add comprehensive schema documentation in DATAMODEL.md
- Add SQLite busy timeout handling for concurrent operations

Streaming and Performance Improvements:
- Implement true streaming blob packing without intermediate storage
- Add streaming chunk processing to reduce memory usage
- Improve progress reporting with real-time metrics
- Add upload metrics tracking in new uploads table

CLI Refactoring:
- Restructure CLI to use subcommands: snapshot create/list/purge/verify
- Add store info command for S3 configuration display
- Add custom duration parser supporting days/weeks/months/years
- Remove old backup.go in favor of enhanced snapshot.go
- Add --cron flag for silent operation

Configuration Changes:
- Remove unused index_prefix configuration option
- Add support for snapshot pruning retention policies
- Improve configuration validation and error messages

Testing Improvements:
- Add comprehensive repository tests with edge cases
- Add cascade delete debugging tests
- Fix concurrent operation tests to use SQLite busy timeout
- Remove tolerance for SQLITE_BUSY errors in tests

Documentation:
- Add MIT LICENSE file
- Update README with new command structure
- Add comprehensive DATAMODEL.md explaining database schema
- Update DESIGN.md with UUID-based architecture

Other Changes:
- Add test-config.yml for testing
- Update Makefile with better test output formatting
- Fix various race conditions in concurrent operations
- Improve error handling throughout
This commit is contained in:
2025-07-22 14:54:37 +02:00
parent 86b533d6ee
commit 78af626759
54 changed files with 5525 additions and 1109 deletions

View File

@@ -1,3 +1,17 @@
// Package blob handles the creation of blobs - the final storage units for Vaultik.
// A blob is a large file (up to 10GB) containing many compressed and encrypted chunks
// from multiple source files. Blobs are content-addressed, meaning their filename
// is derived from the SHA256 hash of their compressed and encrypted content.
//
// The blob creation process:
// 1. Chunks are accumulated from multiple files
// 2. The collection is compressed using zstd
// 3. The compressed data is encrypted using age
// 4. The encrypted blob is hashed to create its content-addressed name
// 5. The blob is uploaded to S3 using the hash as the filename
//
// This design optimizes storage efficiency by batching many small chunks into
// larger blobs, reducing the number of S3 operations and associated costs.
package blob
import (
@@ -20,19 +34,25 @@ import (
"github.com/klauspost/compress/zstd"
)
// BlobHandler is called when a blob is finalized
// BlobHandler is a callback function invoked when a blob is finalized and ready for upload.
// The handler receives a BlobWithReader containing the blob metadata and a reader for
// the compressed and encrypted blob content. The handler is responsible for uploading
// the blob to storage and cleaning up any temporary files.
type BlobHandler func(blob *BlobWithReader) error
// PackerConfig holds configuration for creating a Packer
// PackerConfig holds configuration for creating a Packer.
// All fields except BlobHandler are required.
type PackerConfig struct {
MaxBlobSize int64
CompressionLevel int
Encryptor Encryptor // Required - blobs are always encrypted
Repositories *database.Repositories // For creating blob records
BlobHandler BlobHandler // Optional - called when blob is ready
MaxBlobSize int64 // Maximum size of a blob before forcing finalization
CompressionLevel int // Zstd compression level (1-19, higher = better compression)
Encryptor Encryptor // Age encryptor for blob encryption (required)
Repositories *database.Repositories // Database repositories for tracking blob metadata
BlobHandler BlobHandler // Optional callback when blob is ready for upload
}
// Packer combines chunks into blobs with compression and encryption
// Packer accumulates chunks and packs them into blobs.
// It handles compression, encryption, and coordination with the database
// to track blob metadata. Packer is thread-safe.
type Packer struct {
maxBlobSize int64
compressionLevel int
@@ -69,10 +89,13 @@ type blobInProgress struct {
compressedSize int64 // Current compressed size (estimated)
}
// ChunkRef represents a chunk to be added to a blob
// ChunkRef represents a chunk to be added to a blob.
// The Hash is the content-addressed identifier (SHA256) of the chunk,
// and Data contains the raw chunk bytes. After adding to a blob,
// the Data can be safely discarded as it's written to the blob immediately.
type ChunkRef struct {
Hash string
Data []byte
Hash string // SHA256 hash of the chunk data
Data []byte // Raw chunk content
}
// chunkInfo tracks chunk metadata in a blob
@@ -107,7 +130,9 @@ type BlobWithReader struct {
TempFile *os.File // Optional, only set for disk-based blobs
}
// NewPacker creates a new blob packer
// NewPacker creates a new blob packer that accumulates chunks into blobs.
// The packer will automatically finalize blobs when they reach MaxBlobSize.
// Returns an error if required configuration fields are missing or invalid.
func NewPacker(cfg PackerConfig) (*Packer, error) {
if cfg.Encryptor == nil {
return nil, fmt.Errorf("encryptor is required - blobs must be encrypted")
@@ -125,15 +150,21 @@ func NewPacker(cfg PackerConfig) (*Packer, error) {
}, nil
}
// SetBlobHandler sets the handler to be called when a blob is finalized
// SetBlobHandler sets the handler to be called when a blob is finalized.
// The handler is responsible for uploading the blob to storage.
// If no handler is set, finalized blobs are stored in memory and can be
// retrieved with GetFinishedBlobs().
func (p *Packer) SetBlobHandler(handler BlobHandler) {
p.mu.Lock()
defer p.mu.Unlock()
p.blobHandler = handler
}
// AddChunk adds a chunk to the current blob
// Returns ErrBlobSizeLimitExceeded if adding the chunk would exceed the size limit
// AddChunk adds a chunk to the current blob being packed.
// If adding the chunk would exceed MaxBlobSize, returns ErrBlobSizeLimitExceeded.
// In this case, the caller should finalize the current blob and retry.
// The chunk data is written immediately and can be garbage collected after this call.
// Thread-safe.
func (p *Packer) AddChunk(chunk *ChunkRef) error {
p.mu.Lock()
defer p.mu.Unlock()
@@ -166,7 +197,10 @@ func (p *Packer) AddChunk(chunk *ChunkRef) error {
return nil
}
// Flush finalizes any pending blob
// Flush finalizes any in-progress blob, compressing, encrypting, and hashing it.
// This should be called after all chunks have been added to ensure no data is lost.
// If a BlobHandler is set, it will be called with the finalized blob.
// Thread-safe.
func (p *Packer) Flush() error {
p.mu.Lock()
defer p.mu.Unlock()
@@ -180,8 +214,12 @@ func (p *Packer) Flush() error {
return nil
}
// FinalizeBlob finalizes the current blob being assembled
// Caller must handle retrying the chunk that triggered size limit
// FinalizeBlob finalizes the current blob being assembled.
// This compresses the accumulated chunks, encrypts the result, and computes
// the content-addressed hash. The finalized blob is either passed to the
// BlobHandler (if set) or stored internally.
// Caller must handle retrying any chunk that triggered size limit exceeded.
// Not thread-safe - caller must hold the lock.
func (p *Packer) FinalizeBlob() error {
p.mu.Lock()
defer p.mu.Unlock()
@@ -193,7 +231,10 @@ func (p *Packer) FinalizeBlob() error {
return p.finalizeCurrentBlob()
}
// GetFinishedBlobs returns all completed blobs and clears the list
// GetFinishedBlobs returns all completed blobs and clears the internal list.
// This is only used when no BlobHandler is set. After calling this method,
// the caller is responsible for uploading the blobs to storage.
// Thread-safe.
func (p *Packer) GetFinishedBlobs() []*FinishedBlob {
p.mu.Lock()
defer p.mu.Unlock()
@@ -212,8 +253,8 @@ func (p *Packer) startNewBlob() error {
if p.repos != nil {
blob := &database.Blob{
ID: blobID,
Hash: "", // Will be set when finalized
CreatedTS: time.Now(),
Hash: "temp-placeholder-" + blobID, // Temporary placeholder until finalized
CreatedTS: time.Now().UTC(),
FinishedTS: nil,
UncompressedSize: 0,
CompressedSize: 0,
@@ -237,7 +278,7 @@ func (p *Packer) startNewBlob() error {
id: blobID,
chunks: make([]*chunkInfo, 0),
chunkSet: make(map[string]bool),
startTime: time.Now(),
startTime: time.Now().UTC(),
tempFile: tempFile,
hasher: sha256.New(),
size: 0,