Major refactoring: UUID-based storage, streaming architecture, and CLI improvements
This commit represents a significant architectural overhaul of vaultik: Database Schema Changes: - Switch files table to use UUID primary keys instead of path-based keys - Add UUID primary keys to blobs table for immediate chunk association - Update all foreign key relationships to use UUIDs - Add comprehensive schema documentation in DATAMODEL.md - Add SQLite busy timeout handling for concurrent operations Streaming and Performance Improvements: - Implement true streaming blob packing without intermediate storage - Add streaming chunk processing to reduce memory usage - Improve progress reporting with real-time metrics - Add upload metrics tracking in new uploads table CLI Refactoring: - Restructure CLI to use subcommands: snapshot create/list/purge/verify - Add store info command for S3 configuration display - Add custom duration parser supporting days/weeks/months/years - Remove old backup.go in favor of enhanced snapshot.go - Add --cron flag for silent operation Configuration Changes: - Remove unused index_prefix configuration option - Add support for snapshot pruning retention policies - Improve configuration validation and error messages Testing Improvements: - Add comprehensive repository tests with edge cases - Add cascade delete debugging tests - Fix concurrent operation tests to use SQLite busy timeout - Remove tolerance for SQLITE_BUSY errors in tests Documentation: - Add MIT LICENSE file - Update README with new command structure - Add comprehensive DATAMODEL.md explaining database schema - Update DESIGN.md with UUID-based architecture Other Changes: - Add test-config.yml for testing - Update Makefile with better test output formatting - Fix various race conditions in concurrent operations - Improve error handling throughout
This commit is contained in:
@@ -10,7 +10,9 @@ import (
|
||||
"github.com/jotfs/fastcdc-go"
|
||||
)
|
||||
|
||||
// Chunk represents a single chunk of data
|
||||
// Chunk represents a single chunk of data produced by the content-defined chunking algorithm.
|
||||
// Each chunk is identified by its SHA256 hash and contains the raw data along with
|
||||
// its position and size information from the original file.
|
||||
type Chunk struct {
|
||||
Hash string // Content hash of the chunk
|
||||
Data []byte // Chunk data
|
||||
@@ -18,14 +20,20 @@ type Chunk struct {
|
||||
Size int64 // Size of the chunk
|
||||
}
|
||||
|
||||
// Chunker provides content-defined chunking using FastCDC
|
||||
// Chunker provides content-defined chunking using the FastCDC algorithm.
|
||||
// It splits data into variable-sized chunks based on content patterns, ensuring
|
||||
// that identical data sequences produce identical chunks regardless of their
|
||||
// position in the file. This enables efficient deduplication.
|
||||
type Chunker struct {
|
||||
avgChunkSize int
|
||||
minChunkSize int
|
||||
maxChunkSize int
|
||||
}
|
||||
|
||||
// NewChunker creates a new chunker with the specified average chunk size
|
||||
// NewChunker creates a new chunker with the specified average chunk size.
|
||||
// The actual chunk sizes will vary between avgChunkSize/4 and avgChunkSize*4
|
||||
// as recommended by the FastCDC algorithm. Typical values for avgChunkSize
|
||||
// are 64KB (65536), 256KB (262144), or 1MB (1048576).
|
||||
func NewChunker(avgChunkSize int64) *Chunker {
|
||||
// FastCDC recommends min = avg/4 and max = avg*4
|
||||
return &Chunker{
|
||||
@@ -35,7 +43,10 @@ func NewChunker(avgChunkSize int64) *Chunker {
|
||||
}
|
||||
}
|
||||
|
||||
// ChunkReader splits the reader into content-defined chunks
|
||||
// ChunkReader splits the reader into content-defined chunks and returns all chunks at once.
|
||||
// This method loads all chunk data into memory, so it should only be used for
|
||||
// reasonably sized inputs. For large files or streams, use ChunkReaderStreaming instead.
|
||||
// Returns an error if chunking fails or if reading from the input fails.
|
||||
func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
|
||||
opts := fastcdc.Options{
|
||||
MinSize: c.minChunkSize,
|
||||
@@ -80,20 +91,31 @@ func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
|
||||
return chunks, nil
|
||||
}
|
||||
|
||||
// ChunkCallback is called for each chunk as it's processed
|
||||
// ChunkCallback is a function called for each chunk as it's processed.
|
||||
// The callback receives a Chunk containing the hash, data, offset, and size.
|
||||
// If the callback returns an error, chunk processing stops and the error is propagated.
|
||||
type ChunkCallback func(chunk Chunk) error
|
||||
|
||||
// ChunkReaderStreaming splits the reader into chunks and calls the callback for each
|
||||
func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) error {
|
||||
// ChunkReaderStreaming splits the reader into chunks and calls the callback for each chunk.
|
||||
// This is the preferred method for processing large files or streams as it doesn't
|
||||
// accumulate all chunks in memory. The callback is invoked for each chunk as it's
|
||||
// produced, allowing for streaming processing and immediate storage or transmission.
|
||||
// Returns the SHA256 hash of the entire file content and an error if chunking fails,
|
||||
// reading fails, or if the callback returns an error.
|
||||
func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) (string, error) {
|
||||
// Create a tee reader to calculate full file hash while chunking
|
||||
fileHasher := sha256.New()
|
||||
teeReader := io.TeeReader(r, fileHasher)
|
||||
|
||||
opts := fastcdc.Options{
|
||||
MinSize: c.minChunkSize,
|
||||
AverageSize: c.avgChunkSize,
|
||||
MaxSize: c.maxChunkSize,
|
||||
}
|
||||
|
||||
chunker, err := fastcdc.NewChunker(r, opts)
|
||||
chunker, err := fastcdc.NewChunker(teeReader, opts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating chunker: %w", err)
|
||||
return "", fmt.Errorf("creating chunker: %w", err)
|
||||
}
|
||||
|
||||
offset := int64(0)
|
||||
@@ -104,10 +126,10 @@ func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) erro
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("reading chunk: %w", err)
|
||||
return "", fmt.Errorf("reading chunk: %w", err)
|
||||
}
|
||||
|
||||
// Calculate hash
|
||||
// Calculate chunk hash
|
||||
hash := sha256.Sum256(chunk.Data)
|
||||
|
||||
// Make a copy of the data since FastCDC reuses the buffer
|
||||
@@ -120,16 +142,20 @@ func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) erro
|
||||
Offset: offset,
|
||||
Size: int64(len(chunk.Data)),
|
||||
}); err != nil {
|
||||
return fmt.Errorf("callback error: %w", err)
|
||||
return "", fmt.Errorf("callback error: %w", err)
|
||||
}
|
||||
|
||||
offset += int64(len(chunk.Data))
|
||||
}
|
||||
|
||||
return nil
|
||||
// Return the full file hash
|
||||
return hex.EncodeToString(fileHasher.Sum(nil)), nil
|
||||
}
|
||||
|
||||
// ChunkFile splits a file into content-defined chunks
|
||||
// ChunkFile splits a file into content-defined chunks by reading the entire file.
|
||||
// This is a convenience method that opens the file and passes it to ChunkReader.
|
||||
// For large files, consider using ChunkReaderStreaming with a file handle instead.
|
||||
// Returns an error if the file cannot be opened or if chunking fails.
|
||||
func (c *Chunker) ChunkFile(path string) ([]Chunk, error) {
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user