vaultik/internal/chunker/chunker.go
sneak 78af626759 Major refactoring: UUID-based storage, streaming architecture, and CLI improvements
This commit represents a significant architectural overhaul of vaultik:

Database Schema Changes:
- Switch files table to use UUID primary keys instead of path-based keys
- Add UUID primary keys to blobs table for immediate chunk association
- Update all foreign key relationships to use UUIDs
- Add comprehensive schema documentation in DATAMODEL.md
- Add SQLite busy timeout handling for concurrent operations

Streaming and Performance Improvements:
- Implement true streaming blob packing without intermediate storage
- Add streaming chunk processing to reduce memory usage
- Improve progress reporting with real-time metrics
- Add upload metrics tracking in new uploads table

CLI Refactoring:
- Restructure CLI to use subcommands: snapshot create/list/purge/verify
- Add store info command for S3 configuration display
- Add custom duration parser supporting days/weeks/months/years
- Remove old backup.go in favor of enhanced snapshot.go
- Add --cron flag for silent operation

Configuration Changes:
- Remove unused index_prefix configuration option
- Add support for snapshot pruning retention policies
- Improve configuration validation and error messages

Testing Improvements:
- Add comprehensive repository tests with edge cases
- Add cascade delete debugging tests
- Fix concurrent operation tests to use SQLite busy timeout
- Remove tolerance for SQLITE_BUSY errors in tests

Documentation:
- Add MIT LICENSE file
- Update README with new command structure
- Add comprehensive DATAMODEL.md explaining database schema
- Update DESIGN.md with UUID-based architecture

Other Changes:
- Add test-config.yml for testing
- Update Makefile with better test output formatting
- Fix various race conditions in concurrent operations
- Improve error handling throughout
2025-07-22 14:56:44 +02:00

173 lines
5.2 KiB
Go

package chunker
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"os"
"github.com/jotfs/fastcdc-go"
)
// Chunk represents a single chunk of data produced by the content-defined chunking algorithm.
// Each chunk is identified by its SHA256 hash and contains the raw data along with
// its position and size information from the original file.
type Chunk struct {
Hash string // Content hash of the chunk
Data []byte // Chunk data
Offset int64 // Offset in the original file
Size int64 // Size of the chunk
}
// Chunker provides content-defined chunking using the FastCDC algorithm.
// It splits data into variable-sized chunks based on content patterns, ensuring
// that identical data sequences produce identical chunks regardless of their
// position in the file. This enables efficient deduplication.
type Chunker struct {
avgChunkSize int
minChunkSize int
maxChunkSize int
}
// NewChunker creates a new chunker with the specified average chunk size.
// The actual chunk sizes will vary between avgChunkSize/4 and avgChunkSize*4
// as recommended by the FastCDC algorithm. Typical values for avgChunkSize
// are 64KB (65536), 256KB (262144), or 1MB (1048576).
func NewChunker(avgChunkSize int64) *Chunker {
// FastCDC recommends min = avg/4 and max = avg*4
return &Chunker{
avgChunkSize: int(avgChunkSize),
minChunkSize: int(avgChunkSize / 4),
maxChunkSize: int(avgChunkSize * 4),
}
}
// ChunkReader splits the reader into content-defined chunks and returns all chunks at once.
// This method loads all chunk data into memory, so it should only be used for
// reasonably sized inputs. For large files or streams, use ChunkReaderStreaming instead.
// Returns an error if chunking fails or if reading from the input fails.
func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
opts := fastcdc.Options{
MinSize: c.minChunkSize,
AverageSize: c.avgChunkSize,
MaxSize: c.maxChunkSize,
}
chunker, err := fastcdc.NewChunker(r, opts)
if err != nil {
return nil, fmt.Errorf("creating chunker: %w", err)
}
var chunks []Chunk
offset := int64(0)
for {
chunk, err := chunker.Next()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("reading chunk: %w", err)
}
// Calculate hash
hash := sha256.Sum256(chunk.Data)
// Make a copy of the data since FastCDC reuses the buffer
chunkData := make([]byte, len(chunk.Data))
copy(chunkData, chunk.Data)
chunks = append(chunks, Chunk{
Hash: hex.EncodeToString(hash[:]),
Data: chunkData,
Offset: offset,
Size: int64(len(chunk.Data)),
})
offset += int64(len(chunk.Data))
}
return chunks, nil
}
// ChunkCallback is a function called for each chunk as it's processed.
// The callback receives a Chunk containing the hash, data, offset, and size.
// If the callback returns an error, chunk processing stops and the error is propagated.
type ChunkCallback func(chunk Chunk) error
// ChunkReaderStreaming splits the reader into chunks and calls the callback for each chunk.
// This is the preferred method for processing large files or streams as it doesn't
// accumulate all chunks in memory. The callback is invoked for each chunk as it's
// produced, allowing for streaming processing and immediate storage or transmission.
// Returns the SHA256 hash of the entire file content and an error if chunking fails,
// reading fails, or if the callback returns an error.
func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) (string, error) {
// Create a tee reader to calculate full file hash while chunking
fileHasher := sha256.New()
teeReader := io.TeeReader(r, fileHasher)
opts := fastcdc.Options{
MinSize: c.minChunkSize,
AverageSize: c.avgChunkSize,
MaxSize: c.maxChunkSize,
}
chunker, err := fastcdc.NewChunker(teeReader, opts)
if err != nil {
return "", fmt.Errorf("creating chunker: %w", err)
}
offset := int64(0)
for {
chunk, err := chunker.Next()
if err == io.EOF {
break
}
if err != nil {
return "", fmt.Errorf("reading chunk: %w", err)
}
// Calculate chunk hash
hash := sha256.Sum256(chunk.Data)
// Make a copy of the data since FastCDC reuses the buffer
chunkData := make([]byte, len(chunk.Data))
copy(chunkData, chunk.Data)
if err := callback(Chunk{
Hash: hex.EncodeToString(hash[:]),
Data: chunkData,
Offset: offset,
Size: int64(len(chunk.Data)),
}); err != nil {
return "", fmt.Errorf("callback error: %w", err)
}
offset += int64(len(chunk.Data))
}
// Return the full file hash
return hex.EncodeToString(fileHasher.Sum(nil)), nil
}
// ChunkFile splits a file into content-defined chunks by reading the entire file.
// This is a convenience method that opens the file and passes it to ChunkReader.
// For large files, consider using ChunkReaderStreaming with a file handle instead.
// Returns an error if the file cannot be opened or if chunking fails.
func (c *Chunker) ChunkFile(path string) ([]Chunk, error) {
file, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("opening file: %w", err)
}
defer func() {
if err := file.Close(); err != nil && err.Error() != "invalid argument" {
// Log error or handle as needed
_ = err
}
}()
return c.ChunkReader(file)
}