vaultik/internal/chunker/chunker.go
sneak 86b533d6ee Refactor blob storage to use UUID primary keys and implement streaming chunking
- Changed blob table to use ID (UUID) as primary key instead of hash
- Blob records are now created at packing start, enabling immediate chunk associations
- Implemented streaming chunking to process large files without memory exhaustion
- Fixed blob manifest generation to include all referenced blobs
- Updated all foreign key references from blob_hash to blob_id
- Added progress reporting and improved error handling
- Enforced encryption requirement for all blob packing
- Updated tests to use test encryption keys
- Added Cyrillic transliteration to README
2025-07-22 07:43:39 +02:00

147 lines
3.2 KiB
Go

package chunker
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"os"
"github.com/jotfs/fastcdc-go"
)
// Chunk represents a single chunk of data
type Chunk struct {
Hash string // Content hash of the chunk
Data []byte // Chunk data
Offset int64 // Offset in the original file
Size int64 // Size of the chunk
}
// Chunker provides content-defined chunking using FastCDC
type Chunker struct {
avgChunkSize int
minChunkSize int
maxChunkSize int
}
// NewChunker creates a new chunker with the specified average chunk size
func NewChunker(avgChunkSize int64) *Chunker {
// FastCDC recommends min = avg/4 and max = avg*4
return &Chunker{
avgChunkSize: int(avgChunkSize),
minChunkSize: int(avgChunkSize / 4),
maxChunkSize: int(avgChunkSize * 4),
}
}
// ChunkReader splits the reader into content-defined chunks
func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
opts := fastcdc.Options{
MinSize: c.minChunkSize,
AverageSize: c.avgChunkSize,
MaxSize: c.maxChunkSize,
}
chunker, err := fastcdc.NewChunker(r, opts)
if err != nil {
return nil, fmt.Errorf("creating chunker: %w", err)
}
var chunks []Chunk
offset := int64(0)
for {
chunk, err := chunker.Next()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("reading chunk: %w", err)
}
// Calculate hash
hash := sha256.Sum256(chunk.Data)
// Make a copy of the data since FastCDC reuses the buffer
chunkData := make([]byte, len(chunk.Data))
copy(chunkData, chunk.Data)
chunks = append(chunks, Chunk{
Hash: hex.EncodeToString(hash[:]),
Data: chunkData,
Offset: offset,
Size: int64(len(chunk.Data)),
})
offset += int64(len(chunk.Data))
}
return chunks, nil
}
// ChunkCallback is called for each chunk as it's processed
type ChunkCallback func(chunk Chunk) error
// ChunkReaderStreaming splits the reader into chunks and calls the callback for each
func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) error {
opts := fastcdc.Options{
MinSize: c.minChunkSize,
AverageSize: c.avgChunkSize,
MaxSize: c.maxChunkSize,
}
chunker, err := fastcdc.NewChunker(r, opts)
if err != nil {
return fmt.Errorf("creating chunker: %w", err)
}
offset := int64(0)
for {
chunk, err := chunker.Next()
if err == io.EOF {
break
}
if err != nil {
return fmt.Errorf("reading chunk: %w", err)
}
// Calculate hash
hash := sha256.Sum256(chunk.Data)
// Make a copy of the data since FastCDC reuses the buffer
chunkData := make([]byte, len(chunk.Data))
copy(chunkData, chunk.Data)
if err := callback(Chunk{
Hash: hex.EncodeToString(hash[:]),
Data: chunkData,
Offset: offset,
Size: int64(len(chunk.Data)),
}); err != nil {
return fmt.Errorf("callback error: %w", err)
}
offset += int64(len(chunk.Data))
}
return nil
}
// ChunkFile splits a file into content-defined chunks
func (c *Chunker) ChunkFile(path string) ([]Chunk, error) {
file, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("opening file: %w", err)
}
defer func() {
if err := file.Close(); err != nil && err.Error() != "invalid argument" {
// Log error or handle as needed
_ = err
}
}()
return c.ChunkReader(file)
}