- Changed blob table to use ID (UUID) as primary key instead of hash - Blob records are now created at packing start, enabling immediate chunk associations - Implemented streaming chunking to process large files without memory exhaustion - Fixed blob manifest generation to include all referenced blobs - Updated all foreign key references from blob_hash to blob_id - Added progress reporting and improved error handling - Enforced encryption requirement for all blob packing - Updated tests to use test encryption keys - Added Cyrillic transliteration to README
147 lines
3.2 KiB
Go
147 lines
3.2 KiB
Go
package chunker
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
|
|
"github.com/jotfs/fastcdc-go"
|
|
)
|
|
|
|
// Chunk represents a single chunk of data
|
|
type Chunk struct {
|
|
Hash string // Content hash of the chunk
|
|
Data []byte // Chunk data
|
|
Offset int64 // Offset in the original file
|
|
Size int64 // Size of the chunk
|
|
}
|
|
|
|
// Chunker provides content-defined chunking using FastCDC
|
|
type Chunker struct {
|
|
avgChunkSize int
|
|
minChunkSize int
|
|
maxChunkSize int
|
|
}
|
|
|
|
// NewChunker creates a new chunker with the specified average chunk size
|
|
func NewChunker(avgChunkSize int64) *Chunker {
|
|
// FastCDC recommends min = avg/4 and max = avg*4
|
|
return &Chunker{
|
|
avgChunkSize: int(avgChunkSize),
|
|
minChunkSize: int(avgChunkSize / 4),
|
|
maxChunkSize: int(avgChunkSize * 4),
|
|
}
|
|
}
|
|
|
|
// ChunkReader splits the reader into content-defined chunks
|
|
func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
|
|
opts := fastcdc.Options{
|
|
MinSize: c.minChunkSize,
|
|
AverageSize: c.avgChunkSize,
|
|
MaxSize: c.maxChunkSize,
|
|
}
|
|
|
|
chunker, err := fastcdc.NewChunker(r, opts)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("creating chunker: %w", err)
|
|
}
|
|
|
|
var chunks []Chunk
|
|
offset := int64(0)
|
|
|
|
for {
|
|
chunk, err := chunker.Next()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return nil, fmt.Errorf("reading chunk: %w", err)
|
|
}
|
|
|
|
// Calculate hash
|
|
hash := sha256.Sum256(chunk.Data)
|
|
|
|
// Make a copy of the data since FastCDC reuses the buffer
|
|
chunkData := make([]byte, len(chunk.Data))
|
|
copy(chunkData, chunk.Data)
|
|
|
|
chunks = append(chunks, Chunk{
|
|
Hash: hex.EncodeToString(hash[:]),
|
|
Data: chunkData,
|
|
Offset: offset,
|
|
Size: int64(len(chunk.Data)),
|
|
})
|
|
|
|
offset += int64(len(chunk.Data))
|
|
}
|
|
|
|
return chunks, nil
|
|
}
|
|
|
|
// ChunkCallback is called for each chunk as it's processed
|
|
type ChunkCallback func(chunk Chunk) error
|
|
|
|
// ChunkReaderStreaming splits the reader into chunks and calls the callback for each
|
|
func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) error {
|
|
opts := fastcdc.Options{
|
|
MinSize: c.minChunkSize,
|
|
AverageSize: c.avgChunkSize,
|
|
MaxSize: c.maxChunkSize,
|
|
}
|
|
|
|
chunker, err := fastcdc.NewChunker(r, opts)
|
|
if err != nil {
|
|
return fmt.Errorf("creating chunker: %w", err)
|
|
}
|
|
|
|
offset := int64(0)
|
|
|
|
for {
|
|
chunk, err := chunker.Next()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("reading chunk: %w", err)
|
|
}
|
|
|
|
// Calculate hash
|
|
hash := sha256.Sum256(chunk.Data)
|
|
|
|
// Make a copy of the data since FastCDC reuses the buffer
|
|
chunkData := make([]byte, len(chunk.Data))
|
|
copy(chunkData, chunk.Data)
|
|
|
|
if err := callback(Chunk{
|
|
Hash: hex.EncodeToString(hash[:]),
|
|
Data: chunkData,
|
|
Offset: offset,
|
|
Size: int64(len(chunk.Data)),
|
|
}); err != nil {
|
|
return fmt.Errorf("callback error: %w", err)
|
|
}
|
|
|
|
offset += int64(len(chunk.Data))
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// ChunkFile splits a file into content-defined chunks
|
|
func (c *Chunker) ChunkFile(path string) ([]Chunk, error) {
|
|
file, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("opening file: %w", err)
|
|
}
|
|
defer func() {
|
|
if err := file.Close(); err != nil && err.Error() != "invalid argument" {
|
|
// Log error or handle as needed
|
|
_ = err
|
|
}
|
|
}()
|
|
|
|
return c.ChunkReader(file)
|
|
}
|