Refactor blob storage to use UUID primary keys and implement streaming chunking
- Changed blob table to use ID (UUID) as primary key instead of hash - Blob records are now created at packing start, enabling immediate chunk associations - Implemented streaming chunking to process large files without memory exhaustion - Fixed blob manifest generation to include all referenced blobs - Updated all foreign key references from blob_hash to blob_id - Added progress reporting and improved error handling - Enforced encryption requirement for all blob packing - Updated tests to use test encryption keys - Added Cyrillic transliteration to README
This commit is contained in:
146
internal/chunker/chunker.go
Normal file
146
internal/chunker/chunker.go
Normal file
@@ -0,0 +1,146 @@
|
||||
package chunker
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/jotfs/fastcdc-go"
|
||||
)
|
||||
|
||||
// Chunk represents a single chunk of data
|
||||
type Chunk struct {
|
||||
Hash string // Content hash of the chunk
|
||||
Data []byte // Chunk data
|
||||
Offset int64 // Offset in the original file
|
||||
Size int64 // Size of the chunk
|
||||
}
|
||||
|
||||
// Chunker provides content-defined chunking using FastCDC
|
||||
type Chunker struct {
|
||||
avgChunkSize int
|
||||
minChunkSize int
|
||||
maxChunkSize int
|
||||
}
|
||||
|
||||
// NewChunker creates a new chunker with the specified average chunk size
|
||||
func NewChunker(avgChunkSize int64) *Chunker {
|
||||
// FastCDC recommends min = avg/4 and max = avg*4
|
||||
return &Chunker{
|
||||
avgChunkSize: int(avgChunkSize),
|
||||
minChunkSize: int(avgChunkSize / 4),
|
||||
maxChunkSize: int(avgChunkSize * 4),
|
||||
}
|
||||
}
|
||||
|
||||
// ChunkReader splits the reader into content-defined chunks
|
||||
func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
|
||||
opts := fastcdc.Options{
|
||||
MinSize: c.minChunkSize,
|
||||
AverageSize: c.avgChunkSize,
|
||||
MaxSize: c.maxChunkSize,
|
||||
}
|
||||
|
||||
chunker, err := fastcdc.NewChunker(r, opts)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating chunker: %w", err)
|
||||
}
|
||||
|
||||
var chunks []Chunk
|
||||
offset := int64(0)
|
||||
|
||||
for {
|
||||
chunk, err := chunker.Next()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading chunk: %w", err)
|
||||
}
|
||||
|
||||
// Calculate hash
|
||||
hash := sha256.Sum256(chunk.Data)
|
||||
|
||||
// Make a copy of the data since FastCDC reuses the buffer
|
||||
chunkData := make([]byte, len(chunk.Data))
|
||||
copy(chunkData, chunk.Data)
|
||||
|
||||
chunks = append(chunks, Chunk{
|
||||
Hash: hex.EncodeToString(hash[:]),
|
||||
Data: chunkData,
|
||||
Offset: offset,
|
||||
Size: int64(len(chunk.Data)),
|
||||
})
|
||||
|
||||
offset += int64(len(chunk.Data))
|
||||
}
|
||||
|
||||
return chunks, nil
|
||||
}
|
||||
|
||||
// ChunkCallback is called for each chunk as it's processed
|
||||
type ChunkCallback func(chunk Chunk) error
|
||||
|
||||
// ChunkReaderStreaming splits the reader into chunks and calls the callback for each
|
||||
func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) error {
|
||||
opts := fastcdc.Options{
|
||||
MinSize: c.minChunkSize,
|
||||
AverageSize: c.avgChunkSize,
|
||||
MaxSize: c.maxChunkSize,
|
||||
}
|
||||
|
||||
chunker, err := fastcdc.NewChunker(r, opts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating chunker: %w", err)
|
||||
}
|
||||
|
||||
offset := int64(0)
|
||||
|
||||
for {
|
||||
chunk, err := chunker.Next()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("reading chunk: %w", err)
|
||||
}
|
||||
|
||||
// Calculate hash
|
||||
hash := sha256.Sum256(chunk.Data)
|
||||
|
||||
// Make a copy of the data since FastCDC reuses the buffer
|
||||
chunkData := make([]byte, len(chunk.Data))
|
||||
copy(chunkData, chunk.Data)
|
||||
|
||||
if err := callback(Chunk{
|
||||
Hash: hex.EncodeToString(hash[:]),
|
||||
Data: chunkData,
|
||||
Offset: offset,
|
||||
Size: int64(len(chunk.Data)),
|
||||
}); err != nil {
|
||||
return fmt.Errorf("callback error: %w", err)
|
||||
}
|
||||
|
||||
offset += int64(len(chunk.Data))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ChunkFile splits a file into content-defined chunks
|
||||
func (c *Chunker) ChunkFile(path string) ([]Chunk, error) {
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("opening file: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil && err.Error() != "invalid argument" {
|
||||
// Log error or handle as needed
|
||||
_ = err
|
||||
}
|
||||
}()
|
||||
|
||||
return c.ChunkReader(file)
|
||||
}
|
||||
77
internal/chunker/chunker_isolated_test.go
Normal file
77
internal/chunker/chunker_isolated_test.go
Normal file
@@ -0,0 +1,77 @@
|
||||
package chunker
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestChunkerExpectedChunkCount(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
fileSize int
|
||||
avgChunkSize int64
|
||||
minExpected int
|
||||
maxExpected int
|
||||
}{
|
||||
{
|
||||
name: "1MB file with 64KB average",
|
||||
fileSize: 1024 * 1024,
|
||||
avgChunkSize: 64 * 1024,
|
||||
minExpected: 8, // At least half the expected count
|
||||
maxExpected: 32, // At most double the expected count
|
||||
},
|
||||
{
|
||||
name: "10MB file with 256KB average",
|
||||
fileSize: 10 * 1024 * 1024,
|
||||
avgChunkSize: 256 * 1024,
|
||||
minExpected: 10, // FastCDC may produce larger chunks
|
||||
maxExpected: 80,
|
||||
},
|
||||
{
|
||||
name: "512KB file with 64KB average",
|
||||
fileSize: 512 * 1024,
|
||||
avgChunkSize: 64 * 1024,
|
||||
minExpected: 4, // ~8 expected
|
||||
maxExpected: 16,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
chunker := NewChunker(tt.avgChunkSize)
|
||||
|
||||
// Create data with some variation to trigger chunk boundaries
|
||||
data := make([]byte, tt.fileSize)
|
||||
for i := 0; i < len(data); i++ {
|
||||
// Use a pattern that should create boundaries
|
||||
data[i] = byte((i * 17) ^ (i >> 5))
|
||||
}
|
||||
|
||||
chunks, err := chunker.ChunkReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatalf("chunking failed: %v", err)
|
||||
}
|
||||
|
||||
t.Logf("Created %d chunks for %d bytes with %d average chunk size",
|
||||
len(chunks), tt.fileSize, tt.avgChunkSize)
|
||||
|
||||
if len(chunks) < tt.minExpected {
|
||||
t.Errorf("too few chunks: got %d, expected at least %d",
|
||||
len(chunks), tt.minExpected)
|
||||
}
|
||||
if len(chunks) > tt.maxExpected {
|
||||
t.Errorf("too many chunks: got %d, expected at most %d",
|
||||
len(chunks), tt.maxExpected)
|
||||
}
|
||||
|
||||
// Verify chunks reconstruct to original
|
||||
var reconstructed []byte
|
||||
for _, chunk := range chunks {
|
||||
reconstructed = append(reconstructed, chunk.Data...)
|
||||
}
|
||||
if !bytes.Equal(data, reconstructed) {
|
||||
t.Error("reconstructed data doesn't match original")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
128
internal/chunker/chunker_test.go
Normal file
128
internal/chunker/chunker_test.go
Normal file
@@ -0,0 +1,128 @@
|
||||
package chunker
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/rand"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestChunker(t *testing.T) {
|
||||
t.Run("small file produces single chunk", func(t *testing.T) {
|
||||
chunker := NewChunker(1024 * 1024) // 1MB average
|
||||
data := bytes.Repeat([]byte("hello"), 100) // 500 bytes
|
||||
|
||||
chunks, err := chunker.ChunkReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatalf("chunking failed: %v", err)
|
||||
}
|
||||
|
||||
if len(chunks) != 1 {
|
||||
t.Errorf("expected 1 chunk, got %d", len(chunks))
|
||||
}
|
||||
|
||||
if chunks[0].Size != int64(len(data)) {
|
||||
t.Errorf("expected chunk size %d, got %d", len(data), chunks[0].Size)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("large file produces multiple chunks", func(t *testing.T) {
|
||||
chunker := NewChunker(256 * 1024) // 256KB average chunk size
|
||||
|
||||
// Generate 2MB of random data
|
||||
data := make([]byte, 2*1024*1024)
|
||||
if _, err := rand.Read(data); err != nil {
|
||||
t.Fatalf("failed to generate random data: %v", err)
|
||||
}
|
||||
|
||||
chunks, err := chunker.ChunkReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatalf("chunking failed: %v", err)
|
||||
}
|
||||
|
||||
// Should produce multiple chunks - with FastCDC we expect around 8 chunks for 2MB with 256KB average
|
||||
if len(chunks) < 4 || len(chunks) > 16 {
|
||||
t.Errorf("expected 4-16 chunks, got %d", len(chunks))
|
||||
}
|
||||
|
||||
// Verify chunks reconstruct original data
|
||||
var reconstructed []byte
|
||||
for _, chunk := range chunks {
|
||||
reconstructed = append(reconstructed, chunk.Data...)
|
||||
}
|
||||
|
||||
if !bytes.Equal(data, reconstructed) {
|
||||
t.Error("reconstructed data doesn't match original")
|
||||
}
|
||||
|
||||
// Verify offsets
|
||||
var expectedOffset int64
|
||||
for i, chunk := range chunks {
|
||||
if chunk.Offset != expectedOffset {
|
||||
t.Errorf("chunk %d: expected offset %d, got %d", i, expectedOffset, chunk.Offset)
|
||||
}
|
||||
expectedOffset += chunk.Size
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("deterministic chunking", func(t *testing.T) {
|
||||
chunker1 := NewChunker(256 * 1024)
|
||||
chunker2 := NewChunker(256 * 1024)
|
||||
|
||||
// Use deterministic data
|
||||
data := bytes.Repeat([]byte("abcdefghijklmnopqrstuvwxyz"), 20000) // ~520KB
|
||||
|
||||
chunks1, err := chunker1.ChunkReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatalf("chunking failed: %v", err)
|
||||
}
|
||||
|
||||
chunks2, err := chunker2.ChunkReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatalf("chunking failed: %v", err)
|
||||
}
|
||||
|
||||
// Should produce same chunks
|
||||
if len(chunks1) != len(chunks2) {
|
||||
t.Fatalf("different number of chunks: %d vs %d", len(chunks1), len(chunks2))
|
||||
}
|
||||
|
||||
for i := range chunks1 {
|
||||
if chunks1[i].Hash != chunks2[i].Hash {
|
||||
t.Errorf("chunk %d: different hashes", i)
|
||||
}
|
||||
if chunks1[i].Size != chunks2[i].Size {
|
||||
t.Errorf("chunk %d: different sizes", i)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestChunkBoundaries(t *testing.T) {
|
||||
chunker := NewChunker(256 * 1024) // 256KB average
|
||||
|
||||
// FastCDC uses avg/4 for min and avg*4 for max
|
||||
avgSize := int64(256 * 1024)
|
||||
minSize := avgSize / 4
|
||||
maxSize := avgSize * 4
|
||||
|
||||
// Test that minimum chunk size is respected
|
||||
data := make([]byte, minSize+1024)
|
||||
if _, err := rand.Read(data); err != nil {
|
||||
t.Fatalf("failed to generate random data: %v", err)
|
||||
}
|
||||
|
||||
chunks, err := chunker.ChunkReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatalf("chunking failed: %v", err)
|
||||
}
|
||||
|
||||
for i, chunk := range chunks {
|
||||
// Last chunk can be smaller than minimum
|
||||
if i < len(chunks)-1 && chunk.Size < minSize {
|
||||
t.Errorf("chunk %d size %d is below minimum %d", i, chunk.Size, minSize)
|
||||
}
|
||||
if chunk.Size > maxSize {
|
||||
t.Errorf("chunk %d size %d exceeds maximum %d", i, chunk.Size, maxSize)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user