Refactor blob storage to use UUID primary keys and implement streaming chunking

- Changed blob table to use ID (UUID) as primary key instead of hash - Blob records are now created at packing start, enabling immediate chunk associations - Implemented streaming chunking to process large files without memory exhaustion - Fixed blob manifest generation to include all referenced blobs - Updated all foreign key references from blob_hash to blob_id - Added progress reporting and improved error handling - Enforced encryption requirement for all blob packing - Updated tests to use test encryption keys - Added Cyrillic transliteration to README
2025-07-22 07:43:39 +02:00
parent 26db096913
commit 86b533d6ee
49 changed files with 5709 additions and 324 deletions
--- a/internal/chunker/chunker.go
+++ b/internal/chunker/chunker.go
@@ -0,0 +1,146 @@
+package chunker
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"os"
+
+	"github.com/jotfs/fastcdc-go"
+)
+
+// Chunk represents a single chunk of data
+type Chunk struct {
+	Hash   string // Content hash of the chunk
+	Data   []byte // Chunk data
+	Offset int64  // Offset in the original file
+	Size   int64  // Size of the chunk
+}
+
+// Chunker provides content-defined chunking using FastCDC
+type Chunker struct {
+	avgChunkSize int
+	minChunkSize int
+	maxChunkSize int
+}
+
+// NewChunker creates a new chunker with the specified average chunk size
+func NewChunker(avgChunkSize int64) *Chunker {
+	// FastCDC recommends min = avg/4 and max = avg*4
+	return &Chunker{
+		avgChunkSize: int(avgChunkSize),
+		minChunkSize: int(avgChunkSize / 4),
+		maxChunkSize: int(avgChunkSize * 4),
+	}
+}
+
+// ChunkReader splits the reader into content-defined chunks
+func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) {
+	opts := fastcdc.Options{
+		MinSize:     c.minChunkSize,
+		AverageSize: c.avgChunkSize,
+		MaxSize:     c.maxChunkSize,
+	}
+
+	chunker, err := fastcdc.NewChunker(r, opts)
+	if err != nil {
+		return nil, fmt.Errorf("creating chunker: %w", err)
+	}
+
+	var chunks []Chunk
+	offset := int64(0)
+
+	for {
+		chunk, err := chunker.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return nil, fmt.Errorf("reading chunk: %w", err)
+		}
+
+		// Calculate hash
+		hash := sha256.Sum256(chunk.Data)
+
+		// Make a copy of the data since FastCDC reuses the buffer
+		chunkData := make([]byte, len(chunk.Data))
+		copy(chunkData, chunk.Data)
+
+		chunks = append(chunks, Chunk{
+			Hash:   hex.EncodeToString(hash[:]),
+			Data:   chunkData,
+			Offset: offset,
+			Size:   int64(len(chunk.Data)),
+		})
+
+		offset += int64(len(chunk.Data))
+	}
+
+	return chunks, nil
+}
+
+// ChunkCallback is called for each chunk as it's processed
+type ChunkCallback func(chunk Chunk) error
+
+// ChunkReaderStreaming splits the reader into chunks and calls the callback for each
+func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) error {
+	opts := fastcdc.Options{
+		MinSize:     c.minChunkSize,
+		AverageSize: c.avgChunkSize,
+		MaxSize:     c.maxChunkSize,
+	}
+
+	chunker, err := fastcdc.NewChunker(r, opts)
+	if err != nil {
+		return fmt.Errorf("creating chunker: %w", err)
+	}
+
+	offset := int64(0)
+
+	for {
+		chunk, err := chunker.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return fmt.Errorf("reading chunk: %w", err)
+		}
+
+		// Calculate hash
+		hash := sha256.Sum256(chunk.Data)
+
+		// Make a copy of the data since FastCDC reuses the buffer
+		chunkData := make([]byte, len(chunk.Data))
+		copy(chunkData, chunk.Data)
+
+		if err := callback(Chunk{
+			Hash:   hex.EncodeToString(hash[:]),
+			Data:   chunkData,
+			Offset: offset,
+			Size:   int64(len(chunk.Data)),
+		}); err != nil {
+			return fmt.Errorf("callback error: %w", err)
+		}
+
+		offset += int64(len(chunk.Data))
+	}
+
+	return nil
+}
+
+// ChunkFile splits a file into content-defined chunks
+func (c *Chunker) ChunkFile(path string) ([]Chunk, error) {
+	file, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("opening file: %w", err)
+	}
+	defer func() {
+		if err := file.Close(); err != nil && err.Error() != "invalid argument" {
+			// Log error or handle as needed
+			_ = err
+		}
+	}()
+
+	return c.ChunkReader(file)
+}
--- a/internal/chunker/chunker_isolated_test.go
+++ b/internal/chunker/chunker_isolated_test.go
@@ -0,0 +1,77 @@
+package chunker
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestChunkerExpectedChunkCount(t *testing.T) {
+	tests := []struct {
+		name         string
+		fileSize     int
+		avgChunkSize int64
+		minExpected  int
+		maxExpected  int
+	}{
+		{
+			name:         "1MB file with 64KB average",
+			fileSize:     1024 * 1024,
+			avgChunkSize: 64 * 1024,
+			minExpected:  8,  // At least half the expected count
+			maxExpected:  32, // At most double the expected count
+		},
+		{
+			name:         "10MB file with 256KB average",
+			fileSize:     10 * 1024 * 1024,
+			avgChunkSize: 256 * 1024,
+			minExpected:  10, // FastCDC may produce larger chunks
+			maxExpected:  80,
+		},
+		{
+			name:         "512KB file with 64KB average",
+			fileSize:     512 * 1024,
+			avgChunkSize: 64 * 1024,
+			minExpected:  4, // ~8 expected
+			maxExpected:  16,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			chunker := NewChunker(tt.avgChunkSize)
+
+			// Create data with some variation to trigger chunk boundaries
+			data := make([]byte, tt.fileSize)
+			for i := 0; i < len(data); i++ {
+				// Use a pattern that should create boundaries
+				data[i] = byte((i * 17) ^ (i >> 5))
+			}
+
+			chunks, err := chunker.ChunkReader(bytes.NewReader(data))
+			if err != nil {
+				t.Fatalf("chunking failed: %v", err)
+			}
+
+			t.Logf("Created %d chunks for %d bytes with %d average chunk size",
+				len(chunks), tt.fileSize, tt.avgChunkSize)
+
+			if len(chunks) < tt.minExpected {
+				t.Errorf("too few chunks: got %d, expected at least %d",
+					len(chunks), tt.minExpected)
+			}
+			if len(chunks) > tt.maxExpected {
+				t.Errorf("too many chunks: got %d, expected at most %d",
+					len(chunks), tt.maxExpected)
+			}
+
+			// Verify chunks reconstruct to original
+			var reconstructed []byte
+			for _, chunk := range chunks {
+				reconstructed = append(reconstructed, chunk.Data...)
+			}
+			if !bytes.Equal(data, reconstructed) {
+				t.Error("reconstructed data doesn't match original")
+			}
+		})
+	}
+}
--- a/internal/chunker/chunker_test.go
+++ b/internal/chunker/chunker_test.go
@@ -0,0 +1,128 @@
+package chunker
+
+import (
+	"bytes"
+	"crypto/rand"
+	"testing"
+)
+
+func TestChunker(t *testing.T) {
+	t.Run("small file produces single chunk", func(t *testing.T) {
+		chunker := NewChunker(1024 * 1024)         // 1MB average
+		data := bytes.Repeat([]byte("hello"), 100) // 500 bytes
+
+		chunks, err := chunker.ChunkReader(bytes.NewReader(data))
+		if err != nil {
+			t.Fatalf("chunking failed: %v", err)
+		}
+
+		if len(chunks) != 1 {
+			t.Errorf("expected 1 chunk, got %d", len(chunks))
+		}
+
+		if chunks[0].Size != int64(len(data)) {
+			t.Errorf("expected chunk size %d, got %d", len(data), chunks[0].Size)
+		}
+	})
+
+	t.Run("large file produces multiple chunks", func(t *testing.T) {
+		chunker := NewChunker(256 * 1024) // 256KB average chunk size
+
+		// Generate 2MB of random data
+		data := make([]byte, 2*1024*1024)
+		if _, err := rand.Read(data); err != nil {
+			t.Fatalf("failed to generate random data: %v", err)
+		}
+
+		chunks, err := chunker.ChunkReader(bytes.NewReader(data))
+		if err != nil {
+			t.Fatalf("chunking failed: %v", err)
+		}
+
+		// Should produce multiple chunks - with FastCDC we expect around 8 chunks for 2MB with 256KB average
+		if len(chunks) < 4 || len(chunks) > 16 {
+			t.Errorf("expected 4-16 chunks, got %d", len(chunks))
+		}
+
+		// Verify chunks reconstruct original data
+		var reconstructed []byte
+		for _, chunk := range chunks {
+			reconstructed = append(reconstructed, chunk.Data...)
+		}
+
+		if !bytes.Equal(data, reconstructed) {
+			t.Error("reconstructed data doesn't match original")
+		}
+
+		// Verify offsets
+		var expectedOffset int64
+		for i, chunk := range chunks {
+			if chunk.Offset != expectedOffset {
+				t.Errorf("chunk %d: expected offset %d, got %d", i, expectedOffset, chunk.Offset)
+			}
+			expectedOffset += chunk.Size
+		}
+	})
+
+	t.Run("deterministic chunking", func(t *testing.T) {
+		chunker1 := NewChunker(256 * 1024)
+		chunker2 := NewChunker(256 * 1024)
+
+		// Use deterministic data
+		data := bytes.Repeat([]byte("abcdefghijklmnopqrstuvwxyz"), 20000) // ~520KB
+
+		chunks1, err := chunker1.ChunkReader(bytes.NewReader(data))
+		if err != nil {
+			t.Fatalf("chunking failed: %v", err)
+		}
+
+		chunks2, err := chunker2.ChunkReader(bytes.NewReader(data))
+		if err != nil {
+			t.Fatalf("chunking failed: %v", err)
+		}
+
+		// Should produce same chunks
+		if len(chunks1) != len(chunks2) {
+			t.Fatalf("different number of chunks: %d vs %d", len(chunks1), len(chunks2))
+		}
+
+		for i := range chunks1 {
+			if chunks1[i].Hash != chunks2[i].Hash {
+				t.Errorf("chunk %d: different hashes", i)
+			}
+			if chunks1[i].Size != chunks2[i].Size {
+				t.Errorf("chunk %d: different sizes", i)
+			}
+		}
+	})
+}
+
+func TestChunkBoundaries(t *testing.T) {
+	chunker := NewChunker(256 * 1024) // 256KB average
+
+	// FastCDC uses avg/4 for min and avg*4 for max
+	avgSize := int64(256 * 1024)
+	minSize := avgSize / 4
+	maxSize := avgSize * 4
+
+	// Test that minimum chunk size is respected
+	data := make([]byte, minSize+1024)
+	if _, err := rand.Read(data); err != nil {
+		t.Fatalf("failed to generate random data: %v", err)
+	}
+
+	chunks, err := chunker.ChunkReader(bytes.NewReader(data))
+	if err != nil {
+		t.Fatalf("chunking failed: %v", err)
+	}
+
+	for i, chunk := range chunks {
+		// Last chunk can be smaller than minimum
+		if i < len(chunks)-1 && chunk.Size < minSize {
+			t.Errorf("chunk %d size %d is below minimum %d", i, chunk.Size, minSize)
+		}
+		if chunk.Size > maxSize {
+			t.Errorf("chunk %d size %d exceeds maximum %d", i, chunk.Size, maxSize)
+		}
+	}
+}