Refactor blob storage to use UUID primary keys and implement streaming chunking

- Changed blob table to use ID (UUID) as primary key instead of hash - Blob records are now created at packing start, enabling immediate chunk associations - Implemented streaming chunking to process large files without memory exhaustion - Fixed blob manifest generation to include all referenced blobs - Updated all foreign key references from blob_hash to blob_id - Added progress reporting and improved error handling - Enforced encryption requirement for all blob packing - Updated tests to use test encryption keys - Added Cyrillic transliteration to README
2025-07-22 07:43:39 +02:00
parent 26db096913
commit 86b533d6ee
49 changed files with 5709 additions and 324 deletions
--- a/internal/chunker/chunker_isolated_test.go
+++ b/internal/chunker/chunker_isolated_test.go
@@ -0,0 +1,77 @@
+package chunker
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestChunkerExpectedChunkCount(t *testing.T) {
+	tests := []struct {
+		name         string
+		fileSize     int
+		avgChunkSize int64
+		minExpected  int
+		maxExpected  int
+	}{
+		{
+			name:         "1MB file with 64KB average",
+			fileSize:     1024 * 1024,
+			avgChunkSize: 64 * 1024,
+			minExpected:  8,  // At least half the expected count
+			maxExpected:  32, // At most double the expected count
+		},
+		{
+			name:         "10MB file with 256KB average",
+			fileSize:     10 * 1024 * 1024,
+			avgChunkSize: 256 * 1024,
+			minExpected:  10, // FastCDC may produce larger chunks
+			maxExpected:  80,
+		},
+		{
+			name:         "512KB file with 64KB average",
+			fileSize:     512 * 1024,
+			avgChunkSize: 64 * 1024,
+			minExpected:  4, // ~8 expected
+			maxExpected:  16,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			chunker := NewChunker(tt.avgChunkSize)
+
+			// Create data with some variation to trigger chunk boundaries
+			data := make([]byte, tt.fileSize)
+			for i := 0; i < len(data); i++ {
+				// Use a pattern that should create boundaries
+				data[i] = byte((i * 17) ^ (i >> 5))
+			}
+
+			chunks, err := chunker.ChunkReader(bytes.NewReader(data))
+			if err != nil {
+				t.Fatalf("chunking failed: %v", err)
+			}
+
+			t.Logf("Created %d chunks for %d bytes with %d average chunk size",
+				len(chunks), tt.fileSize, tt.avgChunkSize)
+
+			if len(chunks) < tt.minExpected {
+				t.Errorf("too few chunks: got %d, expected at least %d",
+					len(chunks), tt.minExpected)
+			}
+			if len(chunks) > tt.maxExpected {
+				t.Errorf("too many chunks: got %d, expected at most %d",
+					len(chunks), tt.maxExpected)
+			}
+
+			// Verify chunks reconstruct to original
+			var reconstructed []byte
+			for _, chunk := range chunks {
+				reconstructed = append(reconstructed, chunk.Data...)
+			}
+			if !bytes.Equal(data, reconstructed) {
+				t.Error("reconstructed data doesn't match original")
+			}
+		})
+	}
+}