Fix foreign key constraints and improve snapshot tracking

- Add unified compression/encryption package in internal/blobgen
- Update DATAMODEL.md to reflect current schema implementation
- Refactor snapshot cleanup into well-named methods for clarity
- Add snapshot_id to uploads table to track new blobs per snapshot
- Fix blob count reporting for incremental backups
- Add DeleteOrphaned method to BlobChunkRepository
- Fix cleanup order to respect foreign key constraints
- Update tests to reflect schema changes
This commit is contained in:
2025-07-26 02:22:25 +02:00
parent 78af626759
commit d3afa65420
28 changed files with 994 additions and 534 deletions

View File

@@ -121,3 +121,32 @@ func (r *BlobChunkRepository) GetByChunkHashTx(ctx context.Context, tx *sql.Tx,
LogSQL("GetByChunkHashTx", "Found blob", chunkHash, "blob", bc.BlobID)
return &bc, nil
}
// DeleteOrphaned deletes blob_chunks entries where either the blob or chunk no longer exists
func (r *BlobChunkRepository) DeleteOrphaned(ctx context.Context) error {
// Delete blob_chunks where the blob doesn't exist
query1 := `
DELETE FROM blob_chunks
WHERE NOT EXISTS (
SELECT 1 FROM blobs
WHERE blobs.id = blob_chunks.blob_id
)
`
if _, err := r.db.ExecWithLog(ctx, query1); err != nil {
return fmt.Errorf("deleting blob_chunks with missing blobs: %w", err)
}
// Delete blob_chunks where the chunk doesn't exist
query2 := `
DELETE FROM blob_chunks
WHERE NOT EXISTS (
SELECT 1 FROM chunks
WHERE chunks.chunk_hash = blob_chunks.chunk_hash
)
`
if _, err := r.db.ExecWithLog(ctx, query2); err != nil {
return fmt.Errorf("deleting blob_chunks with missing chunks: %w", err)
}
return nil
}

View File

@@ -30,7 +30,6 @@ func TestBlobChunkRepository(t *testing.T) {
for _, chunkHash := range chunks {
chunk := &Chunk{
ChunkHash: chunkHash,
SHA256: chunkHash + "-sha",
Size: 1024,
}
err = repos.Chunks.Create(ctx, nil, chunk)
@@ -159,7 +158,6 @@ func TestBlobChunkRepositoryMultipleBlobs(t *testing.T) {
for _, chunkHash := range chunkHashes {
chunk := &Chunk{
ChunkHash: chunkHash,
SHA256: chunkHash + "-sha",
Size: 1024,
}
err = repos.Chunks.Create(ctx, nil, chunk)

View File

@@ -43,7 +43,6 @@ func TestCascadeDeleteDebug(t *testing.T) {
for i := 0; i < 3; i++ {
chunk := &Chunk{
ChunkHash: fmt.Sprintf("cascade-chunk-%d", i),
SHA256: fmt.Sprintf("cascade-sha-%d", i),
Size: 1024,
}
err = repos.Chunks.Create(ctx, nil, chunk)

View File

@@ -13,6 +13,7 @@ func TestChunkFileRepository(t *testing.T) {
ctx := context.Background()
repo := NewChunkFileRepository(db)
fileRepo := NewFileRepository(db)
chunksRepo := NewChunkRepository(db)
// Create test files first
testTime := time.Now().Truncate(time.Second)
@@ -46,6 +47,16 @@ func TestChunkFileRepository(t *testing.T) {
t.Fatalf("failed to create file2: %v", err)
}
// Create chunk first
chunk := &Chunk{
ChunkHash: "chunk1",
Size: 1024,
}
err = chunksRepo.Create(ctx, nil, chunk)
if err != nil {
t.Fatalf("failed to create chunk: %v", err)
}
// Test Create
cf1 := &ChunkFile{
ChunkHash: "chunk1",
@@ -121,6 +132,7 @@ func TestChunkFileRepositoryComplexDeduplication(t *testing.T) {
ctx := context.Background()
repo := NewChunkFileRepository(db)
fileRepo := NewFileRepository(db)
chunksRepo := NewChunkRepository(db)
// Create test files
testTime := time.Now().Truncate(time.Second)
@@ -138,6 +150,19 @@ func TestChunkFileRepositoryComplexDeduplication(t *testing.T) {
t.Fatalf("failed to create file3: %v", err)
}
// Create chunks first
chunks := []string{"chunk1", "chunk2", "chunk3", "chunk4"}
for _, chunkHash := range chunks {
chunk := &Chunk{
ChunkHash: chunkHash,
Size: 1024,
}
err := chunksRepo.Create(ctx, nil, chunk)
if err != nil {
t.Fatalf("failed to create chunk %s: %v", chunkHash, err)
}
}
// Simulate a scenario where multiple files share chunks
// File1: chunk1, chunk2, chunk3
// File2: chunk2, chunk3, chunk4
@@ -183,11 +208,11 @@ func TestChunkFileRepositoryComplexDeduplication(t *testing.T) {
}
// Test file2 chunks
chunks, err := repo.GetByFileID(ctx, file2.ID)
file2Chunks, err := repo.GetByFileID(ctx, file2.ID)
if err != nil {
t.Fatalf("failed to get chunks for file2: %v", err)
}
if len(chunks) != 3 {
t.Errorf("expected 3 chunks for file2, got %d", len(chunks))
if len(file2Chunks) != 3 {
t.Errorf("expected 3 chunks for file2, got %d", len(file2Chunks))
}
}

View File

@@ -18,16 +18,16 @@ func NewChunkRepository(db *DB) *ChunkRepository {
func (r *ChunkRepository) Create(ctx context.Context, tx *sql.Tx, chunk *Chunk) error {
query := `
INSERT INTO chunks (chunk_hash, sha256, size)
VALUES (?, ?, ?)
INSERT INTO chunks (chunk_hash, size)
VALUES (?, ?)
ON CONFLICT(chunk_hash) DO NOTHING
`
var err error
if tx != nil {
_, err = tx.ExecContext(ctx, query, chunk.ChunkHash, chunk.SHA256, chunk.Size)
_, err = tx.ExecContext(ctx, query, chunk.ChunkHash, chunk.Size)
} else {
_, err = r.db.ExecWithLog(ctx, query, chunk.ChunkHash, chunk.SHA256, chunk.Size)
_, err = r.db.ExecWithLog(ctx, query, chunk.ChunkHash, chunk.Size)
}
if err != nil {
@@ -39,7 +39,7 @@ func (r *ChunkRepository) Create(ctx context.Context, tx *sql.Tx, chunk *Chunk)
func (r *ChunkRepository) GetByHash(ctx context.Context, hash string) (*Chunk, error) {
query := `
SELECT chunk_hash, sha256, size
SELECT chunk_hash, size
FROM chunks
WHERE chunk_hash = ?
`
@@ -48,7 +48,6 @@ func (r *ChunkRepository) GetByHash(ctx context.Context, hash string) (*Chunk, e
err := r.db.conn.QueryRowContext(ctx, query, hash).Scan(
&chunk.ChunkHash,
&chunk.SHA256,
&chunk.Size,
)
@@ -68,7 +67,7 @@ func (r *ChunkRepository) GetByHashes(ctx context.Context, hashes []string) ([]*
}
query := `
SELECT chunk_hash, sha256, size
SELECT chunk_hash, size
FROM chunks
WHERE chunk_hash IN (`
@@ -94,7 +93,6 @@ func (r *ChunkRepository) GetByHashes(ctx context.Context, hashes []string) ([]*
err := rows.Scan(
&chunk.ChunkHash,
&chunk.SHA256,
&chunk.Size,
)
if err != nil {
@@ -109,7 +107,7 @@ func (r *ChunkRepository) GetByHashes(ctx context.Context, hashes []string) ([]*
func (r *ChunkRepository) ListUnpacked(ctx context.Context, limit int) ([]*Chunk, error) {
query := `
SELECT c.chunk_hash, c.sha256, c.size
SELECT c.chunk_hash, c.size
FROM chunks c
LEFT JOIN blob_chunks bc ON c.chunk_hash = bc.chunk_hash
WHERE bc.chunk_hash IS NULL
@@ -129,7 +127,6 @@ func (r *ChunkRepository) ListUnpacked(ctx context.Context, limit int) ([]*Chunk
err := rows.Scan(
&chunk.ChunkHash,
&chunk.SHA256,
&chunk.Size,
)
if err != nil {

View File

@@ -7,7 +7,7 @@ import (
func (r *ChunkRepository) List(ctx context.Context) ([]*Chunk, error) {
query := `
SELECT chunk_hash, sha256, size
SELECT chunk_hash, size
FROM chunks
ORDER BY chunk_hash
`
@@ -24,7 +24,6 @@ func (r *ChunkRepository) List(ctx context.Context) ([]*Chunk, error) {
err := rows.Scan(
&chunk.ChunkHash,
&chunk.SHA256,
&chunk.Size,
)
if err != nil {

View File

@@ -15,7 +15,6 @@ func TestChunkRepository(t *testing.T) {
// Test Create
chunk := &Chunk{
ChunkHash: "chunkhash123",
SHA256: "sha256hash123",
Size: 4096,
}
@@ -35,9 +34,6 @@ func TestChunkRepository(t *testing.T) {
if retrieved.ChunkHash != chunk.ChunkHash {
t.Errorf("chunk hash mismatch: got %s, want %s", retrieved.ChunkHash, chunk.ChunkHash)
}
if retrieved.SHA256 != chunk.SHA256 {
t.Errorf("sha256 mismatch: got %s, want %s", retrieved.SHA256, chunk.SHA256)
}
if retrieved.Size != chunk.Size {
t.Errorf("size mismatch: got %d, want %d", retrieved.Size, chunk.Size)
}
@@ -51,7 +47,6 @@ func TestChunkRepository(t *testing.T) {
// Test GetByHashes
chunk2 := &Chunk{
ChunkHash: "chunkhash456",
SHA256: "sha256hash456",
Size: 8192,
}
err = repo.Create(ctx, nil, chunk2)

View File

@@ -75,8 +75,8 @@ func TestDatabaseConcurrentAccess(t *testing.T) {
for i := 0; i < 10; i++ {
go func(i int) {
_, err := db.ExecWithLog(ctx, "INSERT INTO chunks (chunk_hash, sha256, size) VALUES (?, ?, ?)",
fmt.Sprintf("hash%d", i), fmt.Sprintf("sha%d", i), i*1024)
_, err := db.ExecWithLog(ctx, "INSERT INTO chunks (chunk_hash, size) VALUES (?, ?)",
fmt.Sprintf("hash%d", i), i*1024)
results <- result{index: i, err: err}
}(i)
}

View File

@@ -32,6 +32,20 @@ func TestFileChunkRepository(t *testing.T) {
t.Fatalf("failed to create file: %v", err)
}
// Create chunks first
chunks := []string{"chunk1", "chunk2", "chunk3"}
chunkRepo := NewChunkRepository(db)
for _, chunkHash := range chunks {
chunk := &Chunk{
ChunkHash: chunkHash,
Size: 1024,
}
err = chunkRepo.Create(ctx, nil, chunk)
if err != nil {
t.Fatalf("failed to create chunk %s: %v", chunkHash, err)
}
}
// Test Create
fc1 := &FileChunk{
FileID: file.ID,
@@ -66,16 +80,16 @@ func TestFileChunkRepository(t *testing.T) {
}
// Test GetByFile
chunks, err := repo.GetByFile(ctx, "/test/file.txt")
fileChunks, err := repo.GetByFile(ctx, "/test/file.txt")
if err != nil {
t.Fatalf("failed to get file chunks: %v", err)
}
if len(chunks) != 3 {
t.Errorf("expected 3 chunks, got %d", len(chunks))
if len(fileChunks) != 3 {
t.Errorf("expected 3 chunks, got %d", len(fileChunks))
}
// Verify order
for i, chunk := range chunks {
for i, chunk := range fileChunks {
if chunk.Idx != i {
t.Errorf("wrong chunk order: expected idx %d, got %d", i, chunk.Idx)
}
@@ -93,12 +107,12 @@ func TestFileChunkRepository(t *testing.T) {
t.Fatalf("failed to delete file chunks: %v", err)
}
chunks, err = repo.GetByFileID(ctx, file.ID)
fileChunks, err = repo.GetByFileID(ctx, file.ID)
if err != nil {
t.Fatalf("failed to get deleted file chunks: %v", err)
}
if len(chunks) != 0 {
t.Errorf("expected 0 chunks after delete, got %d", len(chunks))
if len(fileChunks) != 0 {
t.Errorf("expected 0 chunks after delete, got %d", len(fileChunks))
}
}
@@ -133,6 +147,22 @@ func TestFileChunkRepositoryMultipleFiles(t *testing.T) {
files[i] = file
}
// Create all chunks first
chunkRepo := NewChunkRepository(db)
for i := range files {
for j := 0; j < 2; j++ {
chunkHash := fmt.Sprintf("file%d_chunk%d", i, j)
chunk := &Chunk{
ChunkHash: chunkHash,
Size: 1024,
}
err := chunkRepo.Create(ctx, nil, chunk)
if err != nil {
t.Fatalf("failed to create chunk %s: %v", chunkHash, err)
}
}
}
// Create chunks for multiple files
for i, file := range files {
for j := 0; j < 2; j++ {

View File

@@ -28,7 +28,6 @@ func (r *FileRepository) Create(ctx context.Context, tx *sql.Tx, file *File) err
INSERT INTO files (id, path, mtime, ctime, size, mode, uid, gid, link_target)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(path) DO UPDATE SET
id = excluded.id,
mtime = excluded.mtime,
ctime = excluded.ctime,
size = excluded.size,

View File

@@ -37,11 +37,9 @@ type FileChunk struct {
// Chunk represents a data chunk in the deduplication system.
// Files are split into chunks which are content-addressed by their hash.
// The ChunkHash is used for deduplication, while SHA256 provides
// an additional verification hash.
// The ChunkHash is the SHA256 hash of the chunk content, used for deduplication.
type Chunk struct {
ChunkHash string
SHA256 string
Size int64
}

View File

@@ -34,7 +34,6 @@ func TestRepositoriesTransaction(t *testing.T) {
// Create chunks
chunk1 := &Chunk{
ChunkHash: "tx_chunk1",
SHA256: "tx_sha1",
Size: 512,
}
if err := repos.Chunks.Create(ctx, tx, chunk1); err != nil {
@@ -43,7 +42,6 @@ func TestRepositoriesTransaction(t *testing.T) {
chunk2 := &Chunk{
ChunkHash: "tx_chunk2",
SHA256: "tx_sha2",
Size: 512,
}
if err := repos.Chunks.Create(ctx, tx, chunk2); err != nil {
@@ -159,7 +157,6 @@ func TestRepositoriesTransactionRollback(t *testing.T) {
// Create a chunk
chunk := &Chunk{
ChunkHash: "rollback_chunk",
SHA256: "rollback_sha",
Size: 1024,
}
if err := repos.Chunks.Create(ctx, tx, chunk); err != nil {

View File

@@ -195,12 +195,10 @@ func TestOrphanedChunkCleanup(t *testing.T) {
// Create chunks
chunk1 := &Chunk{
ChunkHash: "orphaned-chunk",
SHA256: "orphaned-chunk-sha",
Size: 1024,
}
chunk2 := &Chunk{
ChunkHash: "referenced-chunk",
SHA256: "referenced-chunk-sha",
Size: 1024,
}
@@ -363,7 +361,6 @@ func TestFileChunkRepositoryWithUUIDs(t *testing.T) {
for i, chunkHash := range chunks {
chunk := &Chunk{
ChunkHash: chunkHash,
SHA256: fmt.Sprintf("sha-%s", chunkHash),
Size: 1024,
}
err = repos.Chunks.Create(ctx, nil, chunk)
@@ -447,7 +444,6 @@ func TestChunkFileRepositoryWithUUIDs(t *testing.T) {
// Create a chunk that appears in both files (deduplication)
chunk := &Chunk{
ChunkHash: "shared-chunk",
SHA256: "shared-chunk-sha",
Size: 1024,
}
err = repos.Chunks.Create(ctx, nil, chunk)
@@ -694,7 +690,6 @@ func TestCascadeDelete(t *testing.T) {
for i := 0; i < 3; i++ {
chunk := &Chunk{
ChunkHash: fmt.Sprintf("cascade-chunk-%d", i),
SHA256: fmt.Sprintf("cascade-sha-%d", i),
Size: 1024,
}
err = repos.Chunks.Create(ctx, nil, chunk)

View File

@@ -170,7 +170,6 @@ func TestDuplicateHandling(t *testing.T) {
t.Run("duplicate chunk hashes", func(t *testing.T) {
chunk := &Chunk{
ChunkHash: "duplicate-chunk",
SHA256: "duplicate-sha",
Size: 1024,
}
@@ -204,7 +203,6 @@ func TestDuplicateHandling(t *testing.T) {
chunk := &Chunk{
ChunkHash: "test-chunk-dup",
SHA256: "test-sha-dup",
Size: 1024,
}
err = repos.Chunks.Create(ctx, nil, chunk)

View File

@@ -24,13 +24,13 @@ CREATE TABLE IF NOT EXISTS file_chunks (
idx INTEGER NOT NULL,
chunk_hash TEXT NOT NULL,
PRIMARY KEY (file_id, idx),
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE,
FOREIGN KEY (chunk_hash) REFERENCES chunks(chunk_hash)
);
-- Chunks table: stores unique content-defined chunks
CREATE TABLE IF NOT EXISTS chunks (
chunk_hash TEXT PRIMARY KEY,
sha256 TEXT NOT NULL,
size INTEGER NOT NULL
);
@@ -52,7 +52,8 @@ CREATE TABLE IF NOT EXISTS blob_chunks (
offset INTEGER NOT NULL,
length INTEGER NOT NULL,
PRIMARY KEY (blob_id, chunk_hash),
FOREIGN KEY (blob_id) REFERENCES blobs(id)
FOREIGN KEY (blob_id) REFERENCES blobs(id) ON DELETE CASCADE,
FOREIGN KEY (chunk_hash) REFERENCES chunks(chunk_hash)
);
-- Chunk files table: reverse mapping of chunks to files
@@ -62,6 +63,7 @@ CREATE TABLE IF NOT EXISTS chunk_files (
file_offset INTEGER NOT NULL,
length INTEGER NOT NULL,
PRIMARY KEY (chunk_hash, file_id),
FOREIGN KEY (chunk_hash) REFERENCES chunks(chunk_hash),
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
);
@@ -91,7 +93,7 @@ CREATE TABLE IF NOT EXISTS snapshot_files (
file_id TEXT NOT NULL,
PRIMARY KEY (snapshot_id, file_id),
FOREIGN KEY (snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
FOREIGN KEY (file_id) REFERENCES files(id)
);
-- Snapshot blobs table: maps snapshots to blobs
@@ -101,13 +103,16 @@ CREATE TABLE IF NOT EXISTS snapshot_blobs (
blob_hash TEXT NOT NULL,
PRIMARY KEY (snapshot_id, blob_id),
FOREIGN KEY (snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE,
FOREIGN KEY (blob_id) REFERENCES blobs(id) ON DELETE CASCADE
FOREIGN KEY (blob_id) REFERENCES blobs(id)
);
-- Uploads table: tracks blob upload metrics
CREATE TABLE IF NOT EXISTS uploads (
blob_hash TEXT PRIMARY KEY,
snapshot_id TEXT NOT NULL,
uploaded_at INTEGER NOT NULL,
size INTEGER NOT NULL,
duration_ms INTEGER NOT NULL
duration_ms INTEGER NOT NULL,
FOREIGN KEY (blob_hash) REFERENCES blobs(blob_hash),
FOREIGN KEY (snapshot_id) REFERENCES snapshots(id)
);

View File

@@ -11,6 +11,7 @@ import (
// Upload represents a blob upload record
type Upload struct {
BlobHash string
SnapshotID string
UploadedAt time.Time
Size int64
DurationMs int64
@@ -29,15 +30,15 @@ func NewUploadRepository(conn *sql.DB) *UploadRepository {
// Create inserts a new upload record
func (r *UploadRepository) Create(ctx context.Context, tx *sql.Tx, upload *Upload) error {
query := `
INSERT INTO uploads (blob_hash, uploaded_at, size, duration_ms)
VALUES (?, ?, ?, ?)
INSERT INTO uploads (blob_hash, snapshot_id, uploaded_at, size, duration_ms)
VALUES (?, ?, ?, ?, ?)
`
var err error
if tx != nil {
_, err = tx.ExecContext(ctx, query, upload.BlobHash, upload.UploadedAt, upload.Size, upload.DurationMs)
_, err = tx.ExecContext(ctx, query, upload.BlobHash, upload.SnapshotID, upload.UploadedAt, upload.Size, upload.DurationMs)
} else {
_, err = r.conn.ExecContext(ctx, query, upload.BlobHash, upload.UploadedAt, upload.Size, upload.DurationMs)
_, err = r.conn.ExecContext(ctx, query, upload.BlobHash, upload.SnapshotID, upload.UploadedAt, upload.Size, upload.DurationMs)
}
return err
@@ -133,3 +134,14 @@ type UploadStats struct {
MinDurationMs int64
MaxDurationMs int64
}
// GetCountBySnapshot returns the count of uploads for a specific snapshot
func (r *UploadRepository) GetCountBySnapshot(ctx context.Context, snapshotID string) (int64, error) {
query := `SELECT COUNT(*) FROM uploads WHERE snapshot_id = ?`
var count int64
err := r.conn.QueryRowContext(ctx, query, snapshotID).Scan(&count)
if err != nil {
return 0, err
}
return count, nil
}