refactor: stream blob hash verification instead of buffering in memory

FetchAndDecryptBlob now returns io.ReadCloser with a hashVerifyReader that computes the double-SHA-256 on-the-fly during reads. Hash is verified on Close() after the stream is fully consumed. This avoids loading entire blobs into memory, which could exceed available RAM. Addresses review feedback on PR #39.
fix: verify blob hash after download and decryption (closes #5 )
2026-02-20 02:29:19 -08:00 · 2026-02-20 02:26:15 -08:00 · 2026-02-20 11:22:12 +01:00 · 2026-02-20 11:20:52 +01:00 · 2026-02-20 11:19:40 +01:00 · 2026-02-20 11:19:21 +01:00
7 changed files with 294 additions and 28 deletions
--- a/internal/blobgen/compress_test.go
+++ b/internal/blobgen/compress_test.go
@@ -0,0 +1,64 @@
+package blobgen
+
+import (
+	"bytes"
+	"crypto/rand"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// testRecipient is a static age recipient for tests.
+const testRecipient = "age1cplgrwj77ta54dnmydvvmzn64ltk83ankxl5sww04mrtmu62kv3s89gmvv"
+
+// TestCompressStreamNoDoubleClose is a regression test for issue #28.
+// It verifies that CompressStream does not panic or return an error due to
+// double-closing the underlying blobgen.Writer. Before the fix in PR #33,
+// the explicit Close() on the happy path combined with defer Close() would
+// cause a double close.
+func TestCompressStreamNoDoubleClose(t *testing.T) {
+	input := []byte("regression test data for issue #28 double-close fix")
+	var buf bytes.Buffer
+
+	written, hash, err := CompressStream(&buf, bytes.NewReader(input), 3, []string{testRecipient})
+	require.NoError(t, err, "CompressStream should not return an error")
+	assert.True(t, written > 0, "expected bytes written > 0")
+	assert.NotEmpty(t, hash, "expected non-empty hash")
+	assert.True(t, buf.Len() > 0, "expected non-empty output")
+}
+
+// TestCompressStreamLargeInput exercises CompressStream with a larger payload
+// to ensure no double-close issues surface under heavier I/O.
+func TestCompressStreamLargeInput(t *testing.T) {
+	data := make([]byte, 512*1024) // 512 KB
+	_, err := rand.Read(data)
+	require.NoError(t, err)
+
+	var buf bytes.Buffer
+	written, hash, err := CompressStream(&buf, bytes.NewReader(data), 3, []string{testRecipient})
+	require.NoError(t, err)
+	assert.True(t, written > 0)
+	assert.NotEmpty(t, hash)
+}
+
+// TestCompressStreamEmptyInput verifies CompressStream handles empty input
+// without double-close issues.
+func TestCompressStreamEmptyInput(t *testing.T) {
+	var buf bytes.Buffer
+	_, hash, err := CompressStream(&buf, strings.NewReader(""), 3, []string{testRecipient})
+	require.NoError(t, err)
+	assert.NotEmpty(t, hash)
+}
+
+// TestCompressDataNoDoubleClose mirrors the stream test for CompressData,
+// ensuring the explicit Close + error-path Close pattern is also safe.
+func TestCompressDataNoDoubleClose(t *testing.T) {
+	input := []byte("CompressData regression test for double-close")
+	result, err := CompressData(input, 3, []string{testRecipient})
+	require.NoError(t, err)
+	assert.True(t, result.CompressedSize > 0)
+	assert.True(t, result.UncompressedSize == int64(len(input)))
+	assert.NotEmpty(t, result.SHA256)
+}
--- a/internal/vaultik/blob_fetch_hash_test.go
+++ b/internal/vaultik/blob_fetch_hash_test.go
@@ -0,0 +1,100 @@
+package vaultik_test
+
+import (
+	"bytes"
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"io"
+	"strings"
+	"testing"
+
+	"filippo.io/age"
+	"git.eeqj.de/sneak/vaultik/internal/blobgen"
+	"git.eeqj.de/sneak/vaultik/internal/vaultik"
+)
+
+// TestFetchAndDecryptBlobVerifiesHash verifies that FetchAndDecryptBlob checks
+// the double-SHA-256 hash of the decrypted plaintext against the expected blob hash.
+func TestFetchAndDecryptBlobVerifiesHash(t *testing.T) {
+	identity, err := age.GenerateX25519Identity()
+	if err != nil {
+		t.Fatalf("generating identity: %v", err)
+	}
+
+	// Create test data and encrypt it using blobgen.Writer
+	plaintext := []byte("hello world test data for blob hash verification")
+	var encBuf bytes.Buffer
+	writer, err := blobgen.NewWriter(&encBuf, 1, []string{identity.Recipient().String()})
+	if err != nil {
+		t.Fatalf("creating blobgen writer: %v", err)
+	}
+	if _, err := writer.Write(plaintext); err != nil {
+		t.Fatalf("writing plaintext: %v", err)
+	}
+	if err := writer.Close(); err != nil {
+		t.Fatalf("closing writer: %v", err)
+	}
+	encryptedData := encBuf.Bytes()
+
+	// Compute correct double-SHA-256 hash of the plaintext (matches blobgen.Writer.Sum256)
+	firstHash := sha256.Sum256(plaintext)
+	secondHash := sha256.Sum256(firstHash[:])
+	correctHash := hex.EncodeToString(secondHash[:])
+
+	// Verify our hash matches what blobgen.Writer produces
+	writerHash := hex.EncodeToString(writer.Sum256())
+	if correctHash != writerHash {
+		t.Fatalf("hash computation mismatch: manual=%s, writer=%s", correctHash, writerHash)
+	}
+
+	// Set up mock storage with the blob at the correct path
+	mockStorage := NewMockStorer()
+	blobPath := "blobs/" + correctHash[:2] + "/" + correctHash[2:4] + "/" + correctHash
+	mockStorage.mu.Lock()
+	mockStorage.data[blobPath] = encryptedData
+	mockStorage.mu.Unlock()
+
+	tv := vaultik.NewForTesting(mockStorage)
+	ctx := context.Background()
+
+	t.Run("correct hash succeeds", func(t *testing.T) {
+		rc, err := tv.FetchAndDecryptBlob(ctx, correctHash, int64(len(encryptedData)), identity)
+		if err != nil {
+			t.Fatalf("expected success, got error: %v", err)
+		}
+		data, err := io.ReadAll(rc)
+		if err != nil {
+			t.Fatalf("reading stream: %v", err)
+		}
+		if err := rc.Close(); err != nil {
+			t.Fatalf("close (hash verification) failed: %v", err)
+		}
+		if !bytes.Equal(data, plaintext) {
+			t.Fatalf("decrypted data mismatch: got %q, want %q", data, plaintext)
+		}
+	})
+
+	t.Run("wrong hash fails", func(t *testing.T) {
+		// Use a fake hash that doesn't match the actual plaintext
+		fakeHash := strings.Repeat("ab", 32) // 64 hex chars
+		fakePath := "blobs/" + fakeHash[:2] + "/" + fakeHash[2:4] + "/" + fakeHash
+		mockStorage.mu.Lock()
+		mockStorage.data[fakePath] = encryptedData
+		mockStorage.mu.Unlock()
+
+		rc, err := tv.FetchAndDecryptBlob(ctx, fakeHash, int64(len(encryptedData)), identity)
+		if err != nil {
+			t.Fatalf("unexpected error opening stream: %v", err)
+		}
+		// Read all data — hash is verified on Close
+		_, _ = io.ReadAll(rc)
+		err = rc.Close()
+		if err == nil {
+			t.Fatal("expected error for mismatched hash, got nil")
+		}
+		if !strings.Contains(err.Error(), "hash mismatch") {
+			t.Fatalf("expected hash mismatch error, got: %v", err)
+		}
+	})
+}
--- a/internal/vaultik/blob_fetch_stub.go
+++ b/internal/vaultik/blob_fetch_stub.go
@@ -2,38 +2,82 @@ package vaultik

 import (
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
 	"fmt"
+	"hash"
 	"io"

 	"filippo.io/age"
 	"git.eeqj.de/sneak/vaultik/internal/blobgen"
 )

-// FetchAndDecryptBlobResult holds the result of fetching and decrypting a blob.
-type FetchAndDecryptBlobResult struct {
-	Data []byte
+// hashVerifyReader wraps a reader and computes a double-SHA-256 hash of all
+// data read through it. The hash is verified against the expected blob hash
+// when Close is called. This allows streaming blob verification without
+// buffering the entire blob in memory.
+type hashVerifyReader struct {
+	reader   io.ReadCloser // underlying decrypted blob reader
+	fetcher  io.ReadCloser // raw fetched stream (closed on Close)
+	hasher   hash.Hash     // running SHA-256 of plaintext
+	blobHash string        // expected double-SHA-256 hex
+	done     bool          // EOF reached
 }

-// FetchAndDecryptBlob downloads a blob, decrypts it, and returns the plaintext data.
-func (v *Vaultik) FetchAndDecryptBlob(ctx context.Context, blobHash string, expectedSize int64, identity age.Identity) (*FetchAndDecryptBlobResult, error) {
+func (h *hashVerifyReader) Read(p []byte) (int, error) {
+	n, err := h.reader.Read(p)
+	if n > 0 {
+		h.hasher.Write(p[:n])
+	}
+	if err == io.EOF {
+		h.done = true
+	}
+	return n, err
+}
+
+// Close verifies the hash (if the stream was fully read) and closes underlying readers.
+func (h *hashVerifyReader) Close() error {
+	readerErr := h.reader.Close()
+	fetcherErr := h.fetcher.Close()
+
+	if h.done {
+		firstHash := h.hasher.Sum(nil)
+		secondHasher := sha256.New()
+		secondHasher.Write(firstHash)
+		actualHashHex := hex.EncodeToString(secondHasher.Sum(nil))
+		if actualHashHex != h.blobHash {
+			return fmt.Errorf("blob hash mismatch: expected %s, got %s", h.blobHash[:16], actualHashHex[:16])
+		}
+	}
+
+	if readerErr != nil {
+		return readerErr
+	}
+	return fetcherErr
+}
+
+// FetchAndDecryptBlob downloads a blob, decrypts and decompresses it, and
+// returns a streaming reader that computes the double-SHA-256 hash on the fly.
+// The hash is verified when the returned reader is closed (after fully reading).
+// This avoids buffering the entire blob in memory.
+func (v *Vaultik) FetchAndDecryptBlob(ctx context.Context, blobHash string, expectedSize int64, identity age.Identity) (io.ReadCloser, error) {
 	rc, _, err := v.FetchBlob(ctx, blobHash, expectedSize)
 	if err != nil {
 		return nil, err
 	}
-	defer func() { _ = rc.Close() }()

 	reader, err := blobgen.NewReader(rc, identity)
 	if err != nil {
+		_ = rc.Close()
 		return nil, fmt.Errorf("creating blob reader: %w", err)
 	}
-	defer func() { _ = reader.Close() }()

-	data, err := io.ReadAll(reader)
-	if err != nil {
-		return nil, fmt.Errorf("reading blob data: %w", err)
-	}
-
-	return &FetchAndDecryptBlobResult{Data: data}, nil
+	return &hashVerifyReader{
+		reader:   reader,
+		fetcher:  rc,
+		hasher:   sha256.New(),
+		blobHash: blobHash,
+	}, nil
 }

 // FetchBlob downloads a blob and returns a reader for the encrypted data.
--- a/internal/vaultik/blobcache.go
+++ b/internal/vaultik/blobcache.go
@@ -7,9 +7,6 @@ import (
 	"sync"
 )

-// defaultMaxBlobCacheBytes is the default maximum size of the disk blob cache (10 GB).
-const defaultMaxBlobCacheBytes = 10 << 30 // 10 GiB
-
 // blobDiskCacheEntry tracks a cached blob on disk.
 type blobDiskCacheEntry struct {
 	key  string
--- a/internal/vaultik/restore.go
+++ b/internal/vaultik/restore.go
@@ -109,7 +109,7 @@ func (v *Vaultik) Restore(opts *RestoreOptions) error {

 	// Step 5: Restore files
 	result := &RestoreResult{}
-	blobCache, err := newBlobDiskCache(defaultMaxBlobCacheBytes)
+	blobCache, err := newBlobDiskCache(4 * v.Config.BlobSizeLimit.Int64())
 	if err != nil {
 		return fmt.Errorf("creating blob cache: %w", err)
 	}
@@ -122,6 +122,8 @@ func (v *Vaultik) Restore(opts *RestoreOptions) error {

 		if err := v.restoreFile(v.ctx, repos, file, opts.TargetDir, identity, chunkToBlobMap, blobCache, result); err != nil {
 			log.Error("Failed to restore file", "path", file.Path, "error", err)
+			result.FilesFailed++
+			result.FailedFiles = append(result.FailedFiles, file.Path.String())
 			// Continue with other files
 			continue
 		}
@@ -151,6 +153,13 @@ func (v *Vaultik) Restore(opts *RestoreOptions) error {
 		result.Duration.Round(time.Second),
 	)

+	if result.FilesFailed > 0 {
+		_, _ = fmt.Fprintf(v.Stdout, "\nWARNING: %d file(s) failed to restore:\n", result.FilesFailed)
+		for _, path := range result.FailedFiles {
+			_, _ = fmt.Fprintf(v.Stdout, "  - %s\n", path)
+		}
+	}
+
 	// Run verification if requested
 	if opts.Verify {
 		if err := v.verifyRestoredFiles(v.ctx, repos, files, opts.TargetDir, result); err != nil {
@@ -171,6 +180,10 @@ func (v *Vaultik) Restore(opts *RestoreOptions) error {
 		)
 	}

+	if result.FilesFailed > 0 {
+		return fmt.Errorf("%d file(s) failed to restore", result.FilesFailed)
+	}
+
 	return nil
 }

@@ -481,11 +494,23 @@ func (v *Vaultik) restoreRegularFile(

 // downloadBlob downloads and decrypts a blob
 func (v *Vaultik) downloadBlob(ctx context.Context, blobHash string, expectedSize int64, identity age.Identity) ([]byte, error) {
-	result, err := v.FetchAndDecryptBlob(ctx, blobHash, expectedSize, identity)
+	rc, err := v.FetchAndDecryptBlob(ctx, blobHash, expectedSize, identity)
 	if err != nil {
 		return nil, err
 	}
-	return result.Data, nil
+
+	data, err := io.ReadAll(rc)
+	if err != nil {
+		_ = rc.Close()
+		return nil, fmt.Errorf("reading blob data: %w", err)
+	}
+
+	// Close triggers hash verification
+	if err := rc.Close(); err != nil {
+		return nil, err
+	}
+
+	return data, nil
 }

 // verifyRestoredFiles verifies that all restored files match their expected chunk hashes
--- a/internal/vaultik/snapshot.go
+++ b/internal/vaultik/snapshot.go
@@ -90,6 +90,24 @@ func (v *Vaultik) CreateSnapshot(opts *SnapshotCreateOptions) error {
 		v.printfStdout("\nAll %d snapshots completed in %s\n", len(snapshotNames), time.Since(overallStartTime).Round(time.Second))
 	}

+	// Prune old snapshots and unreferenced blobs if --prune was specified
+	if opts.Prune {
+		log.Info("Pruning enabled - deleting old snapshots and unreferenced blobs")
+		v.printlnStdout("\nPruning old snapshots (keeping latest)...")
+
+		if err := v.PurgeSnapshots(true, "", true); err != nil {
+			return fmt.Errorf("prune: purging old snapshots: %w", err)
+		}
+
+		v.printlnStdout("Pruning unreferenced blobs...")
+
+		if err := v.PruneBlobs(&PruneOptions{Force: true}); err != nil {
+			return fmt.Errorf("prune: removing unreferenced blobs: %w", err)
+		}
+
+		log.Info("Pruning complete")
+	}
+
 	return nil
 }

@@ -306,11 +324,6 @@ func (v *Vaultik) createNamedSnapshot(opts *SnapshotCreateOptions, hostname, sna
 	}
 	v.printfStdout("Duration: %s\n", formatDuration(snapshotDuration))

-	if opts.Prune {
-		log.Info("Pruning enabled - will delete old snapshots after snapshot")
-		// TODO: Implement pruning
-	}
-
 	return nil
 }

@@ -1004,16 +1017,16 @@ func (v *Vaultik) deleteSnapshotFromLocalDB(snapshotID string) error {

 	// Delete related records first to avoid foreign key constraints
 	if err := v.Repositories.Snapshots.DeleteSnapshotFiles(v.ctx, snapshotID); err != nil {
-		log.Error("Failed to delete snapshot files", "snapshot_id", snapshotID, "error", err)
+		return fmt.Errorf("deleting snapshot files for %s: %w", snapshotID, err)
 	}
 	if err := v.Repositories.Snapshots.DeleteSnapshotBlobs(v.ctx, snapshotID); err != nil {
-		log.Error("Failed to delete snapshot blobs", "snapshot_id", snapshotID, "error", err)
+		return fmt.Errorf("deleting snapshot blobs for %s: %w", snapshotID, err)
 	}
 	if err := v.Repositories.Snapshots.DeleteSnapshotUploads(v.ctx, snapshotID); err != nil {
-		log.Error("Failed to delete snapshot uploads", "snapshot_id", snapshotID, "error", err)
+		return fmt.Errorf("deleting snapshot uploads for %s: %w", snapshotID, err)
 	}
 	if err := v.Repositories.Snapshots.Delete(v.ctx, snapshotID); err != nil {
-		log.Error("Failed to delete snapshot record", "snapshot_id", snapshotID, "error", err)
+		return fmt.Errorf("deleting snapshot record %s: %w", snapshotID, err)
 	}

 	return nil
--- a/internal/vaultik/snapshot_prune_test.go
+++ b/internal/vaultik/snapshot_prune_test.go
@@ -0,0 +1,23 @@
+package vaultik
+
+import (
+	"testing"
+)
+
+// TestSnapshotCreateOptions_PruneFlag verifies the Prune field exists on
+// SnapshotCreateOptions and can be set.
+func TestSnapshotCreateOptions_PruneFlag(t *testing.T) {
+	opts := &SnapshotCreateOptions{
+		Prune: true,
+	}
+	if !opts.Prune {
+		t.Error("Expected Prune to be true")
+	}
+
+	opts2 := &SnapshotCreateOptions{
+		Prune: false,
+	}
+	if opts2.Prune {
+		t.Error("Expected Prune to be false")
+	}
+}
Author	SHA1	Message	Date
clawbot	22efd90f8c	refactor: stream blob hash verification instead of buffering in memory FetchAndDecryptBlob now returns io.ReadCloser with a hashVerifyReader that computes the double-SHA-256 on-the-fly during reads. Hash is verified on Close() after the stream is fully consumed. This avoids loading entire blobs into memory, which could exceed available RAM. Addresses review feedback on PR #39.	2026-02-20 02:29:19 -08:00
user	2bdbf38be6	fix: verify blob hash after download and decryption (closes #5 ) Add double-SHA-256 hash verification of decrypted plaintext in FetchAndDecryptBlob. This ensures blob integrity during restore operations by comparing the computed hash against the expected blob hash before returning data to the caller. Includes test for both correct hash (passes) and mismatched hash (returns error).	2026-02-20 02:26:15 -08:00
Jeffrey Paul	d8a51804d2	Merge pull request 'feat: implement --prune flag on snapshot create (closes #4 )' (#37 ) from feature/implement-prune-flag-on-snapshot-create into main Reviewed-on: #37	2026-02-20 11:22:12 +01:00
Jeffrey Paul	76f4421eb3	Merge branch 'main' into feature/implement-prune-flag-on-snapshot-create	2026-02-20 11:20:52 +01:00
Jeffrey Paul	53ac868c5d	Merge pull request 'fix: track and report file restore failures' (#22 ) from fix/restore-error-handling into main Reviewed-on: #22	2026-02-20 11:19:40 +01:00
Jeffrey Paul	8c4ea2b870	Merge branch 'main' into fix/restore-error-handling	2026-02-20 11:19:21 +01:00
Jeffrey Paul	597b560398	Merge pull request 'Return errors from deleteSnapshotFromLocalDB instead of swallowing them (closes #25 )' (#30 ) from fix/issue-25 into main Reviewed-on: #30	2026-02-20 11:18:30 +01:00
Jeffrey Paul	1e2eced092	Merge branch 'main' into fix/issue-25	2026-02-20 11:18:06 +01:00
Jeffrey Paul	815b35c7ae	Merge pull request 'Disk-based blob cache with LRU eviction during restore (closes #29 )' (#34 ) from fix/issue-29 into main Reviewed-on: #34	2026-02-20 11:16:15 +01:00
Jeffrey Paul	9c66674683	Merge branch 'main' into fix/issue-29	2026-02-20 11:15:59 +01:00
Jeffrey Paul	49de277648	Merge pull request 'Add CompressStream double-close regression test (closes #35 )' (#36 ) from add-compressstream-regression-test into main Reviewed-on: #36	2026-02-20 11:12:51 +01:00
clawbot	ed5d777d05	fix: set disk cache max size to 4x configured blob size instead of hardcoded 10 GiB The disk blob cache now uses 4 * BlobSizeLimit from config instead of a hardcoded 10 GiB default. This ensures the cache scales with the configured blob size.	2026-02-20 02:11:54 -08:00
clawbot	76e047bbb2	feat: implement --prune flag on snapshot create (closes #4 ) The --prune flag on 'snapshot create' was accepted but silently did nothing (TODO stub). This connects it to actually: 1. Purge old snapshots (keeping only the latest) via PurgeSnapshots 2. Remove unreferenced blobs from storage via PruneBlobs The pruning runs after all snapshots complete successfully, not per-snapshot. Both operations use --force mode (no interactive confirmation) since --prune is an explicit opt-in flag. Moved the prune logic from createNamedSnapshot (per-snapshot) to CreateSnapshot (after all snapshots), which is the correct location.	2026-02-20 02:11:52 -08:00
clawbot	2e7356dd85	Add CompressStream double-close regression test (closes #35 ) Adds regression tests for issue #28 (fixed in PR #33) to prevent reintroduction of the double-close bug in CompressStream. Tests cover: - CompressStream with normal input - CompressStream with large (512KB) input - CompressStream with empty input - CompressData close correctness	2026-02-20 02:10:23 -08:00
Jeffrey Paul	70d4fe2aa0	Merge pull request 'Use v.Stdout/v.Stdin instead of os.Stdout for all user-facing output (closes #26 )' (#31 ) from fix/issue-26 into main Reviewed-on: #31	2026-02-20 11:07:52 +01:00
clawbot	ddc23f8057	fix: return errors from deleteSnapshotFromLocalDB instead of swallowing them Previously, deleteSnapshotFromLocalDB logged errors but always returned nil, causing callers to believe deletion succeeded even when it failed. This could lead to data inconsistency where remote metadata is deleted while local records persist. Now returns the first error encountered, allowing callers to handle failures appropriately.	2026-02-19 23:55:27 -08:00
clawbot	cafb3d45b8	fix: track and report file restore failures Restore previously logged errors for individual files but returned success even if files failed. Now tracks failed files in RestoreResult, reports them in the summary output, and returns an error if any files failed to restore. Fixes #21	2026-02-19 23:52:22 -08:00