Add deterministic deduplication, rclone backend, and database purge command

- Implement deterministic blob hashing using double SHA256 of uncompressed
  plaintext data, enabling deduplication even after local DB is cleared
- Add Stat() check before blob upload to skip existing blobs in storage
- Add rclone storage backend for additional remote storage options
- Add 'vaultik database purge' command to erase local state DB
- Add 'vaultik remote check' command to verify remote connectivity
- Show configured snapshots in 'vaultik snapshot list' output
- Skip macOS resource fork files (._*) when listing remote snapshots
- Use multi-threaded zstd compression (CPUs - 2 threads)
- Add writer tests for double hashing behavior
This commit is contained in:
2026-01-28 15:50:17 -08:00
parent bdaaadf990
commit 470bf648c4
26 changed files with 2966 additions and 777 deletions

View File

@@ -5,30 +5,33 @@ import (
"fmt"
"hash"
"io"
"runtime"
"filippo.io/age"
"github.com/klauspost/compress/zstd"
)
// Writer wraps compression and encryption with SHA256 hashing
// Writer wraps compression and encryption with SHA256 hashing.
// Data flows: input -> tee(hasher, compressor -> encryptor -> destination)
// The hash is computed on the uncompressed input for deterministic content-addressing.
type Writer struct {
writer io.Writer // Final destination
teeWriter io.Writer // Tee to hasher and compressor
compressor *zstd.Encoder // Compression layer
encryptor io.WriteCloser // Encryption layer
hasher hash.Hash // SHA256 hasher
teeWriter io.Writer // Tees data to hasher
hasher hash.Hash // SHA256 hasher (on uncompressed input)
compressionLevel int
bytesWritten int64
}
// NewWriter creates a new Writer that compresses, encrypts, and hashes data
// NewWriter creates a new Writer that compresses, encrypts, and hashes data.
// The hash is computed on the uncompressed input for deterministic content-addressing.
func NewWriter(w io.Writer, compressionLevel int, recipients []string) (*Writer, error) {
// Validate compression level
if err := validateCompressionLevel(compressionLevel); err != nil {
return nil, err
}
// Create SHA256 hasher
// Create SHA256 hasher for the uncompressed input
hasher := sha256.New()
// Parse recipients
@@ -41,31 +44,36 @@ func NewWriter(w io.Writer, compressionLevel int, recipients []string) (*Writer,
ageRecipients = append(ageRecipients, r)
}
// Create encryption writer
// Create encryption writer that outputs to destination
encWriter, err := age.Encrypt(w, ageRecipients...)
if err != nil {
return nil, fmt.Errorf("creating encryption writer: %w", err)
}
// Calculate compression concurrency: CPUs - 2, minimum 1
concurrency := runtime.NumCPU() - 2
if concurrency < 1 {
concurrency = 1
}
// Create compression writer with encryption as destination
compressor, err := zstd.NewWriter(encWriter,
zstd.WithEncoderLevel(zstd.EncoderLevelFromZstd(compressionLevel)),
zstd.WithEncoderConcurrency(1), // Use single thread for streaming
zstd.WithEncoderConcurrency(concurrency),
)
if err != nil {
_ = encWriter.Close()
return nil, fmt.Errorf("creating compression writer: %w", err)
}
// Create tee writer that writes to both compressor and hasher
teeWriter := io.MultiWriter(compressor, hasher)
// Create tee writer: input goes to both hasher and compressor
teeWriter := io.MultiWriter(hasher, compressor)
return &Writer{
writer: w,
teeWriter: teeWriter,
compressor: compressor,
encryptor: encWriter,
hasher: hasher,
teeWriter: teeWriter,
compressionLevel: compressionLevel,
}, nil
}
@@ -92,9 +100,16 @@ func (w *Writer) Close() error {
return nil
}
// Sum256 returns the SHA256 hash of all data written
// Sum256 returns the double SHA256 hash of the uncompressed input data.
// Double hashing (SHA256(SHA256(data))) prevents information leakage about
// the plaintext - an attacker cannot confirm existence of known content
// by computing its hash and checking for a matching blob filename.
func (w *Writer) Sum256() []byte {
return w.hasher.Sum(nil)
// First hash: SHA256(plaintext)
firstHash := w.hasher.Sum(nil)
// Second hash: SHA256(firstHash) - this is the blob ID
secondHash := sha256.Sum256(firstHash)
return secondHash[:]
}
// BytesWritten returns the number of uncompressed bytes written

View File

@@ -0,0 +1,105 @@
package blobgen
import (
"bytes"
"crypto/rand"
"crypto/sha256"
"encoding/hex"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestWriterHashIsDoubleHash verifies that Writer.Sum256() returns
// the double hash SHA256(SHA256(plaintext)) for security.
// Double hashing prevents attackers from confirming existence of known content.
func TestWriterHashIsDoubleHash(t *testing.T) {
// Test data - random data that doesn't compress well
testData := make([]byte, 1024*1024) // 1MB
_, err := rand.Read(testData)
require.NoError(t, err)
// Test recipient (generated with age-keygen)
testRecipient := "age1cplgrwj77ta54dnmydvvmzn64ltk83ankxl5sww04mrtmu62kv3s89gmvv"
// Create a buffer to capture the encrypted output
var encryptedBuf bytes.Buffer
// Create blobgen writer
writer, err := NewWriter(&encryptedBuf, 3, []string{testRecipient})
require.NoError(t, err)
// Write test data
n, err := writer.Write(testData)
require.NoError(t, err)
assert.Equal(t, len(testData), n)
// Close to flush all data
err = writer.Close()
require.NoError(t, err)
// Get the hash from the writer
writerHash := hex.EncodeToString(writer.Sum256())
// Calculate the expected double hash: SHA256(SHA256(plaintext))
firstHash := sha256.Sum256(testData)
secondHash := sha256.Sum256(firstHash[:])
expectedDoubleHash := hex.EncodeToString(secondHash[:])
// Also compute single hash to verify it's different
singleHashStr := hex.EncodeToString(firstHash[:])
t.Logf("Input size: %d bytes", len(testData))
t.Logf("Single hash (SHA256(data)): %s", singleHashStr)
t.Logf("Double hash (SHA256(SHA256(data))): %s", expectedDoubleHash)
t.Logf("Writer hash: %s", writerHash)
// The writer hash should match the double hash
assert.Equal(t, expectedDoubleHash, writerHash,
"Writer.Sum256() should return SHA256(SHA256(plaintext)) for security")
// Verify it's NOT the single hash (would leak information)
assert.NotEqual(t, singleHashStr, writerHash,
"Writer hash should not be single hash (would allow content confirmation attacks)")
}
// TestWriterDeterministicHash verifies that the same input always produces
// the same hash, even with non-deterministic encryption.
func TestWriterDeterministicHash(t *testing.T) {
// Test data
testData := []byte("Hello, World! This is test data for deterministic hashing.")
// Test recipient
testRecipient := "age1cplgrwj77ta54dnmydvvmzn64ltk83ankxl5sww04mrtmu62kv3s89gmvv"
// Create two writers and verify they produce the same hash
var buf1, buf2 bytes.Buffer
writer1, err := NewWriter(&buf1, 3, []string{testRecipient})
require.NoError(t, err)
_, err = writer1.Write(testData)
require.NoError(t, err)
require.NoError(t, writer1.Close())
writer2, err := NewWriter(&buf2, 3, []string{testRecipient})
require.NoError(t, err)
_, err = writer2.Write(testData)
require.NoError(t, err)
require.NoError(t, writer2.Close())
hash1 := hex.EncodeToString(writer1.Sum256())
hash2 := hex.EncodeToString(writer2.Sum256())
// Hashes should be identical (deterministic)
assert.Equal(t, hash1, hash2, "Same input should produce same hash")
// Encrypted outputs should be different (non-deterministic encryption)
assert.NotEqual(t, buf1.Bytes(), buf2.Bytes(),
"Encrypted outputs should differ due to non-deterministic encryption")
t.Logf("Hash 1: %s", hash1)
t.Logf("Hash 2: %s", hash2)
t.Logf("Encrypted size 1: %d bytes", buf1.Len())
t.Logf("Encrypted size 2: %d bytes", buf2.Len())
}