Add deterministic deduplication, rclone backend, and database purge command
- Implement deterministic blob hashing using double SHA256 of uncompressed plaintext data, enabling deduplication even after local DB is cleared - Add Stat() check before blob upload to skip existing blobs in storage - Add rclone storage backend for additional remote storage options - Add 'vaultik database purge' command to erase local state DB - Add 'vaultik remote check' command to verify remote connectivity - Show configured snapshots in 'vaultik snapshot list' output - Skip macOS resource fork files (._*) when listing remote snapshots - Use multi-threaded zstd compression (CPUs - 2 threads) - Add writer tests for double hashing behavior
This commit is contained in:
@@ -5,30 +5,33 @@ import (
|
||||
"fmt"
|
||||
"hash"
|
||||
"io"
|
||||
"runtime"
|
||||
|
||||
"filippo.io/age"
|
||||
"github.com/klauspost/compress/zstd"
|
||||
)
|
||||
|
||||
// Writer wraps compression and encryption with SHA256 hashing
|
||||
// Writer wraps compression and encryption with SHA256 hashing.
|
||||
// Data flows: input -> tee(hasher, compressor -> encryptor -> destination)
|
||||
// The hash is computed on the uncompressed input for deterministic content-addressing.
|
||||
type Writer struct {
|
||||
writer io.Writer // Final destination
|
||||
teeWriter io.Writer // Tee to hasher and compressor
|
||||
compressor *zstd.Encoder // Compression layer
|
||||
encryptor io.WriteCloser // Encryption layer
|
||||
hasher hash.Hash // SHA256 hasher
|
||||
teeWriter io.Writer // Tees data to hasher
|
||||
hasher hash.Hash // SHA256 hasher (on uncompressed input)
|
||||
compressionLevel int
|
||||
bytesWritten int64
|
||||
}
|
||||
|
||||
// NewWriter creates a new Writer that compresses, encrypts, and hashes data
|
||||
// NewWriter creates a new Writer that compresses, encrypts, and hashes data.
|
||||
// The hash is computed on the uncompressed input for deterministic content-addressing.
|
||||
func NewWriter(w io.Writer, compressionLevel int, recipients []string) (*Writer, error) {
|
||||
// Validate compression level
|
||||
if err := validateCompressionLevel(compressionLevel); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Create SHA256 hasher
|
||||
// Create SHA256 hasher for the uncompressed input
|
||||
hasher := sha256.New()
|
||||
|
||||
// Parse recipients
|
||||
@@ -41,31 +44,36 @@ func NewWriter(w io.Writer, compressionLevel int, recipients []string) (*Writer,
|
||||
ageRecipients = append(ageRecipients, r)
|
||||
}
|
||||
|
||||
// Create encryption writer
|
||||
// Create encryption writer that outputs to destination
|
||||
encWriter, err := age.Encrypt(w, ageRecipients...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating encryption writer: %w", err)
|
||||
}
|
||||
|
||||
// Calculate compression concurrency: CPUs - 2, minimum 1
|
||||
concurrency := runtime.NumCPU() - 2
|
||||
if concurrency < 1 {
|
||||
concurrency = 1
|
||||
}
|
||||
|
||||
// Create compression writer with encryption as destination
|
||||
compressor, err := zstd.NewWriter(encWriter,
|
||||
zstd.WithEncoderLevel(zstd.EncoderLevelFromZstd(compressionLevel)),
|
||||
zstd.WithEncoderConcurrency(1), // Use single thread for streaming
|
||||
zstd.WithEncoderConcurrency(concurrency),
|
||||
)
|
||||
if err != nil {
|
||||
_ = encWriter.Close()
|
||||
return nil, fmt.Errorf("creating compression writer: %w", err)
|
||||
}
|
||||
|
||||
// Create tee writer that writes to both compressor and hasher
|
||||
teeWriter := io.MultiWriter(compressor, hasher)
|
||||
// Create tee writer: input goes to both hasher and compressor
|
||||
teeWriter := io.MultiWriter(hasher, compressor)
|
||||
|
||||
return &Writer{
|
||||
writer: w,
|
||||
teeWriter: teeWriter,
|
||||
compressor: compressor,
|
||||
encryptor: encWriter,
|
||||
hasher: hasher,
|
||||
teeWriter: teeWriter,
|
||||
compressionLevel: compressionLevel,
|
||||
}, nil
|
||||
}
|
||||
@@ -92,9 +100,16 @@ func (w *Writer) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Sum256 returns the SHA256 hash of all data written
|
||||
// Sum256 returns the double SHA256 hash of the uncompressed input data.
|
||||
// Double hashing (SHA256(SHA256(data))) prevents information leakage about
|
||||
// the plaintext - an attacker cannot confirm existence of known content
|
||||
// by computing its hash and checking for a matching blob filename.
|
||||
func (w *Writer) Sum256() []byte {
|
||||
return w.hasher.Sum(nil)
|
||||
// First hash: SHA256(plaintext)
|
||||
firstHash := w.hasher.Sum(nil)
|
||||
// Second hash: SHA256(firstHash) - this is the blob ID
|
||||
secondHash := sha256.Sum256(firstHash)
|
||||
return secondHash[:]
|
||||
}
|
||||
|
||||
// BytesWritten returns the number of uncompressed bytes written
|
||||
|
||||
Reference in New Issue
Block a user