Add deterministic deduplication, rclone backend, and database purge command

- Implement deterministic blob hashing using double SHA256 of uncompressed
  plaintext data, enabling deduplication even after local DB is cleared
- Add Stat() check before blob upload to skip existing blobs in storage
- Add rclone storage backend for additional remote storage options
- Add 'vaultik database purge' command to erase local state DB
- Add 'vaultik remote check' command to verify remote connectivity
- Show configured snapshots in 'vaultik snapshot list' output
- Skip macOS resource fork files (._*) when listing remote snapshots
- Use multi-threaded zstd compression (CPUs - 2 threads)
- Add writer tests for double hashing behavior
This commit is contained in:
2026-01-28 15:50:17 -08:00
parent bdaaadf990
commit 470bf648c4
26 changed files with 2966 additions and 777 deletions

View File

@@ -327,6 +327,10 @@ func (v *Vaultik) ListSnapshots(jsonOutput bool) error {
// Extract snapshot ID from paths like metadata/hostname-20240115-143052Z/
parts := strings.Split(object.Key, "/")
if len(parts) >= 2 && parts[0] == "metadata" && parts[1] != "" {
// Skip macOS resource fork files (._*) and other hidden files
if strings.HasPrefix(parts[1], ".") {
continue
}
remoteSnapshots[parts[1]] = true
}
}
@@ -425,6 +429,32 @@ func (v *Vaultik) ListSnapshots(jsonOutput bool) error {
// Table output
w := tabwriter.NewWriter(os.Stdout, 0, 0, 3, ' ', 0)
// Show configured snapshots from config file
if _, err := fmt.Fprintln(w, "CONFIGURED SNAPSHOTS:"); err != nil {
return err
}
if _, err := fmt.Fprintln(w, "NAME\tPATHS"); err != nil {
return err
}
if _, err := fmt.Fprintln(w, "────\t─────"); err != nil {
return err
}
for _, name := range v.Config.SnapshotNames() {
snap := v.Config.Snapshots[name]
paths := strings.Join(snap.Paths, ", ")
if _, err := fmt.Fprintf(w, "%s\t%s\n", name, paths); err != nil {
return err
}
}
if _, err := fmt.Fprintln(w); err != nil {
return err
}
// Show remote snapshots
if _, err := fmt.Fprintln(w, "REMOTE SNAPSHOTS:"); err != nil {
return err
}
if _, err := fmt.Fprintln(w, "SNAPSHOT ID\tTIMESTAMP\tCOMPRESSED SIZE"); err != nil {
return err
}
@@ -527,11 +557,15 @@ func (v *Vaultik) PurgeSnapshots(keepLatest bool, olderThan string, force bool)
fmt.Printf("\nDeleting %d snapshot(s) (--force specified)\n", len(toDelete))
}
// Delete snapshots
// Delete snapshots (both local and remote)
for _, snap := range toDelete {
log.Info("Deleting snapshot", "id", snap.ID)
if err := v.deleteSnapshot(snap.ID.String()); err != nil {
return fmt.Errorf("deleting snapshot %s: %w", snap.ID, err)
snapshotID := snap.ID.String()
log.Info("Deleting snapshot", "id", snapshotID)
if err := v.deleteSnapshotFromLocalDB(snapshotID); err != nil {
log.Error("Failed to delete from local database", "snapshot_id", snapshotID, "error", err)
}
if err := v.deleteSnapshotFromRemote(snapshotID); err != nil {
return fmt.Errorf("deleting snapshot %s from remote: %w", snapshotID, err)
}
}
@@ -722,49 +756,6 @@ func (v *Vaultik) downloadManifest(snapshotID string) (*snapshot.Manifest, error
return manifest, nil
}
func (v *Vaultik) deleteSnapshot(snapshotID string) error {
// First, delete from storage
// List all objects under metadata/{snapshotID}/
prefix := fmt.Sprintf("metadata/%s/", snapshotID)
objectCh := v.Storage.ListStream(v.ctx, prefix)
var objectsToDelete []string
for object := range objectCh {
if object.Err != nil {
return fmt.Errorf("listing objects: %w", object.Err)
}
objectsToDelete = append(objectsToDelete, object.Key)
}
// Delete all objects
for _, key := range objectsToDelete {
if err := v.Storage.Delete(v.ctx, key); err != nil {
return fmt.Errorf("removing %s: %w", key, err)
}
}
// Then, delete from local database (if we have a local database)
if v.Repositories != nil {
// Delete related records first to avoid foreign key constraints
if err := v.Repositories.Snapshots.DeleteSnapshotFiles(v.ctx, snapshotID); err != nil {
log.Error("Failed to delete snapshot files", "snapshot_id", snapshotID, "error", err)
}
if err := v.Repositories.Snapshots.DeleteSnapshotBlobs(v.ctx, snapshotID); err != nil {
log.Error("Failed to delete snapshot blobs", "snapshot_id", snapshotID, "error", err)
}
if err := v.Repositories.Snapshots.DeleteSnapshotUploads(v.ctx, snapshotID); err != nil {
log.Error("Failed to delete snapshot uploads", "snapshot_id", snapshotID, "error", err)
}
// Now delete the snapshot itself
if err := v.Repositories.Snapshots.Delete(v.ctx, snapshotID); err != nil {
return fmt.Errorf("deleting snapshot from database: %w", err)
}
}
return nil
}
func (v *Vaultik) syncWithRemote() error {
log.Info("Syncing with remote snapshots")
@@ -780,6 +771,10 @@ func (v *Vaultik) syncWithRemote() error {
// Extract snapshot ID from paths like metadata/hostname-20240115-143052Z/
parts := strings.Split(object.Key, "/")
if len(parts) >= 2 && parts[0] == "metadata" && parts[1] != "" {
// Skip macOS resource fork files (._*) and other hidden files
if strings.HasPrefix(parts[1], ".") {
continue
}
remoteSnapshots[parts[1]] = true
}
}
@@ -818,137 +813,47 @@ type RemoveOptions struct {
Force bool
DryRun bool
JSON bool
Remote bool // Also remove metadata from remote storage
All bool // Remove all snapshots (requires Force)
}
// RemoveResult contains the result of a snapshot removal
type RemoveResult struct {
SnapshotID string `json:"snapshot_id"`
BlobsDeleted int `json:"blobs_deleted"`
BytesFreed int64 `json:"bytes_freed"`
BlobsFailed int `json:"blobs_failed,omitempty"`
DryRun bool `json:"dry_run,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
SnapshotsRemoved []string `json:"snapshots_removed,omitempty"`
RemoteRemoved bool `json:"remote_removed,omitempty"`
DryRun bool `json:"dry_run,omitempty"`
}
// RemoveSnapshot removes a snapshot and any blobs that become orphaned
// RemoveSnapshot removes a snapshot from the local database and optionally from remote storage
// Note: This does NOT remove blobs. Use 'vaultik prune' to remove orphaned blobs.
func (v *Vaultik) RemoveSnapshot(snapshotID string, opts *RemoveOptions) (*RemoveResult, error) {
log.Info("Starting snapshot removal", "snapshot_id", snapshotID)
result := &RemoveResult{
SnapshotID: snapshotID,
}
// Step 1: List all snapshots in storage
log.Info("Listing remote snapshots")
objectCh := v.Storage.ListStream(v.ctx, "metadata/")
var allSnapshotIDs []string
targetExists := false
for object := range objectCh {
if object.Err != nil {
return nil, fmt.Errorf("listing remote snapshots: %w", object.Err)
}
// Extract snapshot ID from paths like metadata/hostname-20240115-143052Z/
parts := strings.Split(object.Key, "/")
if len(parts) >= 2 && parts[0] == "metadata" && parts[1] != "" {
if strings.HasSuffix(object.Key, "/") || strings.Contains(object.Key, "/manifest.json.zst") {
sid := parts[1]
// Only add unique snapshot IDs
found := false
for _, id := range allSnapshotIDs {
if id == sid {
found = true
break
}
}
if !found {
allSnapshotIDs = append(allSnapshotIDs, sid)
if sid == snapshotID {
targetExists = true
}
}
}
}
}
if !targetExists {
return nil, fmt.Errorf("snapshot not found: %s", snapshotID)
}
log.Info("Found snapshots", "total", len(allSnapshotIDs))
// Step 2: Download target snapshot's manifest
log.Info("Downloading target manifest", "snapshot_id", snapshotID)
targetManifest, err := v.downloadManifest(snapshotID)
if err != nil {
return nil, fmt.Errorf("downloading target manifest: %w", err)
}
// Build set of target blob hashes with sizes
targetBlobs := make(map[string]int64) // hash -> size
for _, blob := range targetManifest.Blobs {
targetBlobs[blob.Hash] = blob.CompressedSize
}
log.Info("Target snapshot has blobs", "count", len(targetBlobs))
// Step 3: Download manifests from all OTHER snapshots to build "in-use" set
inUseBlobs := make(map[string]bool)
otherCount := 0
for _, sid := range allSnapshotIDs {
if sid == snapshotID {
continue // Skip target snapshot
}
log.Debug("Processing manifest", "snapshot_id", sid)
manifest, err := v.downloadManifest(sid)
if err != nil {
log.Error("Failed to download manifest", "snapshot_id", sid, "error", err)
continue
}
for _, blob := range manifest.Blobs {
inUseBlobs[blob.Hash] = true
}
otherCount++
}
log.Info("Processed other manifests", "count", otherCount, "in_use_blobs", len(inUseBlobs))
// Step 4: Find orphaned blobs (in target but not in use by others)
var orphanedBlobs []string
var totalSize int64
for hash, size := range targetBlobs {
if !inUseBlobs[hash] {
orphanedBlobs = append(orphanedBlobs, hash)
totalSize += size
}
}
log.Info("Found orphaned blobs",
"count", len(orphanedBlobs),
"total_size", humanize.Bytes(uint64(totalSize)),
)
// Show summary (unless JSON mode)
if !opts.JSON {
_, _ = fmt.Fprintf(v.Stdout, "\nSnapshot: %s\n", snapshotID)
_, _ = fmt.Fprintf(v.Stdout, "Blobs in snapshot: %d\n", len(targetBlobs))
_, _ = fmt.Fprintf(v.Stdout, "Orphaned blobs to delete: %d (%s)\n", len(orphanedBlobs), humanize.Bytes(uint64(totalSize)))
}
if opts.DryRun {
result.DryRun = true
if !opts.JSON {
_, _ = fmt.Fprintf(v.Stdout, "Would remove snapshot: %s\n", snapshotID)
if opts.Remote {
_, _ = fmt.Fprintln(v.Stdout, "Would also remove from remote storage")
}
_, _ = fmt.Fprintln(v.Stdout, "[Dry run - no changes made]")
}
if opts.JSON {
return result, v.outputRemoveJSON(result)
}
_, _ = fmt.Fprintln(v.Stdout, "\n[Dry run - no changes made]")
return result, nil
}
// Confirm unless --force is used (skip in JSON mode - require --force)
if !opts.Force && !opts.JSON {
_, _ = fmt.Fprintf(v.Stdout, "\nDelete snapshot and %d orphaned blob(s)? [y/N] ", len(orphanedBlobs))
if opts.Remote {
_, _ = fmt.Fprintf(v.Stdout, "Remove snapshot '%s' from local database and remote storage? [y/N] ", snapshotID)
} else {
_, _ = fmt.Fprintf(v.Stdout, "Remove snapshot '%s' from local database? [y/N] ", snapshotID)
}
var confirm string
if _, err := fmt.Fscanln(v.Stdin, &confirm); err != nil {
_, _ = fmt.Fprintln(v.Stdout, "Cancelled")
@@ -960,36 +865,20 @@ func (v *Vaultik) RemoveSnapshot(snapshotID string, opts *RemoveOptions) (*Remov
}
}
// Step 5: Delete orphaned blobs
if len(orphanedBlobs) > 0 {
log.Info("Deleting orphaned blobs")
for i, hash := range orphanedBlobs {
blobPath := fmt.Sprintf("blobs/%s/%s/%s", hash[:2], hash[2:4], hash)
log.Info("Removing snapshot from local database", "snapshot_id", snapshotID)
if err := v.Storage.Delete(v.ctx, blobPath); err != nil {
log.Error("Failed to delete blob", "hash", hash, "error", err)
result.BlobsFailed++
continue
}
result.BlobsDeleted++
result.BytesFreed += targetBlobs[hash]
// Progress update every 100 blobs
if (i+1)%100 == 0 || i == len(orphanedBlobs)-1 {
log.Info("Deletion progress",
"deleted", i+1,
"total", len(orphanedBlobs),
"percent", fmt.Sprintf("%.1f%%", float64(i+1)/float64(len(orphanedBlobs))*100),
)
}
}
// Remove from local database
if err := v.deleteSnapshotFromLocalDB(snapshotID); err != nil {
return result, fmt.Errorf("removing from local database: %w", err)
}
// Step 6: Delete snapshot metadata
log.Info("Deleting snapshot metadata")
if err := v.deleteSnapshot(snapshotID); err != nil {
return result, fmt.Errorf("deleting snapshot metadata: %w", err)
// If --remote, also remove from remote storage
if opts.Remote {
log.Info("Removing snapshot metadata from remote storage", "snapshot_id", snapshotID)
if err := v.deleteSnapshotFromRemote(snapshotID); err != nil {
return result, fmt.Errorf("removing from remote storage: %w", err)
}
result.RemoteRemoved = true
}
// Output result
@@ -998,16 +887,165 @@ func (v *Vaultik) RemoveSnapshot(snapshotID string, opts *RemoveOptions) (*Remov
}
// Print summary
_, _ = fmt.Fprintf(v.Stdout, "\nRemoved snapshot %s\n", snapshotID)
_, _ = fmt.Fprintf(v.Stdout, " Blobs deleted: %d\n", result.BlobsDeleted)
_, _ = fmt.Fprintf(v.Stdout, " Storage freed: %s\n", humanize.Bytes(uint64(result.BytesFreed)))
if result.BlobsFailed > 0 {
_, _ = fmt.Fprintf(v.Stdout, " Blobs failed: %d\n", result.BlobsFailed)
_, _ = fmt.Fprintf(v.Stdout, "Removed snapshot '%s' from local database\n", snapshotID)
if opts.Remote {
_, _ = fmt.Fprintln(v.Stdout, "Removed snapshot metadata from remote storage")
_, _ = fmt.Fprintln(v.Stdout, "\nNote: Blobs were not removed. Run 'vaultik prune' to remove orphaned blobs.")
}
return result, nil
}
// RemoveAllSnapshots removes all snapshots from local database and optionally from remote
func (v *Vaultik) RemoveAllSnapshots(opts *RemoveOptions) (*RemoveResult, error) {
result := &RemoveResult{}
// List all snapshots
log.Info("Listing all snapshots")
objectCh := v.Storage.ListStream(v.ctx, "metadata/")
var snapshotIDs []string
for object := range objectCh {
if object.Err != nil {
return nil, fmt.Errorf("listing remote snapshots: %w", object.Err)
}
parts := strings.Split(object.Key, "/")
if len(parts) >= 2 && parts[0] == "metadata" && parts[1] != "" {
// Skip macOS resource fork files (._*) and other hidden files
if strings.HasPrefix(parts[1], ".") {
continue
}
if strings.HasSuffix(object.Key, "/") || strings.Contains(object.Key, "/manifest.json.zst") {
sid := parts[1]
found := false
for _, id := range snapshotIDs {
if id == sid {
found = true
break
}
}
if !found {
snapshotIDs = append(snapshotIDs, sid)
}
}
}
}
if len(snapshotIDs) == 0 {
if !opts.JSON {
_, _ = fmt.Fprintln(v.Stdout, "No snapshots found")
}
return result, nil
}
if opts.DryRun {
result.DryRun = true
result.SnapshotsRemoved = snapshotIDs
if !opts.JSON {
_, _ = fmt.Fprintf(v.Stdout, "Would remove %d snapshot(s):\n", len(snapshotIDs))
for _, id := range snapshotIDs {
_, _ = fmt.Fprintf(v.Stdout, " %s\n", id)
}
if opts.Remote {
_, _ = fmt.Fprintln(v.Stdout, "Would also remove from remote storage")
}
_, _ = fmt.Fprintln(v.Stdout, "[Dry run - no changes made]")
}
if opts.JSON {
return result, v.outputRemoveJSON(result)
}
return result, nil
}
// --all requires --force
if !opts.Force {
return nil, fmt.Errorf("--all requires --force")
}
log.Info("Removing all snapshots", "count", len(snapshotIDs))
for _, snapshotID := range snapshotIDs {
log.Info("Removing snapshot", "snapshot_id", snapshotID)
if err := v.deleteSnapshotFromLocalDB(snapshotID); err != nil {
log.Error("Failed to remove from local database", "snapshot_id", snapshotID, "error", err)
continue
}
if opts.Remote {
if err := v.deleteSnapshotFromRemote(snapshotID); err != nil {
log.Error("Failed to remove from remote", "snapshot_id", snapshotID, "error", err)
continue
}
}
result.SnapshotsRemoved = append(result.SnapshotsRemoved, snapshotID)
}
if opts.Remote {
result.RemoteRemoved = true
}
if opts.JSON {
return result, v.outputRemoveJSON(result)
}
_, _ = fmt.Fprintf(v.Stdout, "Removed %d snapshot(s)\n", len(result.SnapshotsRemoved))
if opts.Remote {
_, _ = fmt.Fprintln(v.Stdout, "Removed snapshot metadata from remote storage")
_, _ = fmt.Fprintln(v.Stdout, "\nNote: Blobs were not removed. Run 'vaultik prune' to remove orphaned blobs.")
}
return result, nil
}
// deleteSnapshotFromLocalDB removes a snapshot from the local database only
func (v *Vaultik) deleteSnapshotFromLocalDB(snapshotID string) error {
if v.Repositories == nil {
return nil // No local database
}
// Delete related records first to avoid foreign key constraints
if err := v.Repositories.Snapshots.DeleteSnapshotFiles(v.ctx, snapshotID); err != nil {
log.Error("Failed to delete snapshot files", "snapshot_id", snapshotID, "error", err)
}
if err := v.Repositories.Snapshots.DeleteSnapshotBlobs(v.ctx, snapshotID); err != nil {
log.Error("Failed to delete snapshot blobs", "snapshot_id", snapshotID, "error", err)
}
if err := v.Repositories.Snapshots.DeleteSnapshotUploads(v.ctx, snapshotID); err != nil {
log.Error("Failed to delete snapshot uploads", "snapshot_id", snapshotID, "error", err)
}
if err := v.Repositories.Snapshots.Delete(v.ctx, snapshotID); err != nil {
log.Error("Failed to delete snapshot record", "snapshot_id", snapshotID, "error", err)
}
return nil
}
// deleteSnapshotFromRemote removes snapshot metadata files from remote storage
func (v *Vaultik) deleteSnapshotFromRemote(snapshotID string) error {
prefix := fmt.Sprintf("metadata/%s/", snapshotID)
objectCh := v.Storage.ListStream(v.ctx, prefix)
var objectsToDelete []string
for object := range objectCh {
if object.Err != nil {
return fmt.Errorf("listing objects: %w", object.Err)
}
objectsToDelete = append(objectsToDelete, object.Key)
}
for _, key := range objectsToDelete {
if err := v.Storage.Delete(v.ctx, key); err != nil {
return fmt.Errorf("removing %s: %w", key, err)
}
log.Debug("Deleted remote object", "key", key)
}
return nil
}
// outputRemoveJSON outputs the removal result as JSON
func (v *Vaultik) outputRemoveJSON(result *RemoveResult) error {
encoder := json.NewEncoder(os.Stdout)
@@ -1027,7 +1065,7 @@ type PruneResult struct {
// and blobs from the local database. This ensures database consistency
// before starting a new backup or on-demand via the prune command.
func (v *Vaultik) PruneDatabase() (*PruneResult, error) {
log.Info("Pruning database: removing incomplete snapshots and orphaned data")
log.Info("Pruning local database: removing incomplete snapshots and orphaned data")
result := &PruneResult{}
@@ -1076,7 +1114,7 @@ func (v *Vaultik) PruneDatabase() (*PruneResult, error) {
result.ChunksDeleted = chunkCountBefore - chunkCountAfter
result.BlobsDeleted = blobCountBefore - blobCountAfter
log.Info("Prune complete",
log.Info("Local database prune complete",
"incomplete_snapshots", result.SnapshotsDeleted,
"orphaned_files", result.FilesDeleted,
"orphaned_chunks", result.ChunksDeleted,
@@ -1084,7 +1122,7 @@ func (v *Vaultik) PruneDatabase() (*PruneResult, error) {
)
// Print summary
_, _ = fmt.Fprintf(v.Stdout, "Prune complete:\n")
_, _ = fmt.Fprintf(v.Stdout, "Local database prune complete:\n")
_, _ = fmt.Fprintf(v.Stdout, " Incomplete snapshots removed: %d\n", result.SnapshotsDeleted)
_, _ = fmt.Fprintf(v.Stdout, " Orphaned files removed: %d\n", result.FilesDeleted)
_, _ = fmt.Fprintf(v.Stdout, " Orphaned chunks removed: %d\n", result.ChunksDeleted)