Add blob manifest for pruning without decryption

- Update bucket structure to include unencrypted blob manifest files
- Add <snapshot_id>.manifest.json.zst containing list of referenced blobs
- This enables pruning operations without requiring decryption keys
- Add snapshot management commands: list, rm, latest (stubs)
- Add --prune flag to backup command for automatic cleanup
- Update DESIGN.md to document manifest format and updated prune flow
This commit is contained in:
Jeffrey Paul 2025-07-20 11:03:53 +02:00
parent 8529ae9735
commit 9c072166fa
4 changed files with 127 additions and 6 deletions

View File

@ -58,12 +58,14 @@ Surprisingly, no existing tool meets these requirements, so I wrote `vaultik`.
## S3 Bucket Layout
S3 stores only three things:
S3 stores only four things:
1) Blobs: encrypted, compressed packs of file chunks.
2) Metadata: encrypted SQLite databases containing the current state of the
filesystem at the time of the snapshot.
3) Metadata hashes: encrypted hashes of the metadata SQLite databases.
4) Blob manifests: unencrypted compressed JSON files listing all blob hashes
referenced in the snapshot, enabling pruning without decryption.
```
s3://<bucket>/<prefix>/
@ -73,6 +75,7 @@ s3://<bucket>/<prefix>/
│ ├── <snapshot_id>.sqlite.age
│ ├── <snapshot_id>.sqlite.00.age
│ ├── <snapshot_id>.sqlite.01.age
│ ├── <snapshot_id>.manifest.json.zst
```
To retrieve a given file, you would:
@ -99,6 +102,23 @@ memory (<10GB).
* `<snapshot_id>`: UTC timestamp in iso860 format, e.g. `2023-10-01T12:00:00Z`. These are lexicographically sortable.
* `blobs/<aa>/<bb>/...`: where `aa` and `bb` are the first 2 hex bytes of the blob hash.
### Blob Manifest Format
The `<snapshot_id>.manifest.json.zst` file is an unencrypted, compressed JSON file containing:
```json
{
"snapshot_id": "2023-10-01T12:00:00Z",
"blob_hashes": [
"aa1234567890abcdef...",
"bb2345678901bcdef0...",
...
]
}
```
This allows pruning operations to determine which blobs are referenced without requiring decryption keys.
---
## 3. Local SQLite Index Schema (source host)
@ -223,18 +243,20 @@ metadata/<snapshot_id>.sqlite.01.age
9. Compress, encrypt, split, and upload to S3
10. Encrypt the hash of the snapshot database to the backup age key
11. Upload the encrypted hash to S3 as `metadata/<snapshot_id>.hash.age`
12. Optionally prune remote blobs that are no longer referenced in the
12. Create blob manifest JSON listing all blob hashes referenced in snapshot
13. Compress manifest with zstd and upload as `metadata/<snapshot_id>.manifest.json.zst`
14. Optionally prune remote blobs that are no longer referenced in the
snapshot, based on local state db
### 5.2 Manual Prune
1. List all objects under `metadata/`
2. Determine the latest valid `snapshot_id` by timestamp
3. Download, decrypt, and reconstruct the latest snapshot SQLite database
4. Extract set of referenced blob hashes
3. Download and decompress the latest `<snapshot_id>.manifest.json.zst`
4. Extract set of referenced blob hashes from manifest (no decryption needed)
5. List all blob objects under `blobs/`
6. For each blob:
* If the hash is not in the latest snapshot:
* If the hash is not in the manifest:
* Issue `DeleteObject` to remove it
### 5.3 Verify
@ -261,11 +283,14 @@ Verify runs on a host that has no state, but access to the bucket.
## 6. CLI Commands
```
vaultik backup [--config <path>] [--cron] [--daemon]
vaultik backup [--config <path>] [--cron] [--daemon] [--prune]
vaultik restore --bucket <bucket> --prefix <prefix> --snapshot <id> --target <dir>
vaultik prune --bucket <bucket> --prefix <prefix> [--dry-run]
vaultik verify --bucket <bucket> --prefix <prefix> [--snapshot <id>] [--quick]
vaultik fetch --bucket <bucket> --prefix <prefix> --snapshot <id> --file <path> --target <path>
vaultik snapshot list --bucket <bucket> --prefix <prefix> [--limit <n>]
vaultik snapshot rm --bucket <bucket> --prefix <prefix> --snapshot <id>
vaultik snapshot latest --bucket <bucket> --prefix <prefix>
```
* `VAULTIK_PRIVATE_KEY` is required for `restore`, `prune`, `verify`, and

View File

@ -17,6 +17,7 @@ type BackupOptions struct {
ConfigPath string
Daemon bool
Cron bool
Prune bool
}
// NewBackupCommand creates the backup command
@ -52,6 +53,7 @@ a path using --config or by setting VAULTIK_CONFIG to a path.`,
cmd.Flags().StringVar(&opts.ConfigPath, "config", "", "Path to config file")
cmd.Flags().BoolVar(&opts.Daemon, "daemon", false, "Run in daemon mode with inotify monitoring")
cmd.Flags().BoolVar(&opts.Cron, "cron", false, "Run in cron mode (silent unless error)")
cmd.Flags().BoolVar(&opts.Prune, "prune", false, "Delete all previous snapshots and unreferenced blobs after backup")
return cmd
}
@ -71,6 +73,9 @@ func runBackup(ctx context.Context, opts *BackupOptions) error {
if opts.Cron {
fmt.Println("Running in cron mode")
}
if opts.Prune {
fmt.Println("Pruning enabled - will delete old snapshots after backup")
}
return nil
}),
},

View File

@ -22,6 +22,7 @@ on the source system.`,
NewPruneCommand(),
NewVerifyCommand(),
NewFetchCommand(),
SnapshotCmd(),
)
return cmd

90
internal/cli/snapshot.go Normal file
View File

@ -0,0 +1,90 @@
package cli
import (
"github.com/spf13/cobra"
)
func SnapshotCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "snapshot",
Short: "Manage snapshots",
Long: "Commands for listing, removing, and querying snapshots",
}
cmd.AddCommand(snapshotListCmd())
cmd.AddCommand(snapshotRmCmd())
cmd.AddCommand(snapshotLatestCmd())
return cmd
}
func snapshotListCmd() *cobra.Command {
var (
bucket string
prefix string
limit int
)
cmd := &cobra.Command{
Use: "list",
Short: "List snapshots",
Long: "List all snapshots in the bucket, sorted by timestamp",
RunE: func(cmd *cobra.Command, args []string) error {
panic("unimplemented")
},
}
cmd.Flags().StringVar(&bucket, "bucket", "", "S3 bucket name")
cmd.Flags().StringVar(&prefix, "prefix", "", "S3 prefix")
cmd.Flags().IntVar(&limit, "limit", 10, "Maximum number of snapshots to list")
cmd.MarkFlagRequired("bucket")
return cmd
}
func snapshotRmCmd() *cobra.Command {
var (
bucket string
prefix string
snapshot string
)
cmd := &cobra.Command{
Use: "rm",
Short: "Remove a snapshot",
Long: "Remove a snapshot and optionally its associated blobs",
RunE: func(cmd *cobra.Command, args []string) error {
panic("unimplemented")
},
}
cmd.Flags().StringVar(&bucket, "bucket", "", "S3 bucket name")
cmd.Flags().StringVar(&prefix, "prefix", "", "S3 prefix")
cmd.Flags().StringVar(&snapshot, "snapshot", "", "Snapshot ID to remove")
cmd.MarkFlagRequired("bucket")
cmd.MarkFlagRequired("snapshot")
return cmd
}
func snapshotLatestCmd() *cobra.Command {
var (
bucket string
prefix string
)
cmd := &cobra.Command{
Use: "latest",
Short: "Get the latest snapshot ID",
Long: "Display the ID of the most recent snapshot",
RunE: func(cmd *cobra.Command, args []string) error {
panic("unimplemented")
},
}
cmd.Flags().StringVar(&bucket, "bucket", "", "S3 bucket name")
cmd.Flags().StringVar(&prefix, "prefix", "", "S3 prefix")
cmd.MarkFlagRequired("bucket")
return cmd
}