diff --git a/DATAMODEL.md b/docs/DATAMODEL.md similarity index 99% rename from DATAMODEL.md rename to docs/DATAMODEL.md index 2111570..71d4b08 100644 --- a/DATAMODEL.md +++ b/docs/DATAMODEL.md @@ -190,7 +190,7 @@ After a snapshot is completed: 3. Export to SQL dump using sqlite3 4. Compress with zstd and encrypt with age 5. Upload to S3 as `metadata/{snapshot-id}/db.zst.age` -6. Generate blob manifest and upload as `metadata/{snapshot-id}/manifest.json.zst.age` +6. Generate blob manifest and upload as `metadata/{snapshot-id}/manifest.json.zst` ### 4. Restore Process diff --git a/docs/REPOSTRUCTURE.md b/docs/REPOSTRUCTURE.md new file mode 100644 index 0000000..e6527f3 --- /dev/null +++ b/docs/REPOSTRUCTURE.md @@ -0,0 +1,143 @@ +# Vaultik S3 Repository Structure + +This document describes the structure and organization of data stored in the S3 bucket by Vaultik. + +## Overview + +Vaultik stores all backup data in an S3-compatible object store. The repository consists of two main components: +1. **Blobs** - The actual backup data (content-addressed, encrypted) +2. **Metadata** - Snapshot information and manifests (partially encrypted) + +## Directory Structure + +``` +// +├── blobs/ +│ └── / +│ └── / +│ └── +└── metadata/ + └── / + ├── db.zst.age + └── manifest.json.zst +``` + +## Blobs Directory (`blobs/`) + +### Structure +- **Path format**: `blobs///` +- **Example**: `blobs/ca/fe/cafebabe1234567890abcdef1234567890abcdef1234567890abcdef12345678` +- **Sharding**: The two-level directory structure (using the first 4 characters of the hash) prevents any single directory from containing too many objects + +### Content +- **What it contains**: Packed collections of content-defined chunks from files +- **Format**: Zstandard compressed, then Age encrypted +- **Encryption**: Always encrypted with Age using the configured recipients +- **Naming**: Content-addressed using SHA256 hash of the encrypted blob + +### Why Encrypted +Blobs contain the actual file data from backups and must be encrypted for security. The content-addressing ensures deduplication while the encryption ensures privacy. + +## Metadata Directory (`metadata/`) + +Each snapshot has its own subdirectory named with the snapshot ID. + +### Snapshot ID Format +- **Format**: `--` +- **Example**: `laptop-20240115-143052Z` +- **Components**: + - Hostname (may contain hyphens) + - Date in YYYYMMDD format + - Time in HHMMSSZ format (Z indicates UTC) + +### Files in Each Snapshot Directory + +#### `db.zst.age` - Encrypted Database Dump +- **What it contains**: Complete SQLite database dump for this snapshot +- **Format**: SQL dump → Zstandard compressed → Age encrypted +- **Encryption**: Encrypted with Age +- **Purpose**: Contains full file metadata, chunk mappings, and all relationships +- **Why encrypted**: Contains sensitive metadata like file paths, permissions, and ownership + +#### `manifest.json.zst` - Unencrypted Blob Manifest +- **What it contains**: JSON list of all blob hashes referenced by this snapshot +- **Format**: JSON → Zstandard compressed (NOT encrypted) +- **Encryption**: NOT encrypted +- **Purpose**: Enables pruning operations without requiring decryption keys +- **Structure**: +```json +{ + "snapshot_id": "laptop-20240115-143052Z", + "timestamp": "2024-01-15T14:30:52Z", + "blob_count": 42, + "blobs": [ + "cafebabe1234567890abcdef1234567890abcdef1234567890abcdef12345678", + "deadbeef1234567890abcdef1234567890abcdef1234567890abcdef12345678", + ... + ] +} +``` + +### Why Manifest is Unencrypted +The manifest must be readable without the private key to enable: +1. **Pruning operations** - Identifying unreferenced blobs for deletion +2. **Storage analysis** - Understanding space usage without decryption +3. **Verification** - Checking blob existence without decryption +4. **Cross-snapshot deduplication analysis** - Finding shared blobs between snapshots + +The manifest only contains blob hashes, not file names or any other sensitive information. + +## Security Considerations + +### What's Encrypted +- **All file content** (in blobs) +- **All file metadata** (paths, permissions, timestamps, ownership in db.zst.age) +- **File-to-chunk mappings** (in db.zst.age) + +### What's Not Encrypted +- **Blob hashes** (in manifest.json.zst) +- **Snapshot IDs** (directory names) +- **Blob count per snapshot** (in manifest.json.zst) + +### Privacy Implications +From the unencrypted data, an observer can determine: +- When backups were taken (from snapshot IDs) +- Which hostname created backups (from snapshot IDs) +- How many blobs each snapshot references +- Which blobs are shared between snapshots (deduplication patterns) +- The size of each encrypted blob + +An observer cannot determine: +- File names or paths +- File contents +- File permissions or ownership +- Directory structure +- Which chunks belong to which files + +## Consistency Guarantees + +1. **Blobs are immutable** - Once written, a blob is never modified +2. **Blobs are written before metadata** - A snapshot's metadata is only written after all its blobs are successfully uploaded +3. **Metadata is written atomically** - Both db.zst.age and manifest.json.zst are written as complete files +4. **Snapshots are marked complete in local DB only after metadata upload** - Ensures consistency between local and remote state + +## Pruning Safety + +The prune operation is safe because: +1. It only deletes blobs not referenced in any manifest +2. Manifests are unencrypted and can be read without keys +3. The operation compares the latest local DB snapshot with the latest S3 snapshot to ensure consistency +4. Pruning will fail if these don't match, preventing accidental deletion of needed blobs + +## Restoration Requirements + +To restore from a backup, you need: +1. **The Age private key** - To decrypt blobs and database +2. **The snapshot metadata** - Both files from the snapshot's metadata directory +3. **All referenced blobs** - As listed in the manifest + +The restoration process: +1. Download and decrypt the database dump to understand file structure +2. Download and decrypt the required blobs +3. Reconstruct files from their chunks +4. Restore file metadata (permissions, timestamps, etc.) \ No newline at end of file diff --git a/internal/backup/snapshot.go b/internal/backup/snapshot.go index 388bb3f..ca892f4 100644 --- a/internal/backup/snapshot.go +++ b/internal/backup/snapshot.go @@ -56,6 +56,7 @@ import ( "git.eeqj.de/sneak/vaultik/internal/log" "git.eeqj.de/sneak/vaultik/internal/s3" "github.com/dustin/go-humanize" + "github.com/klauspost/compress/zstd" "go.uber.org/fx" ) @@ -277,8 +278,8 @@ func (sm *SnapshotManager) ExportSnapshotMetadata(ctx context.Context, dbPath st "duration", dbUploadDuration, "speed", humanize.SI(dbUploadSpeed, "bps")) - // Upload blob manifest (compressed and encrypted) - manifestKey := fmt.Sprintf("metadata/%s/manifest.json.zst.age", snapshotID) + // Upload blob manifest (compressed only, not encrypted) + manifestKey := fmt.Sprintf("metadata/%s/manifest.json.zst", snapshotID) log.Debug("Uploading blob manifest to S3", "key", manifestKey, "size", len(blobManifest)) manifestUploadStart := time.Now() if err := sm.s3Client.PutObject(ctx, manifestKey, bytes.NewReader(blobManifest)); err != nil { @@ -566,25 +567,33 @@ func (sm *SnapshotManager) generateBlobManifest(ctx context.Context, dbPath stri } log.Debug("JSON manifest created", "size", len(jsonData)) - // Compress and encrypt with blobgen - log.Debug("Compressing and encrypting manifest") + // Compress only (no encryption) - manifests must be readable without private keys for pruning + log.Debug("Compressing manifest") - result, err := blobgen.CompressData(jsonData, sm.config.CompressionLevel, sm.config.AgeRecipients) + var compressedBuf bytes.Buffer + writer, err := zstd.NewWriter(&compressedBuf, zstd.WithEncoderLevel(zstd.EncoderLevelFromZstd(sm.config.CompressionLevel))) if err != nil { - return nil, fmt.Errorf("compressing manifest: %w", err) + return nil, fmt.Errorf("creating zstd writer: %w", err) } - log.Debug("Manifest compressed and encrypted", + if _, err := writer.Write(jsonData); err != nil { + _ = writer.Close() + return nil, fmt.Errorf("writing compressed data: %w", err) + } + if err := writer.Close(); err != nil { + return nil, fmt.Errorf("closing zstd writer: %w", err) + } + + log.Debug("Manifest compressed", "original_size", len(jsonData), - "compressed_size", result.CompressedSize, - "hash", result.SHA256) + "compressed_size", compressedBuf.Len()) log.Info("Generated blob manifest", "snapshot_id", snapshotID, "blob_count", len(blobs), "json_size", len(jsonData), - "compressed_size", result.CompressedSize) + "compressed_size", compressedBuf.Len()) - return result.Data, nil + return compressedBuf.Bytes(), nil } // compressData compresses data using zstd diff --git a/internal/cli/prune.go b/internal/cli/prune.go index 467eb26..ef82069 100644 --- a/internal/cli/prune.go +++ b/internal/cli/prune.go @@ -2,8 +2,9 @@ package cli import ( "context" + "encoding/json" "fmt" - "os" + "strings" "git.eeqj.de/sneak/vaultik/internal/backup" "git.eeqj.de/sneak/vaultik/internal/config" @@ -11,6 +12,8 @@ import ( "git.eeqj.de/sneak/vaultik/internal/globals" "git.eeqj.de/sneak/vaultik/internal/log" "git.eeqj.de/sneak/vaultik/internal/s3" + "github.com/dustin/go-humanize" + "github.com/klauspost/compress/zstd" "github.com/spf13/cobra" "go.uber.org/fx" ) @@ -40,20 +43,14 @@ func NewPruneCommand() *cobra.Command { Long: `Delete blobs that are no longer referenced by any snapshot. This command will: -1. Download all snapshot metadata from S3 -2. Build a list of all referenced blobs -3. List all blobs in S3 -4. Delete any blobs not referenced by any snapshot +1. Download the manifest from the last successful snapshot +2. List all blobs in S3 +3. Delete any blobs not referenced in the manifest Config is located at /etc/vaultik/config.yml by default, but can be overridden by specifying a path using --config or by setting VAULTIK_CONFIG to a path.`, Args: cobra.NoArgs, RunE: func(cmd *cobra.Command, args []string) error { - // Check for private key - if os.Getenv("VAULTIK_PRIVATE_KEY") == "" { - return fmt.Errorf("VAULTIK_PRIVATE_KEY environment variable must be set") - } - // Use unified config resolution configPath, err := ResolveConfigPath() if err != nil { @@ -129,19 +126,188 @@ func (app *PruneApp) runPrune(ctx context.Context, opts *PruneOptions) error { "dry_run", opts.DryRun, ) - // TODO: Implement the actual prune logic - // 1. Download all snapshot metadata - // 2. Build set of referenced blobs - // 3. List all blobs in S3 - // 4. Delete unreferenced blobs - - fmt.Printf("Pruning bucket %s with prefix %s\n", app.Config.S3.Bucket, app.Config.S3.Prefix) - if opts.DryRun { - fmt.Println("Running in dry-run mode") + // Step 1: Get the latest complete snapshot from the database + log.Info("Getting latest snapshot from database") + snapshots, err := app.Repositories.Snapshots.ListRecent(ctx, 1) + if err != nil { + return fmt.Errorf("listing snapshots: %w", err) } - // For now, just show we're using the config properly - log.Info("Prune operation completed successfully") + if len(snapshots) == 0 { + return fmt.Errorf("no snapshots found in database") + } + latestSnapshot := snapshots[0] + if latestSnapshot.CompletedAt == nil { + return fmt.Errorf("latest snapshot %s is incomplete", latestSnapshot.ID) + } + + log.Info("Found latest snapshot", + "id", latestSnapshot.ID, + "completed_at", latestSnapshot.CompletedAt.Format("2006-01-02 15:04:05")) + + // Step 2: Find and download the manifest from the last successful snapshot in S3 + log.Info("Finding last successful snapshot in S3") + metadataPrefix := "metadata/" + + // List all snapshots in S3 + var s3Snapshots []string + objectCh := app.S3Client.ListObjectsStream(ctx, metadataPrefix, false) + for obj := range objectCh { + if obj.Err != nil { + return fmt.Errorf("listing metadata objects: %w", obj.Err) + } + // Extract snapshot ID from path like "metadata/hostname-20240115-143052Z/manifest.json.zst" + parts := strings.Split(obj.Key, "/") + if len(parts) >= 2 && strings.HasSuffix(obj.Key, "/manifest.json.zst") { + s3Snapshots = append(s3Snapshots, parts[1]) + } + } + + if len(s3Snapshots) == 0 { + return fmt.Errorf("no snapshot manifests found in S3") + } + + // Find the most recent snapshot (they're named with timestamps) + var lastS3Snapshot string + for _, snap := range s3Snapshots { + if lastS3Snapshot == "" || snap > lastS3Snapshot { + lastS3Snapshot = snap + } + } + + log.Info("Found last S3 snapshot", "id", lastS3Snapshot) + + // Step 3: Verify the last S3 snapshot matches the latest DB snapshot + if lastS3Snapshot != latestSnapshot.ID { + return fmt.Errorf("latest snapshot in database (%s) does not match last successful snapshot in S3 (%s)", + latestSnapshot.ID, lastS3Snapshot) + } + + // Step 4: Download and parse the manifest + log.Info("Downloading manifest", "snapshot_id", lastS3Snapshot) + manifest, err := app.downloadManifest(ctx, lastS3Snapshot) + if err != nil { + return fmt.Errorf("downloading manifest: %w", err) + } + + log.Info("Manifest loaded", "blob_count", len(manifest.Blobs)) + + // Step 5: Build set of referenced blobs + referencedBlobs := make(map[string]bool) + for _, blobHash := range manifest.Blobs { + referencedBlobs[blobHash] = true + } + + // Step 6: List all blobs in S3 + log.Info("Listing all blobs in S3") + blobPrefix := "blobs/" + var totalBlobs int + var unreferencedBlobs []s3.ObjectInfo + var unreferencedSize int64 + + objectCh = app.S3Client.ListObjectsStream(ctx, blobPrefix, true) + for obj := range objectCh { + if obj.Err != nil { + return fmt.Errorf("listing blobs: %w", obj.Err) + } + + totalBlobs++ + + // Extract blob hash from path like "blobs/ca/fe/cafebabe..." + parts := strings.Split(obj.Key, "/") + if len(parts) == 4 { + blobHash := parts[3] + if !referencedBlobs[blobHash] { + unreferencedBlobs = append(unreferencedBlobs, obj) + unreferencedSize += obj.Size + } + } + } + + log.Info("Blob scan complete", + "total_blobs", totalBlobs, + "referenced_blobs", len(referencedBlobs), + "unreferenced_blobs", len(unreferencedBlobs), + "unreferenced_size", humanize.Bytes(uint64(unreferencedSize))) + + // Step 7: Delete or report unreferenced blobs + if opts.DryRun { + fmt.Printf("\nDry run mode - would delete %d unreferenced blobs\n", len(unreferencedBlobs)) + fmt.Printf("Total size of blobs to delete: %s\n", humanize.Bytes(uint64(unreferencedSize))) + + if len(unreferencedBlobs) > 0 { + log.Debug("Unreferenced blobs found", "count", len(unreferencedBlobs)) + for _, obj := range unreferencedBlobs { + log.Debug("Would delete blob", "key", obj.Key, "size", humanize.Bytes(uint64(obj.Size))) + } + } + } else { + if len(unreferencedBlobs) == 0 { + fmt.Println("No unreferenced blobs to delete") + return nil + } + + fmt.Printf("\nDeleting %d unreferenced blobs (%s)...\n", + len(unreferencedBlobs), humanize.Bytes(uint64(unreferencedSize))) + + deletedCount := 0 + deletedSize := int64(0) + + for _, obj := range unreferencedBlobs { + if err := app.S3Client.RemoveObject(ctx, obj.Key); err != nil { + log.Error("Failed to delete blob", "key", obj.Key, "error", err) + continue + } + deletedCount++ + deletedSize += obj.Size + + // Show progress every 100 blobs + if deletedCount%100 == 0 { + fmt.Printf(" Deleted %d/%d blobs (%s)...\n", + deletedCount, len(unreferencedBlobs), + humanize.Bytes(uint64(deletedSize))) + } + } + + fmt.Printf("\nDeleted %d blobs (%s)\n", deletedCount, humanize.Bytes(uint64(deletedSize))) + } + + log.Info("Prune operation completed successfully") return nil -} \ No newline at end of file +} + +// BlobManifest represents the structure of a snapshot's blob manifest +type BlobManifest struct { + SnapshotID string `json:"snapshot_id"` + Timestamp string `json:"timestamp"` + BlobCount int `json:"blob_count"` + Blobs []string `json:"blobs"` +} + +// downloadManifest downloads and decompresses a snapshot manifest +func (app *PruneApp) downloadManifest(ctx context.Context, snapshotID string) (*BlobManifest, error) { + manifestPath := fmt.Sprintf("metadata/%s/manifest.json.zst", snapshotID) + + // Download the compressed manifest + reader, err := app.S3Client.GetObject(ctx, manifestPath) + if err != nil { + return nil, fmt.Errorf("downloading manifest: %w", err) + } + defer func() { _ = reader.Close() }() + + // Decompress using zstd + zr, err := zstd.NewReader(reader) + if err != nil { + return nil, fmt.Errorf("creating zstd reader: %w", err) + } + defer zr.Close() + + // Decode JSON manifest + var manifest BlobManifest + if err := json.NewDecoder(zr).Decode(&manifest); err != nil { + return nil, fmt.Errorf("decoding manifest: %w", err) + } + + return &manifest, nil +}