This commit represents a significant architectural overhaul of vaultik: Database Schema Changes: - Switch files table to use UUID primary keys instead of path-based keys - Add UUID primary keys to blobs table for immediate chunk association - Update all foreign key relationships to use UUIDs - Add comprehensive schema documentation in DATAMODEL.md - Add SQLite busy timeout handling for concurrent operations Streaming and Performance Improvements: - Implement true streaming blob packing without intermediate storage - Add streaming chunk processing to reduce memory usage - Improve progress reporting with real-time metrics - Add upload metrics tracking in new uploads table CLI Refactoring: - Restructure CLI to use subcommands: snapshot create/list/purge/verify - Add store info command for S3 configuration display - Add custom duration parser supporting days/weeks/months/years - Remove old backup.go in favor of enhanced snapshot.go - Add --cron flag for silent operation Configuration Changes: - Remove unused index_prefix configuration option - Add support for snapshot pruning retention policies - Improve configuration validation and error messages Testing Improvements: - Add comprehensive repository tests with edge cases - Add cascade delete debugging tests - Fix concurrent operation tests to use SQLite busy timeout - Remove tolerance for SQLITE_BUSY errors in tests Documentation: - Add MIT LICENSE file - Update README with new command structure - Add comprehensive DATAMODEL.md explaining database schema - Update DESIGN.md with UUID-based architecture Other Changes: - Add test-config.yml for testing - Update Makefile with better test output formatting - Fix various race conditions in concurrent operations - Improve error handling throughout
534 lines
13 KiB
Go
534 lines
13 KiB
Go
package backup
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"database/sql"
|
|
"fmt"
|
|
"io"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
"testing/fstest"
|
|
"time"
|
|
|
|
"git.eeqj.de/sneak/vaultik/internal/database"
|
|
)
|
|
|
|
// MockS3Client is a mock implementation of S3 operations for testing
|
|
type MockS3Client struct {
|
|
storage map[string][]byte
|
|
}
|
|
|
|
func NewMockS3Client() *MockS3Client {
|
|
return &MockS3Client{
|
|
storage: make(map[string][]byte),
|
|
}
|
|
}
|
|
|
|
func (m *MockS3Client) PutBlob(ctx context.Context, hash string, data []byte) error {
|
|
m.storage[hash] = data
|
|
return nil
|
|
}
|
|
|
|
func (m *MockS3Client) GetBlob(ctx context.Context, hash string) ([]byte, error) {
|
|
data, ok := m.storage[hash]
|
|
if !ok {
|
|
return nil, fmt.Errorf("blob not found: %s", hash)
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func (m *MockS3Client) BlobExists(ctx context.Context, hash string) (bool, error) {
|
|
_, ok := m.storage[hash]
|
|
return ok, nil
|
|
}
|
|
|
|
func (m *MockS3Client) CreateBucket(ctx context.Context, bucket string) error {
|
|
return nil
|
|
}
|
|
|
|
func TestBackupWithInMemoryFS(t *testing.T) {
|
|
// Create a temporary directory for the database
|
|
tempDir := t.TempDir()
|
|
dbPath := filepath.Join(tempDir, "test.db")
|
|
|
|
// Create test filesystem
|
|
testFS := fstest.MapFS{
|
|
"file1.txt": &fstest.MapFile{
|
|
Data: []byte("Hello, World!"),
|
|
Mode: 0644,
|
|
ModTime: time.Now(),
|
|
},
|
|
"dir1/file2.txt": &fstest.MapFile{
|
|
Data: []byte("This is a test file with some content."),
|
|
Mode: 0755,
|
|
ModTime: time.Now(),
|
|
},
|
|
"dir1/subdir/file3.txt": &fstest.MapFile{
|
|
Data: []byte("Another file in a subdirectory."),
|
|
Mode: 0600,
|
|
ModTime: time.Now(),
|
|
},
|
|
"largefile.bin": &fstest.MapFile{
|
|
Data: generateLargeFileContent(10 * 1024 * 1024), // 10MB file with varied content
|
|
Mode: 0644,
|
|
ModTime: time.Now(),
|
|
},
|
|
}
|
|
|
|
// Initialize the database
|
|
ctx := context.Background()
|
|
db, err := database.New(ctx, dbPath)
|
|
if err != nil {
|
|
t.Fatalf("Failed to create database: %v", err)
|
|
}
|
|
defer func() {
|
|
if err := db.Close(); err != nil {
|
|
t.Logf("Failed to close database: %v", err)
|
|
}
|
|
}()
|
|
|
|
repos := database.NewRepositories(db)
|
|
|
|
// Create mock S3 client
|
|
s3Client := NewMockS3Client()
|
|
|
|
// Run backup
|
|
backupEngine := &BackupEngine{
|
|
repos: repos,
|
|
s3Client: s3Client,
|
|
}
|
|
|
|
snapshotID, err := backupEngine.Backup(ctx, testFS, ".")
|
|
if err != nil {
|
|
t.Fatalf("Backup failed: %v", err)
|
|
}
|
|
|
|
// Verify snapshot was created
|
|
snapshot, err := repos.Snapshots.GetByID(ctx, snapshotID)
|
|
if err != nil {
|
|
t.Fatalf("Failed to get snapshot: %v", err)
|
|
}
|
|
|
|
if snapshot == nil {
|
|
t.Fatal("Snapshot not found")
|
|
}
|
|
|
|
if snapshot.FileCount == 0 {
|
|
t.Error("Expected snapshot to have files")
|
|
}
|
|
|
|
// Verify files in database
|
|
files, err := repos.Files.ListByPrefix(ctx, "")
|
|
if err != nil {
|
|
t.Fatalf("Failed to list files: %v", err)
|
|
}
|
|
|
|
expectedFiles := map[string]bool{
|
|
"file1.txt": true,
|
|
"dir1/file2.txt": true,
|
|
"dir1/subdir/file3.txt": true,
|
|
"largefile.bin": true,
|
|
}
|
|
|
|
if len(files) != len(expectedFiles) {
|
|
t.Errorf("Expected %d files, got %d", len(expectedFiles), len(files))
|
|
}
|
|
|
|
for _, file := range files {
|
|
if !expectedFiles[file.Path] {
|
|
t.Errorf("Unexpected file in database: %s", file.Path)
|
|
}
|
|
delete(expectedFiles, file.Path)
|
|
|
|
// Verify file metadata
|
|
fsFile := testFS[file.Path]
|
|
if fsFile == nil {
|
|
t.Errorf("File %s not found in test filesystem", file.Path)
|
|
continue
|
|
}
|
|
|
|
if file.Size != int64(len(fsFile.Data)) {
|
|
t.Errorf("File %s: expected size %d, got %d", file.Path, len(fsFile.Data), file.Size)
|
|
}
|
|
|
|
if file.Mode != uint32(fsFile.Mode) {
|
|
t.Errorf("File %s: expected mode %o, got %o", file.Path, fsFile.Mode, file.Mode)
|
|
}
|
|
}
|
|
|
|
if len(expectedFiles) > 0 {
|
|
t.Errorf("Files not found in database: %v", expectedFiles)
|
|
}
|
|
|
|
// Verify chunks
|
|
chunks, err := repos.Chunks.List(ctx)
|
|
if err != nil {
|
|
t.Fatalf("Failed to list chunks: %v", err)
|
|
}
|
|
|
|
if len(chunks) == 0 {
|
|
t.Error("No chunks found in database")
|
|
}
|
|
|
|
// The large file should create 10 chunks (10MB / 1MB chunk size)
|
|
// Plus the small files
|
|
minExpectedChunks := 10 + 3
|
|
if len(chunks) < minExpectedChunks {
|
|
t.Errorf("Expected at least %d chunks, got %d", minExpectedChunks, len(chunks))
|
|
}
|
|
|
|
// Verify at least one blob was created and uploaded
|
|
// We can't list blobs directly, but we can check via snapshot blobs
|
|
blobHashes, err := repos.Snapshots.GetBlobHashes(ctx, snapshotID)
|
|
if err != nil {
|
|
t.Fatalf("Failed to get blob hashes: %v", err)
|
|
}
|
|
if len(blobHashes) == 0 {
|
|
t.Error("Expected at least one blob to be created")
|
|
}
|
|
|
|
for _, blobHash := range blobHashes {
|
|
// Check blob exists in mock S3
|
|
exists, err := s3Client.BlobExists(ctx, blobHash)
|
|
if err != nil {
|
|
t.Errorf("Failed to check blob %s: %v", blobHash, err)
|
|
}
|
|
if !exists {
|
|
t.Errorf("Blob %s not found in S3", blobHash)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestBackupDeduplication(t *testing.T) {
|
|
// Create a temporary directory for the database
|
|
tempDir := t.TempDir()
|
|
dbPath := filepath.Join(tempDir, "test.db")
|
|
|
|
// Create test filesystem with duplicate content
|
|
testFS := fstest.MapFS{
|
|
"file1.txt": &fstest.MapFile{
|
|
Data: []byte("Duplicate content"),
|
|
Mode: 0644,
|
|
ModTime: time.Now(),
|
|
},
|
|
"file2.txt": &fstest.MapFile{
|
|
Data: []byte("Duplicate content"),
|
|
Mode: 0644,
|
|
ModTime: time.Now(),
|
|
},
|
|
"file3.txt": &fstest.MapFile{
|
|
Data: []byte("Unique content"),
|
|
Mode: 0644,
|
|
ModTime: time.Now(),
|
|
},
|
|
}
|
|
|
|
// Initialize the database
|
|
ctx := context.Background()
|
|
db, err := database.New(ctx, dbPath)
|
|
if err != nil {
|
|
t.Fatalf("Failed to create database: %v", err)
|
|
}
|
|
defer func() {
|
|
if err := db.Close(); err != nil {
|
|
t.Logf("Failed to close database: %v", err)
|
|
}
|
|
}()
|
|
|
|
repos := database.NewRepositories(db)
|
|
|
|
// Create mock S3 client
|
|
s3Client := NewMockS3Client()
|
|
|
|
// Run backup
|
|
backupEngine := &BackupEngine{
|
|
repos: repos,
|
|
s3Client: s3Client,
|
|
}
|
|
|
|
_, err = backupEngine.Backup(ctx, testFS, ".")
|
|
if err != nil {
|
|
t.Fatalf("Backup failed: %v", err)
|
|
}
|
|
|
|
// Verify deduplication
|
|
chunks, err := repos.Chunks.List(ctx)
|
|
if err != nil {
|
|
t.Fatalf("Failed to list chunks: %v", err)
|
|
}
|
|
|
|
// Should have only 2 unique chunks (duplicate content + unique content)
|
|
if len(chunks) != 2 {
|
|
t.Errorf("Expected 2 unique chunks, got %d", len(chunks))
|
|
}
|
|
|
|
// Verify chunk references
|
|
for _, chunk := range chunks {
|
|
files, err := repos.ChunkFiles.GetByChunkHash(ctx, chunk.ChunkHash)
|
|
if err != nil {
|
|
t.Errorf("Failed to get files for chunk %s: %v", chunk.ChunkHash, err)
|
|
}
|
|
|
|
// The duplicate content chunk should be referenced by 2 files
|
|
if chunk.Size == int64(len("Duplicate content")) && len(files) != 2 {
|
|
t.Errorf("Expected duplicate chunk to be referenced by 2 files, got %d", len(files))
|
|
}
|
|
}
|
|
}
|
|
|
|
// BackupEngine performs backup operations
|
|
type BackupEngine struct {
|
|
repos *database.Repositories
|
|
s3Client interface {
|
|
PutBlob(ctx context.Context, hash string, data []byte) error
|
|
BlobExists(ctx context.Context, hash string) (bool, error)
|
|
}
|
|
}
|
|
|
|
// Backup performs a backup of the given filesystem
|
|
func (b *BackupEngine) Backup(ctx context.Context, fsys fs.FS, root string) (string, error) {
|
|
// Create a new snapshot
|
|
hostname, _ := os.Hostname()
|
|
snapshotID := time.Now().Format(time.RFC3339)
|
|
snapshot := &database.Snapshot{
|
|
ID: snapshotID,
|
|
Hostname: hostname,
|
|
VaultikVersion: "test",
|
|
StartedAt: time.Now(),
|
|
CompletedAt: nil,
|
|
}
|
|
|
|
// Create initial snapshot record
|
|
err := b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
return b.repos.Snapshots.Create(ctx, tx, snapshot)
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// Track counters
|
|
var fileCount, chunkCount, blobCount, totalSize, blobSize int64
|
|
|
|
// Track which chunks we've seen to handle deduplication
|
|
processedChunks := make(map[string]bool)
|
|
|
|
// Scan the filesystem and process files
|
|
err = fs.WalkDir(fsys, root, func(path string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Skip directories
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
|
|
// Get file info
|
|
info, err := d.Info()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Handle symlinks
|
|
if info.Mode()&fs.ModeSymlink != 0 {
|
|
// For testing, we'll skip symlinks since fstest doesn't support them well
|
|
return nil
|
|
}
|
|
|
|
// Create file record in a short transaction
|
|
file := &database.File{
|
|
Path: path,
|
|
Size: info.Size(),
|
|
Mode: uint32(info.Mode()),
|
|
MTime: info.ModTime(),
|
|
CTime: info.ModTime(), // Use mtime as ctime for test
|
|
UID: 1000, // Default UID for test
|
|
GID: 1000, // Default GID for test
|
|
}
|
|
err = b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
return b.repos.Files.Create(ctx, tx, file)
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
fileCount++
|
|
totalSize += info.Size()
|
|
|
|
// Read and process file in chunks
|
|
f, err := fsys.Open(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
if err := f.Close(); err != nil {
|
|
// Log but don't fail since we're already in an error path potentially
|
|
fmt.Fprintf(os.Stderr, "Failed to close file: %v\n", err)
|
|
}
|
|
}()
|
|
|
|
// Process file in chunks
|
|
chunkIndex := 0
|
|
buffer := make([]byte, defaultChunkSize)
|
|
|
|
for {
|
|
n, err := f.Read(buffer)
|
|
if err != nil && err != io.EOF {
|
|
return err
|
|
}
|
|
if n == 0 {
|
|
break
|
|
}
|
|
|
|
chunkData := buffer[:n]
|
|
chunkHash := calculateHash(chunkData)
|
|
|
|
// Check if chunk already exists (outside of transaction)
|
|
existingChunk, _ := b.repos.Chunks.GetByHash(ctx, chunkHash)
|
|
if existingChunk == nil {
|
|
// Create new chunk in a short transaction
|
|
err = b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
chunk := &database.Chunk{
|
|
ChunkHash: chunkHash,
|
|
SHA256: chunkHash,
|
|
Size: int64(n),
|
|
}
|
|
return b.repos.Chunks.Create(ctx, tx, chunk)
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
processedChunks[chunkHash] = true
|
|
}
|
|
|
|
// Create file-chunk mapping in a short transaction
|
|
err = b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
fileChunk := &database.FileChunk{
|
|
FileID: file.ID,
|
|
Idx: chunkIndex,
|
|
ChunkHash: chunkHash,
|
|
}
|
|
return b.repos.FileChunks.Create(ctx, tx, fileChunk)
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Create chunk-file mapping in a short transaction
|
|
err = b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
chunkFile := &database.ChunkFile{
|
|
ChunkHash: chunkHash,
|
|
FileID: file.ID,
|
|
FileOffset: int64(chunkIndex * defaultChunkSize),
|
|
Length: int64(n),
|
|
}
|
|
return b.repos.ChunkFiles.Create(ctx, tx, chunkFile)
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
chunkIndex++
|
|
}
|
|
|
|
return nil
|
|
})
|
|
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// After all files are processed, create blobs for new chunks
|
|
for chunkHash := range processedChunks {
|
|
// Get chunk data (outside of transaction)
|
|
chunk, err := b.repos.Chunks.GetByHash(ctx, chunkHash)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
chunkCount++
|
|
|
|
// In a real system, blobs would contain multiple chunks and be encrypted
|
|
// For testing, we'll create a blob with a "blob-" prefix to differentiate
|
|
blobHash := "blob-" + chunkHash
|
|
|
|
// For the test, we'll create dummy data since we don't have the original
|
|
dummyData := []byte(chunkHash)
|
|
|
|
// Upload to S3 as a blob
|
|
if err := b.s3Client.PutBlob(ctx, blobHash, dummyData); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// Create blob entry in a short transaction
|
|
err = b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
blob := &database.Blob{
|
|
ID: "test-blob-" + blobHash[:8],
|
|
Hash: blobHash,
|
|
CreatedTS: time.Now(),
|
|
}
|
|
return b.repos.Blobs.Create(ctx, tx, blob)
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
blobCount++
|
|
blobSize += chunk.Size
|
|
|
|
// Create blob-chunk mapping in a short transaction
|
|
err = b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
blobChunk := &database.BlobChunk{
|
|
BlobID: "test-blob-" + blobHash[:8],
|
|
ChunkHash: chunkHash,
|
|
Offset: 0,
|
|
Length: chunk.Size,
|
|
}
|
|
return b.repos.BlobChunks.Create(ctx, tx, blobChunk)
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// Add blob to snapshot in a short transaction
|
|
err = b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
return b.repos.Snapshots.AddBlob(ctx, tx, snapshotID, "test-blob-"+blobHash[:8], blobHash)
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
|
|
// Update snapshot with final counts
|
|
err = b.repos.WithTx(ctx, func(ctx context.Context, tx *sql.Tx) error {
|
|
return b.repos.Snapshots.UpdateCounts(ctx, tx, snapshotID, fileCount, chunkCount, blobCount, totalSize, blobSize)
|
|
})
|
|
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return snapshotID, nil
|
|
}
|
|
|
|
func calculateHash(data []byte) string {
|
|
h := sha256.New()
|
|
h.Write(data)
|
|
return fmt.Sprintf("%x", h.Sum(nil))
|
|
}
|
|
|
|
func generateLargeFileContent(size int) []byte {
|
|
data := make([]byte, size)
|
|
// Fill with pattern that changes every chunk to avoid deduplication
|
|
for i := 0; i < size; i++ {
|
|
chunkNum := i / defaultChunkSize
|
|
data[i] = byte((i + chunkNum) % 256)
|
|
}
|
|
return data
|
|
}
|
|
|
|
const defaultChunkSize = 1024 * 1024 // 1MB chunks
|