diff --git a/internal/snapshot/scanner.go b/internal/snapshot/scanner.go index a9d9916..86e4ea9 100644 --- a/internal/snapshot/scanner.go +++ b/internal/snapshot/scanner.go @@ -133,16 +133,7 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc defer s.progress.Stop() } - // Phase 0: Quick enumeration of all files on disk - fmt.Println("Enumerating files...") - existingFiles, err := s.enumerateFiles(ctx, path) - if err != nil && err != context.Canceled { - log.Warn("Failed to enumerate files", "error", err) - existingFiles = make(map[string]struct{}) - } - fmt.Printf("Found %s files\n", formatNumber(len(existingFiles))) - - // Phase 0b: Load known files from database into memory for fast lookup + // Phase 0: Load known files from database into memory for fast lookup fmt.Println("Loading known files from database...") knownFiles, err := s.loadKnownFiles(ctx, path) if err != nil { @@ -150,18 +141,20 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc } fmt.Printf("Loaded %s known files from database\n", formatNumber(len(knownFiles))) - // Phase 0c: Check for deleted files by comparing DB against enumerated set (no filesystem access) - if err := s.detectDeletedFilesFromMap(ctx, knownFiles, existingFiles, result); err != nil { - return nil, fmt.Errorf("detecting deleted files: %w", err) - } - - // Phase 1: Scan directory and collect files to process + // Phase 1: Scan directory, collect files to process, and track existing files + // (builds existingFiles map during walk to avoid double traversal) log.Info("Phase 1/3: Scanning directory structure") + existingFiles := make(map[string]struct{}) filesToProcess, err := s.scanPhase(ctx, path, result, existingFiles, knownFiles) if err != nil { return nil, fmt.Errorf("scan phase failed: %w", err) } + // Phase 1b: Detect deleted files by comparing DB against scanned files + if err := s.detectDeletedFilesFromMap(ctx, knownFiles, existingFiles, result); err != nil { + return nil, fmt.Errorf("detecting deleted files: %w", err) + } + // Calculate total size to process var totalSizeToProcess int64 for _, file := range filesToProcess { @@ -225,66 +218,6 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc return result, nil } -// enumerateFiles performs a quick enumeration to get all file paths without expensive stat() calls -// Returns a set of all file paths found on disk -func (s *Scanner) enumerateFiles(ctx context.Context, path string) (map[string]struct{}, error) { - files := make(map[string]struct{}) - startTime := time.Now() - lastStatusTime := time.Now() - statusInterval := 5 * time.Second - - var enumDir func(dirPath string) error - enumDir = func(dirPath string) error { - // Check context cancellation - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - f, err := s.fs.Open(dirPath) - if err != nil { - return nil // Skip directories we can't open - } - defer func() { _ = f.Close() }() - - for { - // Read directory entries in batches - entries, err := f.Readdir(1000) - if err != nil { - break // End of directory or error - } - - for _, entry := range entries { - fullPath := dirPath + "/" + entry.Name() - if entry.IsDir() { - if err := enumDir(fullPath); err != nil { - return err - } - } else if entry.Mode().IsRegular() { - files[fullPath] = struct{}{} - } - } - - // Periodic status update - if time.Since(lastStatusTime) >= statusInterval { - elapsed := time.Since(startTime).Round(time.Second) - fmt.Printf("Enumerating files: %s found (%s elapsed)\n", - formatNumber(len(files)), elapsed) - lastStatusTime = time.Now() - } - } - - return nil - } - - if err := enumDir(path); err != nil { - return files, err - } - - return files, nil -} - // loadKnownFiles loads all known files from the database into a map for fast lookup // This avoids per-file database queries during the scan phase func (s *Scanner) loadKnownFiles(ctx context.Context, path string) (map[string]*database.File, error) { @@ -303,8 +236,10 @@ func (s *Scanner) loadKnownFiles(ctx context.Context, path string) (map[string]* // scanPhase performs the initial directory scan to identify files to process // It uses the pre-loaded knownFiles map for fast change detection without DB queries +// It also populates existingFiles map for deletion detection func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult, existingFiles map[string]struct{}, knownFiles map[string]*database.File) ([]*FileToProcess, error) { - totalFiles := int64(len(existingFiles)) + // Use known file count as estimate for progress (accurate for subsequent backups) + estimatedTotal := int64(len(knownFiles)) var filesToProcess []*FileToProcess var allFiles []*database.File // Collect all files for batch insert @@ -335,6 +270,9 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult return nil } + // Track this file as existing (for deletion detection) + existingFiles[filePath] = struct{}{} + // Check file against in-memory map (no DB query!) file, needsProcessing := s.checkFileInMemory(filePath, info, knownFiles) @@ -365,17 +303,23 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult elapsed := time.Since(startTime) rate := float64(filesScanned) / elapsed.Seconds() - // Build status line - if totalFiles > 0 { - pct := float64(filesScanned) / float64(totalFiles) * 100 - remaining := totalFiles - filesScanned + // Build status line - use estimate if available (not first backup) + if estimatedTotal > 0 { + // Show actual scanned vs estimate (may exceed estimate if files were added) + pct := float64(filesScanned) / float64(estimatedTotal) * 100 + if pct > 100 { + pct = 100 // Cap at 100% for display + } + remaining := estimatedTotal - filesScanned + if remaining < 0 { + remaining = 0 + } var eta time.Duration - if rate > 0 { + if rate > 0 && remaining > 0 { eta = time.Duration(float64(remaining)/rate) * time.Second } - fmt.Printf("Scan: %s/%s files (%.1f%%), %s changed/new, %.0f files/sec, %s elapsed", + fmt.Printf("Scan: %s files (~%.0f%%), %s changed/new, %.0f files/sec, %s elapsed", formatNumber(int(filesScanned)), - formatNumber(int(totalFiles)), pct, formatNumber(changedCount), rate, @@ -385,6 +329,7 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult } fmt.Println() } else { + // First backup - no estimate available fmt.Printf("Scan: %s files, %s changed/new, %.0f files/sec, %s elapsed\n", formatNumber(int(filesScanned)), formatNumber(changedCount),