From 88e2508dc796fda09e1f2bc01b75927db2e2a6ee Mon Sep 17 00:00:00 2001 From: sneak Date: Fri, 19 Dec 2025 12:15:13 +0700 Subject: [PATCH] Eliminate redundant filesystem traversal in scan phase Remove the separate enumerateFiles() function that was doing a full directory walk using Readdir() which calls stat() on every file. Instead, build the existingFiles map during the scan phase walk, and detect deleted files afterward. This eliminates one full filesystem traversal, significantly speeding up the scan phase for large directories. --- internal/snapshot/scanner.go | 113 +++++++++-------------------------- 1 file changed, 29 insertions(+), 84 deletions(-) diff --git a/internal/snapshot/scanner.go b/internal/snapshot/scanner.go index a9d9916..86e4ea9 100644 --- a/internal/snapshot/scanner.go +++ b/internal/snapshot/scanner.go @@ -133,16 +133,7 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc defer s.progress.Stop() } - // Phase 0: Quick enumeration of all files on disk - fmt.Println("Enumerating files...") - existingFiles, err := s.enumerateFiles(ctx, path) - if err != nil && err != context.Canceled { - log.Warn("Failed to enumerate files", "error", err) - existingFiles = make(map[string]struct{}) - } - fmt.Printf("Found %s files\n", formatNumber(len(existingFiles))) - - // Phase 0b: Load known files from database into memory for fast lookup + // Phase 0: Load known files from database into memory for fast lookup fmt.Println("Loading known files from database...") knownFiles, err := s.loadKnownFiles(ctx, path) if err != nil { @@ -150,18 +141,20 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc } fmt.Printf("Loaded %s known files from database\n", formatNumber(len(knownFiles))) - // Phase 0c: Check for deleted files by comparing DB against enumerated set (no filesystem access) - if err := s.detectDeletedFilesFromMap(ctx, knownFiles, existingFiles, result); err != nil { - return nil, fmt.Errorf("detecting deleted files: %w", err) - } - - // Phase 1: Scan directory and collect files to process + // Phase 1: Scan directory, collect files to process, and track existing files + // (builds existingFiles map during walk to avoid double traversal) log.Info("Phase 1/3: Scanning directory structure") + existingFiles := make(map[string]struct{}) filesToProcess, err := s.scanPhase(ctx, path, result, existingFiles, knownFiles) if err != nil { return nil, fmt.Errorf("scan phase failed: %w", err) } + // Phase 1b: Detect deleted files by comparing DB against scanned files + if err := s.detectDeletedFilesFromMap(ctx, knownFiles, existingFiles, result); err != nil { + return nil, fmt.Errorf("detecting deleted files: %w", err) + } + // Calculate total size to process var totalSizeToProcess int64 for _, file := range filesToProcess { @@ -225,66 +218,6 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc return result, nil } -// enumerateFiles performs a quick enumeration to get all file paths without expensive stat() calls -// Returns a set of all file paths found on disk -func (s *Scanner) enumerateFiles(ctx context.Context, path string) (map[string]struct{}, error) { - files := make(map[string]struct{}) - startTime := time.Now() - lastStatusTime := time.Now() - statusInterval := 5 * time.Second - - var enumDir func(dirPath string) error - enumDir = func(dirPath string) error { - // Check context cancellation - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - f, err := s.fs.Open(dirPath) - if err != nil { - return nil // Skip directories we can't open - } - defer func() { _ = f.Close() }() - - for { - // Read directory entries in batches - entries, err := f.Readdir(1000) - if err != nil { - break // End of directory or error - } - - for _, entry := range entries { - fullPath := dirPath + "/" + entry.Name() - if entry.IsDir() { - if err := enumDir(fullPath); err != nil { - return err - } - } else if entry.Mode().IsRegular() { - files[fullPath] = struct{}{} - } - } - - // Periodic status update - if time.Since(lastStatusTime) >= statusInterval { - elapsed := time.Since(startTime).Round(time.Second) - fmt.Printf("Enumerating files: %s found (%s elapsed)\n", - formatNumber(len(files)), elapsed) - lastStatusTime = time.Now() - } - } - - return nil - } - - if err := enumDir(path); err != nil { - return files, err - } - - return files, nil -} - // loadKnownFiles loads all known files from the database into a map for fast lookup // This avoids per-file database queries during the scan phase func (s *Scanner) loadKnownFiles(ctx context.Context, path string) (map[string]*database.File, error) { @@ -303,8 +236,10 @@ func (s *Scanner) loadKnownFiles(ctx context.Context, path string) (map[string]* // scanPhase performs the initial directory scan to identify files to process // It uses the pre-loaded knownFiles map for fast change detection without DB queries +// It also populates existingFiles map for deletion detection func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult, existingFiles map[string]struct{}, knownFiles map[string]*database.File) ([]*FileToProcess, error) { - totalFiles := int64(len(existingFiles)) + // Use known file count as estimate for progress (accurate for subsequent backups) + estimatedTotal := int64(len(knownFiles)) var filesToProcess []*FileToProcess var allFiles []*database.File // Collect all files for batch insert @@ -335,6 +270,9 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult return nil } + // Track this file as existing (for deletion detection) + existingFiles[filePath] = struct{}{} + // Check file against in-memory map (no DB query!) file, needsProcessing := s.checkFileInMemory(filePath, info, knownFiles) @@ -365,17 +303,23 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult elapsed := time.Since(startTime) rate := float64(filesScanned) / elapsed.Seconds() - // Build status line - if totalFiles > 0 { - pct := float64(filesScanned) / float64(totalFiles) * 100 - remaining := totalFiles - filesScanned + // Build status line - use estimate if available (not first backup) + if estimatedTotal > 0 { + // Show actual scanned vs estimate (may exceed estimate if files were added) + pct := float64(filesScanned) / float64(estimatedTotal) * 100 + if pct > 100 { + pct = 100 // Cap at 100% for display + } + remaining := estimatedTotal - filesScanned + if remaining < 0 { + remaining = 0 + } var eta time.Duration - if rate > 0 { + if rate > 0 && remaining > 0 { eta = time.Duration(float64(remaining)/rate) * time.Second } - fmt.Printf("Scan: %s/%s files (%.1f%%), %s changed/new, %.0f files/sec, %s elapsed", + fmt.Printf("Scan: %s files (~%.0f%%), %s changed/new, %.0f files/sec, %s elapsed", formatNumber(int(filesScanned)), - formatNumber(int(totalFiles)), pct, formatNumber(changedCount), rate, @@ -385,6 +329,7 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult } fmt.Println() } else { + // First backup - no estimate available fmt.Printf("Scan: %s files, %s changed/new, %.0f files/sec, %s elapsed\n", formatNumber(int(filesScanned)), formatNumber(changedCount),