Eliminate redundant filesystem traversal in scan phase

Remove the separate enumerateFiles() function that was doing a full
directory walk using Readdir() which calls stat() on every file.
Instead, build the existingFiles map during the scan phase walk,
and detect deleted files afterward.

This eliminates one full filesystem traversal, significantly speeding
up the scan phase for large directories.
This commit is contained in:
Jeffrey Paul 2025-12-19 12:15:13 +07:00
parent c3725e745e
commit 88e2508dc7

View File

@ -133,16 +133,7 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
defer s.progress.Stop()
}
// Phase 0: Quick enumeration of all files on disk
fmt.Println("Enumerating files...")
existingFiles, err := s.enumerateFiles(ctx, path)
if err != nil && err != context.Canceled {
log.Warn("Failed to enumerate files", "error", err)
existingFiles = make(map[string]struct{})
}
fmt.Printf("Found %s files\n", formatNumber(len(existingFiles)))
// Phase 0b: Load known files from database into memory for fast lookup
// Phase 0: Load known files from database into memory for fast lookup
fmt.Println("Loading known files from database...")
knownFiles, err := s.loadKnownFiles(ctx, path)
if err != nil {
@ -150,18 +141,20 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
}
fmt.Printf("Loaded %s known files from database\n", formatNumber(len(knownFiles)))
// Phase 0c: Check for deleted files by comparing DB against enumerated set (no filesystem access)
if err := s.detectDeletedFilesFromMap(ctx, knownFiles, existingFiles, result); err != nil {
return nil, fmt.Errorf("detecting deleted files: %w", err)
}
// Phase 1: Scan directory and collect files to process
// Phase 1: Scan directory, collect files to process, and track existing files
// (builds existingFiles map during walk to avoid double traversal)
log.Info("Phase 1/3: Scanning directory structure")
existingFiles := make(map[string]struct{})
filesToProcess, err := s.scanPhase(ctx, path, result, existingFiles, knownFiles)
if err != nil {
return nil, fmt.Errorf("scan phase failed: %w", err)
}
// Phase 1b: Detect deleted files by comparing DB against scanned files
if err := s.detectDeletedFilesFromMap(ctx, knownFiles, existingFiles, result); err != nil {
return nil, fmt.Errorf("detecting deleted files: %w", err)
}
// Calculate total size to process
var totalSizeToProcess int64
for _, file := range filesToProcess {
@ -225,66 +218,6 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
return result, nil
}
// enumerateFiles performs a quick enumeration to get all file paths without expensive stat() calls
// Returns a set of all file paths found on disk
func (s *Scanner) enumerateFiles(ctx context.Context, path string) (map[string]struct{}, error) {
files := make(map[string]struct{})
startTime := time.Now()
lastStatusTime := time.Now()
statusInterval := 5 * time.Second
var enumDir func(dirPath string) error
enumDir = func(dirPath string) error {
// Check context cancellation
select {
case <-ctx.Done():
return ctx.Err()
default:
}
f, err := s.fs.Open(dirPath)
if err != nil {
return nil // Skip directories we can't open
}
defer func() { _ = f.Close() }()
for {
// Read directory entries in batches
entries, err := f.Readdir(1000)
if err != nil {
break // End of directory or error
}
for _, entry := range entries {
fullPath := dirPath + "/" + entry.Name()
if entry.IsDir() {
if err := enumDir(fullPath); err != nil {
return err
}
} else if entry.Mode().IsRegular() {
files[fullPath] = struct{}{}
}
}
// Periodic status update
if time.Since(lastStatusTime) >= statusInterval {
elapsed := time.Since(startTime).Round(time.Second)
fmt.Printf("Enumerating files: %s found (%s elapsed)\n",
formatNumber(len(files)), elapsed)
lastStatusTime = time.Now()
}
}
return nil
}
if err := enumDir(path); err != nil {
return files, err
}
return files, nil
}
// loadKnownFiles loads all known files from the database into a map for fast lookup
// This avoids per-file database queries during the scan phase
func (s *Scanner) loadKnownFiles(ctx context.Context, path string) (map[string]*database.File, error) {
@ -303,8 +236,10 @@ func (s *Scanner) loadKnownFiles(ctx context.Context, path string) (map[string]*
// scanPhase performs the initial directory scan to identify files to process
// It uses the pre-loaded knownFiles map for fast change detection without DB queries
// It also populates existingFiles map for deletion detection
func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult, existingFiles map[string]struct{}, knownFiles map[string]*database.File) ([]*FileToProcess, error) {
totalFiles := int64(len(existingFiles))
// Use known file count as estimate for progress (accurate for subsequent backups)
estimatedTotal := int64(len(knownFiles))
var filesToProcess []*FileToProcess
var allFiles []*database.File // Collect all files for batch insert
@ -335,6 +270,9 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
return nil
}
// Track this file as existing (for deletion detection)
existingFiles[filePath] = struct{}{}
// Check file against in-memory map (no DB query!)
file, needsProcessing := s.checkFileInMemory(filePath, info, knownFiles)
@ -365,17 +303,23 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
elapsed := time.Since(startTime)
rate := float64(filesScanned) / elapsed.Seconds()
// Build status line
if totalFiles > 0 {
pct := float64(filesScanned) / float64(totalFiles) * 100
remaining := totalFiles - filesScanned
// Build status line - use estimate if available (not first backup)
if estimatedTotal > 0 {
// Show actual scanned vs estimate (may exceed estimate if files were added)
pct := float64(filesScanned) / float64(estimatedTotal) * 100
if pct > 100 {
pct = 100 // Cap at 100% for display
}
remaining := estimatedTotal - filesScanned
if remaining < 0 {
remaining = 0
}
var eta time.Duration
if rate > 0 {
if rate > 0 && remaining > 0 {
eta = time.Duration(float64(remaining)/rate) * time.Second
}
fmt.Printf("Scan: %s/%s files (%.1f%%), %s changed/new, %.0f files/sec, %s elapsed",
fmt.Printf("Scan: %s files (~%.0f%%), %s changed/new, %.0f files/sec, %s elapsed",
formatNumber(int(filesScanned)),
formatNumber(int(totalFiles)),
pct,
formatNumber(changedCount),
rate,
@ -385,6 +329,7 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
}
fmt.Println()
} else {
// First backup - no estimate available
fmt.Printf("Scan: %s files, %s changed/new, %.0f files/sec, %s elapsed\n",
formatNumber(int(filesScanned)),
formatNumber(changedCount),