Eliminate redundant filesystem traversal in scan phase
Remove the separate enumerateFiles() function that was doing a full directory walk using Readdir() which calls stat() on every file. Instead, build the existingFiles map during the scan phase walk, and detect deleted files afterward. This eliminates one full filesystem traversal, significantly speeding up the scan phase for large directories.
This commit is contained in:
parent
c3725e745e
commit
88e2508dc7
@ -133,16 +133,7 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
|
||||
defer s.progress.Stop()
|
||||
}
|
||||
|
||||
// Phase 0: Quick enumeration of all files on disk
|
||||
fmt.Println("Enumerating files...")
|
||||
existingFiles, err := s.enumerateFiles(ctx, path)
|
||||
if err != nil && err != context.Canceled {
|
||||
log.Warn("Failed to enumerate files", "error", err)
|
||||
existingFiles = make(map[string]struct{})
|
||||
}
|
||||
fmt.Printf("Found %s files\n", formatNumber(len(existingFiles)))
|
||||
|
||||
// Phase 0b: Load known files from database into memory for fast lookup
|
||||
// Phase 0: Load known files from database into memory for fast lookup
|
||||
fmt.Println("Loading known files from database...")
|
||||
knownFiles, err := s.loadKnownFiles(ctx, path)
|
||||
if err != nil {
|
||||
@ -150,18 +141,20 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
|
||||
}
|
||||
fmt.Printf("Loaded %s known files from database\n", formatNumber(len(knownFiles)))
|
||||
|
||||
// Phase 0c: Check for deleted files by comparing DB against enumerated set (no filesystem access)
|
||||
if err := s.detectDeletedFilesFromMap(ctx, knownFiles, existingFiles, result); err != nil {
|
||||
return nil, fmt.Errorf("detecting deleted files: %w", err)
|
||||
}
|
||||
|
||||
// Phase 1: Scan directory and collect files to process
|
||||
// Phase 1: Scan directory, collect files to process, and track existing files
|
||||
// (builds existingFiles map during walk to avoid double traversal)
|
||||
log.Info("Phase 1/3: Scanning directory structure")
|
||||
existingFiles := make(map[string]struct{})
|
||||
filesToProcess, err := s.scanPhase(ctx, path, result, existingFiles, knownFiles)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("scan phase failed: %w", err)
|
||||
}
|
||||
|
||||
// Phase 1b: Detect deleted files by comparing DB against scanned files
|
||||
if err := s.detectDeletedFilesFromMap(ctx, knownFiles, existingFiles, result); err != nil {
|
||||
return nil, fmt.Errorf("detecting deleted files: %w", err)
|
||||
}
|
||||
|
||||
// Calculate total size to process
|
||||
var totalSizeToProcess int64
|
||||
for _, file := range filesToProcess {
|
||||
@ -225,66 +218,6 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// enumerateFiles performs a quick enumeration to get all file paths without expensive stat() calls
|
||||
// Returns a set of all file paths found on disk
|
||||
func (s *Scanner) enumerateFiles(ctx context.Context, path string) (map[string]struct{}, error) {
|
||||
files := make(map[string]struct{})
|
||||
startTime := time.Now()
|
||||
lastStatusTime := time.Now()
|
||||
statusInterval := 5 * time.Second
|
||||
|
||||
var enumDir func(dirPath string) error
|
||||
enumDir = func(dirPath string) error {
|
||||
// Check context cancellation
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
f, err := s.fs.Open(dirPath)
|
||||
if err != nil {
|
||||
return nil // Skip directories we can't open
|
||||
}
|
||||
defer func() { _ = f.Close() }()
|
||||
|
||||
for {
|
||||
// Read directory entries in batches
|
||||
entries, err := f.Readdir(1000)
|
||||
if err != nil {
|
||||
break // End of directory or error
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
fullPath := dirPath + "/" + entry.Name()
|
||||
if entry.IsDir() {
|
||||
if err := enumDir(fullPath); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if entry.Mode().IsRegular() {
|
||||
files[fullPath] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
// Periodic status update
|
||||
if time.Since(lastStatusTime) >= statusInterval {
|
||||
elapsed := time.Since(startTime).Round(time.Second)
|
||||
fmt.Printf("Enumerating files: %s found (%s elapsed)\n",
|
||||
formatNumber(len(files)), elapsed)
|
||||
lastStatusTime = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := enumDir(path); err != nil {
|
||||
return files, err
|
||||
}
|
||||
|
||||
return files, nil
|
||||
}
|
||||
|
||||
// loadKnownFiles loads all known files from the database into a map for fast lookup
|
||||
// This avoids per-file database queries during the scan phase
|
||||
func (s *Scanner) loadKnownFiles(ctx context.Context, path string) (map[string]*database.File, error) {
|
||||
@ -303,8 +236,10 @@ func (s *Scanner) loadKnownFiles(ctx context.Context, path string) (map[string]*
|
||||
|
||||
// scanPhase performs the initial directory scan to identify files to process
|
||||
// It uses the pre-loaded knownFiles map for fast change detection without DB queries
|
||||
// It also populates existingFiles map for deletion detection
|
||||
func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult, existingFiles map[string]struct{}, knownFiles map[string]*database.File) ([]*FileToProcess, error) {
|
||||
totalFiles := int64(len(existingFiles))
|
||||
// Use known file count as estimate for progress (accurate for subsequent backups)
|
||||
estimatedTotal := int64(len(knownFiles))
|
||||
|
||||
var filesToProcess []*FileToProcess
|
||||
var allFiles []*database.File // Collect all files for batch insert
|
||||
@ -335,6 +270,9 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
|
||||
return nil
|
||||
}
|
||||
|
||||
// Track this file as existing (for deletion detection)
|
||||
existingFiles[filePath] = struct{}{}
|
||||
|
||||
// Check file against in-memory map (no DB query!)
|
||||
file, needsProcessing := s.checkFileInMemory(filePath, info, knownFiles)
|
||||
|
||||
@ -365,17 +303,23 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
|
||||
elapsed := time.Since(startTime)
|
||||
rate := float64(filesScanned) / elapsed.Seconds()
|
||||
|
||||
// Build status line
|
||||
if totalFiles > 0 {
|
||||
pct := float64(filesScanned) / float64(totalFiles) * 100
|
||||
remaining := totalFiles - filesScanned
|
||||
// Build status line - use estimate if available (not first backup)
|
||||
if estimatedTotal > 0 {
|
||||
// Show actual scanned vs estimate (may exceed estimate if files were added)
|
||||
pct := float64(filesScanned) / float64(estimatedTotal) * 100
|
||||
if pct > 100 {
|
||||
pct = 100 // Cap at 100% for display
|
||||
}
|
||||
remaining := estimatedTotal - filesScanned
|
||||
if remaining < 0 {
|
||||
remaining = 0
|
||||
}
|
||||
var eta time.Duration
|
||||
if rate > 0 {
|
||||
if rate > 0 && remaining > 0 {
|
||||
eta = time.Duration(float64(remaining)/rate) * time.Second
|
||||
}
|
||||
fmt.Printf("Scan: %s/%s files (%.1f%%), %s changed/new, %.0f files/sec, %s elapsed",
|
||||
fmt.Printf("Scan: %s files (~%.0f%%), %s changed/new, %.0f files/sec, %s elapsed",
|
||||
formatNumber(int(filesScanned)),
|
||||
formatNumber(int(totalFiles)),
|
||||
pct,
|
||||
formatNumber(changedCount),
|
||||
rate,
|
||||
@ -385,6 +329,7 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
|
||||
}
|
||||
fmt.Println()
|
||||
} else {
|
||||
// First backup - no estimate available
|
||||
fmt.Printf("Scan: %s files, %s changed/new, %.0f files/sec, %s elapsed\n",
|
||||
formatNumber(int(filesScanned)),
|
||||
formatNumber(changedCount),
|
||||
|
||||
Loading…
Reference in New Issue
Block a user