refactor: break up scanner.go methods into smaller helpers

2026-03-17 02:38:55 -07:00
parent e5b94bd917
commit 75c25280c3
1 changed files with 238 additions and 155 deletions
--- a/internal/snapshot/scanner.go
+++ b/internal/snapshot/scanner.go
@@ -180,18 +180,10 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
 	}
 	// Phase 0: Load known files and chunks from database into memory for fast lookup
-	fmt.Println("Loading known files from database...")
+	knownFiles, err := s.loadDatabaseState(ctx, path)
 	knownFiles, err := s.loadKnownFiles(ctx, path)
 	if err != nil {
-		return nil, fmt.Errorf("loading known files: %w", err)
+		return nil, err
 	}
 	fmt.Printf("Loaded %s known files from database\n", formatNumber(len(knownFiles)))
 	fmt.Println("Loading known chunks from database...")
 	if err := s.loadKnownChunks(ctx); err != nil {
 		return nil, fmt.Errorf("loading known chunks: %w", err)
 	}
 	fmt.Printf("Loaded %s known chunks from database\n", formatNumber(len(s.knownChunks)))
 	// Phase 1: Scan directory, collect files to process, and track existing files
 	// (builds existingFiles map during walk to avoid double traversal)
@@ -216,36 +208,8 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
 		}
 	}
-	// Calculate total size to process
+	// Summarize scan phase results and update progress
-	var totalSizeToProcess int64
+	s.summarizeScanPhase(result, filesToProcess)
 	for _, file := range filesToProcess {
 		totalSizeToProcess += file.FileInfo.Size()
 	}
 	// Update progress with total size and file count
 	if s.progress != nil {
 		s.progress.SetTotalSize(totalSizeToProcess)
 		s.progress.GetStats().TotalFiles.Store(int64(len(filesToProcess)))
 	}
 	log.Info("Phase 1 complete",
 		"total_files", len(filesToProcess),
 		"total_size", humanize.Bytes(uint64(totalSizeToProcess)),
 		"files_skipped", result.FilesSkipped,
 		"bytes_skipped", humanize.Bytes(uint64(result.BytesSkipped)))
 	// Print scan summary
 	fmt.Printf("Scan complete: %s examined (%s), %s to process (%s)",
 		formatNumber(result.FilesScanned),
 		humanize.Bytes(uint64(totalSizeToProcess+result.BytesSkipped)),
 		formatNumber(len(filesToProcess)),
 		humanize.Bytes(uint64(totalSizeToProcess)))
 	if result.FilesDeleted > 0 {
 		fmt.Printf(", %s deleted (%s)",
 			formatNumber(result.FilesDeleted),
 			humanize.Bytes(uint64(result.BytesDeleted)))
 	}
 	fmt.Println()
 	// Phase 2: Process files and create chunks
 	if len(filesToProcess) > 0 {
@@ -259,7 +223,66 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
 		log.Info("Phase 2/3: Skipping (no files need processing, metadata-only snapshot)")
 	}
-	// Get final stats from packer
+	// Finalize result with blob statistics
 	s.finalizeScanResult(ctx, result)
 	return result, nil
 }
 // loadDatabaseState loads known files and chunks from the database into memory for fast lookup
 // This avoids per-file and per-chunk database queries during the scan and process phases
 func (s *Scanner) loadDatabaseState(ctx context.Context, path string) (map[string]*database.File, error) {
 	fmt.Println("Loading known files from database...")
 	knownFiles, err := s.loadKnownFiles(ctx, path)
 	if err != nil {
 		return nil, fmt.Errorf("loading known files: %w", err)
 	}
 	fmt.Printf("Loaded %s known files from database\n", formatNumber(len(knownFiles)))
 	fmt.Println("Loading known chunks from database...")
 	if err := s.loadKnownChunks(ctx); err != nil {
 		return nil, fmt.Errorf("loading known chunks: %w", err)
 	}
 	fmt.Printf("Loaded %s known chunks from database\n", formatNumber(len(s.knownChunks)))
 	return knownFiles, nil
 }
 // summarizeScanPhase calculates total size to process, updates progress tracking,
 // and prints the scan phase summary with file counts and sizes
 func (s *Scanner) summarizeScanPhase(result *ScanResult, filesToProcess []*FileToProcess) {
 	var totalSizeToProcess int64
 	for _, file := range filesToProcess {
 		totalSizeToProcess += file.FileInfo.Size()
 	}
 	if s.progress != nil {
 		s.progress.SetTotalSize(totalSizeToProcess)
 		s.progress.GetStats().TotalFiles.Store(int64(len(filesToProcess)))
 	}
 	log.Info("Phase 1 complete",
 		"total_files", len(filesToProcess),
 		"total_size", humanize.Bytes(uint64(totalSizeToProcess)),
 		"files_skipped", result.FilesSkipped,
 		"bytes_skipped", humanize.Bytes(uint64(result.BytesSkipped)))
 	fmt.Printf("Scan complete: %s examined (%s), %s to process (%s)",
 		formatNumber(result.FilesScanned),
 		humanize.Bytes(uint64(totalSizeToProcess+result.BytesSkipped)),
 		formatNumber(len(filesToProcess)),
 		humanize.Bytes(uint64(totalSizeToProcess)))
 	if result.FilesDeleted > 0 {
 		fmt.Printf(", %s deleted (%s)",
 			formatNumber(result.FilesDeleted),
 			humanize.Bytes(uint64(result.BytesDeleted)))
 	}
 	fmt.Println()
 }
 // finalizeScanResult populates final blob statistics in the scan result
 // by querying the packer and database for blob/upload counts
 func (s *Scanner) finalizeScanResult(ctx context.Context, result *ScanResult) {
 	blobs := s.packer.GetFinishedBlobs()
 	result.BlobsCreated += len(blobs)
@@ -276,7 +299,6 @@ func (s *Scanner) Scan(ctx context.Context, path string, snapshotID string) (*Sc
 	}
 	result.EndTime = time.Now().UTC()
 	return result, nil
 }
 // loadKnownFiles loads all known files from the database into a map for fast lookup
@@ -424,12 +446,38 @@ func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
 	flushStart := time.Now()
 	log.Debug("flushCompletedPendingFiles: starting")
 	// Partition pending files into those ready to flush and those still waiting
 	canFlush, stillPendingCount := s.partitionPendingByChunkStatus()
 	if len(canFlush) == 0 {
 		log.Debug("flushCompletedPendingFiles: nothing to flush")
 		return nil
 	}
 	log.Debug("Flushing completed files after blob finalize",
 		"files_to_flush", len(canFlush),
 		"files_still_pending", stillPendingCount)
 	// Collect all data for batch operations
 	allFiles, allFileIDs, allFileChunks, allChunkFiles := s.collectBatchFlushData(canFlush)
 	// Execute the batch flush in a single transaction
 	log.Debug("flushCompletedPendingFiles: starting transaction")
 	txStart := time.Now()
 	err := s.executeBatchFileFlush(ctx, allFiles, allFileIDs, allFileChunks, allChunkFiles)
 	log.Debug("flushCompletedPendingFiles: transaction done", "duration", time.Since(txStart))
 	log.Debug("flushCompletedPendingFiles: total duration", "duration", time.Since(flushStart))
 	return err
 }
 // partitionPendingByChunkStatus separates pending files into those whose chunks
 // are all committed to DB (ready to flush) and those still waiting on pending chunks.
 // Updates s.pendingFiles to contain only the still-pending files.
 func (s *Scanner) partitionPendingByChunkStatus() (canFlush []pendingFileData, stillPendingCount int) {
 	log.Debug("flushCompletedPendingFiles: acquiring pendingFilesMu lock")
 	s.pendingFilesMu.Lock()
 	log.Debug("flushCompletedPendingFiles: acquired lock", "pending_files", len(s.pendingFiles))
 	// Separate files into complete (can flush) and incomplete (keep pending)
 	var canFlush []pendingFileData
 	var stillPending []pendingFileData
 	log.Debug("flushCompletedPendingFiles: checking which files can flush")
@@ -454,18 +502,15 @@ func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
 	s.pendingFilesMu.Unlock()
 	log.Debug("flushCompletedPendingFiles: released lock")
-	if len(canFlush) == 0 {
+	return canFlush, len(stillPending)
-		log.Debug("flushCompletedPendingFiles: nothing to flush")
+}
 		return nil
 	}
-	log.Debug("Flushing completed files after blob finalize",
+// collectBatchFlushData aggregates file records, IDs, file-chunk mappings, and chunk-file
-		"files_to_flush", len(canFlush),
+// mappings from the given pending file data for efficient batch database operations
-		"files_still_pending", len(stillPending))
+func (s *Scanner) collectBatchFlushData(canFlush []pendingFileData) ([]*database.File, []types.FileID, []database.FileChunk, []database.ChunkFile) {
 	// Collect all data for batch operations
 	log.Debug("flushCompletedPendingFiles: collecting data for batch ops")
 	collectStart := time.Now()
 	var allFileChunks []database.FileChunk
 	var allChunkFiles []database.ChunkFile
 	var allFileIDs []types.FileID
@@ -477,16 +522,20 @@ func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
 		allFileIDs = append(allFileIDs, data.file.ID)
 		allFiles = append(allFiles, data.file)
 	}
 	log.Debug("flushCompletedPendingFiles: collected data",
 		"duration", time.Since(collectStart),
 		"file_chunks", len(allFileChunks),
 		"chunk_files", len(allChunkFiles),
 		"files", len(allFiles))
-	// Flush the complete files using batch operations
+	return allFiles, allFileIDs, allFileChunks, allChunkFiles
-	log.Debug("flushCompletedPendingFiles: starting transaction")
+}
-	txStart := time.Now()
+
-	err := s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
+// executeBatchFileFlush writes all collected file data to the database in a single transaction,
 // including deleting old mappings, creating file records, and adding snapshot associations
 func (s *Scanner) executeBatchFileFlush(ctx context.Context, allFiles []*database.File, allFileIDs []types.FileID, allFileChunks []database.FileChunk, allChunkFiles []database.ChunkFile) error {
 	return s.repos.WithTx(ctx, func(txCtx context.Context, tx *sql.Tx) error {
 		log.Debug("flushCompletedPendingFiles: inside transaction")
 		// Batch delete old file_chunks and chunk_files
@@ -539,9 +588,6 @@ func (s *Scanner) flushCompletedPendingFiles(ctx context.Context) error {
 		log.Debug("flushCompletedPendingFiles: transaction complete")
 		return nil
 	})
 	log.Debug("flushCompletedPendingFiles: transaction done", "duration", time.Since(txStart))
 	log.Debug("flushCompletedPendingFiles: total duration", "duration", time.Since(flushStart))
 	return err
 }
 // ScanPhaseResult contains the results of the scan phase
@@ -623,62 +669,11 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
 		mu.Unlock()
 		// Update result stats
-		if needsProcessing {
+		s.updateScanEntryStats(result, needsProcessing, info)
 			result.BytesScanned += info.Size()
 			if s.progress != nil {
 				s.progress.GetStats().BytesScanned.Add(info.Size())
 			}
 		} else {
 			result.FilesSkipped++
 			result.BytesSkipped += info.Size()
 			if s.progress != nil {
 				s.progress.GetStats().FilesSkipped.Add(1)
 				s.progress.GetStats().BytesSkipped.Add(info.Size())
 			}
 		}
 		result.FilesScanned++
 		if s.progress != nil {
 			s.progress.GetStats().FilesScanned.Add(1)
 		}
 		// Output periodic status
 		if time.Since(lastStatusTime) >= statusInterval {
-			elapsed := time.Since(startTime)
+			printScanProgressLine(filesScanned, changedCount, estimatedTotal, startTime)
 			rate := float64(filesScanned) / elapsed.Seconds()
 			// Build status line - use estimate if available (not first backup)
 			if estimatedTotal > 0 {
 				// Show actual scanned vs estimate (may exceed estimate if files were added)
 				pct := float64(filesScanned) / float64(estimatedTotal) * 100
 				if pct > 100 {
 					pct = 100 // Cap at 100% for display
 				}
 				remaining := estimatedTotal - filesScanned
 				if remaining < 0 {
 					remaining = 0
 				}
 				var eta time.Duration
 				if rate > 0 && remaining > 0 {
 					eta = time.Duration(float64(remaining)/rate) * time.Second
 				}
 				fmt.Printf("Scan: %s files (~%.0f%%), %s changed/new, %.0f files/sec, %s elapsed",
 					formatNumber(int(filesScanned)),
 					pct,
 					formatNumber(changedCount),
 					rate,
 					elapsed.Round(time.Second))
 				if eta > 0 {
 					fmt.Printf(", ETA %s", eta.Round(time.Second))
 				}
 				fmt.Println()
 			} else {
 				// First backup - no estimate available
 				fmt.Printf("Scan: %s files, %s changed/new, %.0f files/sec, %s elapsed\n",
 					formatNumber(int(filesScanned)),
 					formatNumber(changedCount),
 					rate,
 					elapsed.Round(time.Second))
 			}
 			lastStatusTime = time.Now()
 		}
@@ -695,6 +690,68 @@ func (s *Scanner) scanPhase(ctx context.Context, path string, result *ScanResult
 	}, nil
 }
 // updateScanEntryStats updates the scan result and progress reporter statistics
 // for a single scanned file entry based on whether it needs processing
 func (s *Scanner) updateScanEntryStats(result *ScanResult, needsProcessing bool, info os.FileInfo) {
 	if needsProcessing {
 		result.BytesScanned += info.Size()
 		if s.progress != nil {
 			s.progress.GetStats().BytesScanned.Add(info.Size())
 		}
 	} else {
 		result.FilesSkipped++
 		result.BytesSkipped += info.Size()
 		if s.progress != nil {
 			s.progress.GetStats().FilesSkipped.Add(1)
 			s.progress.GetStats().BytesSkipped.Add(info.Size())
 		}
 	}
 	result.FilesScanned++
 	if s.progress != nil {
 		s.progress.GetStats().FilesScanned.Add(1)
 	}
 }
 // printScanProgressLine prints a periodic progress line during the scan phase,
 // showing files scanned, percentage complete (if estimate available), and ETA
 func printScanProgressLine(filesScanned int64, changedCount int, estimatedTotal int64, startTime time.Time) {
 	elapsed := time.Since(startTime)
 	rate := float64(filesScanned) / elapsed.Seconds()
 	if estimatedTotal > 0 {
 		// Show actual scanned vs estimate (may exceed estimate if files were added)
 		pct := float64(filesScanned) / float64(estimatedTotal) * 100
 		if pct > 100 {
 			pct = 100 // Cap at 100% for display
 		}
 		remaining := estimatedTotal - filesScanned
 		if remaining < 0 {
 			remaining = 0
 		}
 		var eta time.Duration
 		if rate > 0 && remaining > 0 {
 			eta = time.Duration(float64(remaining)/rate) * time.Second
 		}
 		fmt.Printf("Scan: %s files (~%.0f%%), %s changed/new, %.0f files/sec, %s elapsed",
 			formatNumber(int(filesScanned)),
 			pct,
 			formatNumber(changedCount),
 			rate,
 			elapsed.Round(time.Second))
 		if eta > 0 {
 			fmt.Printf(", ETA %s", eta.Round(time.Second))
 		}
 		fmt.Println()
 	} else {
 		// First backup - no estimate available
 		fmt.Printf("Scan: %s files, %s changed/new, %.0f files/sec, %s elapsed\n",
 			formatNumber(int(filesScanned)),
 			formatNumber(changedCount),
 			rate,
 			elapsed.Round(time.Second))
 	}
 }
 // checkFileInMemory checks if a file needs processing using the in-memory map
 // No database access is performed - this is purely CPU/memory work
 func (s *Scanner) checkFileInMemory(path string, info os.FileInfo, knownFiles map[string]*database.File) (*database.File, bool) {
@@ -830,22 +887,13 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc
 			s.progress.GetStats().CurrentFile.Store(fileToProcess.Path)
 		}
-		// Process file in streaming fashion
+		// Process file with error handling for deleted files and skip-errors mode
-		if err := s.processFileStreaming(ctx, fileToProcess, result); err != nil {
+		skipped, err := s.processFileWithErrorHandling(ctx, fileToProcess, result)
-			// Handle files that were deleted between scan and process phases
+		if err != nil {
-			if errors.Is(err, os.ErrNotExist) {
+			return err
-				log.Warn("File was deleted during backup, skipping", "path", fileToProcess.Path)
+		}
-				result.FilesSkipped++
+		if skipped {
-				continue
+			continue
 			}
 			// Skip file read errors if --skip-errors is enabled
 			if s.skipErrors {
 				log.Error("ERROR: Failed to process file (skipping due to --skip-errors)", "path", fileToProcess.Path, "error", err)
 				fmt.Printf("ERROR: Failed to process %s: %v (skipping)\n", fileToProcess.Path, err)
 				result.FilesSkipped++
 				continue
 			}
 			return fmt.Errorf("processing file %s: %w", fileToProcess.Path, err)
 		}
 		// Update files processed counter
@@ -858,36 +906,71 @@ func (s *Scanner) processPhase(ctx context.Context, filesToProcess []*FileToProc
 		// Output periodic status
 		if time.Since(lastStatusTime) >= statusInterval {
-			elapsed := time.Since(startTime)
+			printProcessingProgress(filesProcessed, totalFiles, bytesProcessed, totalBytes, startTime)
 			pct := float64(bytesProcessed) / float64(totalBytes) * 100
 			byteRate := float64(bytesProcessed) / elapsed.Seconds()
 			fileRate := float64(filesProcessed) / elapsed.Seconds()
 			// Calculate ETA based on bytes (more accurate than files)
 			remainingBytes := totalBytes - bytesProcessed
 			var eta time.Duration
 			if byteRate > 0 {
 				eta = time.Duration(float64(remainingBytes)/byteRate) * time.Second
 			}
 			// Format: Progress [5.7k/610k] 6.7 GB/44 GB (15.4%), 106MB/sec, 500 files/sec, running for 1m30s, ETA: 5m49s
 			fmt.Printf("Progress [%s/%s] %s/%s (%.1f%%), %s/sec, %.0f files/sec, running for %s",
 				formatCompact(filesProcessed),
 				formatCompact(totalFiles),
 				humanize.Bytes(uint64(bytesProcessed)),
 				humanize.Bytes(uint64(totalBytes)),
 				pct,
 				humanize.Bytes(uint64(byteRate)),
 				fileRate,
 				elapsed.Round(time.Second))
 			if eta > 0 {
 				fmt.Printf(", ETA: %s", eta.Round(time.Second))
 			}
 			fmt.Println()
 			lastStatusTime = time.Now()
 		}
 	}
 	// Finalize: flush packer, pending files, and handle local blobs
 	return s.finalizeProcessPhase(ctx, result)
 }
 // processFileWithErrorHandling wraps processFileStreaming with error recovery for
 // deleted files and skip-errors mode. Returns (skipped, error).
 func (s *Scanner) processFileWithErrorHandling(ctx context.Context, fileToProcess *FileToProcess, result *ScanResult) (bool, error) {
 	if err := s.processFileStreaming(ctx, fileToProcess, result); err != nil {
 		// Handle files that were deleted between scan and process phases
 		if errors.Is(err, os.ErrNotExist) {
 			log.Warn("File was deleted during backup, skipping", "path", fileToProcess.Path)
 			result.FilesSkipped++
 			return true, nil
 		}
 		// Skip file read errors if --skip-errors is enabled
 		if s.skipErrors {
 			log.Error("ERROR: Failed to process file (skipping due to --skip-errors)", "path", fileToProcess.Path, "error", err)
 			fmt.Printf("ERROR: Failed to process %s: %v (skipping)\n", fileToProcess.Path, err)
 			result.FilesSkipped++
 			return true, nil
 		}
 		return false, fmt.Errorf("processing file %s: %w", fileToProcess.Path, err)
 	}
 	return false, nil
 }
 // printProcessingProgress prints a periodic progress line during the process phase,
 // showing files processed, bytes transferred, throughput, and ETA
 func printProcessingProgress(filesProcessed, totalFiles int, bytesProcessed, totalBytes int64, startTime time.Time) {
 	elapsed := time.Since(startTime)
 	pct := float64(bytesProcessed) / float64(totalBytes) * 100
 	byteRate := float64(bytesProcessed) / elapsed.Seconds()
 	fileRate := float64(filesProcessed) / elapsed.Seconds()
 	// Calculate ETA based on bytes (more accurate than files)
 	remainingBytes := totalBytes - bytesProcessed
 	var eta time.Duration
 	if byteRate > 0 {
 		eta = time.Duration(float64(remainingBytes)/byteRate) * time.Second
 	}
 	// Format: Progress [5.7k/610k] 6.7 GB/44 GB (15.4%), 106MB/sec, 500 files/sec, running for 1m30s, ETA: 5m49s
 	fmt.Printf("Progress [%s/%s] %s/%s (%.1f%%), %s/sec, %.0f files/sec, running for %s",
 		formatCompact(filesProcessed),
 		formatCompact(totalFiles),
 		humanize.Bytes(uint64(bytesProcessed)),
 		humanize.Bytes(uint64(totalBytes)),
 		pct,
 		humanize.Bytes(uint64(byteRate)),
 		fileRate,
 		elapsed.Round(time.Second))
 	if eta > 0 {
 		fmt.Printf(", ETA: %s", eta.Round(time.Second))
 	}
 	fmt.Println()
 }
 // finalizeProcessPhase flushes the packer, writes remaining pending files to the database,
 // and handles local blob storage when no remote storage is configured
 func (s *Scanner) finalizeProcessPhase(ctx context.Context, result *ScanResult) error {
 	// Final packer flush first - this commits remaining chunks to DB
 	// and handleBlobReady will flush files whose chunks are now committed
 	s.packerMu.Lock()