Optimize scan phase: in-memory change detection and batched DB writes

Performance improvements:
- Load all known files from DB into memory at startup
- Check file changes against in-memory map (no per-file DB queries)
- Batch database writes in groups of 1000 files per transaction
- Scan phase now only counts regular files, not directories

This should improve scan speed from ~600 files/sec to potentially
10,000+ files/sec by eliminating per-file database round trips.
This commit is contained in:
2025-12-19 12:08:47 +07:00
parent badc0c07e0
commit c3725e745e
3 changed files with 187 additions and 258 deletions

View File

@@ -99,26 +99,25 @@ func TestScannerSimpleDirectory(t *testing.T) {
t.Fatalf("scan failed: %v", err)
}
// Verify results
// We now scan 6 files + 3 directories (source, subdir, subdir2) = 9 entries
if result.FilesScanned != 9 {
t.Errorf("expected 9 entries scanned, got %d", result.FilesScanned)
// Verify results - we only scan regular files, not directories
if result.FilesScanned != 6 {
t.Errorf("expected 6 files scanned, got %d", result.FilesScanned)
}
// Directories have their own sizes, so the total will be more than just file content
// Total bytes should be the sum of all file contents
if result.BytesScanned < 97 { // At minimum we have 97 bytes of file content
t.Errorf("expected at least 97 bytes scanned, got %d", result.BytesScanned)
}
// Verify files in database
// Verify files in database - only regular files are stored
files, err := repos.Files.ListByPrefix(ctx, "/source")
if err != nil {
t.Fatalf("failed to list files: %v", err)
}
// We should have 6 files + 3 directories = 9 entries
if len(files) != 9 {
t.Errorf("expected 9 entries in database, got %d", len(files))
// We should have 6 files (directories are not stored)
if len(files) != 6 {
t.Errorf("expected 6 files in database, got %d", len(files))
}
// Verify specific file
@@ -235,9 +234,9 @@ func TestScannerLargeFile(t *testing.T) {
t.Fatalf("scan failed: %v", err)
}
// We scan 1 file + 1 directory = 2 entries
if result.FilesScanned != 2 {
t.Errorf("expected 2 entries scanned, got %d", result.FilesScanned)
// We scan only regular files, not directories
if result.FilesScanned != 1 {
t.Errorf("expected 1 file scanned, got %d", result.FilesScanned)
}
// The file size should be at least 1MB