Add testable CLI with dependency injection and new scanner/checker packages
Major changes: - Refactor CLI to accept injected I/O streams and filesystem (afero.Fs) for testing without touching the real filesystem - Add RunOptions struct and RunWithOptions() for configurable CLI execution - Add internal/scanner package with two-phase manifest generation: - Phase 1 (Enumeration): walk directories, collect metadata - Phase 2 (Scan): read contents, compute hashes, write manifest - Add internal/checker package for manifest verification with progress reporting and channel-based result streaming - Add mfer/builder.go for incremental manifest construction - Add --no-extra-files flag to check command to detect files not in manifest - Add timing summaries showing file count, size, elapsed time, and throughput - Add comprehensive tests using afero.MemMapFs (no real filesystem access) - Add contrib/usage.sh integration test script - Fix banner ASCII art alignment (consistent spacing) - Fix verbosity levels so summaries display at default log level - Update internal/log to support configurable output writers
This commit is contained in:
373
internal/scanner/scanner.go
Normal file
373
internal/scanner/scanner.go
Normal file
@@ -0,0 +1,373 @@
|
||||
package scanner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"io/fs"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/afero"
|
||||
"sneak.berlin/go/mfer/mfer"
|
||||
)
|
||||
|
||||
// Phase 1: Enumeration
|
||||
// ---------------------
|
||||
// Walking directories and calling stat() on files to collect metadata.
|
||||
// Builds the list of files to be scanned. Relatively fast (metadata only).
|
||||
|
||||
// EnumerateStatus contains progress information for the enumeration phase.
|
||||
type EnumerateStatus struct {
|
||||
FilesFound int64
|
||||
BytesFound int64
|
||||
}
|
||||
|
||||
// Phase 2: Scan (ToManifest)
|
||||
// --------------------------
|
||||
// Reading file contents and computing hashes for manifest generation.
|
||||
// This is the expensive phase that reads all file data.
|
||||
|
||||
// ScanStatus contains progress information for the scan phase.
|
||||
type ScanStatus struct {
|
||||
TotalFiles int64
|
||||
ScannedFiles int64
|
||||
TotalBytes int64
|
||||
ScannedBytes int64
|
||||
BytesPerSec float64
|
||||
}
|
||||
|
||||
// Options configures scanner behavior.
|
||||
type Options struct {
|
||||
IgnoreDotfiles bool
|
||||
FollowSymLinks bool
|
||||
Fs afero.Fs // Filesystem to use, defaults to OsFs
|
||||
}
|
||||
|
||||
// FileEntry represents a file that has been enumerated.
|
||||
type FileEntry struct {
|
||||
Path string // Relative path (used in manifest)
|
||||
AbsPath string // Absolute path (used for reading file content)
|
||||
Size int64
|
||||
Mtime time.Time
|
||||
Ctime time.Time
|
||||
}
|
||||
|
||||
// Scanner accumulates files and generates manifests from them.
|
||||
type Scanner struct {
|
||||
mu sync.RWMutex
|
||||
files []*FileEntry
|
||||
options *Options
|
||||
fs afero.Fs
|
||||
}
|
||||
|
||||
// New creates a new Scanner with default options.
|
||||
func New() *Scanner {
|
||||
return NewWithOptions(nil)
|
||||
}
|
||||
|
||||
// NewWithOptions creates a new Scanner with the given options.
|
||||
func NewWithOptions(opts *Options) *Scanner {
|
||||
if opts == nil {
|
||||
opts = &Options{}
|
||||
}
|
||||
fs := opts.Fs
|
||||
if fs == nil {
|
||||
fs = afero.NewOsFs()
|
||||
}
|
||||
return &Scanner{
|
||||
files: make([]*FileEntry, 0),
|
||||
options: opts,
|
||||
fs: fs,
|
||||
}
|
||||
}
|
||||
|
||||
// EnumerateFile adds a single file to the scanner, calling stat() to get metadata.
|
||||
func (s *Scanner) EnumerateFile(filePath string) error {
|
||||
abs, err := filepath.Abs(filePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
info, err := s.fs.Stat(abs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// For single files, use the filename as the relative path
|
||||
basePath := filepath.Dir(abs)
|
||||
return s.enumerateFileWithInfo(filepath.Base(abs), basePath, info, nil)
|
||||
}
|
||||
|
||||
// EnumeratePath walks a directory path and adds all files to the scanner.
|
||||
// If progress is non-nil, status updates are sent as files are discovered.
|
||||
// The progress channel is closed when the method returns.
|
||||
func (s *Scanner) EnumeratePath(inputPath string, progress chan<- EnumerateStatus) error {
|
||||
if progress != nil {
|
||||
defer close(progress)
|
||||
}
|
||||
abs, err := filepath.Abs(inputPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
afs := afero.NewReadOnlyFs(afero.NewBasePathFs(s.fs, abs))
|
||||
return s.enumerateFS(afs, abs, progress)
|
||||
}
|
||||
|
||||
// EnumeratePaths walks multiple directory paths and adds all files to the scanner.
|
||||
// If progress is non-nil, status updates are sent as files are discovered.
|
||||
// The progress channel is closed when the method returns.
|
||||
func (s *Scanner) EnumeratePaths(progress chan<- EnumerateStatus, inputPaths ...string) error {
|
||||
if progress != nil {
|
||||
defer close(progress)
|
||||
}
|
||||
for _, p := range inputPaths {
|
||||
abs, err := filepath.Abs(p)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
afs := afero.NewReadOnlyFs(afero.NewBasePathFs(s.fs, abs))
|
||||
if err := s.enumerateFS(afs, abs, progress); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// EnumerateFS walks an afero filesystem and adds all files to the scanner.
|
||||
// If progress is non-nil, status updates are sent as files are discovered.
|
||||
// The progress channel is closed when the method returns.
|
||||
// basePath is used to compute absolute paths for file reading.
|
||||
func (s *Scanner) EnumerateFS(afs afero.Fs, basePath string, progress chan<- EnumerateStatus) error {
|
||||
if progress != nil {
|
||||
defer close(progress)
|
||||
}
|
||||
return s.enumerateFS(afs, basePath, progress)
|
||||
}
|
||||
|
||||
// enumerateFS is the internal implementation that doesn't close the progress channel.
|
||||
func (s *Scanner) enumerateFS(afs afero.Fs, basePath string, progress chan<- EnumerateStatus) error {
|
||||
return afero.Walk(afs, "/", func(p string, info fs.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if s.options.IgnoreDotfiles && pathIsHidden(p) {
|
||||
if info.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return s.enumerateFileWithInfo(p, basePath, info, progress)
|
||||
})
|
||||
}
|
||||
|
||||
// enumerateFileWithInfo adds a file with pre-existing fs.FileInfo.
|
||||
func (s *Scanner) enumerateFileWithInfo(filePath string, basePath string, info fs.FileInfo, progress chan<- EnumerateStatus) error {
|
||||
if info.IsDir() {
|
||||
// Manifests contain only files, directories are implied
|
||||
return nil
|
||||
}
|
||||
|
||||
// Clean the path - remove leading slash if present
|
||||
cleanPath := filePath
|
||||
if len(cleanPath) > 0 && cleanPath[0] == '/' {
|
||||
cleanPath = cleanPath[1:]
|
||||
}
|
||||
|
||||
// Compute absolute path for file reading
|
||||
absPath := filepath.Join(basePath, cleanPath)
|
||||
|
||||
entry := &FileEntry{
|
||||
Path: cleanPath,
|
||||
AbsPath: absPath,
|
||||
Size: info.Size(),
|
||||
Mtime: info.ModTime(),
|
||||
// Note: Ctime not available from fs.FileInfo on all platforms
|
||||
// Will need platform-specific code to extract it
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
s.files = append(s.files, entry)
|
||||
filesFound := int64(len(s.files))
|
||||
var bytesFound int64
|
||||
for _, f := range s.files {
|
||||
bytesFound += f.Size
|
||||
}
|
||||
s.mu.Unlock()
|
||||
|
||||
sendEnumerateStatus(progress, EnumerateStatus{
|
||||
FilesFound: filesFound,
|
||||
BytesFound: bytesFound,
|
||||
})
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Files returns a copy of all files added to the scanner.
|
||||
func (s *Scanner) Files() []*FileEntry {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
out := make([]*FileEntry, len(s.files))
|
||||
copy(out, s.files)
|
||||
return out
|
||||
}
|
||||
|
||||
// FileCount returns the number of files in the scanner.
|
||||
func (s *Scanner) FileCount() int64 {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
return int64(len(s.files))
|
||||
}
|
||||
|
||||
// TotalBytes returns the total size of all files in the scanner.
|
||||
func (s *Scanner) TotalBytes() int64 {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
var total int64
|
||||
for _, f := range s.files {
|
||||
total += f.Size
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// ToManifest reads all file contents, computes hashes, and generates a manifest.
|
||||
// If progress is non-nil, status updates are sent approximately once per second.
|
||||
// The progress channel is closed when the method returns.
|
||||
// The manifest is written to the provided io.Writer.
|
||||
func (s *Scanner) ToManifest(ctx context.Context, w io.Writer, progress chan<- ScanStatus) error {
|
||||
if progress != nil {
|
||||
defer close(progress)
|
||||
}
|
||||
|
||||
s.mu.RLock()
|
||||
files := make([]*FileEntry, len(s.files))
|
||||
copy(files, s.files)
|
||||
totalFiles := int64(len(files))
|
||||
var totalBytes int64
|
||||
for _, f := range files {
|
||||
totalBytes += f.Size
|
||||
}
|
||||
s.mu.RUnlock()
|
||||
|
||||
builder := mfer.NewBuilder()
|
||||
|
||||
var scannedFiles int64
|
||||
var scannedBytes int64
|
||||
lastProgressTime := time.Now()
|
||||
startTime := time.Now()
|
||||
|
||||
for _, entry := range files {
|
||||
// Check for cancellation
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
// Open file
|
||||
f, err := s.fs.Open(entry.AbsPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Add to manifest with progress callback
|
||||
bytesRead, err := builder.AddFile(
|
||||
entry.Path,
|
||||
entry.Size,
|
||||
entry.Mtime,
|
||||
f,
|
||||
func(fileBytes int64) {
|
||||
// Send progress at most once per second
|
||||
now := time.Now()
|
||||
if progress != nil && now.Sub(lastProgressTime) >= time.Second {
|
||||
elapsed := now.Sub(startTime).Seconds()
|
||||
currentBytes := scannedBytes + fileBytes
|
||||
var rate float64
|
||||
if elapsed > 0 {
|
||||
rate = float64(currentBytes) / elapsed
|
||||
}
|
||||
sendScanStatus(progress, ScanStatus{
|
||||
TotalFiles: totalFiles,
|
||||
ScannedFiles: scannedFiles,
|
||||
TotalBytes: totalBytes,
|
||||
ScannedBytes: currentBytes,
|
||||
BytesPerSec: rate,
|
||||
})
|
||||
lastProgressTime = now
|
||||
}
|
||||
},
|
||||
)
|
||||
f.Close()
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
scannedFiles++
|
||||
scannedBytes += bytesRead
|
||||
}
|
||||
|
||||
// Send final progress
|
||||
if progress != nil {
|
||||
elapsed := time.Since(startTime).Seconds()
|
||||
var rate float64
|
||||
if elapsed > 0 {
|
||||
rate = float64(scannedBytes) / elapsed
|
||||
}
|
||||
sendScanStatus(progress, ScanStatus{
|
||||
TotalFiles: totalFiles,
|
||||
ScannedFiles: scannedFiles,
|
||||
TotalBytes: totalBytes,
|
||||
ScannedBytes: scannedBytes,
|
||||
BytesPerSec: rate,
|
||||
})
|
||||
}
|
||||
|
||||
// Build and write manifest
|
||||
return builder.Build(w)
|
||||
}
|
||||
|
||||
// pathIsHidden returns true if the path or any of its parent directories
|
||||
// start with a dot (hidden files/directories).
|
||||
func pathIsHidden(p string) bool {
|
||||
tp := path.Clean(p)
|
||||
if strings.HasPrefix(tp, ".") {
|
||||
return true
|
||||
}
|
||||
for {
|
||||
d, f := path.Split(tp)
|
||||
if strings.HasPrefix(f, ".") {
|
||||
return true
|
||||
}
|
||||
if d == "" {
|
||||
return false
|
||||
}
|
||||
tp = d[0 : len(d)-1] // trim trailing slash from dir
|
||||
}
|
||||
}
|
||||
|
||||
// sendEnumerateStatus sends a status update without blocking.
|
||||
// If the channel is full, the update is dropped.
|
||||
func sendEnumerateStatus(ch chan<- EnumerateStatus, status EnumerateStatus) {
|
||||
if ch == nil {
|
||||
return
|
||||
}
|
||||
select {
|
||||
case ch <- status:
|
||||
default:
|
||||
// Channel full, drop this update
|
||||
}
|
||||
}
|
||||
|
||||
// sendScanStatus sends a status update without blocking.
|
||||
// If the channel is full, the update is dropped.
|
||||
func sendScanStatus(ch chan<- ScanStatus, status ScanStatus) {
|
||||
if ch == nil {
|
||||
return
|
||||
}
|
||||
select {
|
||||
case ch <- status:
|
||||
default:
|
||||
// Channel full, drop this update
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user