diff --git a/internal/checker/checker.go b/internal/checker/checker.go deleted file mode 100644 index 3790c14..0000000 --- a/internal/checker/checker.go +++ /dev/null @@ -1,281 +0,0 @@ -package checker - -import ( - "bytes" - "context" - "crypto/sha256" - "errors" - "io" - "os" - "path/filepath" - - "github.com/multiformats/go-multihash" - "github.com/spf13/afero" - "sneak.berlin/go/mfer/mfer" -) - -// Result represents the outcome of checking a single file. -type Result struct { - Path string // Relative path from manifest - Status Status // Verification result status - Message string // Human-readable description of the result -} - -// Status represents the verification status of a file. -type Status int - -const ( - StatusOK Status = iota // File matches manifest (size and hash verified) - StatusMissing // File not found on disk - StatusSizeMismatch // File size differs from manifest - StatusHashMismatch // File hash differs from manifest - StatusExtra // File exists on disk but not in manifest - StatusError // Error occurred during verification -) - -func (s Status) String() string { - switch s { - case StatusOK: - return "OK" - case StatusMissing: - return "MISSING" - case StatusSizeMismatch: - return "SIZE_MISMATCH" - case StatusHashMismatch: - return "HASH_MISMATCH" - case StatusExtra: - return "EXTRA" - case StatusError: - return "ERROR" - default: - return "UNKNOWN" - } -} - -// CheckStatus contains progress information for the check operation. -type CheckStatus struct { - TotalFiles int64 // Total number of files in manifest - CheckedFiles int64 // Number of files checked so far - TotalBytes int64 // Total bytes to verify (sum of all file sizes) - CheckedBytes int64 // Bytes verified so far - BytesPerSec float64 // Current throughput rate - Failures int64 // Number of verification failures encountered -} - -// Checker verifies files against a manifest. -type Checker struct { - basePath string - files []*mfer.MFFilePath - fs afero.Fs - // manifestPaths is a set of paths in the manifest for quick lookup - manifestPaths map[string]struct{} -} - -// NewChecker creates a new Checker for the given manifest, base path, and filesystem. -// The basePath is the directory relative to which manifest paths are resolved. -// If fs is nil, the real filesystem (OsFs) is used. -func NewChecker(manifestPath string, basePath string, fs afero.Fs) (*Checker, error) { - if fs == nil { - fs = afero.NewOsFs() - } - - m, err := mfer.NewManifestFromFile(fs, manifestPath) - if err != nil { - return nil, err - } - - abs, err := filepath.Abs(basePath) - if err != nil { - return nil, err - } - - files := m.Files() - manifestPaths := make(map[string]struct{}, len(files)) - for _, f := range files { - manifestPaths[f.Path] = struct{}{} - } - - return &Checker{ - basePath: abs, - files: files, - fs: fs, - manifestPaths: manifestPaths, - }, nil -} - -// FileCount returns the number of files in the manifest. -func (c *Checker) FileCount() int64 { - return int64(len(c.files)) -} - -// TotalBytes returns the total size of all files in the manifest. -func (c *Checker) TotalBytes() int64 { - var total int64 - for _, f := range c.files { - total += f.Size - } - return total -} - -// Check verifies all files against the manifest. -// Results are sent to the results channel as files are checked. -// Progress updates are sent to the progress channel approximately once per second. -// Both channels are closed when the method returns. -func (c *Checker) Check(ctx context.Context, results chan<- Result, progress chan<- CheckStatus) error { - if results != nil { - defer close(results) - } - if progress != nil { - defer close(progress) - } - - totalFiles := int64(len(c.files)) - totalBytes := c.TotalBytes() - - var checkedFiles int64 - var checkedBytes int64 - var failures int64 - - for _, entry := range c.files { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - result := c.checkFile(entry, &checkedBytes) - if result.Status != StatusOK { - failures++ - } - checkedFiles++ - - if results != nil { - results <- result - } - - // Send progress (simplified - every file for now) - if progress != nil { - sendCheckStatus(progress, CheckStatus{ - TotalFiles: totalFiles, - CheckedFiles: checkedFiles, - TotalBytes: totalBytes, - CheckedBytes: checkedBytes, - Failures: failures, - }) - } - } - - return nil -} - -func (c *Checker) checkFile(entry *mfer.MFFilePath, checkedBytes *int64) Result { - absPath := filepath.Join(c.basePath, entry.Path) - - // Check if file exists - info, err := c.fs.Stat(absPath) - if err != nil { - if errors.Is(err, afero.ErrFileNotFound) || errors.Is(err, errors.New("file does not exist")) { - return Result{Path: entry.Path, Status: StatusMissing, Message: "file not found"} - } - // Check for "file does not exist" style errors - exists, _ := afero.Exists(c.fs, absPath) - if !exists { - return Result{Path: entry.Path, Status: StatusMissing, Message: "file not found"} - } - return Result{Path: entry.Path, Status: StatusError, Message: err.Error()} - } - - // Check size - if info.Size() != entry.Size { - *checkedBytes += info.Size() - return Result{ - Path: entry.Path, - Status: StatusSizeMismatch, - Message: "size mismatch", - } - } - - // Open and hash file - f, err := c.fs.Open(absPath) - if err != nil { - return Result{Path: entry.Path, Status: StatusError, Message: err.Error()} - } - defer f.Close() - - h := sha256.New() - n, err := io.Copy(h, f) - if err != nil { - return Result{Path: entry.Path, Status: StatusError, Message: err.Error()} - } - *checkedBytes += n - - // Encode as multihash and compare - computed, err := multihash.Encode(h.Sum(nil), multihash.SHA2_256) - if err != nil { - return Result{Path: entry.Path, Status: StatusError, Message: err.Error()} - } - - // Check against all hashes in manifest (at least one must match) - for _, hash := range entry.Hashes { - if bytes.Equal(computed, hash.MultiHash) { - return Result{Path: entry.Path, Status: StatusOK} - } - } - - return Result{Path: entry.Path, Status: StatusHashMismatch, Message: "hash mismatch"} -} - -// FindExtraFiles walks the filesystem and reports files not in the manifest. -// Results are sent to the results channel. The channel is closed when done. -func (c *Checker) FindExtraFiles(ctx context.Context, results chan<- Result) error { - if results != nil { - defer close(results) - } - - return afero.Walk(c.fs, c.basePath, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - // Skip directories - if info.IsDir() { - return nil - } - - // Get relative path - relPath, err := filepath.Rel(c.basePath, path) - if err != nil { - return err - } - - // Check if path is in manifest - if _, exists := c.manifestPaths[relPath]; !exists { - if results != nil { - results <- Result{ - Path: relPath, - Status: StatusExtra, - Message: "not in manifest", - } - } - } - - return nil - }) -} - -// sendCheckStatus sends a status update without blocking. -func sendCheckStatus(ch chan<- CheckStatus, status CheckStatus) { - if ch == nil { - return - } - select { - case ch <- status: - default: - } -} diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go deleted file mode 100644 index 252e16a..0000000 --- a/internal/scanner/scanner.go +++ /dev/null @@ -1,373 +0,0 @@ -package scanner - -import ( - "context" - "io" - "io/fs" - "path" - "path/filepath" - "strings" - "sync" - "time" - - "github.com/spf13/afero" - "sneak.berlin/go/mfer/mfer" -) - -// Phase 1: Enumeration -// --------------------- -// Walking directories and calling stat() on files to collect metadata. -// Builds the list of files to be scanned. Relatively fast (metadata only). - -// EnumerateStatus contains progress information for the enumeration phase. -type EnumerateStatus struct { - FilesFound int64 // Number of files discovered so far - BytesFound int64 // Total size of discovered files (from stat) -} - -// Phase 2: Scan (ToManifest) -// -------------------------- -// Reading file contents and computing hashes for manifest generation. -// This is the expensive phase that reads all file data. - -// ScanStatus contains progress information for the scan phase. -type ScanStatus struct { - TotalFiles int64 // Total number of files to scan - ScannedFiles int64 // Number of files scanned so far - TotalBytes int64 // Total bytes to read (sum of all file sizes) - ScannedBytes int64 // Bytes read so far - BytesPerSec float64 // Current throughput rate -} - -// Options configures scanner behavior. -type Options struct { - IgnoreDotfiles bool // Skip files and directories starting with a dot - FollowSymLinks bool // Resolve symlinks instead of skipping them - Fs afero.Fs // Filesystem to use, defaults to OsFs if nil -} - -// FileEntry represents a file that has been enumerated. -type FileEntry struct { - Path string // Relative path (used in manifest) - AbsPath string // Absolute path (used for reading file content) - Size int64 // File size in bytes - Mtime time.Time // Last modification time - Ctime time.Time // Creation time (platform-dependent) -} - -// Scanner accumulates files and generates manifests from them. -type Scanner struct { - mu sync.RWMutex - files []*FileEntry - options *Options - fs afero.Fs -} - -// New creates a new Scanner with default options. -func New() *Scanner { - return NewWithOptions(nil) -} - -// NewWithOptions creates a new Scanner with the given options. -func NewWithOptions(opts *Options) *Scanner { - if opts == nil { - opts = &Options{} - } - fs := opts.Fs - if fs == nil { - fs = afero.NewOsFs() - } - return &Scanner{ - files: make([]*FileEntry, 0), - options: opts, - fs: fs, - } -} - -// EnumerateFile adds a single file to the scanner, calling stat() to get metadata. -func (s *Scanner) EnumerateFile(filePath string) error { - abs, err := filepath.Abs(filePath) - if err != nil { - return err - } - info, err := s.fs.Stat(abs) - if err != nil { - return err - } - // For single files, use the filename as the relative path - basePath := filepath.Dir(abs) - return s.enumerateFileWithInfo(filepath.Base(abs), basePath, info, nil) -} - -// EnumeratePath walks a directory path and adds all files to the scanner. -// If progress is non-nil, status updates are sent as files are discovered. -// The progress channel is closed when the method returns. -func (s *Scanner) EnumeratePath(inputPath string, progress chan<- EnumerateStatus) error { - if progress != nil { - defer close(progress) - } - abs, err := filepath.Abs(inputPath) - if err != nil { - return err - } - afs := afero.NewReadOnlyFs(afero.NewBasePathFs(s.fs, abs)) - return s.enumerateFS(afs, abs, progress) -} - -// EnumeratePaths walks multiple directory paths and adds all files to the scanner. -// If progress is non-nil, status updates are sent as files are discovered. -// The progress channel is closed when the method returns. -func (s *Scanner) EnumeratePaths(progress chan<- EnumerateStatus, inputPaths ...string) error { - if progress != nil { - defer close(progress) - } - for _, p := range inputPaths { - abs, err := filepath.Abs(p) - if err != nil { - return err - } - afs := afero.NewReadOnlyFs(afero.NewBasePathFs(s.fs, abs)) - if err := s.enumerateFS(afs, abs, progress); err != nil { - return err - } - } - return nil -} - -// EnumerateFS walks an afero filesystem and adds all files to the scanner. -// If progress is non-nil, status updates are sent as files are discovered. -// The progress channel is closed when the method returns. -// basePath is used to compute absolute paths for file reading. -func (s *Scanner) EnumerateFS(afs afero.Fs, basePath string, progress chan<- EnumerateStatus) error { - if progress != nil { - defer close(progress) - } - return s.enumerateFS(afs, basePath, progress) -} - -// enumerateFS is the internal implementation that doesn't close the progress channel. -func (s *Scanner) enumerateFS(afs afero.Fs, basePath string, progress chan<- EnumerateStatus) error { - return afero.Walk(afs, "/", func(p string, info fs.FileInfo, err error) error { - if err != nil { - return err - } - if s.options.IgnoreDotfiles && pathIsHidden(p) { - if info.IsDir() { - return filepath.SkipDir - } - return nil - } - return s.enumerateFileWithInfo(p, basePath, info, progress) - }) -} - -// enumerateFileWithInfo adds a file with pre-existing fs.FileInfo. -func (s *Scanner) enumerateFileWithInfo(filePath string, basePath string, info fs.FileInfo, progress chan<- EnumerateStatus) error { - if info.IsDir() { - // Manifests contain only files, directories are implied - return nil - } - - // Clean the path - remove leading slash if present - cleanPath := filePath - if len(cleanPath) > 0 && cleanPath[0] == '/' { - cleanPath = cleanPath[1:] - } - - // Compute absolute path for file reading - absPath := filepath.Join(basePath, cleanPath) - - entry := &FileEntry{ - Path: cleanPath, - AbsPath: absPath, - Size: info.Size(), - Mtime: info.ModTime(), - // Note: Ctime not available from fs.FileInfo on all platforms - // Will need platform-specific code to extract it - } - - s.mu.Lock() - s.files = append(s.files, entry) - filesFound := int64(len(s.files)) - var bytesFound int64 - for _, f := range s.files { - bytesFound += f.Size - } - s.mu.Unlock() - - sendEnumerateStatus(progress, EnumerateStatus{ - FilesFound: filesFound, - BytesFound: bytesFound, - }) - - return nil -} - -// Files returns a copy of all files added to the scanner. -func (s *Scanner) Files() []*FileEntry { - s.mu.RLock() - defer s.mu.RUnlock() - out := make([]*FileEntry, len(s.files)) - copy(out, s.files) - return out -} - -// FileCount returns the number of files in the scanner. -func (s *Scanner) FileCount() int64 { - s.mu.RLock() - defer s.mu.RUnlock() - return int64(len(s.files)) -} - -// TotalBytes returns the total size of all files in the scanner. -func (s *Scanner) TotalBytes() int64 { - s.mu.RLock() - defer s.mu.RUnlock() - var total int64 - for _, f := range s.files { - total += f.Size - } - return total -} - -// ToManifest reads all file contents, computes hashes, and generates a manifest. -// If progress is non-nil, status updates are sent approximately once per second. -// The progress channel is closed when the method returns. -// The manifest is written to the provided io.Writer. -func (s *Scanner) ToManifest(ctx context.Context, w io.Writer, progress chan<- ScanStatus) error { - if progress != nil { - defer close(progress) - } - - s.mu.RLock() - files := make([]*FileEntry, len(s.files)) - copy(files, s.files) - totalFiles := int64(len(files)) - var totalBytes int64 - for _, f := range files { - totalBytes += f.Size - } - s.mu.RUnlock() - - builder := mfer.NewBuilder() - - var scannedFiles int64 - var scannedBytes int64 - lastProgressTime := time.Now() - startTime := time.Now() - - for _, entry := range files { - // Check for cancellation - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - // Open file - f, err := s.fs.Open(entry.AbsPath) - if err != nil { - return err - } - - // Add to manifest with progress callback - bytesRead, err := builder.AddFile( - entry.Path, - entry.Size, - entry.Mtime, - f, - func(fileBytes int64) { - // Send progress at most once per second - now := time.Now() - if progress != nil && now.Sub(lastProgressTime) >= time.Second { - elapsed := now.Sub(startTime).Seconds() - currentBytes := scannedBytes + fileBytes - var rate float64 - if elapsed > 0 { - rate = float64(currentBytes) / elapsed - } - sendScanStatus(progress, ScanStatus{ - TotalFiles: totalFiles, - ScannedFiles: scannedFiles, - TotalBytes: totalBytes, - ScannedBytes: currentBytes, - BytesPerSec: rate, - }) - lastProgressTime = now - } - }, - ) - f.Close() - - if err != nil { - return err - } - - scannedFiles++ - scannedBytes += bytesRead - } - - // Send final progress - if progress != nil { - elapsed := time.Since(startTime).Seconds() - var rate float64 - if elapsed > 0 { - rate = float64(scannedBytes) / elapsed - } - sendScanStatus(progress, ScanStatus{ - TotalFiles: totalFiles, - ScannedFiles: scannedFiles, - TotalBytes: totalBytes, - ScannedBytes: scannedBytes, - BytesPerSec: rate, - }) - } - - // Build and write manifest - return builder.Build(w) -} - -// pathIsHidden returns true if the path or any of its parent directories -// start with a dot (hidden files/directories). -func pathIsHidden(p string) bool { - tp := path.Clean(p) - if strings.HasPrefix(tp, ".") { - return true - } - for { - d, f := path.Split(tp) - if strings.HasPrefix(f, ".") { - return true - } - if d == "" { - return false - } - tp = d[0 : len(d)-1] // trim trailing slash from dir - } -} - -// sendEnumerateStatus sends a status update without blocking. -// If the channel is full, the update is dropped. -func sendEnumerateStatus(ch chan<- EnumerateStatus, status EnumerateStatus) { - if ch == nil { - return - } - select { - case ch <- status: - default: - // Channel full, drop this update - } -} - -// sendScanStatus sends a status update without blocking. -// If the channel is full, the update is dropped. -func sendScanStatus(ch chan<- ScanStatus, status ScanStatus) { - if ch == nil { - return - } - select { - case ch <- status: - default: - // Channel full, drop this update - } -}