diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index f8b7eaa..0000000 --- a/.drone.yml +++ /dev/null @@ -1,23 +0,0 @@ -kind: pipeline -name: test-docker-build - -steps: -- name: test-docker-build - image: plugins/docker - network_mode: bridge - settings: - repo: sneak/mfer - build_args_from_env: [ DRONE_COMMIT_SHA ] - dry_run: true - custom_dns: [ 116.202.204.30 ] - tags: - - ${DRONE_COMMIT_SHA:0:7} - - ${DRONE_BRANCH} - - latest -- name: notify - image: plugins/slack - settings: - webhook: - from_secret: SLACK_WEBHOOK_URL - when: - event: pull_request diff --git a/.gitea/workflows/check.yml b/.gitea/workflows/check.yml new file mode 100644 index 0000000..eedd413 --- /dev/null +++ b/.gitea/workflows/check.yml @@ -0,0 +1,9 @@ +name: check +on: [push] +jobs: + check: + runs-on: ubuntu-latest + steps: + # actions/checkout v4.2.2, 2026-03-16 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + - run: docker build . diff --git a/.gitignore b/.gitignore index adc169b..d0d14cf 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,11 @@ *.tmp *.dockerimage /vendor +vendor.tzst +modcache.tzst + +# Generated manifest files +.index.mf + +# Stale files +.drone.yml diff --git a/.index.mf b/.index.mf deleted file mode 100644 index 3f1d0f1..0000000 Binary files a/.index.mf and /dev/null differ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..a71dee0 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,30 @@ +# Agent Instructions + +Read `REPO_POLICIES.md` before making any changes. It is the authoritative +source for coding standards, formatting, linting, and workflow rules. + +## Workflow + +- When fixing a bug, write a failing test FIRST. Only after the test fails, + write the code to fix the bug. Then ensure the test passes. Leave the test in + place and commit it with the bugfix. Don't run shell commands to test bugfixes + or reproduce bugs. Write tests! + +- After each change, run `make fmt`, then `make test`, then `make lint`. Fix any + failures before committing. + +- After each change, commit only the files you've changed. Push after committing. + +## Attribution + +- Never mention Claude, Anthropic, or any AI/LLM tooling in commit messages. Do + not use attribution. + +## Repository-Specific Notes + +- This is a Go library + CLI tool for generating `.mf` manifest files. +- The proto definition is in `mfer/mf.proto`; generated `.pb.go` files are + committed (required for `go get` compatibility). +- The format specification is in `FORMAT.md`. +- See the TODO section in `README.md` for the 1.0 implementation plan + and open design questions. diff --git a/Dockerfile b/Dockerfile index 79673cf..e6dd403 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,37 +1,38 @@ -################################################################################ -#2345678911234567892123456789312345678941234567895123456789612345678971234567898 -################################################################################ -FROM sneak/builder:2022-12-08 AS builder -ENV DEBIAN_FRONTEND noninteractive -WORKDIR /build -COPY ./Makefile ./.golangci.yml ./go.mod ./go.sum /build/ -COPY ./vendor.tzst /build/vendor.tzst -COPY ./modcache.tzst /build/modcache.tzst -COPY ./internal ./internal -COPY ./bin/gitrev.sh ./bin/gitrev.sh -COPY ./mfer ./mfer -COPY ./cmd ./cmd -ARG GITREV unknown -ARG DRONE_COMMIT_SHA unknown +# Lint stage — fast feedback on formatting and lint issues +# golangci/golangci-lint:v2.0.2 (2026-03-14) +FROM golangci/golangci-lint@sha256:d55581f7797e7a0877a7c3aaa399b01bdc57d2874d6412601a046cc4062cb62e AS lint + +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . + +# Touch .pb.go so make does not try to regenerate via protoc (file is committed) +RUN touch mfer/mf.pb.go + +RUN make fmt-check +RUN make lint + +# Build stage — tests and compilation +# golang:1.23 (2026-03-14) +FROM golang@sha256:60deed95d3888cc5e4d9ff8a10c54e5edc008c6ae3fba6187be6fb592e19e8c0 AS builder + +# Force BuildKit to run the lint stage by creating a stage dependency +COPY --from=lint /src/go.sum /dev/null + +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . + +# Touch .pb.go so make does not try to regenerate via protoc (file is committed) +RUN touch mfer/mf.pb.go + +RUN make test +RUN cd cmd/mfer && go build -tags urfave_cli_no_docs -o /mfer . -RUN mkdir -p "$(go env GOMODCACHE)" && cd "$(go env GOMODCACHE)" && \ - zstdmt -d --stdout /build/modcache.tzst | tar xf - && \ - rm /build/modcache.tzst && cd /build -RUN \ - cd mfer && go generate . && cd .. && \ - GOPACKAGESDEBUG=true golangci-lint run ./... && \ - mkdir vendor && cd vendor && \ - zstdmt -d --stdout /build/vendor.tzst | tar xf - && rm /build/vendor.tzst && \ - cd .. && \ - make mfer.cmd -RUN rm -rf /build/vendor && go mod vendor && tar -c . | zstdmt -19 > /src.tzst -################################################################################ -#2345678911234567892123456789312345678941234567895123456789612345678971234567898 -################################################################################ -## final image -################################################################################ FROM scratch -# we put all the source into the final image for posterity, it's small -COPY --from=builder /src.tzst /src.tzst -COPY --from=builder /build/mfer.cmd /mfer +COPY --from=builder /mfer /mfer ENTRYPOINT ["/mfer"] diff --git a/FORMAT.md b/FORMAT.md new file mode 100644 index 0000000..ec661cb --- /dev/null +++ b/FORMAT.md @@ -0,0 +1,143 @@ +# .mf File Format Specification + +Version 1.0 + +## Overview + +An `.mf` file is a binary manifest that describes a directory tree of files, +including their paths, sizes, and cryptographic checksums. It supports +optional GPG signatures for integrity verification and optional timestamps +for metadata preservation. + +## File Structure + +An `.mf` file consists of two parts, concatenated: + +1. **Magic bytes** (8 bytes): the ASCII string `ZNAVSRFG` +2. **Outer message**: a Protocol Buffers serialized `MFFileOuter` message + +There is no length prefix or version byte between the magic and the protobuf +message. The protobuf message extends to the end of the file. + +See [`mfer/mf.proto`](mfer/mf.proto) for exact field numbers and types. + +## Outer Message (`MFFileOuter`) + +The outer message contains: + +| Field | Number | Type | Description | +| ----------------- | ------ | ---------------- | ------------------------------------------------------------------------ | +| `version` | 101 | enum | Must be `VERSION_ONE` (1) | +| `compressionType` | 102 | enum | Compression of `innerMessage`; must be `COMPRESSION_ZSTD` (1) | +| `size` | 103 | int64 | Uncompressed size of `innerMessage` (corruption detection) | +| `sha256` | 104 | bytes | SHA-256 hash of the **compressed** `innerMessage` (corruption detection) | +| `uuid` | 105 | bytes | Random v4 UUID; must match the inner message UUID | +| `innerMessage` | 199 | bytes | Zstd-compressed serialized `MFFile` message | +| `signature` | 201 | bytes (optional) | GPG signature (ASCII-armored or binary) | +| `signer` | 202 | bytes (optional) | Full GPG key ID of the signer | +| `signingPubKey` | 203 | bytes (optional) | Full GPG signing public key | + +### SHA-256 Hash + +The `sha256` field (104) covers the **compressed** `innerMessage` bytes. +This allows verifying data integrity before decompression. + +## Compression + +The `innerMessage` field is compressed with [Zstandard (zstd)](https://facebook.github.io/zstd/). +Implementations must enforce a decompression size limit to prevent +decompression bombs. The reference implementation limits decompressed size to +256 MB. + +## Inner Message (`MFFile`) + +After decompressing `innerMessage`, the result is a serialized `MFFile` +(referred to as the manifest): + +| Field | Number | Type | Description | +| ----------- | ------ | --------------------- | ------------------------------------- | +| `version` | 100 | enum | Must be `VERSION_ONE` (1) | +| `files` | 101 | repeated `MFFilePath` | List of files in the manifest | +| `uuid` | 102 | bytes | Random v4 UUID; must match outer UUID | +| `createdAt` | 201 | Timestamp (optional) | When the manifest was created | + +## File Entries (`MFFilePath`) + +Each file entry contains: + +| Field | Number | Type | Description | +| ---------- | ------ | ------------------------- | ----------------------------------- | +| `path` | 1 | string | Relative file path (see Path Rules) | +| `size` | 2 | int64 | File size in bytes | +| `hashes` | 3 | repeated `MFFileChecksum` | At least one hash required | +| `mimeType` | 301 | string (optional) | MIME type | +| `mtime` | 302 | Timestamp (optional) | Modification time | +| `ctime` | 303 | Timestamp (optional) | Change time (inode metadata change) | + +Field 304 (`atime`) has been removed from the specification. Access time is +volatile and non-deterministic; it is not useful for integrity verification. + +## Path Rules + +All `path` values must satisfy these invariants: + +- **UTF-8**: paths must be valid UTF-8 +- **Forward slashes**: use `/` as the path separator (never `\`) +- **Relative only**: no leading `/` +- **No parent traversal**: no `..` path segments +- **No empty segments**: no `//` sequences +- **No trailing slash**: paths refer to files, not directories + +Implementations must validate these invariants when reading and writing +manifests. Paths that violate these rules must be rejected. + +## Hash Format (`MFFileChecksum`) + +Each checksum is a single `bytes multiHash` field containing a +[multihash](https://multiformats.io/multihash/)-encoded value. Multihash is +self-describing: the encoded bytes include a varint algorithm identifier +followed by a varint digest length followed by the digest itself. + +The 1.0 implementation writes SHA-256 multihashes (`0x12` algorithm code). +Implementations must be able to verify SHA-256 multihashes at minimum. + +## Signature Scheme + +Signing is optional. When present, the signature covers a canonical string +constructed as: + +``` +ZNAVSRFG-- +``` + +Where: + +- `ZNAVSRFG` is the magic bytes string (literal ASCII) +- `` is the hex-encoded UUID from the outer message +- `` is the hex-encoded SHA-256 hash from the outer message (covering compressed data) + +Components are separated by hyphens. The signature is produced by GPG over +this canonical string and stored in the `signature` field of the outer +message. + +## Deterministic Serialization + +By default, manifests are generated deterministically: + +- File entries are sorted by `path` in **lexicographic byte order** +- `createdAt` is omitted unless explicitly requested +- `atime` is never included (field removed from schema) + +This ensures that two independent runs over the same directory tree produce +byte-identical `.mf` files (assuming file contents and metadata have not +changed). + +## MIME Type + +The recommended MIME type for `.mf` files is `application/octet-stream`. +The `.mf` file extension is the canonical identifier. + +## Reference + +- Proto definition: [`mfer/mf.proto`](mfer/mf.proto) +- Reference implementation: [git.eeqj.de/sneak/mfer](https://git.eeqj.de/sneak/mfer) diff --git a/Makefile b/Makefile index e27258f..cab3f5b 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ export PATH := $(PATH):$(GOPATH)/bin PROTOC_GEN_GO := $(GOPATH)/bin/protoc-gen-go SOURCEFILES := mfer/*.go mfer/*.proto internal/*/*.go cmd/*/*.go go.mod go.sum ARCH := $(shell uname -m) -GITREV_BUILD := $(shell bash $(PWD)/bin/gitrev.sh) +GITREV_BUILD := $(shell bash $(PWD)/bin/gitrev.sh 2>/dev/null || echo unknown) APPNAME := mfer VERSION := 0.1.0 export DOCKER_IMAGE_CACHE_DIR := $(HOME)/Library/Caches/Docker/$(APPNAME)-$(ARCH) @@ -13,7 +13,7 @@ GOLDFLAGS += -X main.Version=$(VERSION) GOLDFLAGS += -X main.Gitrev=$(GITREV_BUILD) GOFLAGS := -ldflags "$(GOLDFLAGS)" -.PHONY: docker default run ci test fixme +.PHONY: docker default run ci test check lint fmt fmt-check hooks fixme default: fmt test @@ -32,8 +32,17 @@ $(PROTOC_GEN_GO): fixme: @grep -nir fixme . | grep -v Makefile +check: test lint fmt-check + +fmt-check: mfer/mf.pb.go + sh -c 'test -z "$$(gofmt -l .)"' + +hooks: + echo '#!/bin/sh\nmake check' > .git/hooks/pre-commit + chmod +x .git/hooks/pre-commit + devprereqs: - which golangci-lint || go install -v github.com/golangci/golangci-lint/cmd/golangci-lint@latest + which golangci-lint || go install -v github.com/golangci/golangci-lint/cmd/golangci-lint@v2.0.2 mfer/mf.pb.go: mfer/mf.proto cd mfer && go generate . diff --git a/README.md b/README.md index 6895d5b..259a9b0 100644 --- a/README.md +++ b/README.md @@ -23,38 +23,10 @@ itch in 2022 and is currently a one-person effort, though the goal is for this to emerge as a de-facto standard and be incorporated into other software. A compatible javascript library is planned. -# Phases - -Manifest generation happens in two distinct phases: - -## Phase 1: Enumeration - -Walking directories and calling `stat()` on files to collect metadata (path, size, mtime, ctime). This builds the list of files to be scanned. Relatively fast as it only reads filesystem metadata, not file contents. - -**Progress:** `EnumerateStatus` with `FilesFound` and `BytesFound` - -## Phase 2: Scan (ToManifest) - -Reading file contents and computing cryptographic hashes for manifest generation. This is the expensive phase that reads all file data from disk. - -**Progress:** `ScanStatus` with `TotalFiles`, `ScannedFiles`, `TotalBytes`, `ScannedBytes`, `BytesPerSec` - -# Code Conventions - -- **Logging:** Never use `fmt.Printf` or write to stdout/stderr directly in normal code. Use the `internal/log` package for all output (`log.Info`, `log.Infof`, `log.Debug`, `log.Debugf`, `log.Progressf`, `log.ProgressDone`). -- **Filesystem abstraction:** Use `github.com/spf13/afero` for filesystem operations to enable testing and flexibility. -- **CLI framework:** Use `github.com/urfave/cli/v2` for command-line interface. -- **Serialization:** Use Protocol Buffers for manifest file format. -- **Internal packages:** Non-exported implementation details go in `internal/` subdirectories. -- **Concurrency:** Use `sync.RWMutex` for protecting shared state; prefer channels for progress reporting. -- **Progress channels:** Use buffered channels (size 1) with non-blocking sends to avoid blocking the main operation if the consumer is slow. -- **Context support:** Long-running operations should accept `context.Context` for cancellation. -- **NO_COLOR:** Respect the `NO_COLOR` environment variable for disabling colored output. -- **Options pattern:** Use `NewWithOptions(opts *Options)` constructor pattern for configurable types. - # Build Status -[![Build Status](https://drone.datavi.be/api/badges/sneak/mfer/status.svg)](https://drone.datavi.be/sneak/mfer) +CI runs via `docker build .` which executes `make check` (formatting, +linting, tests). The `main` branch must always be green. # Participation @@ -71,6 +43,9 @@ requests](https://git.eeqj.de/sneak/mfer/pulls) and pass CI to be merged. Any changes submitted to this project must also be [WTFPL-licensed](https://wtfpl.net) to be considered. +See [`REPO_POLICIES.md`](REPO_POLICIES.md) for detailed coding standards, +tooling requirements, and workflow conventions. + # Problem Statement Given a plain URL, there is no standard way to safely and programmatically @@ -148,14 +123,9 @@ The manifest file would do several important things: - metadata size should not be used as an excuse to sacrifice utility (such as providing checksums over each chunk of a large file) -# Limitations - -- **Manifest size:** Manifests must fit entirely in system memory during reading and writing. - # Open Questions - Should the manifest file include checksums of individual file chunks, or just for the whole assembled file? - - If so, should the chunksize be fixed or dynamic? - Should the manifest signature format be GnuPG signatures, or those from @@ -239,6 +209,227 @@ regardless of filesystem format. Please email [`sneak@sneak.berlin`](mailto:sneak@sneak.berlin) with your desired username for an account on this Gitea instance. +# TODO: Remaining Work for 1.0 + +## Design Questions (Owner Decision Required) + +These require @sneak's input before implementation. Answers should be added +inline below each question. + +### Format Design + +**1. Should `MFFileChecksum` be simplified?** Currently it's a separate +message wrapping a single `bytes multiHash` field. Since multihash +already self-describes the algorithm, `repeated bytes hashes` directly on +`MFFilePath` would be simpler and reduce per-file protobuf overhead. Is +the extra message layer intentional (e.g. planning to add per-hash +metadata like `verified_at`)? + +> _answer:_ + +**2. Should file permissions/mode be stored?** The format stores +mtime/ctime but not Unix file permissions. For archival use this may not +matter, but for software distribution or filesystem restoration it's a +gap. Should we reserve a field now (e.g. `optional uint32 mode = 305`) +even if we don't populate it yet? + +> _answer:_ + +**3. Should `atime` be removed from the schema?** Access time is +volatile, non-deterministic, and often disabled (`noatime`). Including it +means two manifests of the same directory at different times will differ, +which conflicts with the determinism goal. Remove it, or document it as +"never set by default"? + +> _answer:_ + +**4. What are the path normalization rules?** The proto has `string path` +with no specification about: always forward-slash? Must be relative? No +`..` components allowed? UTF-8 NFC vs NFD normalization (macOS vs +Linux)? Max path length? This is a security issue (path traversal) and a +cross-platform compatibility issue. What rules should the spec mandate? + +> _answer:_ + +**5. Should we add a version byte after the magic?** Currently +`ZNAVSRFG` is followed immediately by protobuf. Adding a version byte +(`ZNAVSRFG\x01`) would allow future framing changes without requiring +protobuf parsing to detect the version. `MFFileOuter.Version` serves +this purpose but requires successful deserialization to read. Worth the +extra byte? + +> _answer:_ + +**6. Should we add a length-prefix after the magic?** Protobuf is not +self-delimiting. If we ever want to concatenate manifests or append data +after the protobuf, the current framing is insufficient. Add a varint or +fixed-width length-prefix? + +> _answer:_ + +### Signature Design + +**7. What does the outer SHA-256 hash cover — compressed or uncompressed +data?** The code currently hashes compressed data (good for verifying +before decompression), but this should be explicitly documented. Which is +the intended behavior? + +> _answer:_ + +**8. Should `signatureString()` sign raw bytes instead of a hex-encoded +string?** Currently the canonical string is `MAGIC-UUID-MULTIHASH` with +hex encoding, which adds a transformation layer. Signing the raw `sha256` +bytes (or compressed `innerMessage` directly) would be simpler. Keep the +string format or switch to raw bytes? + +> _answer:_ + +**9. Should we support detached signature files (`.mf.sig`)?** Embedded +signatures are better for single-file distribution. Detached `.mf.sig` +files follow the familiar `SHASUMS`/`SHASUMS.asc` pattern and are +simpler for HTTP serving. Support both modes? + +> _answer:_ + +**10. GPG vs pure-Go crypto for signatures?** Shelling out to `gpg` is +fragile (may not be installed, version-dependent output). +`github.com/ProtonMail/go-crypto` provides pure-Go OpenPGP, or we could +use Ed25519/signify (simpler, no key management). Which direction? + +> _answer:_ + +### Implementation Design + +**11. Should manifests be deterministic by default?** This means: sort +file entries by path, omit `createdAt` timestamp (or make it opt-in), no +`atime`. Should determinism be the default, with a +`--include-timestamps` flag to opt in? + +> _answer:_ + +**12. Should we consolidate or keep both scanner/checker +implementations?** There are two parallel implementations: +`mfer/scanner.go` + `mfer/checker.go` (typed with `FileSize`, +`RelFilePath`) and `internal/scanner/` + `internal/checker/` (raw +`int64`, `string`). The `mfer/` versions are superior. Delete the +`internal/` versions? + +> _answer:_ + +**13. Should the `manifest` type be exported?** Currently unexported with +exported constructors (`NewManifestFromReader`, `NewManifestFromFile`). +Consumers can't declare `var m *mfer.manifest`. Export the type, or +define an interface? + +> _answer:_ + +**14. What should the Go module path be for 1.0?** Currently +`sneak.berlin/go/mfer` in `go.mod` but `git.eeqj.de/sneak/mfer/mfer` in +the proto `go_package` option. Which is canonical? + +> _answer:_ + +## Implementation Tasks + +### Repo Infrastructure + +- [ ] Add `.golangci.yml` (fetch from + `https://git.eeqj.de/sneak/prompts/raw/branch/main/.golangci.yml`) +- [ ] Add `.editorconfig` +- [ ] Add `.gitea/workflows/check.yml` that runs `docker build .` + +### Format & Correctness + +- [ ] Resolve proto `go_package` path inconsistency + (`git.eeqj.de/sneak/mfer/mfer` vs `sneak.berlin/go/mfer`) +- [ ] Specify path invariants — add proto comments requiring UTF-8, + forward-slash, relative paths, no `..`, no leading `/`; validate + in `Builder.AddFile` and `Builder.AddFileWithHash` (pending design + question answer) +- [ ] Remove or deprecate `atime` from proto (pending design question + answer) +- [ ] Reserve `optional uint32 mode = 305` in `MFFilePath` for future + file permissions (pending design question answer) +- [ ] Add version byte after magic — `ZNAVSRFG\x01` for format version + 1 (pending design question answer) +- [ ] Write format specification document — separate from README: + magic, outer structure, compression, inner structure, path + invariants, signature scheme, canonical serialization + +### Library + +- [ ] Delete `internal/scanner/` and `internal/checker/` — consolidate + on `mfer/` package versions; update CLI code (pending design + question answer) +- [ ] Add deterministic file ordering — sort entries by path + (lexicographic, byte-order) in `Builder.Build()`; add test + asserting byte-identical output from two runs +- [ ] Add decompression size limit — `io.LimitReader` in + `deserializeInner()` with `m.pbOuter.Size` as bound +- [ ] Fix `errors.Is` dead code in checker — replace with + `os.IsNotExist(err)` or `errors.Is(err, fs.ErrNotExist)` +- [ ] Fix `AddFile` to verify size — check `totalRead == size` after + reading, return error on mismatch +- [ ] Export the `manifest` type or define a public interface (pending + design question answer) — currently consumers cannot hold a reference + to a loaded manifest in their own type declarations +- [ ] Replace GPG subprocess calls with pure-Go crypto (pending design + question answer) — current implementation shells out to `gpg` which + may not be installed +- [ ] Add timeout to any remaining subprocess calls + +### CLI + +- [ ] Fix flag naming — all CLI flags should use kebab-case as primary + (`--include-dotfiles`, `--follow-symlinks`) +- [ ] Fix URL construction in fetch — use `BaseURL.JoinPath()` or + `url.JoinPath()` instead of string concatenation +- [ ] Add progress rate-limiting to Checker — throttle to once per + second, matching Scanner +- [ ] Add `--deterministic` flag or make it default — omit `createdAt`, + sort files (pending design question answer) +- [ ] Wire `--version` flag properly (currently only a `version` + subcommand exists; top-level `--version` shows urfave/cli generic + output) +- [ ] Add retry logic to `fetch` — currently no retries on transient + HTTP errors; needs exponential backoff +- [ ] `fetch` command uses bare `http.Get` with no timeout — needs + `http.Client` with configurable timeout + +### Testing & Robustness + +- [ ] Add fuzzing tests for `NewManifestFromReader` — protobuf + deserialization of untrusted input needs fuzz coverage +- [ ] Add integration test for `freshen` CLI command — current tests + only verify setup, not the actual freshen operation end-to-end +- [ ] Add test for `fetch` CLI command end-to-end (currently only + `downloadFile` is tested) + +### Documentation + +- [ ] Promote `FORMAT.md` as primary spec reference; README should link + to it more prominently +- [ ] Audit and update all error messages for consistency and + helpfulness +- [ ] Document the signature scheme more thoroughly (canonical string + format, verification steps) + +### Release + +- [ ] Finalize Go module path +- [ ] Update version constant in `mfer/constants.go` +- [ ] Add `--version` output matching SemVer +- [ ] Tag `v1.0.0` + +# See Also + +## Prior Art: Metalink + +- [Metalink - Mozilla Wiki](https://wiki.mozilla.org/Metalink) +- [Metalink - Wikipedia](https://en.wikipedia.org/wiki/Metalink) +- [RFC 5854 - The Metalink Download Description Format](https://datatracker.ietf.org/doc/html/rfc5854) +- [RFC 6249 - Metalink/HTTP: Mirrors and Hashes](https://www.rfc-editor.org/rfc/rfc6249.html) + ## Links - Repo: [https://git.eeqj.de/sneak/mfer](https://git.eeqj.de/sneak/mfer) diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 66e1c30..0000000 --- a/TODO.md +++ /dev/null @@ -1,19 +0,0 @@ -# TODO - -## Critical - -- [ ] Fix broken error comparison in `internal/checker/checker.go:195` - `errors.Is(err, errors.New("file does not exist"))` always returns false because `errors.New()` creates a new instance each call -- [ ] Fix unchecked `hash.Write()` errors in `mfer/builder.go:52`, `mfer/serialize.go:56`, `internal/cli/freshen.go:340` -- [ ] Fix URL path traversal risk in `internal/cli/fetch.go:116` - path isn't URL-escaped, should use `url.JoinPath()` or proper encoding - -## Important - -- [ ] Fix goroutine leak in signal handler `internal/cli/gen.go:98-106` - goroutine runs until channel closed, leaks if program exits normally -- [ ] Fix timestamp precision in `mfer/serialize.go:16-22` - use `t.Nanosecond()` instead of manual calculation -- [ ] Add context cancellation check to filesystem walk in `internal/cli/freshen.go` - Ctrl-C doesn't work during scan phase - -## Code Quality - -- [ ] Consolidate duplicate `pathIsHidden` implementations in `internal/scanner/scanner.go:385-402` and `internal/cli/freshen.go:378-397` -- [ ] Make `TotalBytes()` in `internal/scanner/scanner.go:250-259` track total incrementally instead of recalculating on every call -- [ ] Add input validation to `AddFileWithHash()` in `mfer/builder.go:107-120` - validate path, size, and hash inputs diff --git a/internal/checker/checker.go b/internal/checker/checker.go deleted file mode 100644 index 3790c14..0000000 --- a/internal/checker/checker.go +++ /dev/null @@ -1,281 +0,0 @@ -package checker - -import ( - "bytes" - "context" - "crypto/sha256" - "errors" - "io" - "os" - "path/filepath" - - "github.com/multiformats/go-multihash" - "github.com/spf13/afero" - "sneak.berlin/go/mfer/mfer" -) - -// Result represents the outcome of checking a single file. -type Result struct { - Path string // Relative path from manifest - Status Status // Verification result status - Message string // Human-readable description of the result -} - -// Status represents the verification status of a file. -type Status int - -const ( - StatusOK Status = iota // File matches manifest (size and hash verified) - StatusMissing // File not found on disk - StatusSizeMismatch // File size differs from manifest - StatusHashMismatch // File hash differs from manifest - StatusExtra // File exists on disk but not in manifest - StatusError // Error occurred during verification -) - -func (s Status) String() string { - switch s { - case StatusOK: - return "OK" - case StatusMissing: - return "MISSING" - case StatusSizeMismatch: - return "SIZE_MISMATCH" - case StatusHashMismatch: - return "HASH_MISMATCH" - case StatusExtra: - return "EXTRA" - case StatusError: - return "ERROR" - default: - return "UNKNOWN" - } -} - -// CheckStatus contains progress information for the check operation. -type CheckStatus struct { - TotalFiles int64 // Total number of files in manifest - CheckedFiles int64 // Number of files checked so far - TotalBytes int64 // Total bytes to verify (sum of all file sizes) - CheckedBytes int64 // Bytes verified so far - BytesPerSec float64 // Current throughput rate - Failures int64 // Number of verification failures encountered -} - -// Checker verifies files against a manifest. -type Checker struct { - basePath string - files []*mfer.MFFilePath - fs afero.Fs - // manifestPaths is a set of paths in the manifest for quick lookup - manifestPaths map[string]struct{} -} - -// NewChecker creates a new Checker for the given manifest, base path, and filesystem. -// The basePath is the directory relative to which manifest paths are resolved. -// If fs is nil, the real filesystem (OsFs) is used. -func NewChecker(manifestPath string, basePath string, fs afero.Fs) (*Checker, error) { - if fs == nil { - fs = afero.NewOsFs() - } - - m, err := mfer.NewManifestFromFile(fs, manifestPath) - if err != nil { - return nil, err - } - - abs, err := filepath.Abs(basePath) - if err != nil { - return nil, err - } - - files := m.Files() - manifestPaths := make(map[string]struct{}, len(files)) - for _, f := range files { - manifestPaths[f.Path] = struct{}{} - } - - return &Checker{ - basePath: abs, - files: files, - fs: fs, - manifestPaths: manifestPaths, - }, nil -} - -// FileCount returns the number of files in the manifest. -func (c *Checker) FileCount() int64 { - return int64(len(c.files)) -} - -// TotalBytes returns the total size of all files in the manifest. -func (c *Checker) TotalBytes() int64 { - var total int64 - for _, f := range c.files { - total += f.Size - } - return total -} - -// Check verifies all files against the manifest. -// Results are sent to the results channel as files are checked. -// Progress updates are sent to the progress channel approximately once per second. -// Both channels are closed when the method returns. -func (c *Checker) Check(ctx context.Context, results chan<- Result, progress chan<- CheckStatus) error { - if results != nil { - defer close(results) - } - if progress != nil { - defer close(progress) - } - - totalFiles := int64(len(c.files)) - totalBytes := c.TotalBytes() - - var checkedFiles int64 - var checkedBytes int64 - var failures int64 - - for _, entry := range c.files { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - result := c.checkFile(entry, &checkedBytes) - if result.Status != StatusOK { - failures++ - } - checkedFiles++ - - if results != nil { - results <- result - } - - // Send progress (simplified - every file for now) - if progress != nil { - sendCheckStatus(progress, CheckStatus{ - TotalFiles: totalFiles, - CheckedFiles: checkedFiles, - TotalBytes: totalBytes, - CheckedBytes: checkedBytes, - Failures: failures, - }) - } - } - - return nil -} - -func (c *Checker) checkFile(entry *mfer.MFFilePath, checkedBytes *int64) Result { - absPath := filepath.Join(c.basePath, entry.Path) - - // Check if file exists - info, err := c.fs.Stat(absPath) - if err != nil { - if errors.Is(err, afero.ErrFileNotFound) || errors.Is(err, errors.New("file does not exist")) { - return Result{Path: entry.Path, Status: StatusMissing, Message: "file not found"} - } - // Check for "file does not exist" style errors - exists, _ := afero.Exists(c.fs, absPath) - if !exists { - return Result{Path: entry.Path, Status: StatusMissing, Message: "file not found"} - } - return Result{Path: entry.Path, Status: StatusError, Message: err.Error()} - } - - // Check size - if info.Size() != entry.Size { - *checkedBytes += info.Size() - return Result{ - Path: entry.Path, - Status: StatusSizeMismatch, - Message: "size mismatch", - } - } - - // Open and hash file - f, err := c.fs.Open(absPath) - if err != nil { - return Result{Path: entry.Path, Status: StatusError, Message: err.Error()} - } - defer f.Close() - - h := sha256.New() - n, err := io.Copy(h, f) - if err != nil { - return Result{Path: entry.Path, Status: StatusError, Message: err.Error()} - } - *checkedBytes += n - - // Encode as multihash and compare - computed, err := multihash.Encode(h.Sum(nil), multihash.SHA2_256) - if err != nil { - return Result{Path: entry.Path, Status: StatusError, Message: err.Error()} - } - - // Check against all hashes in manifest (at least one must match) - for _, hash := range entry.Hashes { - if bytes.Equal(computed, hash.MultiHash) { - return Result{Path: entry.Path, Status: StatusOK} - } - } - - return Result{Path: entry.Path, Status: StatusHashMismatch, Message: "hash mismatch"} -} - -// FindExtraFiles walks the filesystem and reports files not in the manifest. -// Results are sent to the results channel. The channel is closed when done. -func (c *Checker) FindExtraFiles(ctx context.Context, results chan<- Result) error { - if results != nil { - defer close(results) - } - - return afero.Walk(c.fs, c.basePath, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - // Skip directories - if info.IsDir() { - return nil - } - - // Get relative path - relPath, err := filepath.Rel(c.basePath, path) - if err != nil { - return err - } - - // Check if path is in manifest - if _, exists := c.manifestPaths[relPath]; !exists { - if results != nil { - results <- Result{ - Path: relPath, - Status: StatusExtra, - Message: "not in manifest", - } - } - } - - return nil - }) -} - -// sendCheckStatus sends a status update without blocking. -func sendCheckStatus(ch chan<- CheckStatus, status CheckStatus) { - if ch == nil { - return - } - select { - case ch <- status: - default: - } -} diff --git a/internal/cli/check.go b/internal/cli/check.go index a2d0cdf..7327474 100644 --- a/internal/cli/check.go +++ b/internal/cli/check.go @@ -3,6 +3,7 @@ package cli import ( "encoding/hex" "fmt" + "io" "path/filepath" "strings" "time" @@ -34,29 +35,32 @@ func findManifest(fs afero.Fs, dir string) (string, error) { func (mfa *CLIApp) checkManifestOperation(ctx *cli.Context) error { log.Debug("checkManifestOperation()") - var manifestPath string - var err error + manifestPath, err := mfa.resolveManifestArg(ctx) + if err != nil { + return fmt.Errorf("check: %w", err) + } - if ctx.Args().Len() > 0 { - arg := ctx.Args().Get(0) - // Check if arg is a directory or a file - info, statErr := mfa.Fs.Stat(arg) - if statErr == nil && info.IsDir() { - // It's a directory, look for manifest inside - manifestPath, err = findManifest(mfa.Fs, arg) - if err != nil { - return err - } - } else { - // Treat as a file path - manifestPath = arg + // URL manifests need to be downloaded to a temp file for the checker + if isHTTPURL(manifestPath) { + rc, fetchErr := mfa.openManifestReader(manifestPath) + if fetchErr != nil { + return fmt.Errorf("check: %w", fetchErr) } - } else { - // No argument, look in current directory - manifestPath, err = findManifest(mfa.Fs, ".") - if err != nil { - return err + tmpFile, tmpErr := afero.TempFile(mfa.Fs, "", "mfer-manifest-*.mf") + if tmpErr != nil { + _ = rc.Close() + return fmt.Errorf("check: failed to create temp file: %w", tmpErr) } + tmpPath := tmpFile.Name() + _, cpErr := io.Copy(tmpFile, rc) + _ = rc.Close() + _ = tmpFile.Close() + if cpErr != nil { + _ = mfa.Fs.Remove(tmpPath) + return fmt.Errorf("check: failed to download manifest: %w", cpErr) + } + defer func() { _ = mfa.Fs.Remove(tmpPath) }() + manifestPath = tmpPath } basePath := ctx.String("base") diff --git a/internal/cli/export.go b/internal/cli/export.go new file mode 100644 index 0000000..aca8ebf --- /dev/null +++ b/internal/cli/export.go @@ -0,0 +1,72 @@ +package cli + +import ( + "encoding/hex" + "encoding/json" + "fmt" + "time" + + "github.com/urfave/cli/v2" + "sneak.berlin/go/mfer/mfer" +) + +// ExportEntry represents a single file entry in the exported JSON output. +type ExportEntry struct { + Path string `json:"path"` + Size int64 `json:"size"` + Hashes []string `json:"hashes"` + Mtime *string `json:"mtime,omitempty"` + Ctime *string `json:"ctime,omitempty"` +} + +func (mfa *CLIApp) exportManifestOperation(ctx *cli.Context) error { + pathOrURL, err := mfa.resolveManifestArg(ctx) + if err != nil { + return fmt.Errorf("export: %w", err) + } + + rc, err := mfa.openManifestReader(pathOrURL) + if err != nil { + return fmt.Errorf("export: %w", err) + } + defer func() { _ = rc.Close() }() + + manifest, err := mfer.NewManifestFromReader(rc) + if err != nil { + return fmt.Errorf("export: failed to parse manifest: %w", err) + } + + files := manifest.Files() + entries := make([]ExportEntry, 0, len(files)) + + for _, f := range files { + entry := ExportEntry{ + Path: f.Path, + Size: f.Size, + Hashes: make([]string, 0, len(f.Hashes)), + } + + for _, h := range f.Hashes { + entry.Hashes = append(entry.Hashes, hex.EncodeToString(h.MultiHash)) + } + + if f.Mtime != nil { + t := time.Unix(f.Mtime.Seconds, int64(f.Mtime.Nanos)).UTC().Format(time.RFC3339Nano) + entry.Mtime = &t + } + if f.Ctime != nil { + t := time.Unix(f.Ctime.Seconds, int64(f.Ctime.Nanos)).UTC().Format(time.RFC3339Nano) + entry.Ctime = &t + } + + entries = append(entries, entry) + } + + enc := json.NewEncoder(mfa.Stdout) + enc.SetIndent("", " ") + if err := enc.Encode(entries); err != nil { + return fmt.Errorf("export: failed to encode JSON: %w", err) + } + + return nil +} diff --git a/internal/cli/export_test.go b/internal/cli/export_test.go new file mode 100644 index 0000000..6d1f87d --- /dev/null +++ b/internal/cli/export_test.go @@ -0,0 +1,137 @@ +package cli + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/spf13/afero" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "sneak.berlin/go/mfer/mfer" +) + +// buildTestManifest creates a manifest from in-memory files and returns its bytes. +func buildTestManifest(t *testing.T, files map[string][]byte) []byte { + t.Helper() + sourceFs := afero.NewMemMapFs() + for path, content := range files { + require.NoError(t, sourceFs.MkdirAll("/", 0o755)) + require.NoError(t, afero.WriteFile(sourceFs, "/"+path, content, 0o644)) + } + + opts := &mfer.ScannerOptions{Fs: sourceFs} + s := mfer.NewScannerWithOptions(opts) + require.NoError(t, s.EnumerateFS(sourceFs, "/", nil)) + + var buf bytes.Buffer + require.NoError(t, s.ToManifest(context.Background(), &buf, nil)) + return buf.Bytes() +} + +func TestExportManifestOperation(t *testing.T) { + testFiles := map[string][]byte{ + "hello.txt": []byte("Hello, World!"), + "sub/file.txt": []byte("nested content"), + } + manifestData := buildTestManifest(t, testFiles) + + // Write manifest to memfs + fs := afero.NewMemMapFs() + require.NoError(t, afero.WriteFile(fs, "/test.mf", manifestData, 0o644)) + + var stdout, stderr bytes.Buffer + exitCode := RunWithOptions(&RunOptions{ + Appname: "mfer", + Args: []string{"mfer", "export", "/test.mf"}, + Stdin: &bytes.Buffer{}, + Stdout: &stdout, + Stderr: &stderr, + Fs: fs, + }) + + require.Equal(t, 0, exitCode, "stderr: %s", stderr.String()) + + var entries []ExportEntry + require.NoError(t, json.Unmarshal(stdout.Bytes(), &entries)) + assert.Len(t, entries, 2) + + // Verify entries have expected fields + pathSet := make(map[string]bool) + for _, e := range entries { + pathSet[e.Path] = true + assert.NotEmpty(t, e.Hashes, "entry %s should have hashes", e.Path) + assert.Greater(t, e.Size, int64(0), "entry %s should have positive size", e.Path) + } + assert.True(t, pathSet["hello.txt"]) + assert.True(t, pathSet["sub/file.txt"]) +} + +func TestExportFromHTTPURL(t *testing.T) { + testFiles := map[string][]byte{ + "a.txt": []byte("aaa"), + } + manifestData := buildTestManifest(t, testFiles) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/octet-stream") + _, _ = w.Write(manifestData) + })) + defer server.Close() + + var stdout, stderr bytes.Buffer + exitCode := RunWithOptions(&RunOptions{ + Appname: "mfer", + Args: []string{"mfer", "export", server.URL + "/index.mf"}, + Stdin: &bytes.Buffer{}, + Stdout: &stdout, + Stderr: &stderr, + Fs: afero.NewMemMapFs(), + }) + + require.Equal(t, 0, exitCode, "stderr: %s", stderr.String()) + + var entries []ExportEntry + require.NoError(t, json.Unmarshal(stdout.Bytes(), &entries)) + assert.Len(t, entries, 1) + assert.Equal(t, "a.txt", entries[0].Path) +} + +func TestListFromHTTPURL(t *testing.T) { + testFiles := map[string][]byte{ + "one.txt": []byte("1"), + "two.txt": []byte("22"), + } + manifestData := buildTestManifest(t, testFiles) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write(manifestData) + })) + defer server.Close() + + var stdout, stderr bytes.Buffer + exitCode := RunWithOptions(&RunOptions{ + Appname: "mfer", + Args: []string{"mfer", "list", server.URL + "/index.mf"}, + Stdin: &bytes.Buffer{}, + Stdout: &stdout, + Stderr: &stderr, + Fs: afero.NewMemMapFs(), + }) + + require.Equal(t, 0, exitCode, "stderr: %s", stderr.String()) + output := stdout.String() + assert.Contains(t, output, "one.txt") + assert.Contains(t, output, "two.txt") +} + +func TestIsHTTPURL(t *testing.T) { + assert.True(t, isHTTPURL("http://example.com/manifest.mf")) + assert.True(t, isHTTPURL("https://example.com/manifest.mf")) + assert.False(t, isHTTPURL("/local/path.mf")) + assert.False(t, isHTTPURL("relative/path.mf")) + assert.False(t, isHTTPURL("ftp://example.com/file")) +} diff --git a/internal/cli/fetch.go b/internal/cli/fetch.go index fda1881..677b65a 100644 --- a/internal/cli/fetch.go +++ b/internal/cli/fetch.go @@ -67,7 +67,7 @@ func (mfa *CLIApp) fetchManifestOperation(ctx *cli.Context) error { // Compute base URL (directory containing manifest) baseURL, err := url.Parse(manifestURL) if err != nil { - return err + return fmt.Errorf("fetch: invalid manifest URL: %w", err) } baseURL.Path = path.Dir(baseURL.Path) if !strings.HasSuffix(baseURL.Path, "/") { @@ -113,7 +113,7 @@ func (mfa *CLIApp) fetchManifestOperation(ctx *cli.Context) error { return fmt.Errorf("invalid path in manifest: %w", err) } - fileURL := baseURL.String() + f.Path + fileURL := baseURL.String() + encodeFilePath(f.Path) log.Infof("fetching %s", f.Path) if err := downloadFile(fileURL, localPath, f, progress); err != nil { @@ -139,6 +139,15 @@ func (mfa *CLIApp) fetchManifestOperation(ctx *cli.Context) error { return nil } +// encodeFilePath URL-encodes each segment of a file path while preserving slashes. +func encodeFilePath(p string) string { + segments := strings.Split(p, "/") + for i, seg := range segments { + segments[i] = url.PathEscape(seg) + } + return strings.Join(segments, "/") +} + // sanitizePath validates and sanitizes a file path from the manifest. // It prevents path traversal attacks and rejects unsafe paths. func sanitizePath(p string) (string, error) { @@ -258,7 +267,7 @@ func downloadFile(fileURL, localPath string, entry *mfer.MFFilePath, progress ch dir := filepath.Dir(localPath) if dir != "" && dir != "." { if err := os.MkdirAll(dir, 0o755); err != nil { - return err + return fmt.Errorf("failed to create directory %s: %w", dir, err) } } @@ -278,9 +287,9 @@ func downloadFile(fileURL, localPath string, entry *mfer.MFFilePath, progress ch } // Fetch file - resp, err := http.Get(fileURL) + resp, err := http.Get(fileURL) //nolint:gosec // URL constructed from manifest base if err != nil { - return err + return fmt.Errorf("HTTP request failed: %w", err) } defer func() { _ = resp.Body.Close() }() @@ -298,7 +307,7 @@ func downloadFile(fileURL, localPath string, entry *mfer.MFFilePath, progress ch // Create temp file out, err := os.Create(tmpPath) if err != nil { - return err + return fmt.Errorf("failed to create temp file: %w", err) } // Set up hash computation diff --git a/internal/cli/fetch_test.go b/internal/cli/fetch_test.go index 5809534..43414a7 100644 --- a/internal/cli/fetch_test.go +++ b/internal/cli/fetch_test.go @@ -16,6 +16,29 @@ import ( "sneak.berlin/go/mfer/mfer" ) +func TestEncodeFilePath(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"file.txt", "file.txt"}, + {"dir/file.txt", "dir/file.txt"}, + {"my file.txt", "my%20file.txt"}, + {"dir/my file.txt", "dir/my%20file.txt"}, + {"file#1.txt", "file%231.txt"}, + {"file?v=1.txt", "file%3Fv=1.txt"}, + {"path/to/file with spaces.txt", "path/to/file%20with%20spaces.txt"}, + {"100%done.txt", "100%25done.txt"}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + result := encodeFilePath(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} + func TestSanitizePath(t *testing.T) { // Valid paths that should be accepted validTests := []struct { diff --git a/internal/cli/freshen.go b/internal/cli/freshen.go index 61f7a86..a078ee5 100644 --- a/internal/cli/freshen.go +++ b/internal/cli/freshen.go @@ -41,8 +41,8 @@ func (mfa *CLIApp) freshenManifestOperation(ctx *cli.Context) error { basePath := ctx.String("base") showProgress := ctx.Bool("progress") - includeDotfiles := ctx.Bool("IncludeDotfiles") - followSymlinks := ctx.Bool("FollowSymLinks") + includeDotfiles := ctx.Bool("include-dotfiles") + followSymlinks := ctx.Bool("follow-symlinks") // Find manifest file var manifestPath string @@ -54,7 +54,7 @@ func (mfa *CLIApp) freshenManifestOperation(ctx *cli.Context) error { if statErr == nil && info.IsDir() { manifestPath, err = findManifest(mfa.Fs, arg) if err != nil { - return err + return fmt.Errorf("freshen: %w", err) } } else { manifestPath = arg @@ -62,7 +62,7 @@ func (mfa *CLIApp) freshenManifestOperation(ctx *cli.Context) error { } else { manifestPath, err = findManifest(mfa.Fs, ".") if err != nil { - return err + return fmt.Errorf("freshen: %w", err) } } @@ -93,7 +93,7 @@ func (mfa *CLIApp) freshenManifestOperation(ctx *cli.Context) error { absBase, err := filepath.Abs(basePath) if err != nil { - return err + return fmt.Errorf("freshen: invalid base path: %w", err) } err = afero.Walk(mfa.Fs, absBase, func(path string, info fs.FileInfo, walkErr error) error { @@ -104,7 +104,7 @@ func (mfa *CLIApp) freshenManifestOperation(ctx *cli.Context) error { // Get relative path relPath, err := filepath.Rel(absBase, path) if err != nil { - return err + return fmt.Errorf("freshen: failed to compute relative path for %s: %w", path, err) } // Skip the manifest file itself @@ -226,6 +226,9 @@ func (mfa *CLIApp) freshenManifestOperation(ctx *cli.Context) error { var hashedBytes int64 builder := mfer.NewBuilder() + if ctx.Bool("include-timestamps") { + builder.SetIncludeTimestamps(true) + } // Set up signing options if sign-key is provided if signKey := ctx.String("sign-key"); signKey != "" { diff --git a/internal/cli/gen.go b/internal/cli/gen.go index 6908c0f..f424a6f 100644 --- a/internal/cli/gen.go +++ b/internal/cli/gen.go @@ -20,9 +20,16 @@ func (mfa *CLIApp) generateManifestOperation(ctx *cli.Context) error { log.Debug("generateManifestOperation()") opts := &mfer.ScannerOptions{ - IncludeDotfiles: ctx.Bool("IncludeDotfiles"), - FollowSymLinks: ctx.Bool("FollowSymLinks"), - Fs: mfa.Fs, + IncludeDotfiles: ctx.Bool("include-dotfiles"), + FollowSymLinks: ctx.Bool("follow-symlinks"), + IncludeTimestamps: ctx.Bool("include-timestamps"), + Fs: mfa.Fs, + } + + // Set seed for deterministic UUID if provided + if seed := ctx.String("seed"); seed != "" { + opts.Seed = seed + log.Infof("using deterministic seed for manifest UUID") } // Set up signing options if sign-key is provided @@ -59,7 +66,7 @@ func (mfa *CLIApp) generateManifestOperation(ctx *cli.Context) error { if args.Len() == 0 { // Default to current directory if err := s.EnumeratePath(".", enumProgress); err != nil { - return err + return fmt.Errorf("generate: failed to enumerate current directory: %w", err) } } else { // Collect and validate all paths first @@ -68,7 +75,7 @@ func (mfa *CLIApp) generateManifestOperation(ctx *cli.Context) error { inputPath := args.Get(i) ap, err := filepath.Abs(inputPath) if err != nil { - return err + return fmt.Errorf("generate: invalid path %q: %w", inputPath, err) } // Validate path exists before adding to list if exists, _ := afero.Exists(mfa.Fs, ap); !exists { @@ -78,7 +85,7 @@ func (mfa *CLIApp) generateManifestOperation(ctx *cli.Context) error { paths = append(paths, ap) } if err := s.EnumeratePaths(enumProgress, paths...); err != nil { - return err + return fmt.Errorf("generate: failed to enumerate paths: %w", err) } } enumWg.Wait() diff --git a/internal/cli/list.go b/internal/cli/list.go index b89c236..66031d7 100644 --- a/internal/cli/list.go +++ b/internal/cli/list.go @@ -16,32 +16,20 @@ func (mfa *CLIApp) listManifestOperation(ctx *cli.Context) error { longFormat := ctx.Bool("long") print0 := ctx.Bool("print0") - // Find manifest file - var manifestPath string - var err error - - if ctx.Args().Len() > 0 { - arg := ctx.Args().Get(0) - info, statErr := mfa.Fs.Stat(arg) - if statErr == nil && info.IsDir() { - manifestPath, err = findManifest(mfa.Fs, arg) - if err != nil { - return err - } - } else { - manifestPath = arg - } - } else { - manifestPath, err = findManifest(mfa.Fs, ".") - if err != nil { - return err - } + pathOrURL, err := mfa.resolveManifestArg(ctx) + if err != nil { + return fmt.Errorf("list: %w", err) } - // Load manifest - manifest, err := mfer.NewManifestFromFile(mfa.Fs, manifestPath) + rc, err := mfa.openManifestReader(pathOrURL) if err != nil { - return fmt.Errorf("failed to load manifest: %w", err) + return fmt.Errorf("list: %w", err) + } + defer func() { _ = rc.Close() }() + + manifest, err := mfer.NewManifestFromReader(rc) + if err != nil { + return fmt.Errorf("list: failed to parse manifest: %w", err) } files := manifest.Files() diff --git a/internal/cli/manifest_loader.go b/internal/cli/manifest_loader.go new file mode 100644 index 0000000..333ac38 --- /dev/null +++ b/internal/cli/manifest_loader.go @@ -0,0 +1,56 @@ +package cli + +import ( + "fmt" + "io" + "net/http" + "strings" + "time" + + "github.com/urfave/cli/v2" +) + +// isHTTPURL returns true if the string starts with http:// or https://. +func isHTTPURL(s string) bool { + return strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://") +} + +// openManifestReader opens a manifest from a path or URL and returns a ReadCloser. +// The caller must close the returned reader. +func (mfa *CLIApp) openManifestReader(pathOrURL string) (io.ReadCloser, error) { + if isHTTPURL(pathOrURL) { + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Get(pathOrURL) //nolint:gosec // user-provided URL is intentional + if err != nil { + return nil, fmt.Errorf("failed to fetch %s: %w", pathOrURL, err) + } + if resp.StatusCode != http.StatusOK { + _ = resp.Body.Close() + return nil, fmt.Errorf("failed to fetch %s: HTTP %d", pathOrURL, resp.StatusCode) + } + return resp.Body, nil + } + f, err := mfa.Fs.Open(pathOrURL) + if err != nil { + return nil, err + } + return f, nil +} + +// resolveManifestArg resolves the manifest path from CLI arguments. +// HTTP(S) URLs are returned as-is. Directories are searched for index.mf/.index.mf. +// If no argument is given, the current directory is searched. +func (mfa *CLIApp) resolveManifestArg(ctx *cli.Context) (string, error) { + if ctx.Args().Len() > 0 { + arg := ctx.Args().Get(0) + if isHTTPURL(arg) { + return arg, nil + } + info, statErr := mfa.Fs.Stat(arg) + if statErr == nil && info.IsDir() { + return findManifest(mfa.Fs, arg) + } + return arg, nil + } + return findManifest(mfa.Fs, ".") +} diff --git a/internal/cli/mfer.go b/internal/cli/mfer.go index e99dd7e..0ef7dbf 100644 --- a/internal/cli/mfer.go +++ b/internal/cli/mfer.go @@ -123,14 +123,15 @@ func (mfa *CLIApp) run(args []string) { }, Flags: append(commonFlags(), &cli.BoolFlag{ - Name: "FollowSymLinks", - Aliases: []string{"follow-symlinks"}, + Name: "follow-symlinks", + Aliases: []string{"L"}, Usage: "Resolve encountered symlinks", }, &cli.BoolFlag{ - Name: "IncludeDotfiles", - Aliases: []string{"include-dotfiles"}, - Usage: "Include dot (hidden) files (excluded by default)", + Name: "include-dotfiles", + Aliases: []string{"IncludeDotfiles"}, + + Usage: "Include dot (hidden) files (excluded by default)", }, &cli.StringFlag{ Name: "output", @@ -154,6 +155,15 @@ func (mfa *CLIApp) run(args []string) { Usage: "GPG key ID to sign the manifest with", EnvVars: []string{"MFER_SIGN_KEY"}, }, + &cli.StringFlag{ + Name: "seed", + Usage: "Seed value for deterministic manifest UUID", + EnvVars: []string{"MFER_SEED"}, + }, + &cli.BoolFlag{ + Name: "include-timestamps", + Usage: "Include createdAt timestamp in manifest (omitted by default for determinism)", + }, ), }, { @@ -206,14 +216,15 @@ func (mfa *CLIApp) run(args []string) { Usage: "Base directory for resolving relative paths", }, &cli.BoolFlag{ - Name: "FollowSymLinks", - Aliases: []string{"follow-symlinks"}, + Name: "follow-symlinks", + Aliases: []string{"L"}, Usage: "Resolve encountered symlinks", }, &cli.BoolFlag{ - Name: "IncludeDotfiles", - Aliases: []string{"include-dotfiles"}, - Usage: "Include dot (hidden) files (excluded by default)", + Name: "include-dotfiles", + Aliases: []string{"IncludeDotfiles"}, + + Usage: "Include dot (hidden) files (excluded by default)", }, &cli.BoolFlag{ Name: "progress", @@ -226,8 +237,20 @@ func (mfa *CLIApp) run(args []string) { Usage: "GPG key ID to sign the manifest with", EnvVars: []string{"MFER_SIGN_KEY"}, }, + &cli.BoolFlag{ + Name: "include-timestamps", + Usage: "Include createdAt timestamp in manifest (omitted by default for determinism)", + }, ), }, + { + Name: "export", + Usage: "Export manifest contents as JSON", + ArgsUsage: "[manifest file or URL]", + Action: func(c *cli.Context) error { + return mfa.exportManifestOperation(c) + }, + }, { Name: "version", Usage: "Show version", @@ -269,7 +292,7 @@ func (mfa *CLIApp) run(args []string) { }, } - mfa.app.HideVersion = true + mfa.app.HideVersion = false err := mfa.app.Run(args) if err != nil { mfa.exitCode = 1 diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go deleted file mode 100644 index 252e16a..0000000 --- a/internal/scanner/scanner.go +++ /dev/null @@ -1,373 +0,0 @@ -package scanner - -import ( - "context" - "io" - "io/fs" - "path" - "path/filepath" - "strings" - "sync" - "time" - - "github.com/spf13/afero" - "sneak.berlin/go/mfer/mfer" -) - -// Phase 1: Enumeration -// --------------------- -// Walking directories and calling stat() on files to collect metadata. -// Builds the list of files to be scanned. Relatively fast (metadata only). - -// EnumerateStatus contains progress information for the enumeration phase. -type EnumerateStatus struct { - FilesFound int64 // Number of files discovered so far - BytesFound int64 // Total size of discovered files (from stat) -} - -// Phase 2: Scan (ToManifest) -// -------------------------- -// Reading file contents and computing hashes for manifest generation. -// This is the expensive phase that reads all file data. - -// ScanStatus contains progress information for the scan phase. -type ScanStatus struct { - TotalFiles int64 // Total number of files to scan - ScannedFiles int64 // Number of files scanned so far - TotalBytes int64 // Total bytes to read (sum of all file sizes) - ScannedBytes int64 // Bytes read so far - BytesPerSec float64 // Current throughput rate -} - -// Options configures scanner behavior. -type Options struct { - IgnoreDotfiles bool // Skip files and directories starting with a dot - FollowSymLinks bool // Resolve symlinks instead of skipping them - Fs afero.Fs // Filesystem to use, defaults to OsFs if nil -} - -// FileEntry represents a file that has been enumerated. -type FileEntry struct { - Path string // Relative path (used in manifest) - AbsPath string // Absolute path (used for reading file content) - Size int64 // File size in bytes - Mtime time.Time // Last modification time - Ctime time.Time // Creation time (platform-dependent) -} - -// Scanner accumulates files and generates manifests from them. -type Scanner struct { - mu sync.RWMutex - files []*FileEntry - options *Options - fs afero.Fs -} - -// New creates a new Scanner with default options. -func New() *Scanner { - return NewWithOptions(nil) -} - -// NewWithOptions creates a new Scanner with the given options. -func NewWithOptions(opts *Options) *Scanner { - if opts == nil { - opts = &Options{} - } - fs := opts.Fs - if fs == nil { - fs = afero.NewOsFs() - } - return &Scanner{ - files: make([]*FileEntry, 0), - options: opts, - fs: fs, - } -} - -// EnumerateFile adds a single file to the scanner, calling stat() to get metadata. -func (s *Scanner) EnumerateFile(filePath string) error { - abs, err := filepath.Abs(filePath) - if err != nil { - return err - } - info, err := s.fs.Stat(abs) - if err != nil { - return err - } - // For single files, use the filename as the relative path - basePath := filepath.Dir(abs) - return s.enumerateFileWithInfo(filepath.Base(abs), basePath, info, nil) -} - -// EnumeratePath walks a directory path and adds all files to the scanner. -// If progress is non-nil, status updates are sent as files are discovered. -// The progress channel is closed when the method returns. -func (s *Scanner) EnumeratePath(inputPath string, progress chan<- EnumerateStatus) error { - if progress != nil { - defer close(progress) - } - abs, err := filepath.Abs(inputPath) - if err != nil { - return err - } - afs := afero.NewReadOnlyFs(afero.NewBasePathFs(s.fs, abs)) - return s.enumerateFS(afs, abs, progress) -} - -// EnumeratePaths walks multiple directory paths and adds all files to the scanner. -// If progress is non-nil, status updates are sent as files are discovered. -// The progress channel is closed when the method returns. -func (s *Scanner) EnumeratePaths(progress chan<- EnumerateStatus, inputPaths ...string) error { - if progress != nil { - defer close(progress) - } - for _, p := range inputPaths { - abs, err := filepath.Abs(p) - if err != nil { - return err - } - afs := afero.NewReadOnlyFs(afero.NewBasePathFs(s.fs, abs)) - if err := s.enumerateFS(afs, abs, progress); err != nil { - return err - } - } - return nil -} - -// EnumerateFS walks an afero filesystem and adds all files to the scanner. -// If progress is non-nil, status updates are sent as files are discovered. -// The progress channel is closed when the method returns. -// basePath is used to compute absolute paths for file reading. -func (s *Scanner) EnumerateFS(afs afero.Fs, basePath string, progress chan<- EnumerateStatus) error { - if progress != nil { - defer close(progress) - } - return s.enumerateFS(afs, basePath, progress) -} - -// enumerateFS is the internal implementation that doesn't close the progress channel. -func (s *Scanner) enumerateFS(afs afero.Fs, basePath string, progress chan<- EnumerateStatus) error { - return afero.Walk(afs, "/", func(p string, info fs.FileInfo, err error) error { - if err != nil { - return err - } - if s.options.IgnoreDotfiles && pathIsHidden(p) { - if info.IsDir() { - return filepath.SkipDir - } - return nil - } - return s.enumerateFileWithInfo(p, basePath, info, progress) - }) -} - -// enumerateFileWithInfo adds a file with pre-existing fs.FileInfo. -func (s *Scanner) enumerateFileWithInfo(filePath string, basePath string, info fs.FileInfo, progress chan<- EnumerateStatus) error { - if info.IsDir() { - // Manifests contain only files, directories are implied - return nil - } - - // Clean the path - remove leading slash if present - cleanPath := filePath - if len(cleanPath) > 0 && cleanPath[0] == '/' { - cleanPath = cleanPath[1:] - } - - // Compute absolute path for file reading - absPath := filepath.Join(basePath, cleanPath) - - entry := &FileEntry{ - Path: cleanPath, - AbsPath: absPath, - Size: info.Size(), - Mtime: info.ModTime(), - // Note: Ctime not available from fs.FileInfo on all platforms - // Will need platform-specific code to extract it - } - - s.mu.Lock() - s.files = append(s.files, entry) - filesFound := int64(len(s.files)) - var bytesFound int64 - for _, f := range s.files { - bytesFound += f.Size - } - s.mu.Unlock() - - sendEnumerateStatus(progress, EnumerateStatus{ - FilesFound: filesFound, - BytesFound: bytesFound, - }) - - return nil -} - -// Files returns a copy of all files added to the scanner. -func (s *Scanner) Files() []*FileEntry { - s.mu.RLock() - defer s.mu.RUnlock() - out := make([]*FileEntry, len(s.files)) - copy(out, s.files) - return out -} - -// FileCount returns the number of files in the scanner. -func (s *Scanner) FileCount() int64 { - s.mu.RLock() - defer s.mu.RUnlock() - return int64(len(s.files)) -} - -// TotalBytes returns the total size of all files in the scanner. -func (s *Scanner) TotalBytes() int64 { - s.mu.RLock() - defer s.mu.RUnlock() - var total int64 - for _, f := range s.files { - total += f.Size - } - return total -} - -// ToManifest reads all file contents, computes hashes, and generates a manifest. -// If progress is non-nil, status updates are sent approximately once per second. -// The progress channel is closed when the method returns. -// The manifest is written to the provided io.Writer. -func (s *Scanner) ToManifest(ctx context.Context, w io.Writer, progress chan<- ScanStatus) error { - if progress != nil { - defer close(progress) - } - - s.mu.RLock() - files := make([]*FileEntry, len(s.files)) - copy(files, s.files) - totalFiles := int64(len(files)) - var totalBytes int64 - for _, f := range files { - totalBytes += f.Size - } - s.mu.RUnlock() - - builder := mfer.NewBuilder() - - var scannedFiles int64 - var scannedBytes int64 - lastProgressTime := time.Now() - startTime := time.Now() - - for _, entry := range files { - // Check for cancellation - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - // Open file - f, err := s.fs.Open(entry.AbsPath) - if err != nil { - return err - } - - // Add to manifest with progress callback - bytesRead, err := builder.AddFile( - entry.Path, - entry.Size, - entry.Mtime, - f, - func(fileBytes int64) { - // Send progress at most once per second - now := time.Now() - if progress != nil && now.Sub(lastProgressTime) >= time.Second { - elapsed := now.Sub(startTime).Seconds() - currentBytes := scannedBytes + fileBytes - var rate float64 - if elapsed > 0 { - rate = float64(currentBytes) / elapsed - } - sendScanStatus(progress, ScanStatus{ - TotalFiles: totalFiles, - ScannedFiles: scannedFiles, - TotalBytes: totalBytes, - ScannedBytes: currentBytes, - BytesPerSec: rate, - }) - lastProgressTime = now - } - }, - ) - f.Close() - - if err != nil { - return err - } - - scannedFiles++ - scannedBytes += bytesRead - } - - // Send final progress - if progress != nil { - elapsed := time.Since(startTime).Seconds() - var rate float64 - if elapsed > 0 { - rate = float64(scannedBytes) / elapsed - } - sendScanStatus(progress, ScanStatus{ - TotalFiles: totalFiles, - ScannedFiles: scannedFiles, - TotalBytes: totalBytes, - ScannedBytes: scannedBytes, - BytesPerSec: rate, - }) - } - - // Build and write manifest - return builder.Build(w) -} - -// pathIsHidden returns true if the path or any of its parent directories -// start with a dot (hidden files/directories). -func pathIsHidden(p string) bool { - tp := path.Clean(p) - if strings.HasPrefix(tp, ".") { - return true - } - for { - d, f := path.Split(tp) - if strings.HasPrefix(f, ".") { - return true - } - if d == "" { - return false - } - tp = d[0 : len(d)-1] // trim trailing slash from dir - } -} - -// sendEnumerateStatus sends a status update without blocking. -// If the channel is full, the update is dropped. -func sendEnumerateStatus(ch chan<- EnumerateStatus, status EnumerateStatus) { - if ch == nil { - return - } - select { - case ch <- status: - default: - // Channel full, drop this update - } -} - -// sendScanStatus sends a status update without blocking. -// If the channel is full, the update is dropped. -func sendScanStatus(ch chan<- ScanStatus, status ScanStatus) { - if ch == nil { - return - } - select { - case ch <- status: - default: - // Channel full, drop this update - } -} diff --git a/mfer/builder.go b/mfer/builder.go index 22d4d4a..0b2262f 100644 --- a/mfer/builder.go +++ b/mfer/builder.go @@ -3,13 +3,48 @@ package mfer import ( "crypto/sha256" "errors" + "fmt" "io" + "sort" + "strings" "sync" "time" + "unicode/utf8" "github.com/multiformats/go-multihash" ) +// ValidatePath checks that a file path conforms to manifest path invariants: +// - Must be valid UTF-8 +// - Must use forward slashes only (no backslashes) +// - Must be relative (no leading /) +// - Must not contain ".." segments +// - Must not contain empty segments (no "//") +// - Must not be empty +func ValidatePath(p string) error { + if p == "" { + return errors.New("path cannot be empty") + } + if !utf8.ValidString(p) { + return fmt.Errorf("path %q is not valid UTF-8", p) + } + if strings.ContainsRune(p, '\\') { + return fmt.Errorf("path %q contains backslash; use forward slashes only", p) + } + if strings.HasPrefix(p, "/") { + return fmt.Errorf("path %q is absolute; must be relative", p) + } + for _, seg := range strings.Split(p, "/") { + if seg == "" { + return fmt.Errorf("path %q contains empty segment", p) + } + if seg == ".." { + return fmt.Errorf("path %q contains '..' segment", p) + } + } + return nil +} + // RelFilePath represents a relative file path within a manifest. type RelFilePath string @@ -50,10 +85,20 @@ type FileHashProgress struct { // Builder constructs a manifest by adding files one at a time. type Builder struct { - mu sync.Mutex - files []*MFFilePath - createdAt time.Time - signingOptions *SigningOptions + mu sync.Mutex + files []*MFFilePath + createdAt time.Time + includeTimestamps bool + signingOptions *SigningOptions + fixedUUID []byte // if set, use this UUID instead of generating one +} + +// SetSeed derives a deterministic UUID from the given seed string. +// The seed is hashed once with SHA-256 and the first 16 bytes are used +// as a fixed UUID for the manifest. +func (b *Builder) SetSeed(seed string) { + hash := sha256.Sum256([]byte(seed)) + b.fixedUUID = hash[:16] } // NewBuilder creates a new Builder. @@ -74,6 +119,10 @@ func (b *Builder) AddFile( reader io.Reader, progress chan<- FileHashProgress, ) (FileSize, error) { + if err := ValidatePath(string(path)); err != nil { + return 0, err + } + // Create hash writer h := sha256.New() @@ -96,6 +145,11 @@ func (b *Builder) AddFile( } } + // Verify actual bytes read matches declared size + if totalRead != size { + return totalRead, fmt.Errorf("size mismatch for %q: declared %d bytes but read %d bytes", path, size, totalRead) + } + // Encode hash as multihash (SHA2-256) mh, err := multihash.Encode(h.Sum(nil), multihash.SHA2_256) if err != nil { @@ -141,8 +195,8 @@ func (b *Builder) FileCount() int { // This is useful when the hash is already known (e.g., from an existing manifest). // Returns an error if path is empty, size is negative, or hash is nil/empty. func (b *Builder) AddFileWithHash(path RelFilePath, size FileSize, mtime ModTime, hash Multihash) error { - if path == "" { - return errors.New("path cannot be empty") + if err := ValidatePath(string(path)); err != nil { + return fmt.Errorf("add file: %w", err) } if size < 0 { return errors.New("size cannot be negative") @@ -166,6 +220,14 @@ func (b *Builder) AddFileWithHash(path RelFilePath, size FileSize, mtime ModTime return nil } +// SetIncludeTimestamps controls whether the manifest includes a createdAt timestamp. +// By default timestamps are omitted for deterministic output. +func (b *Builder) SetIncludeTimestamps(include bool) { + b.mu.Lock() + defer b.mu.Unlock() + b.includeTimestamps = include +} + // SetSigningOptions sets the GPG signing options for the manifest. // If opts is non-nil, the manifest will be signed when Build() is called. func (b *Builder) SetSigningOptions(opts *SigningOptions) { @@ -179,30 +241,41 @@ func (b *Builder) Build(w io.Writer) error { b.mu.Lock() defer b.mu.Unlock() + // Sort files by path for deterministic output + sort.Slice(b.files, func(i, j int) bool { + return b.files[i].Path < b.files[j].Path + }) + // Create inner manifest inner := &MFFile{ - Version: MFFile_VERSION_ONE, - CreatedAt: newTimestampFromTime(b.createdAt), - Files: b.files, + Version: MFFile_VERSION_ONE, + Files: b.files, + } + if b.includeTimestamps { + inner.CreatedAt = newTimestampFromTime(b.createdAt) } // Create a temporary manifest to use existing serialization m := &manifest{ pbInner: inner, signingOptions: b.signingOptions, + fixedUUID: b.fixedUUID, } // Generate outer wrapper if err := m.generateOuter(); err != nil { - return err + return fmt.Errorf("build: generate outer: %w", err) } // Generate final output if err := m.generate(); err != nil { - return err + return fmt.Errorf("build: generate: %w", err) } // Write to output _, err := w.Write(m.output.Bytes()) - return err + if err != nil { + return fmt.Errorf("build: write output: %w", err) + } + return nil } diff --git a/mfer/builder_test.go b/mfer/builder_test.go index a92994f..577106d 100644 --- a/mfer/builder_test.go +++ b/mfer/builder_test.go @@ -92,6 +92,230 @@ func TestBuilderBuild(t *testing.T) { assert.True(t, strings.HasPrefix(buf.String(), MAGIC)) } +func TestNewTimestampFromTimeExtremeDate(t *testing.T) { + // Regression test: newTimestampFromTime used UnixNano() which panics + // for dates outside ~1678-2262. Now uses Nanosecond() which is safe. + tests := []struct { + name string + time time.Time + }{ + {"zero time", time.Time{}}, + {"year 1000", time.Date(1000, 1, 1, 0, 0, 0, 0, time.UTC)}, + {"year 3000", time.Date(3000, 1, 1, 0, 0, 0, 123456789, time.UTC)}, + {"unix epoch", time.Unix(0, 0)}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Should not panic + ts := newTimestampFromTime(tt.time) + assert.Equal(t, tt.time.Unix(), ts.Seconds) + assert.Equal(t, int32(tt.time.Nanosecond()), ts.Nanos) + }) + } +} + +func TestBuilderDeterministicOutput(t *testing.T) { + buildManifest := func() []byte { + b := NewBuilder() + // Use a fixed createdAt and UUID so output is reproducible + b.createdAt = time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + b.fixedUUID = make([]byte, 16) // all zeros + + mtime := ModTime(time.Date(2025, 6, 1, 0, 0, 0, 0, time.UTC)) + + // Add files in reverse order to test sorting + files := []struct { + path string + content string + }{ + {"c/file.txt", "content c"}, + {"a/file.txt", "content a"}, + {"b/file.txt", "content b"}, + } + for _, f := range files { + r := bytes.NewReader([]byte(f.content)) + _, err := b.AddFile(RelFilePath(f.path), FileSize(len(f.content)), mtime, r, nil) + require.NoError(t, err) + } + + var buf bytes.Buffer + err := b.Build(&buf) + require.NoError(t, err) + return buf.Bytes() + } + + out1 := buildManifest() + out2 := buildManifest() + assert.Equal(t, out1, out2, "two builds with same input should produce byte-identical output") +} + +func TestSetSeedDeterministic(t *testing.T) { + b1 := NewBuilder() + b1.SetSeed("test-seed-value") + b2 := NewBuilder() + b2.SetSeed("test-seed-value") + assert.Equal(t, b1.fixedUUID, b2.fixedUUID, "same seed should produce same UUID") + assert.Len(t, b1.fixedUUID, 16, "UUID should be 16 bytes") + + b3 := NewBuilder() + b3.SetSeed("different-seed") + assert.NotEqual(t, b1.fixedUUID, b3.fixedUUID, "different seeds should produce different UUIDs") +} + +func TestValidatePath(t *testing.T) { + valid := []string{ + "file.txt", + "dir/file.txt", + "a/b/c/d.txt", + "file with spaces.txt", + "日本語.txt", + } + for _, p := range valid { + t.Run("valid:"+p, func(t *testing.T) { + assert.NoError(t, ValidatePath(p)) + }) + } + + invalid := []struct { + path string + desc string + }{ + {"", "empty"}, + {"/absolute", "absolute path"}, + {"has\\backslash", "backslash"}, + {"has/../traversal", "dot-dot segment"}, + {"has//double", "empty segment"}, + {"..", "just dot-dot"}, + {string([]byte{0xff, 0xfe}), "invalid UTF-8"}, + } + for _, tt := range invalid { + t.Run("invalid:"+tt.desc, func(t *testing.T) { + assert.Error(t, ValidatePath(tt.path)) + }) + } +} + +func TestBuilderAddFileSizeMismatch(t *testing.T) { + b := NewBuilder() + content := []byte("short") + reader := bytes.NewReader(content) + + // Declare wrong size + _, err := b.AddFile("test.txt", FileSize(100), ModTime(time.Now()), reader, nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "size mismatch") +} + +func TestBuilderAddFileInvalidPath(t *testing.T) { + b := NewBuilder() + content := []byte("data") + reader := bytes.NewReader(content) + + _, err := b.AddFile("", FileSize(len(content)), ModTime(time.Now()), reader, nil) + assert.Error(t, err) + + reader.Reset(content) + _, err = b.AddFile("/absolute", FileSize(len(content)), ModTime(time.Now()), reader, nil) + assert.Error(t, err) +} + +func TestBuilderAddFileWithProgress(t *testing.T) { + b := NewBuilder() + content := bytes.Repeat([]byte("x"), 1000) + reader := bytes.NewReader(content) + progress := make(chan FileHashProgress, 100) + + bytesRead, err := b.AddFile("test.txt", FileSize(len(content)), ModTime(time.Now()), reader, progress) + close(progress) + require.NoError(t, err) + assert.Equal(t, FileSize(1000), bytesRead) + + var updates []FileHashProgress + for p := range progress { + updates = append(updates, p) + } + assert.NotEmpty(t, updates) + // Last update should show all bytes + assert.Equal(t, FileSize(1000), updates[len(updates)-1].BytesRead) +} + +func TestBuilderBuildRoundTrip(t *testing.T) { + // Build a manifest, deserialize it, verify all fields survive round-trip + b := NewBuilder() + now := time.Date(2025, 6, 15, 12, 0, 0, 0, time.UTC) + + files := []struct { + path string + content []byte + }{ + {"alpha.txt", []byte("alpha content")}, + {"beta/gamma.txt", []byte("gamma content")}, + {"beta/delta.txt", []byte("delta content")}, + } + + for _, f := range files { + reader := bytes.NewReader(f.content) + _, err := b.AddFile(RelFilePath(f.path), FileSize(len(f.content)), ModTime(now), reader, nil) + require.NoError(t, err) + } + + var buf bytes.Buffer + require.NoError(t, b.Build(&buf)) + + m, err := NewManifestFromReader(&buf) + require.NoError(t, err) + + mfiles := m.Files() + require.Len(t, mfiles, 3) + + // Verify sorted order + assert.Equal(t, "alpha.txt", mfiles[0].Path) + assert.Equal(t, "beta/delta.txt", mfiles[1].Path) + assert.Equal(t, "beta/gamma.txt", mfiles[2].Path) + + // Verify sizes + assert.Equal(t, int64(len("alpha content")), mfiles[0].Size) + + // Verify hashes are present + for _, f := range mfiles { + require.NotEmpty(t, f.Hashes, "file %s should have hashes", f.Path) + assert.NotEmpty(t, f.Hashes[0].MultiHash) + } +} + +func TestNewManifestFromReaderInvalidMagic(t *testing.T) { + _, err := NewManifestFromReader(bytes.NewReader([]byte("NOT_VALID"))) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid file format") +} + +func TestNewManifestFromReaderEmpty(t *testing.T) { + _, err := NewManifestFromReader(bytes.NewReader([]byte{})) + assert.Error(t, err) +} + +func TestNewManifestFromReaderTruncated(t *testing.T) { + // Just the magic with nothing after + _, err := NewManifestFromReader(bytes.NewReader([]byte(MAGIC))) + assert.Error(t, err) +} + +func TestManifestString(t *testing.T) { + b := NewBuilder() + content := []byte("test") + reader := bytes.NewReader(content) + _, err := b.AddFile("test.txt", FileSize(len(content)), ModTime(time.Now()), reader, nil) + require.NoError(t, err) + + var buf bytes.Buffer + require.NoError(t, b.Build(&buf)) + + m, err := NewManifestFromReader(&buf) + require.NoError(t, err) + assert.Contains(t, m.String(), "count=1") +} + func TestBuilderBuildEmpty(t *testing.T) { b := NewBuilder() @@ -102,3 +326,62 @@ func TestBuilderBuildEmpty(t *testing.T) { // Should still produce valid manifest with 0 files assert.True(t, strings.HasPrefix(buf.String(), MAGIC)) } + +func TestBuilderOmitsCreatedAtByDefault(t *testing.T) { + b := NewBuilder() + content := []byte("hello") + _, err := b.AddFile("test.txt", FileSize(len(content)), ModTime(time.Now()), bytes.NewReader(content), nil) + require.NoError(t, err) + + var buf bytes.Buffer + require.NoError(t, b.Build(&buf)) + + m, err := NewManifestFromReader(&buf) + require.NoError(t, err) + assert.Nil(t, m.pbInner.CreatedAt, "createdAt should be nil by default for deterministic output") +} + +func TestBuilderIncludesCreatedAtWhenRequested(t *testing.T) { + b := NewBuilder() + b.SetIncludeTimestamps(true) + content := []byte("hello") + _, err := b.AddFile("test.txt", FileSize(len(content)), ModTime(time.Now()), bytes.NewReader(content), nil) + require.NoError(t, err) + + var buf bytes.Buffer + require.NoError(t, b.Build(&buf)) + + m, err := NewManifestFromReader(&buf) + require.NoError(t, err) + assert.NotNil(t, m.pbInner.CreatedAt, "createdAt should be set when IncludeTimestamps is true") +} + +func TestBuilderDeterministicFileOrder(t *testing.T) { + // Two builds with same files in different order should produce same file ordering. + // Note: UUIDs differ per build, so we compare parsed file lists, not raw bytes. + buildAndParse := func(order []string) []*MFFilePath { + b := NewBuilder() + for _, name := range order { + content := []byte("content of " + name) + _, err := b.AddFile(RelFilePath(name), FileSize(len(content)), ModTime(time.Unix(1000, 0)), bytes.NewReader(content), nil) + require.NoError(t, err) + } + var buf bytes.Buffer + require.NoError(t, b.Build(&buf)) + m, err := NewManifestFromReader(&buf) + require.NoError(t, err) + return m.Files() + } + + files1 := buildAndParse([]string{"b.txt", "a.txt"}) + files2 := buildAndParse([]string{"a.txt", "b.txt"}) + + require.Len(t, files1, 2) + require.Len(t, files2, 2) + for i := range files1 { + assert.Equal(t, files1[i].Path, files2[i].Path) + assert.Equal(t, files1[i].Size, files2[i].Size) + } + assert.Equal(t, "a.txt", files1[0].Path) + assert.Equal(t, "b.txt", files1[1].Path) +} diff --git a/mfer/checker.go b/mfer/checker.go index 2e47d3a..35f233c 100644 --- a/mfer/checker.go +++ b/mfer/checker.go @@ -70,6 +70,8 @@ type Checker struct { fs afero.Fs // manifestPaths is a set of paths in the manifest for quick lookup manifestPaths map[RelFilePath]struct{} + // manifestRelPath is the relative path of the manifest file from basePath (for exclusion) + manifestRelPath RelFilePath // signature info from the manifest signature []byte signer []byte @@ -100,14 +102,25 @@ func NewChecker(manifestPath string, basePath string, fs afero.Fs) (*Checker, er manifestPaths[RelFilePath(f.Path)] = struct{}{} } + // Compute manifest's relative path from basePath for exclusion in FindExtraFiles + absManifest, err := filepath.Abs(manifestPath) + if err != nil { + return nil, err + } + manifestRel, err := filepath.Rel(abs, absManifest) + if err != nil { + manifestRel = "" + } + return &Checker{ - basePath: AbsFilePath(abs), - files: files, - fs: fs, - manifestPaths: manifestPaths, - signature: m.pbOuter.Signature, - signer: m.pbOuter.Signer, - signingPubKey: m.pbOuter.SigningPubKey, + basePath: AbsFilePath(abs), + files: files, + fs: fs, + manifestPaths: manifestPaths, + manifestRelPath: RelFilePath(manifestRel), + signature: m.pbOuter.Signature, + signer: m.pbOuter.Signer, + signingPubKey: m.pbOuter.SigningPubKey, }, nil } @@ -170,6 +183,7 @@ func (c *Checker) Check(ctx context.Context, results chan<- Result, progress cha var failures FileCount startTime := time.Now() + lastProgressTime := time.Now() for _, entry := range c.files { select { @@ -188,29 +202,34 @@ func (c *Checker) Check(ctx context.Context, results chan<- Result, progress cha results <- result } - // Send progress with rate and ETA calculation + // Send progress at most once per second (rate-limited) if progress != nil { - elapsed := time.Since(startTime) - var bytesPerSec float64 - var eta time.Duration + now := time.Now() + isLast := checkedFiles == totalFiles + if isLast || now.Sub(lastProgressTime) >= time.Second { + elapsed := time.Since(startTime) + var bytesPerSec float64 + var eta time.Duration - if elapsed > 0 && checkedBytes > 0 { - bytesPerSec = float64(checkedBytes) / elapsed.Seconds() - remainingBytes := totalBytes - checkedBytes - if bytesPerSec > 0 { - eta = time.Duration(float64(remainingBytes)/bytesPerSec) * time.Second + if elapsed > 0 && checkedBytes > 0 { + bytesPerSec = float64(checkedBytes) / elapsed.Seconds() + remainingBytes := totalBytes - checkedBytes + if bytesPerSec > 0 { + eta = time.Duration(float64(remainingBytes)/bytesPerSec) * time.Second + } } - } - sendCheckStatus(progress, CheckStatus{ - TotalFiles: totalFiles, - CheckedFiles: checkedFiles, - TotalBytes: totalBytes, - CheckedBytes: checkedBytes, - BytesPerSec: bytesPerSec, - ETA: eta, - Failures: failures, - }) + sendCheckStatus(progress, CheckStatus{ + TotalFiles: totalFiles, + CheckedFiles: checkedFiles, + TotalBytes: totalBytes, + CheckedBytes: checkedBytes, + BytesPerSec: bytesPerSec, + ETA: eta, + Failures: failures, + }) + lastProgressTime = now + } } } @@ -224,12 +243,7 @@ func (c *Checker) checkFile(entry *MFFilePath, checkedBytes *FileSize) Result { // Check if file exists info, err := c.fs.Stat(absPath) if err != nil { - if errors.Is(err, afero.ErrFileNotFound) || errors.Is(err, errors.New("file does not exist")) { - return Result{Path: relPath, Status: StatusMissing, Message: "file not found"} - } - // Check for "file does not exist" style errors - exists, _ := afero.Exists(c.fs, absPath) - if !exists { + if errors.Is(err, os.ErrNotExist) || errors.Is(err, afero.ErrFileNotFound) { return Result{Path: relPath, Status: StatusMissing, Message: "file not found"} } return Result{Path: relPath, Status: StatusError, Message: err.Error()} @@ -277,12 +291,14 @@ func (c *Checker) checkFile(entry *MFFilePath, checkedBytes *FileSize) Result { // FindExtraFiles walks the filesystem and reports files not in the manifest. // Results are sent to the results channel. The channel is closed when done. +// Hidden files/directories (starting with .) are skipped, as they are excluded +// from manifests by default. The manifest file itself is also skipped. func (c *Checker) FindExtraFiles(ctx context.Context, results chan<- Result) error { if results != nil { defer close(results) } - return afero.Walk(c.fs, string(c.basePath), func(path string, info os.FileInfo, err error) error { + return afero.Walk(c.fs, string(c.basePath), func(walkPath string, info os.FileInfo, err error) error { if err != nil { return err } @@ -293,18 +309,32 @@ func (c *Checker) FindExtraFiles(ctx context.Context, results chan<- Result) err default: } + // Get relative path + rel, err := filepath.Rel(string(c.basePath), walkPath) + if err != nil { + return err + } + + // Skip hidden files and directories (dotfiles) + if IsHiddenPath(filepath.ToSlash(rel)) { + if info.IsDir() { + return filepath.SkipDir + } + return nil + } + // Skip directories if info.IsDir() { return nil } - // Get relative path - rel, err := filepath.Rel(string(c.basePath), path) - if err != nil { - return err - } relPath := RelFilePath(rel) + // Skip the manifest file itself + if relPath == c.manifestRelPath { + return nil + } + // Check if path is in manifest if _, exists := c.manifestPaths[relPath]; !exists { if results != nil { diff --git a/mfer/checker_test.go b/mfer/checker_test.go index 2313bb8..3709d48 100644 --- a/mfer/checker_test.go +++ b/mfer/checker_test.go @@ -3,6 +3,7 @@ package mfer import ( "bytes" "context" + "fmt" "testing" "time" @@ -305,6 +306,44 @@ func TestFindExtraFiles(t *testing.T) { assert.Equal(t, "not in manifest", extras[0].Message) } +func TestFindExtraFilesSkipsManifestAndDotfiles(t *testing.T) { + fs := afero.NewMemMapFs() + manifestFiles := map[string][]byte{ + "file1.txt": []byte("in manifest"), + } + createTestManifest(t, fs, "/data/.index.mf", manifestFiles) + createFilesOnDisk(t, fs, "/data", map[string][]byte{ + "file1.txt": []byte("in manifest"), + }) + // Create dotfile and manifest that should be skipped + require.NoError(t, afero.WriteFile(fs, "/data/.hidden", []byte("hidden"), 0o644)) + require.NoError(t, afero.WriteFile(fs, "/data/.config/settings", []byte("cfg"), 0o644)) + // Create a real extra file + require.NoError(t, fs.MkdirAll("/data", 0o755)) + require.NoError(t, afero.WriteFile(fs, "/data/extra.txt", []byte("extra"), 0o644)) + + chk, err := NewChecker("/data/.index.mf", "/data", fs) + require.NoError(t, err) + + results := make(chan Result, 10) + err = chk.FindExtraFiles(context.Background(), results) + require.NoError(t, err) + + var extras []Result + for r := range results { + extras = append(extras, r) + } + + // Should only report extra.txt, not .hidden, .config/settings, or .index.mf + for _, e := range extras { + t.Logf("extra: %s", e.Path) + } + assert.Len(t, extras, 1) + if len(extras) > 0 { + assert.Equal(t, RelFilePath("extra.txt"), extras[0].Path) + } +} + func TestFindExtraFilesContextCancellation(t *testing.T) { fs := afero.NewMemMapFs() files := map[string][]byte{"file.txt": []byte("data")} @@ -381,6 +420,94 @@ func TestCheckSubdirectories(t *testing.T) { assert.Equal(t, 3, okCount) } +func TestCheckMissingFileDetectedWithoutFallback(t *testing.T) { + // Regression test: errors.Is(err, errors.New("...")) never matches because + // errors.New creates a new value each time. The fix uses os.ErrNotExist instead. + fs := afero.NewMemMapFs() + files := map[string][]byte{ + "exists.txt": []byte("here"), + "missing.txt": []byte("not on disk"), + } + createTestManifest(t, fs, "/manifest.mf", files) + // Only create one file on disk + createFilesOnDisk(t, fs, "/data", map[string][]byte{ + "exists.txt": []byte("here"), + }) + + chk, err := NewChecker("/manifest.mf", "/data", fs) + require.NoError(t, err) + + results := make(chan Result, 10) + err = chk.Check(context.Background(), results, nil) + require.NoError(t, err) + + statusCounts := map[Status]int{} + for r := range results { + statusCounts[r.Status]++ + if r.Status == StatusMissing { + assert.Equal(t, RelFilePath("missing.txt"), r.Path) + } + } + assert.Equal(t, 1, statusCounts[StatusOK], "one file should be OK") + assert.Equal(t, 1, statusCounts[StatusMissing], "one file should be MISSING") + assert.Equal(t, 0, statusCounts[StatusError], "no files should be ERROR") +} + +func TestFindExtraFilesSkipsDotfiles(t *testing.T) { + // Regression test for #16: FindExtraFiles should not report dotfiles + // or the manifest file itself as extra files. + fs := afero.NewMemMapFs() + files := map[string][]byte{ + "file1.txt": []byte("in manifest"), + } + createTestManifest(t, fs, "/data/.index.mf", files) + createFilesOnDisk(t, fs, "/data", files) + + // Add dotfiles and manifest file on disk + require.NoError(t, afero.WriteFile(fs, "/data/.hidden", []byte("dotfile"), 0o644)) + require.NoError(t, fs.MkdirAll("/data/.git", 0o755)) + require.NoError(t, afero.WriteFile(fs, "/data/.git/config", []byte("git config"), 0o644)) + + chk, err := NewChecker("/data/.index.mf", "/data", fs) + require.NoError(t, err) + + results := make(chan Result, 10) + err = chk.FindExtraFiles(context.Background(), results) + require.NoError(t, err) + + var extras []Result + for r := range results { + extras = append(extras, r) + } + + // Should report NO extra files — dotfiles and manifest should be skipped + assert.Empty(t, extras, "FindExtraFiles should not report dotfiles or manifest file as extra; got: %v", extras) +} + +func TestFindExtraFilesSkipsManifestFile(t *testing.T) { + // The manifest file itself should never be reported as extra + fs := afero.NewMemMapFs() + files := map[string][]byte{ + "file1.txt": []byte("content"), + } + createTestManifest(t, fs, "/data/index.mf", files) + createFilesOnDisk(t, fs, "/data", files) + + chk, err := NewChecker("/data/index.mf", "/data", fs) + require.NoError(t, err) + + results := make(chan Result, 10) + err = chk.FindExtraFiles(context.Background(), results) + require.NoError(t, err) + + var extras []Result + for r := range results { + extras = append(extras, r) + } + + assert.Empty(t, extras, "manifest file should not be reported as extra; got: %v", extras) +} + func TestCheckEmptyManifest(t *testing.T) { fs := afero.NewMemMapFs() // Create manifest with no files @@ -402,3 +529,40 @@ func TestCheckEmptyManifest(t *testing.T) { } assert.Equal(t, 0, count) } + +func TestCheckProgressRateLimited(t *testing.T) { + // Create many small files - progress should be rate-limited, not one per file. + // With rate-limiting to once per second, we should get far fewer progress + // updates than files (plus one final update). + fs := afero.NewMemMapFs() + files := make(map[string][]byte, 100) + for i := 0; i < 100; i++ { + name := fmt.Sprintf("file%03d.txt", i) + files[name] = []byte("content") + } + createTestManifest(t, fs, "/manifest.mf", files) + createFilesOnDisk(t, fs, "/data", files) + + chk, err := NewChecker("/manifest.mf", "/data", fs) + require.NoError(t, err) + + results := make(chan Result, 200) + progress := make(chan CheckStatus, 200) + err = chk.Check(context.Background(), results, progress) + require.NoError(t, err) + + // Drain results + for range results { + } + + // Count progress updates + var progressCount int + for range progress { + progressCount++ + } + + // Should be far fewer than 100 (rate-limited to once per second) + // At minimum we get the final update + assert.GreaterOrEqual(t, progressCount, 1, "should get at least the final progress update") + assert.Less(t, progressCount, 100, "progress should be rate-limited, not one per file") +} diff --git a/mfer/constants.go b/mfer/constants.go index f38c74a..4640637 100644 --- a/mfer/constants.go +++ b/mfer/constants.go @@ -3,4 +3,9 @@ package mfer const ( Version = "0.1.0" ReleaseDate = "2025-12-17" + + // MaxDecompressedSize is the maximum allowed size of decompressed manifest + // data (256 MB). This prevents decompression bombs from consuming excessive + // memory. + MaxDecompressedSize int64 = 256 * 1024 * 1024 ) diff --git a/mfer/deserialize.go b/mfer/deserialize.go index 76a8655..f8de802 100644 --- a/mfer/deserialize.go +++ b/mfer/deserialize.go @@ -44,7 +44,7 @@ func (m *manifest) deserializeInner() error { // Verify hash of compressed data before decompression h := sha256.New() if _, err := h.Write(m.pbOuter.InnerMessage); err != nil { - return err + return fmt.Errorf("deserialize: hash write: %w", err) } sha256Hash := h.Sum(nil) if !bytes.Equal(sha256Hash, m.pbOuter.Sha256) { @@ -72,13 +72,23 @@ func (m *manifest) deserializeInner() error { zr, err := zstd.NewReader(bb) if err != nil { - return err + return fmt.Errorf("deserialize: zstd reader: %w", err) } defer zr.Close() - dat, err := io.ReadAll(zr) + // Limit decompressed size to prevent decompression bombs. + // Use declared size + 1 byte to detect overflow, capped at MaxDecompressedSize. + maxSize := MaxDecompressedSize + if m.pbOuter.Size > 0 && m.pbOuter.Size < int64(maxSize) { + maxSize = int64(m.pbOuter.Size) + 1 + } + limitedReader := io.LimitReader(zr, maxSize) + dat, err := io.ReadAll(limitedReader) if err != nil { - return err + return fmt.Errorf("deserialize: decompress: %w", err) + } + if int64(len(dat)) >= MaxDecompressedSize { + return fmt.Errorf("decompressed data exceeds maximum allowed size of %d bytes", MaxDecompressedSize) } isize := len(dat) @@ -90,7 +100,7 @@ func (m *manifest) deserializeInner() error { // Deserialize inner message m.pbInner = new(MFFile) if err := proto.Unmarshal(dat, m.pbInner); err != nil { - return err + return fmt.Errorf("deserialize: unmarshal inner: %w", err) } // Validate inner UUID diff --git a/mfer/gpg.go b/mfer/gpg.go index c587b2e..2ae607b 100644 --- a/mfer/gpg.go +++ b/mfer/gpg.go @@ -20,7 +20,7 @@ type SigningOptions struct { // gpgSign creates a detached signature of the data using the specified key. // Returns the armored detached signature. func gpgSign(data []byte, keyID GPGKeyID) ([]byte, error) { - cmd := exec.Command("gpg", + cmd := exec.Command("gpg", "--batch", "--no-tty", "--detach-sign", "--armor", "--local-user", string(keyID), @@ -42,7 +42,7 @@ func gpgSign(data []byte, keyID GPGKeyID) ([]byte, error) { // gpgExportPublicKey exports the public key for the specified key ID. // Returns the armored public key. func gpgExportPublicKey(keyID GPGKeyID) ([]byte, error) { - cmd := exec.Command("gpg", + cmd := exec.Command("gpg", "--batch", "--no-tty", "--export", "--armor", string(keyID), @@ -65,7 +65,7 @@ func gpgExportPublicKey(keyID GPGKeyID) ([]byte, error) { // gpgGetKeyFingerprint gets the full fingerprint for a key ID. func gpgGetKeyFingerprint(keyID GPGKeyID) ([]byte, error) { - cmd := exec.Command("gpg", + cmd := exec.Command("gpg", "--batch", "--no-tty", "--with-colons", "--fingerprint", string(keyID), @@ -100,7 +100,7 @@ func gpgExtractPubKeyFingerprint(pubKey []byte) (string, error) { if err != nil { return "", fmt.Errorf("failed to create temp dir: %w", err) } - defer os.RemoveAll(tmpDir) + defer func() { _ = os.RemoveAll(tmpDir) }() // Set restrictive permissions if err := os.Chmod(tmpDir, 0o700); err != nil { @@ -114,7 +114,7 @@ func gpgExtractPubKeyFingerprint(pubKey []byte) (string, error) { } // Import the public key into the temporary keyring - importCmd := exec.Command("gpg", + importCmd := exec.Command("gpg", "--batch", "--no-tty", "--homedir", tmpDir, "--import", pubKeyFile, @@ -126,7 +126,7 @@ func gpgExtractPubKeyFingerprint(pubKey []byte) (string, error) { } // List keys to get fingerprint - listCmd := exec.Command("gpg", + listCmd := exec.Command("gpg", "--batch", "--no-tty", "--homedir", tmpDir, "--with-colons", "--fingerprint", @@ -158,7 +158,7 @@ func gpgVerify(data, signature, pubKey []byte) error { if err != nil { return fmt.Errorf("failed to create temp dir: %w", err) } - defer os.RemoveAll(tmpDir) + defer func() { _ = os.RemoveAll(tmpDir) }() // Set restrictive permissions if err := os.Chmod(tmpDir, 0o700); err != nil { @@ -184,7 +184,7 @@ func gpgVerify(data, signature, pubKey []byte) error { } // Import the public key into the temporary keyring - importCmd := exec.Command("gpg", + importCmd := exec.Command("gpg", "--batch", "--no-tty", "--homedir", tmpDir, "--import", pubKeyFile, @@ -196,7 +196,7 @@ func gpgVerify(data, signature, pubKey []byte) error { } // Verify the signature - verifyCmd := exec.Command("gpg", + verifyCmd := exec.Command("gpg", "--batch", "--no-tty", "--homedir", tmpDir, "--verify", sigFile, diff --git a/mfer/gpg_test.go b/mfer/gpg_test.go index 97f72e2..badc8ca 100644 --- a/mfer/gpg_test.go +++ b/mfer/gpg_test.go @@ -34,15 +34,15 @@ func testGPGEnv(t *testing.T) (GPGKeyID, func()) { // Save original GNUPGHOME and set new one origGPGHome := os.Getenv("GNUPGHOME") - os.Setenv("GNUPGHOME", gpgHome) + require.NoError(t, os.Setenv("GNUPGHOME", gpgHome)) cleanup := func() { if origGPGHome == "" { - os.Unsetenv("GNUPGHOME") + _ = os.Unsetenv("GNUPGHOME") } else { - os.Setenv("GNUPGHOME", origGPGHome) + _ = os.Setenv("GNUPGHOME", origGPGHome) } - os.RemoveAll(gpgHome) + _ = os.RemoveAll(gpgHome) } // Generate a test key with no passphrase diff --git a/mfer/manifest.go b/mfer/manifest.go index 203c79c..bea4fa1 100644 --- a/mfer/manifest.go +++ b/mfer/manifest.go @@ -17,6 +17,7 @@ type manifest struct { pbOuter *MFFileOuter output *bytes.Buffer signingOptions *SigningOptions + fixedUUID []byte // if set, use this UUID instead of generating one } func (m *manifest) String() string { diff --git a/mfer/mf.pb.go b/mfer/mf.pb.go index 7c02e2d..41f53f9 100644 --- a/mfer/mf.pb.go +++ b/mfer/mf.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.36.11 -// protoc v6.33.0 +// protoc v6.33.4 // source: mf.proto package mfer @@ -329,6 +329,9 @@ func (x *MFFileOuter) GetSigningPubKey() []byte { type MFFilePath struct { state protoimpl.MessageState `protogen:"open.v1"` // required attributes: + // Path invariants: must be valid UTF-8, use forward slashes only, + // be relative (no leading /), contain no ".." segments, and no + // empty segments (no "//"). Path string `protobuf:"bytes,1,opt,name=path,proto3" json:"path,omitempty"` Size int64 `protobuf:"varint,2,opt,name=size,proto3" json:"size,omitempty"` // gotta have at least one: @@ -337,7 +340,6 @@ type MFFilePath struct { MimeType *string `protobuf:"bytes,301,opt,name=mimeType,proto3,oneof" json:"mimeType,omitempty"` Mtime *Timestamp `protobuf:"bytes,302,opt,name=mtime,proto3,oneof" json:"mtime,omitempty"` Ctime *Timestamp `protobuf:"bytes,303,opt,name=ctime,proto3,oneof" json:"ctime,omitempty"` - Atime *Timestamp `protobuf:"bytes,304,opt,name=atime,proto3,oneof" json:"atime,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -414,13 +416,6 @@ func (x *MFFilePath) GetCtime() *Timestamp { return nil } -func (x *MFFilePath) GetAtime() *Timestamp { - if x != nil { - return x.Atime - } - return nil -} - type MFFileChecksum struct { state protoimpl.MessageState `protogen:"open.v1"` // 1.0 golang implementation must write a multihash here @@ -566,7 +561,7 @@ const file_mf_proto_rawDesc = "" + "\n" + "_signatureB\t\n" + "\a_signerB\x10\n" + - "\x0e_signingPubKey\"\xa2\x02\n" + + "\x0e_signingPubKey\"\xf0\x01\n" + "\n" + "MFFilePath\x12\x12\n" + "\x04path\x18\x01 \x01(\tR\x04path\x12\x12\n" + @@ -576,13 +571,10 @@ const file_mf_proto_rawDesc = "" + "\x05mtime\x18\xae\x02 \x01(\v2\n" + ".TimestampH\x01R\x05mtime\x88\x01\x01\x12&\n" + "\x05ctime\x18\xaf\x02 \x01(\v2\n" + - ".TimestampH\x02R\x05ctime\x88\x01\x01\x12&\n" + - "\x05atime\x18\xb0\x02 \x01(\v2\n" + - ".TimestampH\x03R\x05atime\x88\x01\x01B\v\n" + + ".TimestampH\x02R\x05ctime\x88\x01\x01B\v\n" + "\t_mimeTypeB\b\n" + "\x06_mtimeB\b\n" + - "\x06_ctimeB\b\n" + - "\x06_atime\".\n" + + "\x06_ctime\".\n" + "\x0eMFFileChecksum\x12\x1c\n" + "\tmultiHash\x18\x01 \x01(\fR\tmultiHash\"\xd6\x01\n" + "\x06MFFile\x12)\n" + @@ -627,15 +619,14 @@ var file_mf_proto_depIdxs = []int32{ 6, // 2: MFFilePath.hashes:type_name -> MFFileChecksum 3, // 3: MFFilePath.mtime:type_name -> Timestamp 3, // 4: MFFilePath.ctime:type_name -> Timestamp - 3, // 5: MFFilePath.atime:type_name -> Timestamp - 2, // 6: MFFile.version:type_name -> MFFile.Version - 5, // 7: MFFile.files:type_name -> MFFilePath - 3, // 8: MFFile.createdAt:type_name -> Timestamp - 9, // [9:9] is the sub-list for method output_type - 9, // [9:9] is the sub-list for method input_type - 9, // [9:9] is the sub-list for extension type_name - 9, // [9:9] is the sub-list for extension extendee - 0, // [0:9] is the sub-list for field type_name + 2, // 5: MFFile.version:type_name -> MFFile.Version + 5, // 6: MFFile.files:type_name -> MFFilePath + 3, // 7: MFFile.createdAt:type_name -> Timestamp + 8, // [8:8] is the sub-list for method output_type + 8, // [8:8] is the sub-list for method input_type + 8, // [8:8] is the sub-list for extension type_name + 8, // [8:8] is the sub-list for extension extendee + 0, // [0:8] is the sub-list for field type_name } func init() { file_mf_proto_init() } diff --git a/mfer/mf.proto b/mfer/mf.proto index d8a5bac..951946f 100644 --- a/mfer/mf.proto +++ b/mfer/mf.proto @@ -46,6 +46,9 @@ message MFFileOuter { message MFFilePath { // required attributes: + // Path invariants: must be valid UTF-8, use forward slashes only, + // be relative (no leading /), contain no ".." segments, and no + // empty segments (no "//"). string path = 1; int64 size = 2; @@ -56,7 +59,6 @@ message MFFilePath { optional string mimeType = 301; optional Timestamp mtime = 302; optional Timestamp ctime = 303; - optional Timestamp atime = 304; } message MFFileChecksum { diff --git a/mfer/scanner.go b/mfer/scanner.go index df0df11..abf845d 100644 --- a/mfer/scanner.go +++ b/mfer/scanner.go @@ -43,10 +43,12 @@ type ScanStatus struct { // ScannerOptions configures scanner behavior. type ScannerOptions struct { - IncludeDotfiles bool // Include files and directories starting with a dot (default: exclude) - FollowSymLinks bool // Resolve symlinks instead of skipping them - Fs afero.Fs // Filesystem to use, defaults to OsFs if nil - SigningOptions *SigningOptions // GPG signing options (nil = no signing) + IncludeDotfiles bool // Include files and directories starting with a dot (default: exclude) + FollowSymLinks bool // Resolve symlinks instead of skipping them + IncludeTimestamps bool // Include createdAt timestamp in manifest (default: omit for determinism) + Fs afero.Fs // Filesystem to use, defaults to OsFs if nil + SigningOptions *SigningOptions // GPG signing options (nil = no signing) + Seed string // If set, derive a deterministic UUID from this seed } // FileEntry represents a file that has been enumerated. @@ -273,9 +275,15 @@ func (s *Scanner) ToManifest(ctx context.Context, w io.Writer, progress chan<- S s.mu.RUnlock() builder := NewBuilder() + if s.options.IncludeTimestamps { + builder.SetIncludeTimestamps(true) + } if s.options.SigningOptions != nil { builder.SetSigningOptions(s.options.SigningOptions) } + if s.options.Seed != "" { + builder.SetSeed(s.options.Seed) + } var scannedFiles FileCount var scannedBytes FileSize @@ -385,6 +393,9 @@ func (s *Scanner) ToManifest(ctx context.Context, w io.Writer, progress chan<- S // The path should use forward slashes. func IsHiddenPath(p string) bool { tp := path.Clean(p) + if tp == "." || tp == "/" { + return false + } if strings.HasPrefix(tp, ".") { return true } diff --git a/mfer/scanner_test.go b/mfer/scanner_test.go index b6e6296..8db6357 100644 --- a/mfer/scanner_test.go +++ b/mfer/scanner_test.go @@ -352,6 +352,10 @@ func TestIsHiddenPath(t *testing.T) { {"/absolute/.hidden", true}, {"./relative", false}, // path.Clean removes leading ./ {"a/b/c/.d/e", true}, + {".", false}, // current directory is not hidden (#14) + {"/", false}, // root is not hidden + {"./", false}, // current directory with trailing slash + {"./file.txt", false}, // file in current directory } for _, tt := range tests { diff --git a/mfer/serialize.go b/mfer/serialize.go index 0a2898c..b60c1c0 100644 --- a/mfer/serialize.go +++ b/mfer/serialize.go @@ -16,11 +16,10 @@ import ( const MAGIC string = "ZNAVSRFG" func newTimestampFromTime(t time.Time) *Timestamp { - out := &Timestamp{ + return &Timestamp{ Seconds: t.Unix(), - Nanos: int32(t.UnixNano() - (t.Unix() * 1000000000)), + Nanos: int32(t.Nanosecond()), } - return out } func (m *manifest) generate() error { @@ -35,12 +34,12 @@ func (m *manifest) generate() error { } dat, err := proto.MarshalOptions{Deterministic: true}.Marshal(m.pbOuter) if err != nil { - return err + return fmt.Errorf("serialize: marshal outer: %w", err) } m.output = bytes.NewBuffer([]byte(MAGIC)) _, err = m.output.Write(dat) if err != nil { - return err + return fmt.Errorf("serialize: write output: %w", err) } return nil } @@ -50,24 +49,29 @@ func (m *manifest) generateOuter() error { return errors.New("internal error") } - // Generate UUID and set on inner message - manifestUUID := uuid.New() + // Use fixed UUID if provided, otherwise generate a new one + var manifestUUID uuid.UUID + if len(m.fixedUUID) == 16 { + copy(manifestUUID[:], m.fixedUUID) + } else { + manifestUUID = uuid.New() + } m.pbInner.Uuid = manifestUUID[:] innerData, err := proto.MarshalOptions{Deterministic: true}.Marshal(m.pbInner) if err != nil { - return err + return fmt.Errorf("serialize: marshal inner: %w", err) } // Compress the inner data idc := new(bytes.Buffer) zw, err := zstd.NewWriter(idc, zstd.WithEncoderLevel(zstd.SpeedBestCompression)) if err != nil { - return err + return fmt.Errorf("serialize: create compressor: %w", err) } _, err = zw.Write(innerData) if err != nil { - return err + return fmt.Errorf("serialize: compress: %w", err) } _ = zw.Close() @@ -76,7 +80,7 @@ func (m *manifest) generateOuter() error { // Hash the compressed data for integrity verification before decompression h := sha256.New() if _, err := h.Write(compressedData); err != nil { - return err + return fmt.Errorf("serialize: hash write: %w", err) } sha256Hash := h.Sum(nil) diff --git a/mfer/url.go b/mfer/url.go index fb1da96..274687e 100644 --- a/mfer/url.go +++ b/mfer/url.go @@ -27,8 +27,12 @@ func (b BaseURL) JoinPath(path RelFilePath) (FileURL, error) { base.Path += "/" } - // Parse and encode the relative path - ref, err := url.Parse(url.PathEscape(string(path))) + // Encode each path segment individually to preserve slashes + segments := strings.Split(string(path), "/") + for i, seg := range segments { + segments[i] = url.PathEscape(seg) + } + ref, err := url.Parse(strings.Join(segments, "/")) if err != nil { return "", err } diff --git a/mfer/url_test.go b/mfer/url_test.go new file mode 100644 index 0000000..dd36a4a --- /dev/null +++ b/mfer/url_test.go @@ -0,0 +1,44 @@ +package mfer + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBaseURLJoinPath(t *testing.T) { + tests := []struct { + base BaseURL + path RelFilePath + expected string + }{ + {"https://example.com/dir/", "file.txt", "https://example.com/dir/file.txt"}, + {"https://example.com/dir", "file.txt", "https://example.com/dir/file.txt"}, + {"https://example.com/", "sub/file.txt", "https://example.com/sub/file.txt"}, + {"https://example.com/dir/", "file with spaces.txt", "https://example.com/dir/file%20with%20spaces.txt"}, + } + + for _, tt := range tests { + t.Run(string(tt.base)+"+"+string(tt.path), func(t *testing.T) { + result, err := tt.base.JoinPath(tt.path) + require.NoError(t, err) + assert.Equal(t, tt.expected, string(result)) + }) + } +} + +func TestBaseURLString(t *testing.T) { + b := BaseURL("https://example.com/") + assert.Equal(t, "https://example.com/", b.String()) +} + +func TestFileURLString(t *testing.T) { + f := FileURL("https://example.com/file.txt") + assert.Equal(t, "https://example.com/file.txt", f.String()) +} + +func TestManifestURLString(t *testing.T) { + m := ManifestURL("https://example.com/index.mf") + assert.Equal(t, "https://example.com/index.mf", m.String()) +} diff --git a/modcache.tzst b/modcache.tzst deleted file mode 100644 index 0be2836..0000000 Binary files a/modcache.tzst and /dev/null differ diff --git a/vendor.tzst b/vendor.tzst deleted file mode 100644 index 9f63bca..0000000 Binary files a/vendor.tzst and /dev/null differ