diff --git a/Dockerfile b/Dockerfile index 79673cf..9a816e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,37 +1,18 @@ -################################################################################ -#2345678911234567892123456789312345678941234567895123456789612345678971234567898 -################################################################################ -FROM sneak/builder:2022-12-08 AS builder -ENV DEBIAN_FRONTEND noninteractive -WORKDIR /build -COPY ./Makefile ./.golangci.yml ./go.mod ./go.sum /build/ -COPY ./vendor.tzst /build/vendor.tzst -COPY ./modcache.tzst /build/modcache.tzst -COPY ./internal ./internal -COPY ./bin/gitrev.sh ./bin/gitrev.sh -COPY ./mfer ./mfer -COPY ./cmd ./cmd -ARG GITREV unknown -ARG DRONE_COMMIT_SHA unknown +# golangci/golangci-lint:v2.0.2 (2026-03-14) +FROM golangci/golangci-lint@sha256:d55581f7797e7a0877a7c3aaa399b01bdc57d2874d6412601a046cc4062cb62e AS lint-bin + +# golang:1.23 (2026-03-14) +FROM golang@sha256:60deed95d3888cc5e4d9ff8a10c54e5edc008c6ae3fba6187be6fb592e19e8c0 AS builder +COPY --from=lint-bin /usr/bin/golangci-lint /usr/local/bin/golangci-lint +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download +COPY . . +# Touch .pb.go so make does not try to regenerate via protoc (file is committed) +RUN touch mfer/mf.pb.go +RUN make check +RUN cd cmd/mfer && go build -tags urfave_cli_no_docs -o /mfer . -RUN mkdir -p "$(go env GOMODCACHE)" && cd "$(go env GOMODCACHE)" && \ - zstdmt -d --stdout /build/modcache.tzst | tar xf - && \ - rm /build/modcache.tzst && cd /build -RUN \ - cd mfer && go generate . && cd .. && \ - GOPACKAGESDEBUG=true golangci-lint run ./... && \ - mkdir vendor && cd vendor && \ - zstdmt -d --stdout /build/vendor.tzst | tar xf - && rm /build/vendor.tzst && \ - cd .. && \ - make mfer.cmd -RUN rm -rf /build/vendor && go mod vendor && tar -c . | zstdmt -19 > /src.tzst -################################################################################ -#2345678911234567892123456789312345678941234567895123456789612345678971234567898 -################################################################################ -## final image -################################################################################ FROM scratch -# we put all the source into the final image for posterity, it's small -COPY --from=builder /src.tzst /src.tzst -COPY --from=builder /build/mfer.cmd /mfer +COPY --from=builder /mfer /mfer ENTRYPOINT ["/mfer"] diff --git a/Makefile b/Makefile index e27258f..89ad942 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ export PATH := $(PATH):$(GOPATH)/bin PROTOC_GEN_GO := $(GOPATH)/bin/protoc-gen-go SOURCEFILES := mfer/*.go mfer/*.proto internal/*/*.go cmd/*/*.go go.mod go.sum ARCH := $(shell uname -m) -GITREV_BUILD := $(shell bash $(PWD)/bin/gitrev.sh) +GITREV_BUILD := $(shell bash $(PWD)/bin/gitrev.sh 2>/dev/null || echo unknown) APPNAME := mfer VERSION := 0.1.0 export DOCKER_IMAGE_CACHE_DIR := $(HOME)/Library/Caches/Docker/$(APPNAME)-$(ARCH) @@ -13,7 +13,7 @@ GOLDFLAGS += -X main.Version=$(VERSION) GOLDFLAGS += -X main.Gitrev=$(GITREV_BUILD) GOFLAGS := -ldflags "$(GOLDFLAGS)" -.PHONY: docker default run ci test fixme +.PHONY: docker default run ci test check lint fmt fmt-check hooks fixme default: fmt test @@ -32,8 +32,17 @@ $(PROTOC_GEN_GO): fixme: @grep -nir fixme . | grep -v Makefile +check: test fmt-check + +fmt-check: mfer/mf.pb.go + sh -c 'test -z "$$(gofmt -l .)"' + +hooks: + echo '#!/bin/sh\nmake check' > .git/hooks/pre-commit + chmod +x .git/hooks/pre-commit + devprereqs: - which golangci-lint || go install -v github.com/golangci/golangci-lint/cmd/golangci-lint@latest + which golangci-lint || go install -v github.com/golangci/golangci-lint/cmd/golangci-lint@v2.0.2 mfer/mf.pb.go: mfer/mf.proto cd mfer && go generate . diff --git a/README.md b/README.md index 37c5cc2..40963c6 100644 --- a/README.md +++ b/README.md @@ -3,84 +3,25 @@ [mfer](https://git.eeqj.de/sneak/mfer) is a reference implementation library and thin wrapper command-line utility written in [Go](https://golang.org) and first published in 2022 under the [WTFPL](https://wtfpl.net) (public -domain) license. It specifies and generates `.mf` manifest files over a +domain) license. It specifies and generates `.mf` manifest files over a directory tree of files to encapsulate metadata about them (such as cryptographic checksums or signatures over same) to aid in archiving, -downloading, and streaming, or mirroring. The manifest files' data is +downloading, and streaming, or mirroring. The manifest files' data is serialized with Google's [protobuf serialization -format](https://developers.google.com/protocol-buffers). The structure of -these files can be found in the [format specification](FORMAT.md) and the -[protobuf schema](mfer/mf.proto), both included in the [project +format](https://developers.google.com/protocol-buffers). The structure of +these files can be found [in the format +specification](https://git.eeqj.de/sneak/mfer/src/branch/main/mfer/mf.proto) +which is included in the [project repository](https://git.eeqj.de/sneak/mfer). The current version is pre-1.0 and while the repo was published in 2022, -there has not yet been any versioned release. [SemVer](https://semver.org) +there has not yet been any versioned release. [SemVer](https://semver.org) will be used for releases. This project was started by [@sneak](https://sneak.berlin) to scratch an itch in 2022 and is currently a one-person effort, though the goal is for this to emerge as a de-facto standard and be incorporated into other -software. A compatible javascript library is planned. - -# Phases - -Manifest generation happens in two distinct phases: - -## Phase 1: Enumeration - -Walking directories and calling `stat()` on files to collect metadata (path, size, mtime, ctime). This builds the list of files to be scanned. Relatively fast as it only reads filesystem metadata, not file contents. - -**Progress:** `EnumerateStatus` with `FilesFound` and `BytesFound` - -## Phase 2: Scan (ToManifest) - -Reading file contents and computing cryptographic hashes for manifest generation. This is the expensive phase that reads all file data from disk. - -**Progress:** `ScanStatus` with `TotalFiles`, `ScannedFiles`, `TotalBytes`, `ScannedBytes`, `BytesPerSec` - -# Code Conventions - -- **Logging:** Never use `fmt.Printf` or write to stdout/stderr directly in normal code. Use the `internal/log` package for all output (`log.Info`, `log.Infof`, `log.Debug`, `log.Debugf`, `log.Progressf`, `log.ProgressDone`). -- **Filesystem abstraction:** Use `github.com/spf13/afero` for filesystem operations to enable testing and flexibility. -- **CLI framework:** Use `github.com/urfave/cli/v2` for command-line interface. -- **Serialization:** Use Protocol Buffers for manifest file format. -- **Internal packages:** Non-exported implementation details go in `internal/` subdirectories. -- **Concurrency:** Use `sync.RWMutex` for protecting shared state; prefer channels for progress reporting. -- **Progress channels:** Use buffered channels (size 1) with non-blocking sends to avoid blocking the main operation if the consumer is slow. -- **Context support:** Long-running operations should accept `context.Context` for cancellation. -- **NO_COLOR:** Respect the `NO_COLOR` environment variable for disabling colored output. -- **Options pattern:** Use `NewWithOptions(opts *Options)` constructor pattern for configurable types. - -# Building - -## Prerequisites - -- Go 1.21 or later -- `protoc` (Protocol Buffers compiler) — only needed if modifying `.proto` files -- `golangci-lint` — for linting (`go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest`) -- `gofumpt` — for formatting (`go install mvdan.cc/gofumpt@latest`) - -## Build - -```sh -# Build the binary -make bin/mfer - -# Run tests -make test - -# Format code -make fmt - -# Lint -make lint -``` - -## Install from source - -```sh -go install sneak.berlin/go/mfer/cmd/mfer@latest -``` +software. A compatible javascript library is planned. # Build Status @@ -89,18 +30,19 @@ go install sneak.berlin/go/mfer/cmd/mfer@latest # Participation The community is as yet nonexistent so there are no defined policies or -norms yet. Primary development happens on a privately-run Gitea instance at +norms yet. Primary development happens on a privately-run Gitea instance at [https://git.eeqj.de/sneak/mfer](https://git.eeqj.de/sneak/mfer) and issues are [tracked there](https://git.eeqj.de/sneak/mfer/issues). Changes must always be formatted with a standard `go fmt`, syntactically valid, and must pass the linting defined in the repository (presently only -the `golangci-lint` defaults), which can be run with a `make lint`. The +the `golangci-lint` defaults), which can be run with a `make lint`. The `main` branch is protected and all changes must be made via [pull requests](https://git.eeqj.de/sneak/mfer/pulls) and pass CI to be merged. Any changes submitted to this project must also be [WTFPL-licensed](https://wtfpl.net) to be considered. + # Problem Statement Given a plain URL, there is no standard way to safely and programmatically @@ -178,10 +120,6 @@ The manifest file would do several important things: - metadata size should not be used as an excuse to sacrifice utility (such as providing checksums over each chunk of a large file) -# Limitations - -- **Manifest size:** Manifests must fit entirely in system memory during reading and writing. - # Open Questions - Should the manifest file include checksums of individual file chunks, or just for the whole assembled file? @@ -269,15 +207,24 @@ regardless of filesystem format. Please email [`sneak@sneak.berlin`](mailto:sneak@sneak.berlin) with your desired username for an account on this Gitea instance. +# See Also + +## Prior Art: Metalink + +* [Metalink - Mozilla Wiki](https://wiki.mozilla.org/Metalink) +* [Metalink - Wikipedia](https://en.wikipedia.org/wiki/Metalink) +* [RFC 5854 - The Metalink Download Description Format](https://datatracker.ietf.org/doc/html/rfc5854) +* [RFC 6249 - Metalink/HTTP: Mirrors and Hashes](https://www.rfc-editor.org/rfc/rfc6249.html) + ## Links -- Repo: [https://git.eeqj.de/sneak/mfer](https://git.eeqj.de/sneak/mfer) -- Issues: [https://git.eeqj.de/sneak/mfer/issues](https://git.eeqj.de/sneak/mfer/issues) +* Repo: [https://git.eeqj.de/sneak/mfer](https://git.eeqj.de/sneak/mfer) +* Issues: [https://git.eeqj.de/sneak/mfer/issues](https://git.eeqj.de/sneak/mfer/issues) # Authors -- [@sneak <sneak@sneak.berlin>](mailto:sneak@sneak.berlin) +* [@sneak <sneak@sneak.berlin>](mailto:sneak@sneak.berlin) # License -- [WTFPL](https://wtfpl.net) +* [WTFPL](https://wtfpl.net) diff --git a/TODO.md b/TODO.md index b03d1b7..6c4cd3e 100644 --- a/TODO.md +++ b/TODO.md @@ -9,76 +9,76 @@ **1. Should `MFFileChecksum` be simplified?** Currently it's a separate message wrapping a single `bytes multiHash` field. Since multihash already self-describes the algorithm, `repeated bytes hashes` directly on `MFFilePath` would be simpler and reduce per-file protobuf overhead. Is the extra message layer intentional (e.g. planning to add per-hash metadata like `verified_at`)? -> *answer:* Leave as-is for now. +> *answer:* **2. Should file permissions/mode be stored?** The format stores mtime/ctime but not Unix file permissions. For archival use (ExFAT, filesystem-independent checksums) this may not matter, but for software distribution or filesystem restoration it's a gap. Should we reserve a field now (e.g. `optional uint32 mode = 305`) even if we don't populate it yet? -> *answer:* No, not right now. +> *answer:* **3. Should `atime` be removed from the schema?** Access time is volatile, non-deterministic, and often disabled (`noatime`). Including it means two manifests of the same directory at different times will differ, which conflicts with the determinism goal. Remove it, or document it as "never set by default"? -> *answer:* REMOVED — done. Field 304 has been removed from the proto schema. +> *answer:* **4. What are the path normalization rules?** The proto has `string path` with no specification about: always forward-slash? Must be relative? No `..` components allowed? UTF-8 NFC vs NFD normalization (macOS vs Linux)? Max path length? This is a security issue (path traversal) and a cross-platform compatibility issue. What rules should the spec mandate? -> *answer:* Implemented — UTF-8, forward-slash only, relative paths only, no `..` segments. Documented in FORMAT.md. +> *answer:* **5. Should we add a version byte after the magic?** Currently `ZNAVSRFG` is followed immediately by protobuf. Adding a version byte (`ZNAVSRFG\x01`) would allow future framing changes without requiring protobuf parsing to detect the version. `MFFileOuter.Version` serves this purpose but requires successful deserialization to read. Worth the extra byte? -> *answer:* No — protobuf handles versioning via the `MFFileOuter.Version` field. +> *answer:* **6. Should we add a length-prefix after the magic?** Protobuf is not self-delimiting. If we ever want to concatenate manifests or append data after the protobuf, the current framing is insufficient. Add a varint or fixed-width length-prefix? -> *answer:* Not needed now. +> *answer:* ### Signature Design **7. What does the outer SHA-256 hash cover — compressed or uncompressed data?** The review notes it currently hashes compressed data (good for verifying before decompression), but this should be explicitly documented. Which is the intended behavior? -> *answer:* Hash covers compressed data. Documented in FORMAT.md. +> *answer:* **8. Should `signatureString()` sign raw bytes instead of a hex-encoded string?** Currently the canonical string is `MAGIC-UUID-MULTIHASH` with hex encoding, which adds a transformation layer. Signing the raw `sha256` bytes (or compressed `innerMessage` directly) would be simpler. Keep the string format or switch to raw bytes? -> *answer:* Keep string format as-is (established). +> *answer:* **9. Should we support detached signature files (`.mf.sig`)?** Embedded signatures are better for single-file distribution. Detached `.mf.sig` files follow the familiar `SHASUMS`/`SHASUMS.asc` pattern and are simpler for HTTP serving. Support both modes? -> *answer:* Not for 1.0. +> *answer:* **10. GPG vs pure-Go crypto for signatures?** Shelling out to `gpg` is fragile (may not be installed, version-dependent output). `github.com/ProtonMail/go-crypto` provides pure-Go OpenPGP, or we could go Ed25519/signify (simpler, no key management). Which direction? -> *answer:* Keep GPG shelling for now (established). +> *answer:* ### Implementation Design **11. Should manifests be deterministic by default?** This means: sort file entries by path, omit `createdAt` timestamp (or make it opt-in), no `atime`. Should determinism be the default, with a `--include-timestamps` flag to opt in? -> *answer:* YES — implemented, default behavior. +> *answer:* **12. Should we consolidate or keep both scanner/checker implementations?** There are two parallel implementations: `mfer/scanner.go` + `mfer/checker.go` (typed with `FileSize`, `RelFilePath`) and `internal/scanner/` + `internal/checker/` (raw `int64`, `string`). The `mfer/` versions are superior. Delete the `internal/` versions? -> *answer:* Consolidated — done (PR#27). +> *answer:* **13. Should the `manifest` type be exported?** Currently unexported with exported constructors (`New`, `NewFromPaths`, etc.). Consumers can't declare `var m *mfer.manifest`. Export the type, or define an interface? -> *answer:* Keep unexported. +> *answer:* **14. What should the Go module path be for 1.0?** Currently mixed between `sneak.berlin/go/mfer` and `git.eeqj.de/sneak/mfer`. Which is canonical? -> *answer:* `sneak.berlin/go/mfer` +> *answer:* --- @@ -86,19 +86,19 @@ Currently mixed between `sneak.berlin/go/mfer` and `git.eeqj.de/sneak/mfer`. Whi ### Phase 1: Foundation (format correctness) -- [x] Delete `internal/scanner/` and `internal/checker/` — consolidate on `mfer/` package versions; update CLI code -- [x] Add deterministic file ordering — sort entries by path (lexicographic, byte-order) in `Builder.Build()`; add test asserting byte-identical output from two runs -- [x] Add decompression size limit — `io.LimitReader` in `deserializeInner()` with `m.pbOuter.Size` as bound +- [ ] Delete `internal/scanner/` and `internal/checker/` — consolidate on `mfer/` package versions; update CLI code +- [ ] Add deterministic file ordering — sort entries by path (lexicographic, byte-order) in `Builder.Build()`; add test asserting byte-identical output from two runs +- [ ] Add decompression size limit — `io.LimitReader` in `deserializeInner()` with `m.pbOuter.Size` as bound - [ ] Fix `errors.Is` dead code in checker — replace with `os.IsNotExist(err)` or `errors.Is(err, fs.ErrNotExist)` - [ ] Fix `AddFile` to verify size — check `totalRead == size` after reading, return error on mismatch -- [x] Specify path invariants — add proto comments (UTF-8, forward-slash, relative, no `..`, no leading `/`); validate in `Builder.AddFile` and `Builder.AddFileWithHash` +- [ ] Specify path invariants — add proto comments (UTF-8, forward-slash, relative, no `..`, no leading `/`); validate in `Builder.AddFile` and `Builder.AddFileWithHash` ### Phase 2: CLI polish - [ ] Fix flag naming — all CLI flags use kebab-case as primary (`--include-dotfiles`, `--follow-symlinks`) - [ ] Fix URL construction in fetch — use `BaseURL.JoinPath()` or `url.JoinPath()` instead of string concatenation - [ ] Add progress rate-limiting to Checker — throttle to once per second, matching Scanner -- [x] Add `--deterministic` flag (or make it default) — omit `createdAt`, sort files +- [ ] Add `--deterministic` flag (or make it default) — omit `createdAt`, sort files ### Phase 3: Robustness @@ -109,10 +109,10 @@ Currently mixed between `sneak.berlin/go/mfer` and `git.eeqj.de/sneak/mfer`. Whi ### Phase 4: Format finalization -- [x] Remove or deprecate `atime` from proto (pending design question answer) +- [ ] Remove or deprecate `atime` from proto (pending design question answer) - [ ] Reserve `optional uint32 mode = 305` in `MFFilePath` for future file permissions - [ ] Add version byte after magic — `ZNAVSRFG\x01` for format version 1 -- [x] Write format specification document — separate from README: magic, outer structure, compression, inner structure, path invariants, signature scheme, canonical serialization +- [ ] Write format specification document — separate from README: magic, outer structure, compression, inner structure, path invariants, signature scheme, canonical serialization ### Phase 5: Release prep