From c8e7971445862c4b638ae10ab71ee278907c9a0a Mon Sep 17 00:00:00 2001 From: sneak Date: Tue, 9 Jun 2026 17:35:35 -0400 Subject: [PATCH] Add --ml and --exif flags to backup-metadata --ml fetches face detections and CLIP embeddings from the /files/data/fetch endpoint (type 'mldata'). Each blob is encrypted with the file's key and gzipped; we decrypt with decryptBlob, gunzip, and include the parsed JSON as 'mlData' in the per-file output. Fetched in batches of 200 file IDs. --exif downloads each file, runs sharp().metadata() to extract image properties (format, dimensions, color space, orientation), then parses the raw EXIF buffer with exif-reader for structured tags (lens, ISO, shutter, aperture, GPS altitude, etc.). Also captures raw IPTC, XMP, and ICC profile data. Included as 'imageMetadata' in the per-file output. Without either flag, behavior is unchanged (fast metadata-only dump). Adds exif-reader 2.0.3 as a runtime dependency. 3 new tests (ML data decrypted, ML data absent when flag not set, EXIF extraction). 119 total tests, all green. --- bin/quak.ts | 18 ++- package.json | 1 + src/metadata-backup.ts | 183 ++++++++++++++++++++++++++++--- test/cli/metadata-backup.test.ts | 169 ++++++++++++++++++++++++++++ yarn.lock | 9 +- 5 files changed, 357 insertions(+), 23 deletions(-) diff --git a/bin/quak.ts b/bin/quak.ts index f5fcee1..4ac077a 100644 --- a/bin/quak.ts +++ b/bin/quak.ts @@ -337,13 +337,25 @@ program program .command("backup-metadata") .description( - "Dump all decrypted account metadata (no file content) to a directory", + "Dump all decrypted account metadata to a directory of JSON files", ) .argument("", "Output directory") - .action(async (dir: string) => { + .option( + "--ml", + "Include ML data (face detections, CLIP embeddings) from the Ente server", + ) + .option( + "--exif", + "Download each file and extract full EXIF/IPTC/XMP metadata (slow)", + ) + .action(async (dir: string, opts: { ml?: boolean; exif?: boolean }) => { await init(); const client = requireSession(); - await runMetadataBackup(client, dir, (msg) => stderr.write(msg + "\n")); + await runMetadataBackup(client, dir, { + mlData: opts.ml, + exif: opts.exif, + onProgress: (msg) => stderr.write(msg + "\n"), + }); }); program diff --git a/package.json b/package.json index 836f04d..7d2b90d 100644 --- a/package.json +++ b/package.json @@ -41,6 +41,7 @@ "dependencies": { "commander": "14.0.3", "env-paths": "4.0.0", + "exif-reader": "2.0.3", "fast-srp-hap": "2.0.4", "libsodium-wrappers-sumo": "0.8.4", "sharp": "0.34.5" diff --git a/src/metadata-backup.ts b/src/metadata-backup.ts index 2b622ca..26034cd 100644 --- a/src/metadata-backup.ts +++ b/src/metadata-backup.ts @@ -1,18 +1,127 @@ -import { mkdirSync, writeFileSync } from "node:fs"; +import { gunzipSync } from "node:zlib"; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; import { join } from "node:path"; +import { tmpdir } from "node:os"; +import sharp from "sharp"; +import exifReader from "exif-reader"; import type { Client } from "./client.js"; +import { decryptBlob, fromBase64 } from "./crypto/index.js"; +import type { EnteFile } from "./model/types.js"; export type ProgressCallback = (message: string) => void; +export interface MetadataBackupOptions { + mlData?: boolean; + exif?: boolean; + onProgress?: ProgressCallback; +} + const sanitizePath = (name: string): string => name.replace(/[/\\:*?"<>|]/g, "_").replace(/^\.+/, "_"); +interface RawRemoteFileData { + fileID: number; + encryptedData: string; + decryptionHeader: string; + updatedAt?: number; +} + +const fetchMLDataForFiles = async ( + client: Client, + fileIDs: number[], + fileKeys: Map, +): Promise>> => { + const api = client.getApiClient(); + const result = new Map>(); + const batchSize = 200; + + for (let i = 0; i < fileIDs.length; i += batchSize) { + const batch = fileIDs.slice(i, i + batchSize); + const { data } = await api.postJSON<{ data: RawRemoteFileData[] }>( + "/files/data/fetch", + { type: "mldata", fileIDs: batch }, + ); + + for (const entry of data ?? []) { + const key = fileKeys.get(entry.fileID); + if (!key) continue; + try { + const decrypted = decryptBlob( + fromBase64(entry.encryptedData), + fromBase64(entry.decryptionHeader), + key, + ); + const jsonStr = gunzipSync(Buffer.from(decrypted)).toString( + "utf-8", + ); + result.set(entry.fileID, JSON.parse(jsonStr)); + } catch { + // Corrupted ML data for this file; skip it + } + } + } + return result; +}; + +const extractExif = async ( + client: Client, + file: EnteFile, +): Promise | undefined> => { + const tmpDir = mkdtempSync(join(tmpdir(), "quak-exif-")); + try { + const origPath = join(tmpDir, "original"); + await client.downloadFile(file, origPath); + const meta = await sharp(origPath).metadata(); + + const result: Record = { + format: meta.format, + width: meta.width, + height: meta.height, + space: meta.space, + channels: meta.channels, + depth: meta.depth, + density: meta.density, + chromaSubSampling: meta.chromaSubSampling, + isProgressive: meta.isProgressive, + hasProfile: meta.hasProfile, + hasAlpha: meta.hasAlpha, + orientation: meta.orientation, + }; + + if (meta.exif) { + try { + result.exif = exifReader(meta.exif); + } catch { + // Malformed EXIF; store the raw bytes as base64 instead + result.exifRaw = meta.exif.toString("base64"); + } + } + if (meta.iptc) result.iptcRaw = meta.iptc.toString("base64"); + if (meta.xmp) { + try { + result.xmp = Buffer.from(meta.xmp).toString("utf-8"); + } catch { + result.xmpRaw = meta.xmp.toString("base64"); + } + } + if (meta.icc) result.iccRaw = meta.icc.toString("base64"); + + return result; + } catch { + return undefined; + } finally { + rmSync(tmpDir, { recursive: true, force: true }); + } +}; + export const runMetadataBackup = async ( client: Client, outDir: string, - onProgress?: ProgressCallback, + opts?: MetadataBackupOptions, ): Promise => { - const log = onProgress ?? (() => {}); + const log = opts?.onProgress ?? (() => {}); + const wantML = opts?.mlData ?? false; + const wantExif = opts?.exif ?? false; mkdirSync(outDir, { recursive: true }); mkdirSync(join(outDir, "collections"), { recursive: true }); @@ -26,6 +135,10 @@ export const runMetadataBackup = async ( log("Fetching collections..."); const collections = await client.listCollections(); + const allFiles: { file: EnteFile; colDirName: string }[] = []; + const fileKeys = new Map(); + const seenFileIDs = new Set(); + for (const col of collections) { const dirName = `${col.id}-${sanitizePath(col.name || "unnamed")}`; const colDir = join(outDir, "collections", dirName); @@ -55,23 +168,57 @@ export const runMetadataBackup = async ( log(`[${col.name}] ${files.length} file(s)`); for (const file of files) { - const fileMeta: Record = { - id: file.id, - collectionID: file.collectionID, - ownerID: file.ownerID, - metadata: file.metadata, - updationTime: file.updationTime, - }; - if (file.magicMetadata) fileMeta.magicMetadata = file.magicMetadata; - if (file.pubMagicMetadata) - fileMeta.pubMagicMetadata = file.pubMagicMetadata; - - writeFileSync( - join(colDir, `${file.id}.json`), - JSON.stringify(fileMeta, null, 2), - ); + allFiles.push({ file, colDirName: dirName }); + if (!seenFileIDs.has(file.id)) { + fileKeys.set(file.id, file.key); + seenFileIDs.add(file.id); + } } } + // Fetch ML data in bulk if requested + let mlDataMap = new Map>(); + if (wantML) { + log("Fetching ML data (face detections, CLIP embeddings)..."); + mlDataMap = await fetchMLDataForFiles( + client, + [...fileKeys.keys()], + fileKeys, + ); + log(`Got ML data for ${mlDataMap.size} file(s)`); + } + + // Write per-file JSON (with optional ML data and EXIF) + const writtenFileIDs = new Set(); + for (const { file, colDirName } of allFiles) { + const colDir = join(outDir, "collections", colDirName); + + const fileMeta: Record = { + id: file.id, + collectionID: file.collectionID, + ownerID: file.ownerID, + metadata: file.metadata, + updationTime: file.updationTime, + }; + if (file.magicMetadata) fileMeta.magicMetadata = file.magicMetadata; + if (file.pubMagicMetadata) + fileMeta.pubMagicMetadata = file.pubMagicMetadata; + + const ml = mlDataMap.get(file.id); + if (ml) fileMeta.mlData = ml; + + if (wantExif && !writtenFileIDs.has(file.id)) { + log(`[${file.metadata.title}] Extracting EXIF...`); + const exifData = await extractExif(client, file); + if (exifData) fileMeta.imageMetadata = exifData; + } + writtenFileIDs.add(file.id); + + writeFileSync( + join(colDir, `${file.id}.json`), + JSON.stringify(fileMeta, null, 2), + ); + } + log("Metadata backup complete."); }; diff --git a/test/cli/metadata-backup.test.ts b/test/cli/metadata-backup.test.ts index 0d86b3e..c6aae7a 100644 --- a/test/cli/metadata-backup.test.ts +++ b/test/cli/metadata-backup.test.ts @@ -22,6 +22,7 @@ * the output tree is correct and complete. */ +import { gzipSync } from "node:zlib"; import { existsSync, mkdtempSync, @@ -39,7 +40,9 @@ import { toBase64, deriveKEK, deriveLoginSubkey, + encryptBlob, } from "../../src/crypto/index.js"; +import sharp from "sharp"; import { Client } from "../../src/client.js"; import { runMetadataBackup } from "../../src/metadata-backup.js"; import type { KeyAttributes } from "../../src/auth/types.js"; @@ -56,6 +59,12 @@ interface MetaMockState { encryptedToken: string; collections: Record[]; filesByCollection: Record[]>; + // For ML data and EXIF tests + encryptedMLData: Record< + number, + { encryptedData: string; decryptionHeader: string } + >; + fileCiphertexts: Record; } let mock: MetaMockState; @@ -245,6 +254,58 @@ const buildMetaMock = async (): Promise => { updationTime: 1710000000000000, }; + // Encrypt ML data for file 100 (gzipped JSON, encrypted with file key) + const mlPayload = JSON.stringify({ + face: { + version: 1, + client: "test", + width: 3000, + height: 2000, + faces: [ + { + faceID: "face-abc", + detection: { + box: { x: 0.1, y: 0.2, width: 0.3, height: 0.4 }, + landmarks: [ + { x: 0.15, y: 0.25 }, + { x: 0.25, y: 0.25 }, + ], + }, + score: 0.98, + blur: 12.5, + embedding: [0.1, 0.2, 0.3], + }, + ], + }, + clip: { + version: 1, + client: "test", + embedding: [0.5, 0.6, 0.7], + }, + }); + const gzipped = gzipSync(Buffer.from(mlPayload)); + const { header: mlHeader, ciphertext: mlCiphertext } = encryptBlob( + new Uint8Array(gzipped), + fk1, + ); + + // Generate a real JPEG for EXIF extraction tests + const tinyJpeg = await sharp({ + create: { width: 100, height: 80, channels: 3, background: "red" }, + }) + .jpeg({ quality: 80 }) + .toBuffer(); + const filePush1 = + sodium.crypto_secretstream_xchacha20poly1305_init_push(fk1); + const encFileBody1 = sodium.crypto_secretstream_xchacha20poly1305_push( + filePush1.state, + new Uint8Array(tinyJpeg), + null, + sodium.crypto_secretstream_xchacha20poly1305_TAG_FINAL, + ); + // Patch rawFile1's file.decryptionHeader to match the push header + rawFile1.file.decryptionHeader = toBase64(filePush1.header); + return { verifier, srpAttributes: { @@ -259,6 +320,13 @@ const buildMetaMock = async (): Promise => { encryptedToken: toBase64(encToken), collections: [rawColl1, rawColl2], filesByCollection: { 10: [rawFile1], 20: [rawFile2] }, + encryptedMLData: { + 100: { + encryptedData: toBase64(mlCiphertext), + decryptionHeader: toBase64(mlHeader), + }, + }, + fileCiphertexts: { 100: encFileBody1 }, }; }; @@ -316,6 +384,29 @@ const buildMetaFetch = (m: MetaMockState) => { hasMore: false, }); } + if (path === "/files/data/fetch") { + const body = JSON.parse(init?.body as string); + const data = (body.fileIDs as number[]) + .filter((id: number) => m.encryptedMLData[id]) + .map((id: number) => ({ + fileID: id, + ...m.encryptedMLData[id], + updatedAt: 1700000000000000, + })); + return json({ data }); + } + if ( + url.includes("files.ente.io") || + path.startsWith("/files/download/") + ) { + const parsed = new URL(url); + const fileID = Number( + parsed.searchParams.get("fileID") ?? path.split("/").pop(), + ); + const ct = m.fileCiphertexts[fileID]; + if (ct) return new Response(ct, { status: 200 }); + return new Response("not found", { status: 404 }); + } return new Response("not found", { status: 404 }); }) as typeof globalThis.fetch; }; @@ -465,4 +556,82 @@ describe("quak backup-metadata", () => { ); expect(account.email).toBe(TEST_EMAIL); }); + + it("fetches and decrypts ML data when --ml is set", async () => { + const outDir = join(testDir, "ml-data"); + const client = await Client.login({ + email: TEST_EMAIL, + password: TEST_PASSWORD, + apiOptions: { fetch: buildMetaFetch(mock) }, + }); + + await runMetadataBackup(client, outDir, { mlData: true }); + + const collDirs = readdirSync(join(outDir, "collections")); + const vacDir = collDirs.find((d) => d.includes("Vacation"))!; + const fileMeta = JSON.parse( + readFileSync( + join(outDir, "collections", vacDir, "100.json"), + "utf-8", + ), + ); + + // ML data should be present and decrypted + expect(fileMeta.mlData).toBeDefined(); + expect(fileMeta.mlData.face).toBeDefined(); + expect(fileMeta.mlData.face.faces.length).toBe(1); + expect(fileMeta.mlData.face.faces[0].faceID).toBe("face-abc"); + expect(fileMeta.mlData.face.faces[0].score).toBeCloseTo(0.98); + expect(fileMeta.mlData.face.faces[0].detection.box.x).toBeCloseTo(0.1); + expect(fileMeta.mlData.clip).toBeDefined(); + expect(fileMeta.mlData.clip.embedding).toEqual([0.5, 0.6, 0.7]); + }); + + it("does not include ML data when --ml is not set", async () => { + const outDir = join(testDir, "no-ml"); + const client = await Client.login({ + email: TEST_EMAIL, + password: TEST_PASSWORD, + apiOptions: { fetch: buildMetaFetch(mock) }, + }); + + await runMetadataBackup(client, outDir); + + const collDirs = readdirSync(join(outDir, "collections")); + const vacDir = collDirs.find((d) => d.includes("Vacation"))!; + const fileMeta = JSON.parse( + readFileSync( + join(outDir, "collections", vacDir, "100.json"), + "utf-8", + ), + ); + expect(fileMeta.mlData).toBeUndefined(); + }); + + it("extracts EXIF from downloaded files when --exif is set", async () => { + const outDir = join(testDir, "exif-data"); + const client = await Client.login({ + email: TEST_EMAIL, + password: TEST_PASSWORD, + apiOptions: { fetch: buildMetaFetch(mock) }, + }); + + await runMetadataBackup(client, outDir, { exif: true }); + + const collDirs = readdirSync(join(outDir, "collections")); + const vacDir = collDirs.find((d) => d.includes("Vacation"))!; + const fileMeta = JSON.parse( + readFileSync( + join(outDir, "collections", vacDir, "100.json"), + "utf-8", + ), + ); + + // imageMetadata from sharp should be present + expect(fileMeta.imageMetadata).toBeDefined(); + expect(fileMeta.imageMetadata.format).toBe("jpeg"); + expect(fileMeta.imageMetadata.width).toBe(100); + expect(fileMeta.imageMetadata.height).toBe(80); + expect(fileMeta.imageMetadata.channels).toBe(3); + }); }); diff --git a/yarn.lock b/yarn.lock index f16f660..cb84981 100644 --- a/yarn.lock +++ b/yarn.lock @@ -557,7 +557,7 @@ dependencies: undici-types "~6.21.0" -"@types/sharp@^0.32.0": +"@types/sharp@0.32.0": version "0.32.0" resolved "https://registry.yarnpkg.com/@types/sharp/-/sharp-0.32.0.tgz#fc3ac6df6b456319bae807c3d24efdc6631cdd6f" integrity sha512-OOi3kL+FZDnPhVzsfD37J88FNeZh6gQsGcLc95NbeURRGvmSjeXiDcyWzF2o3yh/gQAUn2uhh/e+CPCa5nwAxw== @@ -1026,6 +1026,11 @@ esutils@^2.0.2: resolved "https://registry.yarnpkg.com/esutils/-/esutils-2.0.3.tgz#74d2eb4de0b8da1293711910d50775b9b710ef64" integrity sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g== +exif-reader@^2.0.3: + version "2.0.3" + resolved "https://registry.yarnpkg.com/exif-reader/-/exif-reader-2.0.3.tgz#259997735080bc6bb959c37b32c60f004ec4391d" + integrity sha512-zFbQvguwT9JkqyYhR7pjE1Yn8SagwaGLNRU0Oh14xFa1paSf5Gzxn4gxgk0XhnudI0UIqU+HgnBX93+nva592A== + expect-type@^1.1.0: version "1.3.0" resolved "https://registry.yarnpkg.com/expect-type/-/expect-type-1.3.0.tgz#0d58ed361877a31bbc4dd6cf71bbfef7faf6bd68" @@ -1451,7 +1456,7 @@ semver@^7.6.0, semver@^7.7.3: resolved "https://registry.yarnpkg.com/semver/-/semver-7.8.0.tgz#ed0661039fcbcda2ce71f01fa6adbefaa77040df" integrity sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA== -sharp@*, sharp@^0.34.5: +sharp@*, sharp@0.34.5: version "0.34.5" resolved "https://registry.yarnpkg.com/sharp/-/sharp-0.34.5.tgz#b6f148e4b8c61f1797bde11a9d1cfebbae2c57b0" integrity sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==