Merge: backup-metadata --ml and --exif flags

--ml: fetches face detections (bounding boxes, landmarks, embeddings)
and CLIP search embeddings from the /files/data/fetch endpoint. These
are encrypted with the file key and gzipped; quak decrypts and
decompresses them into the per-file JSON output.

--exif: downloads each original file, extracts full image metadata
via sharp (format, dimensions, color space, orientation) and parses
raw EXIF tags via exif-reader (lens, ISO, shutter, aperture, GPS
altitude, software, etc.). Also captures IPTC, XMP, and ICC data.

3 new tests. 119 total, all green.
This commit is contained in:
2026-06-09 17:35:44 -04:00
5 changed files with 357 additions and 23 deletions

View File

@@ -337,13 +337,25 @@ program
program program
.command("backup-metadata") .command("backup-metadata")
.description( .description(
"Dump all decrypted account metadata (no file content) to a directory", "Dump all decrypted account metadata to a directory of JSON files",
) )
.argument("<dir>", "Output directory") .argument("<dir>", "Output directory")
.action(async (dir: string) => { .option(
"--ml",
"Include ML data (face detections, CLIP embeddings) from the Ente server",
)
.option(
"--exif",
"Download each file and extract full EXIF/IPTC/XMP metadata (slow)",
)
.action(async (dir: string, opts: { ml?: boolean; exif?: boolean }) => {
await init(); await init();
const client = requireSession(); const client = requireSession();
await runMetadataBackup(client, dir, (msg) => stderr.write(msg + "\n")); await runMetadataBackup(client, dir, {
mlData: opts.ml,
exif: opts.exif,
onProgress: (msg) => stderr.write(msg + "\n"),
});
}); });
program program

View File

@@ -41,6 +41,7 @@
"dependencies": { "dependencies": {
"commander": "14.0.3", "commander": "14.0.3",
"env-paths": "4.0.0", "env-paths": "4.0.0",
"exif-reader": "2.0.3",
"fast-srp-hap": "2.0.4", "fast-srp-hap": "2.0.4",
"libsodium-wrappers-sumo": "0.8.4", "libsodium-wrappers-sumo": "0.8.4",
"sharp": "0.34.5" "sharp": "0.34.5"

View File

@@ -1,18 +1,127 @@
import { mkdirSync, writeFileSync } from "node:fs"; import { gunzipSync } from "node:zlib";
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { join } from "node:path"; import { join } from "node:path";
import { tmpdir } from "node:os";
import sharp from "sharp";
import exifReader from "exif-reader";
import type { Client } from "./client.js"; import type { Client } from "./client.js";
import { decryptBlob, fromBase64 } from "./crypto/index.js";
import type { EnteFile } from "./model/types.js";
export type ProgressCallback = (message: string) => void; export type ProgressCallback = (message: string) => void;
export interface MetadataBackupOptions {
mlData?: boolean;
exif?: boolean;
onProgress?: ProgressCallback;
}
const sanitizePath = (name: string): string => const sanitizePath = (name: string): string =>
name.replace(/[/\\:*?"<>|]/g, "_").replace(/^\.+/, "_"); name.replace(/[/\\:*?"<>|]/g, "_").replace(/^\.+/, "_");
interface RawRemoteFileData {
fileID: number;
encryptedData: string;
decryptionHeader: string;
updatedAt?: number;
}
const fetchMLDataForFiles = async (
client: Client,
fileIDs: number[],
fileKeys: Map<number, Uint8Array>,
): Promise<Map<number, Record<string, unknown>>> => {
const api = client.getApiClient();
const result = new Map<number, Record<string, unknown>>();
const batchSize = 200;
for (let i = 0; i < fileIDs.length; i += batchSize) {
const batch = fileIDs.slice(i, i + batchSize);
const { data } = await api.postJSON<{ data: RawRemoteFileData[] }>(
"/files/data/fetch",
{ type: "mldata", fileIDs: batch },
);
for (const entry of data ?? []) {
const key = fileKeys.get(entry.fileID);
if (!key) continue;
try {
const decrypted = decryptBlob(
fromBase64(entry.encryptedData),
fromBase64(entry.decryptionHeader),
key,
);
const jsonStr = gunzipSync(Buffer.from(decrypted)).toString(
"utf-8",
);
result.set(entry.fileID, JSON.parse(jsonStr));
} catch {
// Corrupted ML data for this file; skip it
}
}
}
return result;
};
const extractExif = async (
client: Client,
file: EnteFile,
): Promise<Record<string, unknown> | undefined> => {
const tmpDir = mkdtempSync(join(tmpdir(), "quak-exif-"));
try {
const origPath = join(tmpDir, "original");
await client.downloadFile(file, origPath);
const meta = await sharp(origPath).metadata();
const result: Record<string, unknown> = {
format: meta.format,
width: meta.width,
height: meta.height,
space: meta.space,
channels: meta.channels,
depth: meta.depth,
density: meta.density,
chromaSubSampling: meta.chromaSubSampling,
isProgressive: meta.isProgressive,
hasProfile: meta.hasProfile,
hasAlpha: meta.hasAlpha,
orientation: meta.orientation,
};
if (meta.exif) {
try {
result.exif = exifReader(meta.exif);
} catch {
// Malformed EXIF; store the raw bytes as base64 instead
result.exifRaw = meta.exif.toString("base64");
}
}
if (meta.iptc) result.iptcRaw = meta.iptc.toString("base64");
if (meta.xmp) {
try {
result.xmp = Buffer.from(meta.xmp).toString("utf-8");
} catch {
result.xmpRaw = meta.xmp.toString("base64");
}
}
if (meta.icc) result.iccRaw = meta.icc.toString("base64");
return result;
} catch {
return undefined;
} finally {
rmSync(tmpDir, { recursive: true, force: true });
}
};
export const runMetadataBackup = async ( export const runMetadataBackup = async (
client: Client, client: Client,
outDir: string, outDir: string,
onProgress?: ProgressCallback, opts?: MetadataBackupOptions,
): Promise<void> => { ): Promise<void> => {
const log = onProgress ?? (() => {}); const log = opts?.onProgress ?? (() => {});
const wantML = opts?.mlData ?? false;
const wantExif = opts?.exif ?? false;
mkdirSync(outDir, { recursive: true }); mkdirSync(outDir, { recursive: true });
mkdirSync(join(outDir, "collections"), { recursive: true }); mkdirSync(join(outDir, "collections"), { recursive: true });
@@ -26,6 +135,10 @@ export const runMetadataBackup = async (
log("Fetching collections..."); log("Fetching collections...");
const collections = await client.listCollections(); const collections = await client.listCollections();
const allFiles: { file: EnteFile; colDirName: string }[] = [];
const fileKeys = new Map<number, Uint8Array>();
const seenFileIDs = new Set<number>();
for (const col of collections) { for (const col of collections) {
const dirName = `${col.id}-${sanitizePath(col.name || "unnamed")}`; const dirName = `${col.id}-${sanitizePath(col.name || "unnamed")}`;
const colDir = join(outDir, "collections", dirName); const colDir = join(outDir, "collections", dirName);
@@ -55,6 +168,31 @@ export const runMetadataBackup = async (
log(`[${col.name}] ${files.length} file(s)`); log(`[${col.name}] ${files.length} file(s)`);
for (const file of files) { for (const file of files) {
allFiles.push({ file, colDirName: dirName });
if (!seenFileIDs.has(file.id)) {
fileKeys.set(file.id, file.key);
seenFileIDs.add(file.id);
}
}
}
// Fetch ML data in bulk if requested
let mlDataMap = new Map<number, Record<string, unknown>>();
if (wantML) {
log("Fetching ML data (face detections, CLIP embeddings)...");
mlDataMap = await fetchMLDataForFiles(
client,
[...fileKeys.keys()],
fileKeys,
);
log(`Got ML data for ${mlDataMap.size} file(s)`);
}
// Write per-file JSON (with optional ML data and EXIF)
const writtenFileIDs = new Set<number>();
for (const { file, colDirName } of allFiles) {
const colDir = join(outDir, "collections", colDirName);
const fileMeta: Record<string, unknown> = { const fileMeta: Record<string, unknown> = {
id: file.id, id: file.id,
collectionID: file.collectionID, collectionID: file.collectionID,
@@ -66,12 +204,21 @@ export const runMetadataBackup = async (
if (file.pubMagicMetadata) if (file.pubMagicMetadata)
fileMeta.pubMagicMetadata = file.pubMagicMetadata; fileMeta.pubMagicMetadata = file.pubMagicMetadata;
const ml = mlDataMap.get(file.id);
if (ml) fileMeta.mlData = ml;
if (wantExif && !writtenFileIDs.has(file.id)) {
log(`[${file.metadata.title}] Extracting EXIF...`);
const exifData = await extractExif(client, file);
if (exifData) fileMeta.imageMetadata = exifData;
}
writtenFileIDs.add(file.id);
writeFileSync( writeFileSync(
join(colDir, `${file.id}.json`), join(colDir, `${file.id}.json`),
JSON.stringify(fileMeta, null, 2), JSON.stringify(fileMeta, null, 2),
); );
} }
}
log("Metadata backup complete."); log("Metadata backup complete.");
}; };

View File

@@ -22,6 +22,7 @@
* the output tree is correct and complete. * the output tree is correct and complete.
*/ */
import { gzipSync } from "node:zlib";
import { import {
existsSync, existsSync,
mkdtempSync, mkdtempSync,
@@ -39,7 +40,9 @@ import {
toBase64, toBase64,
deriveKEK, deriveKEK,
deriveLoginSubkey, deriveLoginSubkey,
encryptBlob,
} from "../../src/crypto/index.js"; } from "../../src/crypto/index.js";
import sharp from "sharp";
import { Client } from "../../src/client.js"; import { Client } from "../../src/client.js";
import { runMetadataBackup } from "../../src/metadata-backup.js"; import { runMetadataBackup } from "../../src/metadata-backup.js";
import type { KeyAttributes } from "../../src/auth/types.js"; import type { KeyAttributes } from "../../src/auth/types.js";
@@ -56,6 +59,12 @@ interface MetaMockState {
encryptedToken: string; encryptedToken: string;
collections: Record<string, unknown>[]; collections: Record<string, unknown>[];
filesByCollection: Record<number, Record<string, unknown>[]>; filesByCollection: Record<number, Record<string, unknown>[]>;
// For ML data and EXIF tests
encryptedMLData: Record<
number,
{ encryptedData: string; decryptionHeader: string }
>;
fileCiphertexts: Record<number, Uint8Array>;
} }
let mock: MetaMockState; let mock: MetaMockState;
@@ -245,6 +254,58 @@ const buildMetaMock = async (): Promise<MetaMockState> => {
updationTime: 1710000000000000, updationTime: 1710000000000000,
}; };
// Encrypt ML data for file 100 (gzipped JSON, encrypted with file key)
const mlPayload = JSON.stringify({
face: {
version: 1,
client: "test",
width: 3000,
height: 2000,
faces: [
{
faceID: "face-abc",
detection: {
box: { x: 0.1, y: 0.2, width: 0.3, height: 0.4 },
landmarks: [
{ x: 0.15, y: 0.25 },
{ x: 0.25, y: 0.25 },
],
},
score: 0.98,
blur: 12.5,
embedding: [0.1, 0.2, 0.3],
},
],
},
clip: {
version: 1,
client: "test",
embedding: [0.5, 0.6, 0.7],
},
});
const gzipped = gzipSync(Buffer.from(mlPayload));
const { header: mlHeader, ciphertext: mlCiphertext } = encryptBlob(
new Uint8Array(gzipped),
fk1,
);
// Generate a real JPEG for EXIF extraction tests
const tinyJpeg = await sharp({
create: { width: 100, height: 80, channels: 3, background: "red" },
})
.jpeg({ quality: 80 })
.toBuffer();
const filePush1 =
sodium.crypto_secretstream_xchacha20poly1305_init_push(fk1);
const encFileBody1 = sodium.crypto_secretstream_xchacha20poly1305_push(
filePush1.state,
new Uint8Array(tinyJpeg),
null,
sodium.crypto_secretstream_xchacha20poly1305_TAG_FINAL,
);
// Patch rawFile1's file.decryptionHeader to match the push header
rawFile1.file.decryptionHeader = toBase64(filePush1.header);
return { return {
verifier, verifier,
srpAttributes: { srpAttributes: {
@@ -259,6 +320,13 @@ const buildMetaMock = async (): Promise<MetaMockState> => {
encryptedToken: toBase64(encToken), encryptedToken: toBase64(encToken),
collections: [rawColl1, rawColl2], collections: [rawColl1, rawColl2],
filesByCollection: { 10: [rawFile1], 20: [rawFile2] }, filesByCollection: { 10: [rawFile1], 20: [rawFile2] },
encryptedMLData: {
100: {
encryptedData: toBase64(mlCiphertext),
decryptionHeader: toBase64(mlHeader),
},
},
fileCiphertexts: { 100: encFileBody1 },
}; };
}; };
@@ -316,6 +384,29 @@ const buildMetaFetch = (m: MetaMockState) => {
hasMore: false, hasMore: false,
}); });
} }
if (path === "/files/data/fetch") {
const body = JSON.parse(init?.body as string);
const data = (body.fileIDs as number[])
.filter((id: number) => m.encryptedMLData[id])
.map((id: number) => ({
fileID: id,
...m.encryptedMLData[id],
updatedAt: 1700000000000000,
}));
return json({ data });
}
if (
url.includes("files.ente.io") ||
path.startsWith("/files/download/")
) {
const parsed = new URL(url);
const fileID = Number(
parsed.searchParams.get("fileID") ?? path.split("/").pop(),
);
const ct = m.fileCiphertexts[fileID];
if (ct) return new Response(ct, { status: 200 });
return new Response("not found", { status: 404 });
}
return new Response("not found", { status: 404 }); return new Response("not found", { status: 404 });
}) as typeof globalThis.fetch; }) as typeof globalThis.fetch;
}; };
@@ -465,4 +556,82 @@ describe("quak backup-metadata", () => {
); );
expect(account.email).toBe(TEST_EMAIL); expect(account.email).toBe(TEST_EMAIL);
}); });
it("fetches and decrypts ML data when --ml is set", async () => {
const outDir = join(testDir, "ml-data");
const client = await Client.login({
email: TEST_EMAIL,
password: TEST_PASSWORD,
apiOptions: { fetch: buildMetaFetch(mock) },
});
await runMetadataBackup(client, outDir, { mlData: true });
const collDirs = readdirSync(join(outDir, "collections"));
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
const fileMeta = JSON.parse(
readFileSync(
join(outDir, "collections", vacDir, "100.json"),
"utf-8",
),
);
// ML data should be present and decrypted
expect(fileMeta.mlData).toBeDefined();
expect(fileMeta.mlData.face).toBeDefined();
expect(fileMeta.mlData.face.faces.length).toBe(1);
expect(fileMeta.mlData.face.faces[0].faceID).toBe("face-abc");
expect(fileMeta.mlData.face.faces[0].score).toBeCloseTo(0.98);
expect(fileMeta.mlData.face.faces[0].detection.box.x).toBeCloseTo(0.1);
expect(fileMeta.mlData.clip).toBeDefined();
expect(fileMeta.mlData.clip.embedding).toEqual([0.5, 0.6, 0.7]);
});
it("does not include ML data when --ml is not set", async () => {
const outDir = join(testDir, "no-ml");
const client = await Client.login({
email: TEST_EMAIL,
password: TEST_PASSWORD,
apiOptions: { fetch: buildMetaFetch(mock) },
});
await runMetadataBackup(client, outDir);
const collDirs = readdirSync(join(outDir, "collections"));
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
const fileMeta = JSON.parse(
readFileSync(
join(outDir, "collections", vacDir, "100.json"),
"utf-8",
),
);
expect(fileMeta.mlData).toBeUndefined();
});
it("extracts EXIF from downloaded files when --exif is set", async () => {
const outDir = join(testDir, "exif-data");
const client = await Client.login({
email: TEST_EMAIL,
password: TEST_PASSWORD,
apiOptions: { fetch: buildMetaFetch(mock) },
});
await runMetadataBackup(client, outDir, { exif: true });
const collDirs = readdirSync(join(outDir, "collections"));
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
const fileMeta = JSON.parse(
readFileSync(
join(outDir, "collections", vacDir, "100.json"),
"utf-8",
),
);
// imageMetadata from sharp should be present
expect(fileMeta.imageMetadata).toBeDefined();
expect(fileMeta.imageMetadata.format).toBe("jpeg");
expect(fileMeta.imageMetadata.width).toBe(100);
expect(fileMeta.imageMetadata.height).toBe(80);
expect(fileMeta.imageMetadata.channels).toBe(3);
});
}); });

View File

@@ -557,7 +557,7 @@
dependencies: dependencies:
undici-types "~6.21.0" undici-types "~6.21.0"
"@types/sharp@^0.32.0": "@types/sharp@0.32.0":
version "0.32.0" version "0.32.0"
resolved "https://registry.yarnpkg.com/@types/sharp/-/sharp-0.32.0.tgz#fc3ac6df6b456319bae807c3d24efdc6631cdd6f" resolved "https://registry.yarnpkg.com/@types/sharp/-/sharp-0.32.0.tgz#fc3ac6df6b456319bae807c3d24efdc6631cdd6f"
integrity sha512-OOi3kL+FZDnPhVzsfD37J88FNeZh6gQsGcLc95NbeURRGvmSjeXiDcyWzF2o3yh/gQAUn2uhh/e+CPCa5nwAxw== integrity sha512-OOi3kL+FZDnPhVzsfD37J88FNeZh6gQsGcLc95NbeURRGvmSjeXiDcyWzF2o3yh/gQAUn2uhh/e+CPCa5nwAxw==
@@ -1026,6 +1026,11 @@ esutils@^2.0.2:
resolved "https://registry.yarnpkg.com/esutils/-/esutils-2.0.3.tgz#74d2eb4de0b8da1293711910d50775b9b710ef64" resolved "https://registry.yarnpkg.com/esutils/-/esutils-2.0.3.tgz#74d2eb4de0b8da1293711910d50775b9b710ef64"
integrity sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g== integrity sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==
exif-reader@^2.0.3:
version "2.0.3"
resolved "https://registry.yarnpkg.com/exif-reader/-/exif-reader-2.0.3.tgz#259997735080bc6bb959c37b32c60f004ec4391d"
integrity sha512-zFbQvguwT9JkqyYhR7pjE1Yn8SagwaGLNRU0Oh14xFa1paSf5Gzxn4gxgk0XhnudI0UIqU+HgnBX93+nva592A==
expect-type@^1.1.0: expect-type@^1.1.0:
version "1.3.0" version "1.3.0"
resolved "https://registry.yarnpkg.com/expect-type/-/expect-type-1.3.0.tgz#0d58ed361877a31bbc4dd6cf71bbfef7faf6bd68" resolved "https://registry.yarnpkg.com/expect-type/-/expect-type-1.3.0.tgz#0d58ed361877a31bbc4dd6cf71bbfef7faf6bd68"
@@ -1451,7 +1456,7 @@ semver@^7.6.0, semver@^7.7.3:
resolved "https://registry.yarnpkg.com/semver/-/semver-7.8.0.tgz#ed0661039fcbcda2ce71f01fa6adbefaa77040df" resolved "https://registry.yarnpkg.com/semver/-/semver-7.8.0.tgz#ed0661039fcbcda2ce71f01fa6adbefaa77040df"
integrity sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA== integrity sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==
sharp@*, sharp@^0.34.5: sharp@*, sharp@0.34.5:
version "0.34.5" version "0.34.5"
resolved "https://registry.yarnpkg.com/sharp/-/sharp-0.34.5.tgz#b6f148e4b8c61f1797bde11a9d1cfebbae2c57b0" resolved "https://registry.yarnpkg.com/sharp/-/sharp-0.34.5.tgz#b6f148e4b8c61f1797bde11a9d1cfebbae2c57b0"
integrity sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg== integrity sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==