Add --ml and --exif flags to backup-metadata

--ml fetches face detections and CLIP embeddings from the /files/data/fetch
endpoint (type 'mldata'). Each blob is encrypted with the file's key and
gzipped; we decrypt with decryptBlob, gunzip, and include the parsed JSON
as 'mlData' in the per-file output. Fetched in batches of 200 file IDs.

--exif downloads each file, runs sharp().metadata() to extract image
properties (format, dimensions, color space, orientation), then parses
the raw EXIF buffer with exif-reader for structured tags (lens, ISO,
shutter, aperture, GPS altitude, etc.). Also captures raw IPTC, XMP,
and ICC profile data. Included as 'imageMetadata' in the per-file output.

Without either flag, behavior is unchanged (fast metadata-only dump).

Adds exif-reader 2.0.3 as a runtime dependency.
3 new tests (ML data decrypted, ML data absent when flag not set, EXIF
extraction). 119 total tests, all green.
This commit is contained in:
2026-06-09 17:35:35 -04:00
parent 73bfec5a9e
commit c8e7971445
5 changed files with 357 additions and 23 deletions

View File

@@ -22,6 +22,7 @@
* the output tree is correct and complete.
*/
import { gzipSync } from "node:zlib";
import {
existsSync,
mkdtempSync,
@@ -39,7 +40,9 @@ import {
toBase64,
deriveKEK,
deriveLoginSubkey,
encryptBlob,
} from "../../src/crypto/index.js";
import sharp from "sharp";
import { Client } from "../../src/client.js";
import { runMetadataBackup } from "../../src/metadata-backup.js";
import type { KeyAttributes } from "../../src/auth/types.js";
@@ -56,6 +59,12 @@ interface MetaMockState {
encryptedToken: string;
collections: Record<string, unknown>[];
filesByCollection: Record<number, Record<string, unknown>[]>;
// For ML data and EXIF tests
encryptedMLData: Record<
number,
{ encryptedData: string; decryptionHeader: string }
>;
fileCiphertexts: Record<number, Uint8Array>;
}
let mock: MetaMockState;
@@ -245,6 +254,58 @@ const buildMetaMock = async (): Promise<MetaMockState> => {
updationTime: 1710000000000000,
};
// Encrypt ML data for file 100 (gzipped JSON, encrypted with file key)
const mlPayload = JSON.stringify({
face: {
version: 1,
client: "test",
width: 3000,
height: 2000,
faces: [
{
faceID: "face-abc",
detection: {
box: { x: 0.1, y: 0.2, width: 0.3, height: 0.4 },
landmarks: [
{ x: 0.15, y: 0.25 },
{ x: 0.25, y: 0.25 },
],
},
score: 0.98,
blur: 12.5,
embedding: [0.1, 0.2, 0.3],
},
],
},
clip: {
version: 1,
client: "test",
embedding: [0.5, 0.6, 0.7],
},
});
const gzipped = gzipSync(Buffer.from(mlPayload));
const { header: mlHeader, ciphertext: mlCiphertext } = encryptBlob(
new Uint8Array(gzipped),
fk1,
);
// Generate a real JPEG for EXIF extraction tests
const tinyJpeg = await sharp({
create: { width: 100, height: 80, channels: 3, background: "red" },
})
.jpeg({ quality: 80 })
.toBuffer();
const filePush1 =
sodium.crypto_secretstream_xchacha20poly1305_init_push(fk1);
const encFileBody1 = sodium.crypto_secretstream_xchacha20poly1305_push(
filePush1.state,
new Uint8Array(tinyJpeg),
null,
sodium.crypto_secretstream_xchacha20poly1305_TAG_FINAL,
);
// Patch rawFile1's file.decryptionHeader to match the push header
rawFile1.file.decryptionHeader = toBase64(filePush1.header);
return {
verifier,
srpAttributes: {
@@ -259,6 +320,13 @@ const buildMetaMock = async (): Promise<MetaMockState> => {
encryptedToken: toBase64(encToken),
collections: [rawColl1, rawColl2],
filesByCollection: { 10: [rawFile1], 20: [rawFile2] },
encryptedMLData: {
100: {
encryptedData: toBase64(mlCiphertext),
decryptionHeader: toBase64(mlHeader),
},
},
fileCiphertexts: { 100: encFileBody1 },
};
};
@@ -316,6 +384,29 @@ const buildMetaFetch = (m: MetaMockState) => {
hasMore: false,
});
}
if (path === "/files/data/fetch") {
const body = JSON.parse(init?.body as string);
const data = (body.fileIDs as number[])
.filter((id: number) => m.encryptedMLData[id])
.map((id: number) => ({
fileID: id,
...m.encryptedMLData[id],
updatedAt: 1700000000000000,
}));
return json({ data });
}
if (
url.includes("files.ente.io") ||
path.startsWith("/files/download/")
) {
const parsed = new URL(url);
const fileID = Number(
parsed.searchParams.get("fileID") ?? path.split("/").pop(),
);
const ct = m.fileCiphertexts[fileID];
if (ct) return new Response(ct, { status: 200 });
return new Response("not found", { status: 404 });
}
return new Response("not found", { status: 404 });
}) as typeof globalThis.fetch;
};
@@ -465,4 +556,82 @@ describe("quak backup-metadata", () => {
);
expect(account.email).toBe(TEST_EMAIL);
});
it("fetches and decrypts ML data when --ml is set", async () => {
const outDir = join(testDir, "ml-data");
const client = await Client.login({
email: TEST_EMAIL,
password: TEST_PASSWORD,
apiOptions: { fetch: buildMetaFetch(mock) },
});
await runMetadataBackup(client, outDir, { mlData: true });
const collDirs = readdirSync(join(outDir, "collections"));
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
const fileMeta = JSON.parse(
readFileSync(
join(outDir, "collections", vacDir, "100.json"),
"utf-8",
),
);
// ML data should be present and decrypted
expect(fileMeta.mlData).toBeDefined();
expect(fileMeta.mlData.face).toBeDefined();
expect(fileMeta.mlData.face.faces.length).toBe(1);
expect(fileMeta.mlData.face.faces[0].faceID).toBe("face-abc");
expect(fileMeta.mlData.face.faces[0].score).toBeCloseTo(0.98);
expect(fileMeta.mlData.face.faces[0].detection.box.x).toBeCloseTo(0.1);
expect(fileMeta.mlData.clip).toBeDefined();
expect(fileMeta.mlData.clip.embedding).toEqual([0.5, 0.6, 0.7]);
});
it("does not include ML data when --ml is not set", async () => {
const outDir = join(testDir, "no-ml");
const client = await Client.login({
email: TEST_EMAIL,
password: TEST_PASSWORD,
apiOptions: { fetch: buildMetaFetch(mock) },
});
await runMetadataBackup(client, outDir);
const collDirs = readdirSync(join(outDir, "collections"));
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
const fileMeta = JSON.parse(
readFileSync(
join(outDir, "collections", vacDir, "100.json"),
"utf-8",
),
);
expect(fileMeta.mlData).toBeUndefined();
});
it("extracts EXIF from downloaded files when --exif is set", async () => {
const outDir = join(testDir, "exif-data");
const client = await Client.login({
email: TEST_EMAIL,
password: TEST_PASSWORD,
apiOptions: { fetch: buildMetaFetch(mock) },
});
await runMetadataBackup(client, outDir, { exif: true });
const collDirs = readdirSync(join(outDir, "collections"));
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
const fileMeta = JSON.parse(
readFileSync(
join(outDir, "collections", vacDir, "100.json"),
"utf-8",
),
);
// imageMetadata from sharp should be present
expect(fileMeta.imageMetadata).toBeDefined();
expect(fileMeta.imageMetadata.format).toBe("jpeg");
expect(fileMeta.imageMetadata.width).toBe(100);
expect(fileMeta.imageMetadata.height).toBe(80);
expect(fileMeta.imageMetadata.channels).toBe(3);
});
});