Add --ml and --exif flags to backup-metadata
--ml fetches face detections and CLIP embeddings from the /files/data/fetch endpoint (type 'mldata'). Each blob is encrypted with the file's key and gzipped; we decrypt with decryptBlob, gunzip, and include the parsed JSON as 'mlData' in the per-file output. Fetched in batches of 200 file IDs. --exif downloads each file, runs sharp().metadata() to extract image properties (format, dimensions, color space, orientation), then parses the raw EXIF buffer with exif-reader for structured tags (lens, ISO, shutter, aperture, GPS altitude, etc.). Also captures raw IPTC, XMP, and ICC profile data. Included as 'imageMetadata' in the per-file output. Without either flag, behavior is unchanged (fast metadata-only dump). Adds exif-reader 2.0.3 as a runtime dependency. 3 new tests (ML data decrypted, ML data absent when flag not set, EXIF extraction). 119 total tests, all green.
This commit is contained in:
18
bin/quak.ts
18
bin/quak.ts
@@ -337,13 +337,25 @@ program
|
||||
program
|
||||
.command("backup-metadata")
|
||||
.description(
|
||||
"Dump all decrypted account metadata (no file content) to a directory",
|
||||
"Dump all decrypted account metadata to a directory of JSON files",
|
||||
)
|
||||
.argument("<dir>", "Output directory")
|
||||
.action(async (dir: string) => {
|
||||
.option(
|
||||
"--ml",
|
||||
"Include ML data (face detections, CLIP embeddings) from the Ente server",
|
||||
)
|
||||
.option(
|
||||
"--exif",
|
||||
"Download each file and extract full EXIF/IPTC/XMP metadata (slow)",
|
||||
)
|
||||
.action(async (dir: string, opts: { ml?: boolean; exif?: boolean }) => {
|
||||
await init();
|
||||
const client = requireSession();
|
||||
await runMetadataBackup(client, dir, (msg) => stderr.write(msg + "\n"));
|
||||
await runMetadataBackup(client, dir, {
|
||||
mlData: opts.ml,
|
||||
exif: opts.exif,
|
||||
onProgress: (msg) => stderr.write(msg + "\n"),
|
||||
});
|
||||
});
|
||||
|
||||
program
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
"dependencies": {
|
||||
"commander": "14.0.3",
|
||||
"env-paths": "4.0.0",
|
||||
"exif-reader": "2.0.3",
|
||||
"fast-srp-hap": "2.0.4",
|
||||
"libsodium-wrappers-sumo": "0.8.4",
|
||||
"sharp": "0.34.5"
|
||||
|
||||
@@ -1,18 +1,127 @@
|
||||
import { mkdirSync, writeFileSync } from "node:fs";
|
||||
import { gunzipSync } from "node:zlib";
|
||||
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { tmpdir } from "node:os";
|
||||
import sharp from "sharp";
|
||||
import exifReader from "exif-reader";
|
||||
import type { Client } from "./client.js";
|
||||
import { decryptBlob, fromBase64 } from "./crypto/index.js";
|
||||
import type { EnteFile } from "./model/types.js";
|
||||
|
||||
export type ProgressCallback = (message: string) => void;
|
||||
|
||||
export interface MetadataBackupOptions {
|
||||
mlData?: boolean;
|
||||
exif?: boolean;
|
||||
onProgress?: ProgressCallback;
|
||||
}
|
||||
|
||||
const sanitizePath = (name: string): string =>
|
||||
name.replace(/[/\\:*?"<>|]/g, "_").replace(/^\.+/, "_");
|
||||
|
||||
interface RawRemoteFileData {
|
||||
fileID: number;
|
||||
encryptedData: string;
|
||||
decryptionHeader: string;
|
||||
updatedAt?: number;
|
||||
}
|
||||
|
||||
const fetchMLDataForFiles = async (
|
||||
client: Client,
|
||||
fileIDs: number[],
|
||||
fileKeys: Map<number, Uint8Array>,
|
||||
): Promise<Map<number, Record<string, unknown>>> => {
|
||||
const api = client.getApiClient();
|
||||
const result = new Map<number, Record<string, unknown>>();
|
||||
const batchSize = 200;
|
||||
|
||||
for (let i = 0; i < fileIDs.length; i += batchSize) {
|
||||
const batch = fileIDs.slice(i, i + batchSize);
|
||||
const { data } = await api.postJSON<{ data: RawRemoteFileData[] }>(
|
||||
"/files/data/fetch",
|
||||
{ type: "mldata", fileIDs: batch },
|
||||
);
|
||||
|
||||
for (const entry of data ?? []) {
|
||||
const key = fileKeys.get(entry.fileID);
|
||||
if (!key) continue;
|
||||
try {
|
||||
const decrypted = decryptBlob(
|
||||
fromBase64(entry.encryptedData),
|
||||
fromBase64(entry.decryptionHeader),
|
||||
key,
|
||||
);
|
||||
const jsonStr = gunzipSync(Buffer.from(decrypted)).toString(
|
||||
"utf-8",
|
||||
);
|
||||
result.set(entry.fileID, JSON.parse(jsonStr));
|
||||
} catch {
|
||||
// Corrupted ML data for this file; skip it
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
const extractExif = async (
|
||||
client: Client,
|
||||
file: EnteFile,
|
||||
): Promise<Record<string, unknown> | undefined> => {
|
||||
const tmpDir = mkdtempSync(join(tmpdir(), "quak-exif-"));
|
||||
try {
|
||||
const origPath = join(tmpDir, "original");
|
||||
await client.downloadFile(file, origPath);
|
||||
const meta = await sharp(origPath).metadata();
|
||||
|
||||
const result: Record<string, unknown> = {
|
||||
format: meta.format,
|
||||
width: meta.width,
|
||||
height: meta.height,
|
||||
space: meta.space,
|
||||
channels: meta.channels,
|
||||
depth: meta.depth,
|
||||
density: meta.density,
|
||||
chromaSubSampling: meta.chromaSubSampling,
|
||||
isProgressive: meta.isProgressive,
|
||||
hasProfile: meta.hasProfile,
|
||||
hasAlpha: meta.hasAlpha,
|
||||
orientation: meta.orientation,
|
||||
};
|
||||
|
||||
if (meta.exif) {
|
||||
try {
|
||||
result.exif = exifReader(meta.exif);
|
||||
} catch {
|
||||
// Malformed EXIF; store the raw bytes as base64 instead
|
||||
result.exifRaw = meta.exif.toString("base64");
|
||||
}
|
||||
}
|
||||
if (meta.iptc) result.iptcRaw = meta.iptc.toString("base64");
|
||||
if (meta.xmp) {
|
||||
try {
|
||||
result.xmp = Buffer.from(meta.xmp).toString("utf-8");
|
||||
} catch {
|
||||
result.xmpRaw = meta.xmp.toString("base64");
|
||||
}
|
||||
}
|
||||
if (meta.icc) result.iccRaw = meta.icc.toString("base64");
|
||||
|
||||
return result;
|
||||
} catch {
|
||||
return undefined;
|
||||
} finally {
|
||||
rmSync(tmpDir, { recursive: true, force: true });
|
||||
}
|
||||
};
|
||||
|
||||
export const runMetadataBackup = async (
|
||||
client: Client,
|
||||
outDir: string,
|
||||
onProgress?: ProgressCallback,
|
||||
opts?: MetadataBackupOptions,
|
||||
): Promise<void> => {
|
||||
const log = onProgress ?? (() => {});
|
||||
const log = opts?.onProgress ?? (() => {});
|
||||
const wantML = opts?.mlData ?? false;
|
||||
const wantExif = opts?.exif ?? false;
|
||||
|
||||
mkdirSync(outDir, { recursive: true });
|
||||
mkdirSync(join(outDir, "collections"), { recursive: true });
|
||||
@@ -26,6 +135,10 @@ export const runMetadataBackup = async (
|
||||
log("Fetching collections...");
|
||||
const collections = await client.listCollections();
|
||||
|
||||
const allFiles: { file: EnteFile; colDirName: string }[] = [];
|
||||
const fileKeys = new Map<number, Uint8Array>();
|
||||
const seenFileIDs = new Set<number>();
|
||||
|
||||
for (const col of collections) {
|
||||
const dirName = `${col.id}-${sanitizePath(col.name || "unnamed")}`;
|
||||
const colDir = join(outDir, "collections", dirName);
|
||||
@@ -55,6 +168,31 @@ export const runMetadataBackup = async (
|
||||
log(`[${col.name}] ${files.length} file(s)`);
|
||||
|
||||
for (const file of files) {
|
||||
allFiles.push({ file, colDirName: dirName });
|
||||
if (!seenFileIDs.has(file.id)) {
|
||||
fileKeys.set(file.id, file.key);
|
||||
seenFileIDs.add(file.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch ML data in bulk if requested
|
||||
let mlDataMap = new Map<number, Record<string, unknown>>();
|
||||
if (wantML) {
|
||||
log("Fetching ML data (face detections, CLIP embeddings)...");
|
||||
mlDataMap = await fetchMLDataForFiles(
|
||||
client,
|
||||
[...fileKeys.keys()],
|
||||
fileKeys,
|
||||
);
|
||||
log(`Got ML data for ${mlDataMap.size} file(s)`);
|
||||
}
|
||||
|
||||
// Write per-file JSON (with optional ML data and EXIF)
|
||||
const writtenFileIDs = new Set<number>();
|
||||
for (const { file, colDirName } of allFiles) {
|
||||
const colDir = join(outDir, "collections", colDirName);
|
||||
|
||||
const fileMeta: Record<string, unknown> = {
|
||||
id: file.id,
|
||||
collectionID: file.collectionID,
|
||||
@@ -66,12 +204,21 @@ export const runMetadataBackup = async (
|
||||
if (file.pubMagicMetadata)
|
||||
fileMeta.pubMagicMetadata = file.pubMagicMetadata;
|
||||
|
||||
const ml = mlDataMap.get(file.id);
|
||||
if (ml) fileMeta.mlData = ml;
|
||||
|
||||
if (wantExif && !writtenFileIDs.has(file.id)) {
|
||||
log(`[${file.metadata.title}] Extracting EXIF...`);
|
||||
const exifData = await extractExif(client, file);
|
||||
if (exifData) fileMeta.imageMetadata = exifData;
|
||||
}
|
||||
writtenFileIDs.add(file.id);
|
||||
|
||||
writeFileSync(
|
||||
join(colDir, `${file.id}.json`),
|
||||
JSON.stringify(fileMeta, null, 2),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
log("Metadata backup complete.");
|
||||
};
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
* the output tree is correct and complete.
|
||||
*/
|
||||
|
||||
import { gzipSync } from "node:zlib";
|
||||
import {
|
||||
existsSync,
|
||||
mkdtempSync,
|
||||
@@ -39,7 +40,9 @@ import {
|
||||
toBase64,
|
||||
deriveKEK,
|
||||
deriveLoginSubkey,
|
||||
encryptBlob,
|
||||
} from "../../src/crypto/index.js";
|
||||
import sharp from "sharp";
|
||||
import { Client } from "../../src/client.js";
|
||||
import { runMetadataBackup } from "../../src/metadata-backup.js";
|
||||
import type { KeyAttributes } from "../../src/auth/types.js";
|
||||
@@ -56,6 +59,12 @@ interface MetaMockState {
|
||||
encryptedToken: string;
|
||||
collections: Record<string, unknown>[];
|
||||
filesByCollection: Record<number, Record<string, unknown>[]>;
|
||||
// For ML data and EXIF tests
|
||||
encryptedMLData: Record<
|
||||
number,
|
||||
{ encryptedData: string; decryptionHeader: string }
|
||||
>;
|
||||
fileCiphertexts: Record<number, Uint8Array>;
|
||||
}
|
||||
|
||||
let mock: MetaMockState;
|
||||
@@ -245,6 +254,58 @@ const buildMetaMock = async (): Promise<MetaMockState> => {
|
||||
updationTime: 1710000000000000,
|
||||
};
|
||||
|
||||
// Encrypt ML data for file 100 (gzipped JSON, encrypted with file key)
|
||||
const mlPayload = JSON.stringify({
|
||||
face: {
|
||||
version: 1,
|
||||
client: "test",
|
||||
width: 3000,
|
||||
height: 2000,
|
||||
faces: [
|
||||
{
|
||||
faceID: "face-abc",
|
||||
detection: {
|
||||
box: { x: 0.1, y: 0.2, width: 0.3, height: 0.4 },
|
||||
landmarks: [
|
||||
{ x: 0.15, y: 0.25 },
|
||||
{ x: 0.25, y: 0.25 },
|
||||
],
|
||||
},
|
||||
score: 0.98,
|
||||
blur: 12.5,
|
||||
embedding: [0.1, 0.2, 0.3],
|
||||
},
|
||||
],
|
||||
},
|
||||
clip: {
|
||||
version: 1,
|
||||
client: "test",
|
||||
embedding: [0.5, 0.6, 0.7],
|
||||
},
|
||||
});
|
||||
const gzipped = gzipSync(Buffer.from(mlPayload));
|
||||
const { header: mlHeader, ciphertext: mlCiphertext } = encryptBlob(
|
||||
new Uint8Array(gzipped),
|
||||
fk1,
|
||||
);
|
||||
|
||||
// Generate a real JPEG for EXIF extraction tests
|
||||
const tinyJpeg = await sharp({
|
||||
create: { width: 100, height: 80, channels: 3, background: "red" },
|
||||
})
|
||||
.jpeg({ quality: 80 })
|
||||
.toBuffer();
|
||||
const filePush1 =
|
||||
sodium.crypto_secretstream_xchacha20poly1305_init_push(fk1);
|
||||
const encFileBody1 = sodium.crypto_secretstream_xchacha20poly1305_push(
|
||||
filePush1.state,
|
||||
new Uint8Array(tinyJpeg),
|
||||
null,
|
||||
sodium.crypto_secretstream_xchacha20poly1305_TAG_FINAL,
|
||||
);
|
||||
// Patch rawFile1's file.decryptionHeader to match the push header
|
||||
rawFile1.file.decryptionHeader = toBase64(filePush1.header);
|
||||
|
||||
return {
|
||||
verifier,
|
||||
srpAttributes: {
|
||||
@@ -259,6 +320,13 @@ const buildMetaMock = async (): Promise<MetaMockState> => {
|
||||
encryptedToken: toBase64(encToken),
|
||||
collections: [rawColl1, rawColl2],
|
||||
filesByCollection: { 10: [rawFile1], 20: [rawFile2] },
|
||||
encryptedMLData: {
|
||||
100: {
|
||||
encryptedData: toBase64(mlCiphertext),
|
||||
decryptionHeader: toBase64(mlHeader),
|
||||
},
|
||||
},
|
||||
fileCiphertexts: { 100: encFileBody1 },
|
||||
};
|
||||
};
|
||||
|
||||
@@ -316,6 +384,29 @@ const buildMetaFetch = (m: MetaMockState) => {
|
||||
hasMore: false,
|
||||
});
|
||||
}
|
||||
if (path === "/files/data/fetch") {
|
||||
const body = JSON.parse(init?.body as string);
|
||||
const data = (body.fileIDs as number[])
|
||||
.filter((id: number) => m.encryptedMLData[id])
|
||||
.map((id: number) => ({
|
||||
fileID: id,
|
||||
...m.encryptedMLData[id],
|
||||
updatedAt: 1700000000000000,
|
||||
}));
|
||||
return json({ data });
|
||||
}
|
||||
if (
|
||||
url.includes("files.ente.io") ||
|
||||
path.startsWith("/files/download/")
|
||||
) {
|
||||
const parsed = new URL(url);
|
||||
const fileID = Number(
|
||||
parsed.searchParams.get("fileID") ?? path.split("/").pop(),
|
||||
);
|
||||
const ct = m.fileCiphertexts[fileID];
|
||||
if (ct) return new Response(ct, { status: 200 });
|
||||
return new Response("not found", { status: 404 });
|
||||
}
|
||||
return new Response("not found", { status: 404 });
|
||||
}) as typeof globalThis.fetch;
|
||||
};
|
||||
@@ -465,4 +556,82 @@ describe("quak backup-metadata", () => {
|
||||
);
|
||||
expect(account.email).toBe(TEST_EMAIL);
|
||||
});
|
||||
|
||||
it("fetches and decrypts ML data when --ml is set", async () => {
|
||||
const outDir = join(testDir, "ml-data");
|
||||
const client = await Client.login({
|
||||
email: TEST_EMAIL,
|
||||
password: TEST_PASSWORD,
|
||||
apiOptions: { fetch: buildMetaFetch(mock) },
|
||||
});
|
||||
|
||||
await runMetadataBackup(client, outDir, { mlData: true });
|
||||
|
||||
const collDirs = readdirSync(join(outDir, "collections"));
|
||||
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
|
||||
const fileMeta = JSON.parse(
|
||||
readFileSync(
|
||||
join(outDir, "collections", vacDir, "100.json"),
|
||||
"utf-8",
|
||||
),
|
||||
);
|
||||
|
||||
// ML data should be present and decrypted
|
||||
expect(fileMeta.mlData).toBeDefined();
|
||||
expect(fileMeta.mlData.face).toBeDefined();
|
||||
expect(fileMeta.mlData.face.faces.length).toBe(1);
|
||||
expect(fileMeta.mlData.face.faces[0].faceID).toBe("face-abc");
|
||||
expect(fileMeta.mlData.face.faces[0].score).toBeCloseTo(0.98);
|
||||
expect(fileMeta.mlData.face.faces[0].detection.box.x).toBeCloseTo(0.1);
|
||||
expect(fileMeta.mlData.clip).toBeDefined();
|
||||
expect(fileMeta.mlData.clip.embedding).toEqual([0.5, 0.6, 0.7]);
|
||||
});
|
||||
|
||||
it("does not include ML data when --ml is not set", async () => {
|
||||
const outDir = join(testDir, "no-ml");
|
||||
const client = await Client.login({
|
||||
email: TEST_EMAIL,
|
||||
password: TEST_PASSWORD,
|
||||
apiOptions: { fetch: buildMetaFetch(mock) },
|
||||
});
|
||||
|
||||
await runMetadataBackup(client, outDir);
|
||||
|
||||
const collDirs = readdirSync(join(outDir, "collections"));
|
||||
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
|
||||
const fileMeta = JSON.parse(
|
||||
readFileSync(
|
||||
join(outDir, "collections", vacDir, "100.json"),
|
||||
"utf-8",
|
||||
),
|
||||
);
|
||||
expect(fileMeta.mlData).toBeUndefined();
|
||||
});
|
||||
|
||||
it("extracts EXIF from downloaded files when --exif is set", async () => {
|
||||
const outDir = join(testDir, "exif-data");
|
||||
const client = await Client.login({
|
||||
email: TEST_EMAIL,
|
||||
password: TEST_PASSWORD,
|
||||
apiOptions: { fetch: buildMetaFetch(mock) },
|
||||
});
|
||||
|
||||
await runMetadataBackup(client, outDir, { exif: true });
|
||||
|
||||
const collDirs = readdirSync(join(outDir, "collections"));
|
||||
const vacDir = collDirs.find((d) => d.includes("Vacation"))!;
|
||||
const fileMeta = JSON.parse(
|
||||
readFileSync(
|
||||
join(outDir, "collections", vacDir, "100.json"),
|
||||
"utf-8",
|
||||
),
|
||||
);
|
||||
|
||||
// imageMetadata from sharp should be present
|
||||
expect(fileMeta.imageMetadata).toBeDefined();
|
||||
expect(fileMeta.imageMetadata.format).toBe("jpeg");
|
||||
expect(fileMeta.imageMetadata.width).toBe(100);
|
||||
expect(fileMeta.imageMetadata.height).toBe(80);
|
||||
expect(fileMeta.imageMetadata.channels).toBe(3);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -557,7 +557,7 @@
|
||||
dependencies:
|
||||
undici-types "~6.21.0"
|
||||
|
||||
"@types/sharp@^0.32.0":
|
||||
"@types/sharp@0.32.0":
|
||||
version "0.32.0"
|
||||
resolved "https://registry.yarnpkg.com/@types/sharp/-/sharp-0.32.0.tgz#fc3ac6df6b456319bae807c3d24efdc6631cdd6f"
|
||||
integrity sha512-OOi3kL+FZDnPhVzsfD37J88FNeZh6gQsGcLc95NbeURRGvmSjeXiDcyWzF2o3yh/gQAUn2uhh/e+CPCa5nwAxw==
|
||||
@@ -1026,6 +1026,11 @@ esutils@^2.0.2:
|
||||
resolved "https://registry.yarnpkg.com/esutils/-/esutils-2.0.3.tgz#74d2eb4de0b8da1293711910d50775b9b710ef64"
|
||||
integrity sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==
|
||||
|
||||
exif-reader@^2.0.3:
|
||||
version "2.0.3"
|
||||
resolved "https://registry.yarnpkg.com/exif-reader/-/exif-reader-2.0.3.tgz#259997735080bc6bb959c37b32c60f004ec4391d"
|
||||
integrity sha512-zFbQvguwT9JkqyYhR7pjE1Yn8SagwaGLNRU0Oh14xFa1paSf5Gzxn4gxgk0XhnudI0UIqU+HgnBX93+nva592A==
|
||||
|
||||
expect-type@^1.1.0:
|
||||
version "1.3.0"
|
||||
resolved "https://registry.yarnpkg.com/expect-type/-/expect-type-1.3.0.tgz#0d58ed361877a31bbc4dd6cf71bbfef7faf6bd68"
|
||||
@@ -1451,7 +1456,7 @@ semver@^7.6.0, semver@^7.7.3:
|
||||
resolved "https://registry.yarnpkg.com/semver/-/semver-7.8.0.tgz#ed0661039fcbcda2ce71f01fa6adbefaa77040df"
|
||||
integrity sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA==
|
||||
|
||||
sharp@*, sharp@^0.34.5:
|
||||
sharp@*, sharp@0.34.5:
|
||||
version "0.34.5"
|
||||
resolved "https://registry.yarnpkg.com/sharp/-/sharp-0.34.5.tgz#b6f148e4b8c61f1797bde11a9d1cfebbae2c57b0"
|
||||
integrity sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==
|
||||
|
||||
Reference in New Issue
Block a user