Files
quak/src/metadata-backup.ts
sneak 21a1a78f07 ML data included by default, --exif is the opt-in, --all aliases --exif
ML data (face detections, CLIP embeddings) is now fetched by default
in backup-metadata. Use --no-ml to skip it. EXIF extraction (which
requires downloading every file) remains opt-in via --exif. --all is
an alias for --exif.
2026-06-09 17:38:15 -04:00

225 lines
7.3 KiB
TypeScript

import { gunzipSync } from "node:zlib";
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import sharp from "sharp";
import exifReader from "exif-reader";
import type { Client } from "./client.js";
import { decryptBlob, fromBase64 } from "./crypto/index.js";
import type { EnteFile } from "./model/types.js";
export type ProgressCallback = (message: string) => void;
export interface MetadataBackupOptions {
mlData?: boolean;
exif?: boolean;
onProgress?: ProgressCallback;
}
const sanitizePath = (name: string): string =>
name.replace(/[/\\:*?"<>|]/g, "_").replace(/^\.+/, "_");
interface RawRemoteFileData {
fileID: number;
encryptedData: string;
decryptionHeader: string;
updatedAt?: number;
}
const fetchMLDataForFiles = async (
client: Client,
fileIDs: number[],
fileKeys: Map<number, Uint8Array>,
): Promise<Map<number, Record<string, unknown>>> => {
const api = client.getApiClient();
const result = new Map<number, Record<string, unknown>>();
const batchSize = 200;
for (let i = 0; i < fileIDs.length; i += batchSize) {
const batch = fileIDs.slice(i, i + batchSize);
const { data } = await api.postJSON<{ data: RawRemoteFileData[] }>(
"/files/data/fetch",
{ type: "mldata", fileIDs: batch },
);
for (const entry of data ?? []) {
const key = fileKeys.get(entry.fileID);
if (!key) continue;
try {
const decrypted = decryptBlob(
fromBase64(entry.encryptedData),
fromBase64(entry.decryptionHeader),
key,
);
const jsonStr = gunzipSync(Buffer.from(decrypted)).toString(
"utf-8",
);
result.set(entry.fileID, JSON.parse(jsonStr));
} catch {
// Corrupted ML data for this file; skip it
}
}
}
return result;
};
const extractExif = async (
client: Client,
file: EnteFile,
): Promise<Record<string, unknown> | undefined> => {
const tmpDir = mkdtempSync(join(tmpdir(), "quak-exif-"));
try {
const origPath = join(tmpDir, "original");
await client.downloadFile(file, origPath);
const meta = await sharp(origPath).metadata();
const result: Record<string, unknown> = {
format: meta.format,
width: meta.width,
height: meta.height,
space: meta.space,
channels: meta.channels,
depth: meta.depth,
density: meta.density,
chromaSubSampling: meta.chromaSubSampling,
isProgressive: meta.isProgressive,
hasProfile: meta.hasProfile,
hasAlpha: meta.hasAlpha,
orientation: meta.orientation,
};
if (meta.exif) {
try {
result.exif = exifReader(meta.exif);
} catch {
// Malformed EXIF; store the raw bytes as base64 instead
result.exifRaw = meta.exif.toString("base64");
}
}
if (meta.iptc) result.iptcRaw = meta.iptc.toString("base64");
if (meta.xmp) {
try {
result.xmp = Buffer.from(meta.xmp).toString("utf-8");
} catch {
result.xmpRaw = meta.xmp.toString("base64");
}
}
if (meta.icc) result.iccRaw = meta.icc.toString("base64");
return result;
} catch {
return undefined;
} finally {
rmSync(tmpDir, { recursive: true, force: true });
}
};
export const runMetadataBackup = async (
client: Client,
outDir: string,
opts?: MetadataBackupOptions,
): Promise<void> => {
const log = opts?.onProgress ?? (() => {});
const wantML = opts?.mlData ?? true;
const wantExif = opts?.exif ?? false;
mkdirSync(outDir, { recursive: true });
mkdirSync(join(outDir, "collections"), { recursive: true });
const { email, userID } = client.whoami();
writeFileSync(
join(outDir, "account.json"),
JSON.stringify({ email, userID }, null, 2),
);
log("Fetching collections...");
const collections = await client.listCollections();
const allFiles: { file: EnteFile; colDirName: string }[] = [];
const fileKeys = new Map<number, Uint8Array>();
const seenFileIDs = new Set<number>();
for (const col of collections) {
const dirName = `${col.id}-${sanitizePath(col.name || "unnamed")}`;
const colDir = join(outDir, "collections", dirName);
mkdirSync(colDir, { recursive: true });
const collectionMeta: Record<string, unknown> = {
id: col.id,
name: col.name,
type: col.type,
ownerID: col.ownerID,
isShared: col.isShared,
updationTime: col.updationTime,
};
if (col.magicMetadata) collectionMeta.magicMetadata = col.magicMetadata;
if (col.pubMagicMetadata)
collectionMeta.pubMagicMetadata = col.pubMagicMetadata;
if (col.sharedMagicMetadata)
collectionMeta.sharedMagicMetadata = col.sharedMagicMetadata;
writeFileSync(
join(colDir, "_collection.json"),
JSON.stringify(collectionMeta, null, 2),
);
log(`[${col.name}] Fetching files...`);
const files = await client.listFiles(col.id, col.key);
log(`[${col.name}] ${files.length} file(s)`);
for (const file of files) {
allFiles.push({ file, colDirName: dirName });
if (!seenFileIDs.has(file.id)) {
fileKeys.set(file.id, file.key);
seenFileIDs.add(file.id);
}
}
}
// Fetch ML data in bulk if requested
let mlDataMap = new Map<number, Record<string, unknown>>();
if (wantML) {
log("Fetching ML data (face detections, CLIP embeddings)...");
mlDataMap = await fetchMLDataForFiles(
client,
[...fileKeys.keys()],
fileKeys,
);
log(`Got ML data for ${mlDataMap.size} file(s)`);
}
// Write per-file JSON (with optional ML data and EXIF)
const writtenFileIDs = new Set<number>();
for (const { file, colDirName } of allFiles) {
const colDir = join(outDir, "collections", colDirName);
const fileMeta: Record<string, unknown> = {
id: file.id,
collectionID: file.collectionID,
ownerID: file.ownerID,
metadata: file.metadata,
updationTime: file.updationTime,
};
if (file.magicMetadata) fileMeta.magicMetadata = file.magicMetadata;
if (file.pubMagicMetadata)
fileMeta.pubMagicMetadata = file.pubMagicMetadata;
const ml = mlDataMap.get(file.id);
if (ml) fileMeta.mlData = ml;
if (wantExif && !writtenFileIDs.has(file.id)) {
log(`[${file.metadata.title}] Extracting EXIF...`);
const exifData = await extractExif(client, file);
if (exifData) fileMeta.imageMetadata = exifData;
}
writtenFileIDs.add(file.id);
writeFileSync(
join(colDir, `${file.id}.json`),
JSON.stringify(fileMeta, null, 2),
);
}
log("Metadata backup complete.");
};