Add --ml and --exif flags to backup-metadata
--ml fetches face detections and CLIP embeddings from the /files/data/fetch endpoint (type 'mldata'). Each blob is encrypted with the file's key and gzipped; we decrypt with decryptBlob, gunzip, and include the parsed JSON as 'mlData' in the per-file output. Fetched in batches of 200 file IDs. --exif downloads each file, runs sharp().metadata() to extract image properties (format, dimensions, color space, orientation), then parses the raw EXIF buffer with exif-reader for structured tags (lens, ISO, shutter, aperture, GPS altitude, etc.). Also captures raw IPTC, XMP, and ICC profile data. Included as 'imageMetadata' in the per-file output. Without either flag, behavior is unchanged (fast metadata-only dump). Adds exif-reader 2.0.3 as a runtime dependency. 3 new tests (ML data decrypted, ML data absent when flag not set, EXIF extraction). 119 total tests, all green.
This commit is contained in:
@@ -1,18 +1,127 @@
|
||||
import { mkdirSync, writeFileSync } from "node:fs";
|
||||
import { gunzipSync } from "node:zlib";
|
||||
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { tmpdir } from "node:os";
|
||||
import sharp from "sharp";
|
||||
import exifReader from "exif-reader";
|
||||
import type { Client } from "./client.js";
|
||||
import { decryptBlob, fromBase64 } from "./crypto/index.js";
|
||||
import type { EnteFile } from "./model/types.js";
|
||||
|
||||
export type ProgressCallback = (message: string) => void;
|
||||
|
||||
export interface MetadataBackupOptions {
|
||||
mlData?: boolean;
|
||||
exif?: boolean;
|
||||
onProgress?: ProgressCallback;
|
||||
}
|
||||
|
||||
const sanitizePath = (name: string): string =>
|
||||
name.replace(/[/\\:*?"<>|]/g, "_").replace(/^\.+/, "_");
|
||||
|
||||
interface RawRemoteFileData {
|
||||
fileID: number;
|
||||
encryptedData: string;
|
||||
decryptionHeader: string;
|
||||
updatedAt?: number;
|
||||
}
|
||||
|
||||
const fetchMLDataForFiles = async (
|
||||
client: Client,
|
||||
fileIDs: number[],
|
||||
fileKeys: Map<number, Uint8Array>,
|
||||
): Promise<Map<number, Record<string, unknown>>> => {
|
||||
const api = client.getApiClient();
|
||||
const result = new Map<number, Record<string, unknown>>();
|
||||
const batchSize = 200;
|
||||
|
||||
for (let i = 0; i < fileIDs.length; i += batchSize) {
|
||||
const batch = fileIDs.slice(i, i + batchSize);
|
||||
const { data } = await api.postJSON<{ data: RawRemoteFileData[] }>(
|
||||
"/files/data/fetch",
|
||||
{ type: "mldata", fileIDs: batch },
|
||||
);
|
||||
|
||||
for (const entry of data ?? []) {
|
||||
const key = fileKeys.get(entry.fileID);
|
||||
if (!key) continue;
|
||||
try {
|
||||
const decrypted = decryptBlob(
|
||||
fromBase64(entry.encryptedData),
|
||||
fromBase64(entry.decryptionHeader),
|
||||
key,
|
||||
);
|
||||
const jsonStr = gunzipSync(Buffer.from(decrypted)).toString(
|
||||
"utf-8",
|
||||
);
|
||||
result.set(entry.fileID, JSON.parse(jsonStr));
|
||||
} catch {
|
||||
// Corrupted ML data for this file; skip it
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
const extractExif = async (
|
||||
client: Client,
|
||||
file: EnteFile,
|
||||
): Promise<Record<string, unknown> | undefined> => {
|
||||
const tmpDir = mkdtempSync(join(tmpdir(), "quak-exif-"));
|
||||
try {
|
||||
const origPath = join(tmpDir, "original");
|
||||
await client.downloadFile(file, origPath);
|
||||
const meta = await sharp(origPath).metadata();
|
||||
|
||||
const result: Record<string, unknown> = {
|
||||
format: meta.format,
|
||||
width: meta.width,
|
||||
height: meta.height,
|
||||
space: meta.space,
|
||||
channels: meta.channels,
|
||||
depth: meta.depth,
|
||||
density: meta.density,
|
||||
chromaSubSampling: meta.chromaSubSampling,
|
||||
isProgressive: meta.isProgressive,
|
||||
hasProfile: meta.hasProfile,
|
||||
hasAlpha: meta.hasAlpha,
|
||||
orientation: meta.orientation,
|
||||
};
|
||||
|
||||
if (meta.exif) {
|
||||
try {
|
||||
result.exif = exifReader(meta.exif);
|
||||
} catch {
|
||||
// Malformed EXIF; store the raw bytes as base64 instead
|
||||
result.exifRaw = meta.exif.toString("base64");
|
||||
}
|
||||
}
|
||||
if (meta.iptc) result.iptcRaw = meta.iptc.toString("base64");
|
||||
if (meta.xmp) {
|
||||
try {
|
||||
result.xmp = Buffer.from(meta.xmp).toString("utf-8");
|
||||
} catch {
|
||||
result.xmpRaw = meta.xmp.toString("base64");
|
||||
}
|
||||
}
|
||||
if (meta.icc) result.iccRaw = meta.icc.toString("base64");
|
||||
|
||||
return result;
|
||||
} catch {
|
||||
return undefined;
|
||||
} finally {
|
||||
rmSync(tmpDir, { recursive: true, force: true });
|
||||
}
|
||||
};
|
||||
|
||||
export const runMetadataBackup = async (
|
||||
client: Client,
|
||||
outDir: string,
|
||||
onProgress?: ProgressCallback,
|
||||
opts?: MetadataBackupOptions,
|
||||
): Promise<void> => {
|
||||
const log = onProgress ?? (() => {});
|
||||
const log = opts?.onProgress ?? (() => {});
|
||||
const wantML = opts?.mlData ?? false;
|
||||
const wantExif = opts?.exif ?? false;
|
||||
|
||||
mkdirSync(outDir, { recursive: true });
|
||||
mkdirSync(join(outDir, "collections"), { recursive: true });
|
||||
@@ -26,6 +135,10 @@ export const runMetadataBackup = async (
|
||||
log("Fetching collections...");
|
||||
const collections = await client.listCollections();
|
||||
|
||||
const allFiles: { file: EnteFile; colDirName: string }[] = [];
|
||||
const fileKeys = new Map<number, Uint8Array>();
|
||||
const seenFileIDs = new Set<number>();
|
||||
|
||||
for (const col of collections) {
|
||||
const dirName = `${col.id}-${sanitizePath(col.name || "unnamed")}`;
|
||||
const colDir = join(outDir, "collections", dirName);
|
||||
@@ -55,23 +168,57 @@ export const runMetadataBackup = async (
|
||||
log(`[${col.name}] ${files.length} file(s)`);
|
||||
|
||||
for (const file of files) {
|
||||
const fileMeta: Record<string, unknown> = {
|
||||
id: file.id,
|
||||
collectionID: file.collectionID,
|
||||
ownerID: file.ownerID,
|
||||
metadata: file.metadata,
|
||||
updationTime: file.updationTime,
|
||||
};
|
||||
if (file.magicMetadata) fileMeta.magicMetadata = file.magicMetadata;
|
||||
if (file.pubMagicMetadata)
|
||||
fileMeta.pubMagicMetadata = file.pubMagicMetadata;
|
||||
|
||||
writeFileSync(
|
||||
join(colDir, `${file.id}.json`),
|
||||
JSON.stringify(fileMeta, null, 2),
|
||||
);
|
||||
allFiles.push({ file, colDirName: dirName });
|
||||
if (!seenFileIDs.has(file.id)) {
|
||||
fileKeys.set(file.id, file.key);
|
||||
seenFileIDs.add(file.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch ML data in bulk if requested
|
||||
let mlDataMap = new Map<number, Record<string, unknown>>();
|
||||
if (wantML) {
|
||||
log("Fetching ML data (face detections, CLIP embeddings)...");
|
||||
mlDataMap = await fetchMLDataForFiles(
|
||||
client,
|
||||
[...fileKeys.keys()],
|
||||
fileKeys,
|
||||
);
|
||||
log(`Got ML data for ${mlDataMap.size} file(s)`);
|
||||
}
|
||||
|
||||
// Write per-file JSON (with optional ML data and EXIF)
|
||||
const writtenFileIDs = new Set<number>();
|
||||
for (const { file, colDirName } of allFiles) {
|
||||
const colDir = join(outDir, "collections", colDirName);
|
||||
|
||||
const fileMeta: Record<string, unknown> = {
|
||||
id: file.id,
|
||||
collectionID: file.collectionID,
|
||||
ownerID: file.ownerID,
|
||||
metadata: file.metadata,
|
||||
updationTime: file.updationTime,
|
||||
};
|
||||
if (file.magicMetadata) fileMeta.magicMetadata = file.magicMetadata;
|
||||
if (file.pubMagicMetadata)
|
||||
fileMeta.pubMagicMetadata = file.pubMagicMetadata;
|
||||
|
||||
const ml = mlDataMap.get(file.id);
|
||||
if (ml) fileMeta.mlData = ml;
|
||||
|
||||
if (wantExif && !writtenFileIDs.has(file.id)) {
|
||||
log(`[${file.metadata.title}] Extracting EXIF...`);
|
||||
const exifData = await extractExif(client, file);
|
||||
if (exifData) fileMeta.imageMetadata = exifData;
|
||||
}
|
||||
writtenFileIDs.add(file.id);
|
||||
|
||||
writeFileSync(
|
||||
join(colDir, `${file.id}.json`),
|
||||
JSON.stringify(fileMeta, null, 2),
|
||||
);
|
||||
}
|
||||
|
||||
log("Metadata backup complete.");
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user