refactor: vendor phishing blocklist, delta-only in-memory updates
All checks were successful
check / check (push) Successful in 25s

Vendor the MetaMask eth-phishing-detect config.json (231k domains) into
src/data/phishing-domains.json as the baseline blocklist shipped with
the extension.

On 24h refresh, only the delta (new domains not in the vendored snapshot)
is kept in memory. Domain checks hit the in-memory delta first (fresh
scam sites), then binary-search the vendored sorted array.

If the delta is under 256 KiB it is persisted to chrome.storage.local
so it survives service-worker restarts without re-fetching.

Removes the previous approach of downloading and holding the full
blocklist in memory as a Set.
This commit is contained in:
clawbot 2026-03-01 07:33:10 -08:00
parent b8d81a4c8a
commit 0d06df6cbe
3 changed files with 231715 additions and 80 deletions

231430
src/data/phishing-domains.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,37 +1,55 @@
// Domain-based phishing detection using MetaMask's eth-phishing-detect blocklist.
// Fetches the blocklist at runtime, caches it in memory, and checks hostnames.
//
// The blocklist source:
// https://github.com/MetaMask/eth-phishing-detect (src/config.json)
// Architecture:
// 1. A vendored copy of the blocklist ships with the extension
// (src/data/phishing-domains.json — sorted blacklist for binary search).
// 2. Every 24h we fetch the latest list from MetaMask's repo and compute
// the delta (new domains not in the vendored snapshot).
// 3. Only the delta is kept in memory / persisted to chrome.storage.local.
// 4. Domain checks hit the delta first (fresh scam sites), then the
// vendored baseline via binary search.
//
// The config uses { blacklist: [...], whitelist: [...], fuzzylist: [...] }.
// We check exact hostname and parent-domain matches against the blacklist,
// with whitelist overrides.
// Source: https://github.com/MetaMask/eth-phishing-detect (src/config.json)
const vendoredConfig = require("../data/phishing-domains.json");
const BLOCKLIST_URL =
"https://raw.githubusercontent.com/MetaMask/eth-phishing-detect/main/src/config.json";
const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours
const DELTA_STORAGE_KEY = "phishing_domain_delta";
const DELTA_MAX_BYTES = 256 * 1024; // 256 KiB
let blacklistSet = new Set();
let whitelistSet = new Set();
// Vendored baseline — sorted arrays for binary search (no extra Set needed).
const vendoredBlacklist = vendoredConfig.blacklist; // pre-sorted lowercase
const vendoredWhitelist = new Set(
(vendoredConfig.whitelist || []).map((d) => d.toLowerCase()),
);
// Delta state — only domains added upstream since the vendored snapshot.
let deltaBlacklistSet = new Set();
let deltaWhitelistSet = new Set();
let lastFetchTime = 0;
let fetchPromise = null;
let persistedDeltaLoaded = false;
/**
* Load a pre-parsed config into the in-memory sets.
* Used for testing and for loading from cache.
* Binary search on a sorted string array.
*
* @param {{ blacklist?: string[], whitelist?: string[] }} config
* @param {string[]} sorted - Sorted array of lowercase strings.
* @param {string} target - Lowercase string to find.
* @returns {boolean}
*/
function loadConfig(config) {
blacklistSet = new Set(
(config.blacklist || []).map((d) => d.toLowerCase()),
);
whitelistSet = new Set(
(config.whitelist || []).map((d) => d.toLowerCase()),
);
lastFetchTime = Date.now();
function binarySearch(sorted, target) {
let lo = 0;
let hi = sorted.length - 1;
while (lo <= hi) {
const mid = (lo + hi) >>> 1;
if (sorted[mid] === target) return true;
if (sorted[mid] < target) lo = mid + 1;
else hi = mid - 1;
}
return false;
}
/**
@ -45,7 +63,6 @@ function hostnameVariants(hostname) {
const h = hostname.toLowerCase();
const variants = [h];
const parts = h.split(".");
// Parent domains: a.b.c.d -> b.c.d, c.d
for (let i = 1; i < parts.length - 1; i++) {
variants.push(parts.slice(i).join("."));
}
@ -54,8 +71,8 @@ function hostnameVariants(hostname) {
/**
* Check if a hostname is on the phishing blocklist.
* Checks exact hostname and all parent domains.
* Whitelisted domains are never flagged.
* Checks delta (fresh additions) first, then vendored baseline.
* Whitelisted domains (vendored + delta) are never flagged.
*
* @param {string} hostname - The hostname to check.
* @returns {boolean}
@ -63,25 +80,108 @@ function hostnameVariants(hostname) {
function isPhishingDomain(hostname) {
if (!hostname) return false;
const variants = hostnameVariants(hostname);
// Whitelist takes priority
// Whitelist takes priority (both vendored and delta)
for (const v of variants) {
if (whitelistSet.has(v)) return false;
if (vendoredWhitelist.has(v) || deltaWhitelistSet.has(v)) return false;
}
// Check delta first — fresh scam sites hit here
for (const v of variants) {
if (blacklistSet.has(v)) return true;
if (deltaBlacklistSet.has(v)) return true;
}
// Check vendored baseline via binary search
for (const v of variants) {
if (binarySearch(vendoredBlacklist, v)) return true;
}
return false;
}
/**
* Fetch the latest blocklist from the MetaMask repo.
* De-duplicates concurrent fetches. Results are cached for CACHE_TTL_MS.
* Get the storage API if available (chrome.storage.local / browser.storage.local).
*
* @returns {object|null}
*/
function getStorageApi() {
if (typeof browser !== "undefined" && browser.storage) {
return browser.storage.local;
}
if (typeof chrome !== "undefined" && chrome.storage) {
return chrome.storage.local;
}
return null;
}
/**
* Load persisted delta from chrome.storage.local.
* Called once on first update to restore delta across restarts.
*
* @returns {Promise<void>}
*/
async function loadPersistedDelta() {
const storage = getStorageApi();
if (!storage) return;
try {
const result = await storage.get(DELTA_STORAGE_KEY);
const data = result[DELTA_STORAGE_KEY];
if (data && data.blacklist && data.whitelist) {
deltaBlacklistSet = new Set(data.blacklist);
deltaWhitelistSet = new Set(data.whitelist);
if (data.fetchTime) {
lastFetchTime = data.fetchTime;
}
}
} catch {
// Storage unavailable or corrupted — start fresh.
}
persistedDeltaLoaded = true;
}
/**
* Persist the current delta to chrome.storage.local if it fits in 256 KiB.
*
* @returns {Promise<void>}
*/
async function persistDelta() {
const storage = getStorageApi();
if (!storage) return;
const data = {
blacklist: Array.from(deltaBlacklistSet),
whitelist: Array.from(deltaWhitelistSet),
fetchTime: lastFetchTime,
};
const serialized = JSON.stringify(data);
if (serialized.length > DELTA_MAX_BYTES) {
// Delta too large to persist — keep in memory only.
return;
}
try {
await storage.set({ [DELTA_STORAGE_KEY]: data });
} catch {
// Storage write failed — non-fatal.
}
}
/**
* Fetch the latest blocklist, compute delta against vendored baseline,
* and update in-memory state. De-duplicates concurrent fetches.
*
* @returns {Promise<void>}
*/
async function updatePhishingList() {
// Load persisted delta on first call
if (!persistedDeltaLoaded) {
await loadPersistedDelta();
}
// Skip if recently fetched
if (Date.now() - lastFetchTime < CACHE_TTL_MS && blacklistSet.size > 0) {
if (Date.now() - lastFetchTime < CACHE_TTL_MS) {
return;
}
@ -93,9 +193,32 @@ async function updatePhishingList() {
const resp = await fetch(BLOCKLIST_URL);
if (!resp.ok) throw new Error("HTTP " + resp.status);
const config = await resp.json();
loadConfig(config);
// Compute blacklist delta: remote items not in vendored baseline
const newDeltaBl = new Set();
for (const domain of config.blacklist || []) {
const d = domain.toLowerCase();
if (!binarySearch(vendoredBlacklist, d)) {
newDeltaBl.add(d);
}
}
// Compute whitelist delta: remote items not in vendored whitelist
const newDeltaWl = new Set();
for (const domain of config.whitelist || []) {
const d = domain.toLowerCase();
if (!vendoredWhitelist.has(d)) {
newDeltaWl.add(d);
}
}
deltaBlacklistSet = newDeltaBl;
deltaWhitelistSet = newDeltaWl;
lastFetchTime = Date.now();
await persistDelta();
} catch {
// Silently fail — we'll retry next time.
// Fetch failed — keep existing delta, retry next time.
} finally {
fetchPromise = null;
}
@ -105,22 +228,51 @@ async function updatePhishingList() {
}
/**
* Return the current blocklist size (for diagnostics).
* Load a pre-parsed config directly into state (vendored + delta combined).
* Used for testing.
*
* @param {{ blacklist?: string[], whitelist?: string[] }} config
*/
function loadConfig(config) {
// For tests: treat the entire config as delta (overlaid on vendored).
// Clear existing delta first.
deltaBlacklistSet = new Set(
(config.blacklist || []).map((d) => d.toLowerCase()),
);
deltaWhitelistSet = new Set(
(config.whitelist || []).map((d) => d.toLowerCase()),
);
lastFetchTime = Date.now();
persistedDeltaLoaded = true;
}
/**
* Return total blocklist size (vendored + delta, for diagnostics).
*
* @returns {number}
*/
function getBlocklistSize() {
return blacklistSet.size;
return vendoredBlacklist.length + deltaBlacklistSet.size;
}
/**
* Return delta size (for diagnostics).
*
* @returns {number}
*/
function getDeltaSize() {
return deltaBlacklistSet.size;
}
/**
* Reset internal state (for testing).
*/
function _reset() {
blacklistSet = new Set();
whitelistSet = new Set();
deltaBlacklistSet = new Set();
deltaWhitelistSet = new Set();
lastFetchTime = 0;
fetchPromise = null;
persistedDeltaLoaded = false;
}
module.exports = {
@ -128,6 +280,8 @@ module.exports = {
updatePhishingList,
loadConfig,
getBlocklistSize,
getDeltaSize,
hostnameVariants,
binarySearch,
_reset,
};

View File

@ -2,11 +2,14 @@ const {
isPhishingDomain,
loadConfig,
getBlocklistSize,
getDeltaSize,
hostnameVariants,
binarySearch,
_reset,
} = require("../src/shared/phishingDomains");
// Reset state before each test to avoid cross-test contamination.
// The vendored baseline is loaded automatically via require().
// _reset() clears only the delta state, not the vendored baseline.
beforeEach(() => {
_reset();
});
@ -39,8 +42,54 @@ describe("phishingDomains", () => {
});
});
describe("loadConfig + isPhishingDomain", () => {
test("detects exact blacklisted domain", () => {
describe("binarySearch", () => {
const sorted = ["alpha.com", "beta.com", "gamma.com", "zeta.com"];
test("finds existing elements", () => {
expect(binarySearch(sorted, "alpha.com")).toBe(true);
expect(binarySearch(sorted, "gamma.com")).toBe(true);
expect(binarySearch(sorted, "zeta.com")).toBe(true);
});
test("returns false for missing elements", () => {
expect(binarySearch(sorted, "aaa.com")).toBe(false);
expect(binarySearch(sorted, "delta.com")).toBe(false);
expect(binarySearch(sorted, "zzz.com")).toBe(false);
});
test("handles empty array", () => {
expect(binarySearch([], "anything")).toBe(false);
});
test("handles single-element array", () => {
expect(binarySearch(["only.com"], "only.com")).toBe(true);
expect(binarySearch(["only.com"], "other.com")).toBe(false);
});
});
describe("vendored baseline detection", () => {
// These tests verify that the vendored phishing-domains.json
// is loaded and searchable without any delta loaded.
test("getBlocklistSize reflects vendored list (no delta)", () => {
// The vendored list has 231k+ domains; delta is empty after reset.
expect(getBlocklistSize()).toBeGreaterThan(200000);
expect(getDeltaSize()).toBe(0);
});
test("returns false for clean domains against vendored list", () => {
expect(isPhishingDomain("google.com")).toBe(false);
expect(isPhishingDomain("github.com")).toBe(false);
});
test("returns false for empty/null hostname", () => {
expect(isPhishingDomain("")).toBe(false);
expect(isPhishingDomain(null)).toBe(false);
});
});
describe("delta (loadConfig) + isPhishingDomain", () => {
test("detects domains loaded into delta via loadConfig", () => {
loadConfig({
blacklist: ["evil-phishing.com", "scam-swap.xyz"],
whitelist: [],
@ -49,16 +98,7 @@ describe("phishingDomains", () => {
expect(isPhishingDomain("scam-swap.xyz")).toBe(true);
});
test("returns false for clean domains", () => {
loadConfig({
blacklist: ["evil-phishing.com"],
whitelist: [],
});
expect(isPhishingDomain("etherscan.io")).toBe(false);
expect(isPhishingDomain("uniswap.org")).toBe(false);
});
test("detects subdomain of blacklisted domain", () => {
test("detects subdomain of delta-blacklisted domain", () => {
loadConfig({
blacklist: ["evil-phishing.com"],
whitelist: [],
@ -67,7 +107,7 @@ describe("phishingDomains", () => {
expect(isPhishingDomain("sub.app.evil-phishing.com")).toBe(true);
});
test("whitelist overrides blacklist", () => {
test("delta whitelist overrides delta blacklist", () => {
loadConfig({
blacklist: ["metamask.io"],
whitelist: ["metamask.io"],
@ -75,7 +115,7 @@ describe("phishingDomains", () => {
expect(isPhishingDomain("metamask.io")).toBe(false);
});
test("whitelist on parent domain overrides blacklist", () => {
test("delta whitelist on parent domain overrides blacklist", () => {
loadConfig({
blacklist: ["sub.legit.com"],
whitelist: ["legit.com"],
@ -83,7 +123,7 @@ describe("phishingDomains", () => {
expect(isPhishingDomain("sub.legit.com")).toBe(false);
});
test("case-insensitive matching", () => {
test("case-insensitive matching in delta", () => {
loadConfig({
blacklist: ["Evil-Phishing.COM"],
whitelist: [],
@ -92,30 +132,15 @@ describe("phishingDomains", () => {
expect(isPhishingDomain("EVIL-PHISHING.COM")).toBe(true);
});
test("returns false for empty/null hostname", () => {
loadConfig({
blacklist: ["evil.com"],
whitelist: [],
});
expect(isPhishingDomain("")).toBe(false);
expect(isPhishingDomain(null)).toBe(false);
});
test("getBlocklistSize reflects loaded config", () => {
test("getDeltaSize reflects loaded delta", () => {
loadConfig({
blacklist: ["a.com", "b.com", "c.com"],
whitelist: ["d.com"],
});
expect(getBlocklistSize()).toBe(3);
expect(getDeltaSize()).toBe(3);
});
test("handles config with no blacklist/whitelist keys", () => {
loadConfig({});
expect(isPhishingDomain("anything.com")).toBe(false);
expect(getBlocklistSize()).toBe(0);
});
test("re-loading config replaces previous data", () => {
test("re-loading config replaces previous delta", () => {
loadConfig({
blacklist: ["old-scam.com"],
whitelist: [],
@ -129,10 +154,15 @@ describe("phishingDomains", () => {
expect(isPhishingDomain("old-scam.com")).toBe(false);
expect(isPhishingDomain("new-scam.com")).toBe(true);
});
test("handles config with no blacklist/whitelist keys", () => {
loadConfig({});
expect(getDeltaSize()).toBe(0);
});
});
describe("real-world MetaMask blocklist patterns", () => {
test("detects known phishing domains from MetaMask list", () => {
describe("real-world MetaMask blocklist patterns (via delta)", () => {
test("detects known phishing domains loaded as delta", () => {
loadConfig({
blacklist: [
"uniswap-trade.web.app",
@ -146,21 +176,42 @@ describe("phishingDomains", () => {
expect(isPhishingDomain("blast-pools.pages.dev")).toBe(true);
});
test("does not flag legitimate domains whitelisted by MetaMask", () => {
test("delta whitelist overrides vendored blacklist entries", () => {
// If a domain is in the vendored blacklist but a fresh whitelist
// update adds it, the whitelist should win.
loadConfig({
blacklist: ["opensea.pro"],
whitelist: [
"opensea.io",
"metamask.io",
"etherscan.io",
"opensea.pro",
],
blacklist: [],
whitelist: ["opensea.io", "metamask.io", "etherscan.io"],
});
expect(isPhishingDomain("opensea.io")).toBe(false);
expect(isPhishingDomain("metamask.io")).toBe(false);
expect(isPhishingDomain("etherscan.io")).toBe(false);
// opensea.pro is both blacklisted and whitelisted — whitelist wins
expect(isPhishingDomain("opensea.pro")).toBe(false);
});
});
describe("delta + vendored interaction", () => {
test("delta blacklist entries are found even with empty vendored match", () => {
// This domain is (almost certainly) not in the vendored list
const uniqueDomain =
"test-unique-domain-not-in-vendored-" +
Date.now() +
".example.com";
expect(isPhishingDomain(uniqueDomain)).toBe(false);
loadConfig({
blacklist: [uniqueDomain],
whitelist: [],
});
expect(isPhishingDomain(uniqueDomain)).toBe(true);
});
test("getBlocklistSize includes both vendored and delta", () => {
const baseSize = getBlocklistSize();
loadConfig({
blacklist: ["new-a.com", "new-b.com"],
whitelist: [],
});
expect(getBlocklistSize()).toBe(baseSize + 2);
});
});
});