migrate article embeddings to support multi-model architecture and enhance data integrity

This commit is contained in:
ImBenji 2026-04-19 00:28:15 +01:00
parent a10c5eb39f
commit b4b2fe2ac7
4 changed files with 686 additions and 104 deletions

View file

@ -1,14 +1,37 @@
const { extractFromHtml } = require('@extractus/article-extractor');
const sharp = require('sharp');
const db = require('./db');
const config = require('./config');
const { generateAndStoreEmbedding } = require('./embeddings');
const { fetchWithPolicy } = require('./http');
const { getSharedBrowserSession } = require('./sources/browserCrawler');
const { extractFromHtml } = require("@extractus/article-extractor");
const sharp = require("sharp");
const db = require("./db");
const config = require("./config");
const { generateAndStoreEmbedding } = require("./embeddings");
const { fetchWithPolicy } = require("./http");
const { getSharedBrowserSession } = require("./sources/browserCrawler");
const { validateExtractedArticle } = require("./contentValidation");
const {
getEffectivePolicy,
recordPlainSuccess,
recordPlainFailure,
recordBrowserSuccess,
recordBrowserFailure,
} = require("./domainPolicy");
const MAX_PLAIN_HTML_LENGTH = 1_500_000;
const PLAIN_FETCH_TIMEOUT = 12000;
const BROWSER_FETCH_TIMEOUT = 20000;
// retry windows for failures that look transient (validation rejected the
// page, fetch timed out). genuinely terminal failures (404, dead url) get
// a hard cap on attempt count instead
const VALIDATION_RETRY_AFTER_MS = 24 * 60 * 60 * 1000;
const TRANSIENT_RETRY_AFTER_MS = 6 * 60 * 60 * 1000;
const MAX_TERMINAL_ATTEMPTS = 3;
const updateArticleAssets = db.prepare(`
UPDATE articles
SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
SET content = ?, image = ?, content_status = 'ready', content_error = NULL,
content_attempted_at = ?, content_attempt_count = content_attempt_count + 1,
content_retry_after = NULL
WHERE id = ?
`);
const updateArticleTitleDescription = db.prepare(`
@ -18,19 +41,25 @@ const updateArticleTitleDescription = db.prepare(`
`);
const markContentSkipped = db.prepare(`
UPDATE articles
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?,
content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
WHERE id = ?
`);
const markContentFailed = db.prepare(`
UPDATE articles
SET content_status = 'failed', content_error = ?, content_attempted_at = ?
SET content_status = 'failed', content_error = ?, content_attempted_at = ?,
content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
WHERE id = ?
`);
const markContentPending = db.prepare(`
UPDATE articles
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
SET content_status = 'pending', content_error = ?, content_attempted_at = ?,
content_attempt_count = content_attempt_count + 1, content_retry_after = ?
WHERE id = ?
`);
// round-robin pull of articles needing content. respects content_retry_after so
// a freshly-rejected article doesnt get retried in the next loop iteration
const selectRoundRobinArticlesMissingContent = db.prepare(`
SELECT id, url, title, description
FROM (
@ -39,21 +68,18 @@ const selectRoundRobinArticlesMissingContent = db.prepare(`
FROM articles
WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending')
AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
)
WHERE rn <= ?
ORDER BY rn, source
`);
const loggedBlockedDomains = new Set();
let contentBackfillRunning = false;
const selectAttemptCount = db.prepare(`
SELECT content_attempt_count AS attempts FROM articles WHERE id = ?
`);
function getHostname(url) {
try {
return new URL(url).hostname.toLowerCase();
} catch {
return '';
}
}
let contentBackfillRunning = false;
function getErrorStatus(error) {
@ -61,38 +87,28 @@ function getErrorStatus(error) {
return error.status;
}
const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/);
const match = String((error && error.message) || "").match(/\b(401|403|404|408|429|5\d\d)\b/);
return match ? Number(match[1]) : null;
}
function getErrorMessage(error, fallback) {
const message = String(error && error.message || fallback || '').trim();
const message = String((error && error.message) || fallback || "").trim();
return message ? message.slice(0, 500) : null;
}
function markArticleStatus(statement, id, message) {
const attemptedAt = new Date().toISOString();
const parameterCount = statement.source.split('?').length - 1;
if (parameterCount === 3) {
statement.run(message, attemptedAt, id);
return;
function nowIso() {
return new Date().toISOString();
}
if (parameterCount === 2) {
statement.run(attemptedAt, id);
return;
function futureIso(ms) {
return new Date(Date.now() + ms).toISOString();
}
throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
}
async function fetchCompressedImage(url) {
const response = await fetchWithPolicy(url, {
retries: 1,
headers: {
Accept: 'image/*',
},
headers: { Accept: "image/*" },
});
if (!response.ok) {
@ -101,45 +117,226 @@ async function fetchCompressedImage(url) {
throw error;
}
const contentType = String(response.headers.get('content-type') || '').toLowerCase();
if (!contentType.startsWith('image/')) {
throw new Error(`image request returned ${contentType || 'unknown content-type'}`);
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
if (!contentType.startsWith("image/")) {
throw new Error(`image request returned ${contentType || "unknown content-type"}`);
}
const input = Buffer.from(await response.arrayBuffer());
if (input.length === 0) {
throw new Error('image request returned an empty body');
throw new Error("image request returned an empty body");
}
const output = await sharp(input)
.rotate()
.resize({ width: 320, height: 320, fit: 'inside', withoutEnlargement: true })
.resize({ width: 320, height: 320, fit: "inside", withoutEnlargement: true })
.webp({ quality: 25 })
.toBuffer();
return output.toString('base64');
return output.toString("base64");
}
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
try {
// plain http fetch — no js execution. fast, low memory, but fails on
// js-rendered sites and gets blocked by cloudflare more often
async function fetchPlainHtml(url) {
const response = await fetchWithPolicy(url, {
timeout: PLAIN_FETCH_TIMEOUT,
retries: 1,
});
if (!response.ok) {
const error = new Error(`plain fetch returned ${response.status}`);
error.status = response.status;
throw error;
}
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
if (contentType && !contentType.includes("html") && !contentType.includes("xml")) {
throw new Error(`plain fetch returned non-html content-type: ${contentType}`);
}
const text = await response.text();
return {
html: text.slice(0, MAX_PLAIN_HTML_LENGTH),
finalUrl: response.url || url,
};
}
async function fetchBrowserHtml(url) {
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
const article = await extractFromHtml(html, url);
if (!article) {
markArticleStatus(markContentSkipped, id, 'extractor returned no article');
const session = await getSharedBrowserSession({
requestTimeout: BROWSER_FETCH_TIMEOUT,
maxConcurrentPages,
});
const html = await session.fetchRenderedHtml(url, { timeout: BROWSER_FETCH_TIMEOUT });
return { html, finalUrl: url };
}
function stripHtmlContent(value) {
if (typeof value !== "string") return null;
const stripped = value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
return stripped || null;
}
// runs fetch → extract → validate. returns { ok, article, html, finalUrl, reason }
// where article has been post-processed (content stripped of html). on failure,
// reason explains what tripped — used both for logging and for the per-domain
// policy update
async function attemptFetch(url, fetcher) {
let html;
let finalUrl;
try {
const result = await fetcher(url);
html = result.html;
finalUrl = result.finalUrl;
} catch (error) {
return { ok: false, reason: `fetch-error:${error.message || "unknown"}`, error };
}
if (!html) {
return { ok: false, reason: "empty-html" };
}
let extracted;
try {
extracted = await extractFromHtml(html, finalUrl || url);
} catch (error) {
return { ok: false, reason: `extractor-error:${error.message || "unknown"}` };
}
if (extracted) {
extracted = {
...extracted,
content: stripHtmlContent(extracted.content),
};
}
const validation = validateExtractedArticle({ article: extracted, html, finalUrl });
if (!validation.ok) {
return { ok: false, reason: validation.reason, retryable: validation.retryable, html, finalUrl };
}
return { ok: true, article: extracted, html, finalUrl };
}
function getAttemptCount(id) {
const row = selectAttemptCount.get(id);
return row ? row.attempts || 0 : 0;
}
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
const policy = getEffectivePolicy(url);
// domains we know are blocked — skip the fetch entirely until ttl expires.
// the row stays pending so it'll get picked up after the policy resets
if (policy.policy === "blocked") {
markContentPending.run(
`domain blocked by policy`,
nowIso(),
futureIso(TRANSIENT_RETRY_AFTER_MS),
id
);
return;
}
const content = typeof article.content === 'string'
? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
: null;
const tryPlainFirst = policy.policy === "auto" || policy.policy === "plain_only";
let plainResult = null;
let browserResult = null;
// if stored title looks like a raw URL, try to replace with scraped title
if (tryPlainFirst) {
plainResult = await attemptFetch(url, fetchPlainHtml);
if (plainResult.ok) {
recordPlainSuccess(url);
await commitArticle(id, url, plainResult, storedTitle, storedDescription);
return;
}
recordPlainFailure(url);
// hard 4xx (other than 408/429) on plain — domain might serve the same to
// browser, but try anyway since it's cheap once the policy hasnt flipped yet.
// 408/429/5xx defer for retry
const status = plainResult.error && getErrorStatus(plainResult.error);
if (status === 408 || status === 429 || (status && status >= 500)) {
markContentPending.run(
`plain ${status}`,
nowIso(),
futureIso(TRANSIENT_RETRY_AFTER_MS),
id
);
return;
}
}
// policy.policy === "plain_only" means we just tried plain and failed —
// dont escalate to browser, the operator (or earlier domain memory) said no
if (policy.policy === "plain_only") {
recordValidationFailure(id, plainResult);
return;
}
browserResult = await attemptFetch(url, fetchBrowserHtml);
if (browserResult.ok) {
recordBrowserSuccess(url);
await commitArticle(id, url, browserResult, storedTitle, storedDescription);
return;
}
recordBrowserFailure(url);
const browserStatus = browserResult.error && getErrorStatus(browserResult.error);
if (browserStatus === 408 || browserStatus === 429 || (browserStatus && browserStatus >= 500)) {
markContentPending.run(
`browser ${browserStatus}`,
nowIso(),
futureIso(TRANSIENT_RETRY_AFTER_MS),
id
);
return;
}
// both paths exhausted (or browser-only path failed). decide between
// pending-with-retry and terminal failed based on attempt count and
// whether the validator thought it was retryable
recordValidationFailure(id, browserResult);
}
function recordValidationFailure(id, result) {
const reason = result?.reason || "unknown";
const retryable = result?.retryable !== false;
const attempts = getAttemptCount(id);
// hard fetch errors with no retryable signal — terminal after a few tries
if (!retryable || attempts + 1 >= MAX_TERMINAL_ATTEMPTS) {
markContentFailed.run(reason, nowIso(), id);
return;
}
markContentPending.run(reason, nowIso(), futureIso(VALIDATION_RETRY_AFTER_MS), id);
}
async function commitArticle(id, url, result, storedTitle, storedDescription) {
const { article, finalUrl } = result;
const content = article.content || null;
// if stored title looks like a raw url, replace with extracted one
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
if (titleLooksLikeUrl) {
const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
const scrapedTitle = typeof article.title === "string" ? article.title.trim() : null;
const scrapedDescription = typeof article.description === "string" ? article.description.trim() : null;
if (scrapedTitle) {
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
}
@ -154,36 +351,20 @@ async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
if (status === 401 || status === 403 || status === 404 || status === 429) {
console.warn(`image fetch skipped for ${url}: upstream returned ${status}`);
} else {
console.error(`image fetch failed for ${url}:`, error);
console.error(`image fetch failed for ${url}:`, error.message || error);
}
}
}
if (!content && !image) {
markArticleStatus(markContentSkipped, id, 'article had no extractable content or image');
return;
}
updateArticleAssets.run(content, image, nowIso(), id);
updateArticleAssets.run(content, image, new Date().toISOString(), id);
try {
await generateAndStoreEmbedding(id);
} catch (error) {
const status = getErrorStatus(error);
if (status === 401 || status === 403 || status === 404) {
console.warn(`content fetch skipped for ${url}: upstream returned ${status}`);
markArticleStatus(markContentSkipped, id, `upstream returned ${status}`);
return;
console.error(`embedding failed for article ${id}:`, error.message || error);
}
}
if (status === 408 || status === 429 || (status && status >= 500)) {
console.warn(`content fetch deferred for ${url}: upstream returned ${status}`);
markArticleStatus(markContentPending, id, null);
return;
}
markArticleStatus(markContentFailed, id, getErrorMessage(error, 'content fetch failed'));
console.error(`content fetch failed for ${url}:`, error);
}
}
async function backfillMissingContent(perSource = 50, concurrency = 5) {
if (contentBackfillRunning) {
@ -204,15 +385,18 @@ async function backfillMissingContent(perSource = 50, concurrency = 5) {
}
}
function hasPendingContent() {
return Boolean(db.prepare(`
SELECT 1 FROM articles
WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending')
AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
LIMIT 1
`).get());
}
module.exports = {
fetchAndStoreContent,
backfillMissingContent,

199
src/contentValidation.js Normal file
View file

@ -0,0 +1,199 @@
// validates whether an extracted article is real content vs a soft-error page
// (cookie wall, cloudflare challenge, paywall, "enable javascript", etc).
//
// the rules are deliberately conservative. we'd rather let a few junk pages
// through (caught downstream when re-checked) than reject 5% of real articles.
// fingerprints are anchored to title or the first ~500 chars of body so an
// article that *mentions* cloudflare doesnt get falsely rejected.
const MIN_CONTENT_LENGTH = 400;
const MIN_SENTENCE_TERMINATORS = 3;
const BODY_SNIFF_LENGTH = 800;
// titles that ONLY appear on error/challenge pages — never on real articles.
// match is case-insensitive, exact-or-prefix only (not substring) to avoid
// false positives like a real article titled "404 reasons your startup failed"
const TITLE_BLOCKLIST = [
"just a moment",
"just a moment...",
"attention required! | cloudflare",
"attention required!",
"access denied",
"access to this page has been denied",
"you have been blocked",
"are you a robot",
"are you a robot?",
"verify you are human",
"please verify you are a human",
"page not found",
"404 not found",
"404 page not found",
"403 forbidden",
"503 service unavailable",
"this page isn't available",
"this page isnt available",
"site temporarily unavailable",
"request unsuccessful",
];
// substrings to look for in the raw html head/early body that indicate a
// cloudflare/akamai/imperva interstitial. these are infrastructure markers
// the real site never serves
const CHALLENGE_MARKERS = [
"cf-chl-bypass",
"__cf_chl_",
"cf_chl_opt",
"/cdn-cgi/challenge-platform",
"_incapsula_resource",
"incap_ses_",
"x-iinfo",
"akamai-bm-telemetry",
"ak_bmsc",
"distil_r_captcha",
];
// phrases at the very start of extracted body text that mean we got a stub.
// anchored to first ~500 chars so we dont false-flag articles that discuss
// these topics later in the body
const BODY_PREFIX_BLOCKLIST = [
"you need to enable javascript",
"please enable javascript",
"javascript is required",
"please enable cookies",
"cookies must be enabled",
"your browser will redirect",
"checking your browser before",
"this site requires javascript",
"please make sure your browser supports",
];
// final-url path suffixes that indicate the request was redirected to a
// generic error/login page. we only check the pathname so query strings dont
// throw it off
const ERROR_PATH_HINTS = [
"/404",
"/403",
"/error",
"/errors",
"/blocked",
"/captcha",
"/challenge",
"/access-denied",
"/account/login",
"/sign-in",
"/signin",
"/subscribe",
"/subscription",
];
function normalizeForMatch(value) {
return String(value || "").trim().toLowerCase();
}
function countSentenceTerminators(text) {
// matches . ! ? followed by whitespace or end — avoids counting decimals like 3.14
const matches = String(text || "").match(/[.!?](?:\s|$)/g);
return matches ? matches.length : 0;
}
function hasErrorPath(finalUrl) {
if (!finalUrl) return false;
try {
const path = new URL(finalUrl).pathname.toLowerCase();
return ERROR_PATH_HINTS.some((hint) => path === hint || path.startsWith(`${hint}/`) || path.endsWith(hint));
} catch {
return false;
}
}
function hasChallengeMarker(html) {
if (!html) return null;
// cap the search window — challenge markers are always in head or top of body,
// dont need to scan a full 1.5mb document
const haystack = String(html).slice(0, 50000).toLowerCase();
for (const marker of CHALLENGE_MARKERS) {
if (haystack.includes(marker)) {
return marker;
}
}
return null;
}
function titleIsBlocked(title) {
const normalized = normalizeForMatch(title);
if (!normalized) return null;
for (const entry of TITLE_BLOCKLIST) {
if (normalized === entry || normalized.startsWith(`${entry} `) || normalized.startsWith(`${entry}|`)) {
return entry;
}
}
return null;
}
function bodyPrefixIsBlocked(content) {
const sniff = normalizeForMatch(content).slice(0, BODY_SNIFF_LENGTH);
if (!sniff) return null;
for (const phrase of BODY_PREFIX_BLOCKLIST) {
if (sniff.includes(phrase)) {
return phrase;
}
}
return null;
}
function validateExtractedArticle({ article, html, finalUrl }) {
if (!article) {
return { ok: false, reason: "extractor-returned-null", retryable: false };
}
const content = typeof article.content === "string" ? article.content.trim() : "";
const title = typeof article.title === "string" ? article.title.trim() : "";
// title-level checks first since they're the cheapest signal
const blockedTitle = titleIsBlocked(title);
if (blockedTitle) {
return { ok: false, reason: `title-blocklist:${blockedTitle}`, retryable: true };
}
if (hasErrorPath(finalUrl)) {
return { ok: false, reason: `error-path:${finalUrl}`, retryable: true };
}
const challenge = hasChallengeMarker(html);
if (challenge) {
return { ok: false, reason: `challenge-marker:${challenge}`, retryable: true };
}
if (!content) {
return { ok: false, reason: "no-content-extracted", retryable: true };
}
if (content.length < MIN_CONTENT_LENGTH) {
return { ok: false, reason: `content-too-short:${content.length}`, retryable: true };
}
const blockedPrefix = bodyPrefixIsBlocked(content);
if (blockedPrefix) {
return { ok: false, reason: `body-prefix-blocklist:${blockedPrefix}`, retryable: true };
}
if (countSentenceTerminators(content) < MIN_SENTENCE_TERMINATORS) {
return { ok: false, reason: "too-few-sentences", retryable: true };
}
return { ok: true };
}
module.exports = {
validateExtractedArticle,
MIN_CONTENT_LENGTH,
};

View file

@ -261,11 +261,29 @@ db.exec(`
);
`);
// per-domain fetch policy — caches whether plain http or browser is needed
// so we dont waste a round trip on every article from a known js-only site.
// expires_at lets us re-probe domains that may have recovered
db.exec(`
CREATE TABLE IF NOT EXISTS domain_fetch_policy (
domain TEXT PRIMARY KEY,
policy TEXT NOT NULL DEFAULT 'auto',
consecutive_plain_failures INTEGER NOT NULL DEFAULT 0,
consecutive_browser_failures INTEGER NOT NULL DEFAULT 0,
plain_success_count INTEGER NOT NULL DEFAULT 0,
browser_success_count INTEGER NOT NULL DEFAULT 0,
expires_at TEXT,
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
);
`);
for (const statement of [
'ALTER TABLE articles ADD COLUMN image TEXT',
'ALTER TABLE articles ADD COLUMN content_status TEXT',
'ALTER TABLE articles ADD COLUMN content_error TEXT',
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0',
'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
]) {
try {

181
src/domainPolicy.js Normal file
View file

@ -0,0 +1,181 @@
const db = require("./db");
// thresholds — kept in code rather than config because tuning these without
// understanding the consequences is a recipe for either a thundering herd
// against blocked domains or wasted plain-fetch attempts forever
const PLAIN_FAILURE_THRESHOLD = 5;
const BROWSER_FAILURE_THRESHOLD = 5;
const BROWSER_ONLY_TTL_MS = 7 * 24 * 60 * 60 * 1000;
const BLOCKED_TTL_MS = 24 * 60 * 60 * 1000;
const selectPolicy = db.prepare(`
SELECT domain, policy, consecutive_plain_failures, consecutive_browser_failures,
plain_success_count, browser_success_count, expires_at, updated_at
FROM domain_fetch_policy
WHERE domain = ?
`);
const upsertPolicy = db.prepare(`
INSERT INTO domain_fetch_policy (
domain, policy, consecutive_plain_failures, consecutive_browser_failures,
plain_success_count, browser_success_count, expires_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))
ON CONFLICT(domain) DO UPDATE SET
policy = excluded.policy,
consecutive_plain_failures = excluded.consecutive_plain_failures,
consecutive_browser_failures = excluded.consecutive_browser_failures,
plain_success_count = excluded.plain_success_count,
browser_success_count = excluded.browser_success_count,
expires_at = excluded.expires_at,
updated_at = datetime('now')
`);
function getDomain(url) {
try {
return new URL(url).hostname.toLowerCase();
} catch {
return "";
}
}
function loadRow(domain) {
if (!domain) return null;
return selectPolicy.get(domain) || null;
}
function isExpired(row) {
if (!row || !row.expires_at) return false;
return new Date(row.expires_at).getTime() <= Date.now();
}
// returns the effective policy for a domain right now. expired entries
// silently revert to "auto" so we re-probe — we dont mutate the row here
// since reads happen on every fetch and writes are expensive
function getEffectivePolicy(url) {
const domain = getDomain(url);
const row = loadRow(domain);
if (!row) {
return { domain, policy: "auto" };
}
if (isExpired(row)) {
return { domain, policy: "auto", wasExpired: true, previous: row.policy };
}
return { domain, policy: row.policy };
}
function writeRow(domain, updates) {
const existing = loadRow(domain) || {
policy: "auto",
consecutive_plain_failures: 0,
consecutive_browser_failures: 0,
plain_success_count: 0,
browser_success_count: 0,
expires_at: null,
};
const merged = {
policy: updates.policy ?? existing.policy,
consecutive_plain_failures: updates.consecutive_plain_failures ?? existing.consecutive_plain_failures,
consecutive_browser_failures: updates.consecutive_browser_failures ?? existing.consecutive_browser_failures,
plain_success_count: updates.plain_success_count ?? existing.plain_success_count,
browser_success_count: updates.browser_success_count ?? existing.browser_success_count,
expires_at: updates.expires_at !== undefined ? updates.expires_at : existing.expires_at,
};
upsertPolicy.run(
domain,
merged.policy,
merged.consecutive_plain_failures,
merged.consecutive_browser_failures,
merged.plain_success_count,
merged.browser_success_count,
merged.expires_at
);
}
function recordPlainSuccess(url) {
const domain = getDomain(url);
if (!domain) return;
const existing = loadRow(domain);
writeRow(domain, {
policy: "auto",
consecutive_plain_failures: 0,
plain_success_count: (existing?.plain_success_count || 0) + 1,
expires_at: null,
});
}
function recordPlainFailure(url) {
const domain = getDomain(url);
if (!domain) return;
const existing = loadRow(domain);
const failures = (existing?.consecutive_plain_failures || 0) + 1;
if (failures >= PLAIN_FAILURE_THRESHOLD) {
writeRow(domain, {
policy: "browser_only",
consecutive_plain_failures: failures,
expires_at: new Date(Date.now() + BROWSER_ONLY_TTL_MS).toISOString(),
});
return;
}
writeRow(domain, {
consecutive_plain_failures: failures,
});
}
function recordBrowserSuccess(url) {
const domain = getDomain(url);
if (!domain) return;
const existing = loadRow(domain);
// a browser success doesnt reset the plain-failure counter — plain fetch
// is still broken for this domain, we just confirmed the browser path works.
// policy stays browser_only until the ttl expires and we re-probe plain
writeRow(domain, {
consecutive_browser_failures: 0,
browser_success_count: (existing?.browser_success_count || 0) + 1,
});
}
function recordBrowserFailure(url) {
const domain = getDomain(url);
if (!domain) return;
const existing = loadRow(domain);
const failures = (existing?.consecutive_browser_failures || 0) + 1;
if (failures >= BROWSER_FAILURE_THRESHOLD) {
writeRow(domain, {
policy: "blocked",
consecutive_browser_failures: failures,
expires_at: new Date(Date.now() + BLOCKED_TTL_MS).toISOString(),
});
return;
}
writeRow(domain, {
consecutive_browser_failures: failures,
});
}
module.exports = {
getEffectivePolicy,
recordPlainSuccess,
recordPlainFailure,
recordBrowserSuccess,
recordBrowserFailure,
};