diff --git a/src/content.js b/src/content.js index 794254a..6f5b5bf 100644 --- a/src/content.js +++ b/src/content.js @@ -1,14 +1,37 @@ -const { extractFromHtml } = require('@extractus/article-extractor'); -const sharp = require('sharp'); -const db = require('./db'); -const config = require('./config'); -const { generateAndStoreEmbedding } = require('./embeddings'); -const { fetchWithPolicy } = require('./http'); -const { getSharedBrowserSession } = require('./sources/browserCrawler'); +const { extractFromHtml } = require("@extractus/article-extractor"); +const sharp = require("sharp"); +const db = require("./db"); +const config = require("./config"); +const { generateAndStoreEmbedding } = require("./embeddings"); +const { fetchWithPolicy } = require("./http"); +const { getSharedBrowserSession } = require("./sources/browserCrawler"); +const { validateExtractedArticle } = require("./contentValidation"); +const { + getEffectivePolicy, + recordPlainSuccess, + recordPlainFailure, + recordBrowserSuccess, + recordBrowserFailure, +} = require("./domainPolicy"); + + +const MAX_PLAIN_HTML_LENGTH = 1_500_000; +const PLAIN_FETCH_TIMEOUT = 12000; +const BROWSER_FETCH_TIMEOUT = 20000; + +// retry windows for failures that look transient (validation rejected the +// page, fetch timed out). genuinely terminal failures (404, dead url) get +// a hard cap on attempt count instead +const VALIDATION_RETRY_AFTER_MS = 24 * 60 * 60 * 1000; +const TRANSIENT_RETRY_AFTER_MS = 6 * 60 * 60 * 1000; +const MAX_TERMINAL_ATTEMPTS = 3; + const updateArticleAssets = db.prepare(` UPDATE articles - SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ? + SET content = ?, image = ?, content_status = 'ready', content_error = NULL, + content_attempted_at = ?, content_attempt_count = content_attempt_count + 1, + content_retry_after = NULL WHERE id = ? `); const updateArticleTitleDescription = db.prepare(` @@ -18,19 +41,25 @@ const updateArticleTitleDescription = db.prepare(` `); const markContentSkipped = db.prepare(` UPDATE articles - SET content_status = 'skipped', content_error = ?, content_attempted_at = ? + SET content_status = 'skipped', content_error = ?, content_attempted_at = ?, + content_attempt_count = content_attempt_count + 1, content_retry_after = NULL WHERE id = ? `); const markContentFailed = db.prepare(` UPDATE articles - SET content_status = 'failed', content_error = ?, content_attempted_at = ? + SET content_status = 'failed', content_error = ?, content_attempted_at = ?, + content_attempt_count = content_attempt_count + 1, content_retry_after = NULL WHERE id = ? `); const markContentPending = db.prepare(` UPDATE articles - SET content_status = NULL, content_error = NULL, content_attempted_at = ? + SET content_status = 'pending', content_error = ?, content_attempted_at = ?, + content_attempt_count = content_attempt_count + 1, content_retry_after = ? WHERE id = ? `); + +// round-robin pull of articles needing content. respects content_retry_after so +// a freshly-rejected article doesnt get retried in the next loop iteration const selectRoundRobinArticlesMissingContent = db.prepare(` SELECT id, url, title, description FROM ( @@ -39,21 +68,18 @@ const selectRoundRobinArticlesMissingContent = db.prepare(` FROM articles WHERE (content IS NULL OR TRIM(content) = '') AND (content_status IS NULL OR content_status = 'pending') + AND (content_retry_after IS NULL OR content_retry_after <= datetime('now')) ) WHERE rn <= ? ORDER BY rn, source `); -const loggedBlockedDomains = new Set(); -let contentBackfillRunning = false; +const selectAttemptCount = db.prepare(` + SELECT content_attempt_count AS attempts FROM articles WHERE id = ? +`); -function getHostname(url) { - try { - return new URL(url).hostname.toLowerCase(); - } catch { - return ''; - } -} + +let contentBackfillRunning = false; function getErrorStatus(error) { @@ -61,38 +87,28 @@ function getErrorStatus(error) { return error.status; } - const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/); + const match = String((error && error.message) || "").match(/\b(401|403|404|408|429|5\d\d)\b/); return match ? Number(match[1]) : null; } function getErrorMessage(error, fallback) { - const message = String(error && error.message || fallback || '').trim(); + const message = String((error && error.message) || fallback || "").trim(); return message ? message.slice(0, 500) : null; } -function markArticleStatus(statement, id, message) { - const attemptedAt = new Date().toISOString(); - const parameterCount = statement.source.split('?').length - 1; - - if (parameterCount === 3) { - statement.run(message, attemptedAt, id); - return; - } - - if (parameterCount === 2) { - statement.run(attemptedAt, id); - return; - } - - throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`); +function nowIso() { + return new Date().toISOString(); } +function futureIso(ms) { + return new Date(Date.now() + ms).toISOString(); +} + + async function fetchCompressedImage(url) { const response = await fetchWithPolicy(url, { retries: 1, - headers: { - Accept: 'image/*', - }, + headers: { Accept: "image/*" }, }); if (!response.ok) { @@ -101,90 +117,255 @@ async function fetchCompressedImage(url) { throw error; } - const contentType = String(response.headers.get('content-type') || '').toLowerCase(); - if (!contentType.startsWith('image/')) { - throw new Error(`image request returned ${contentType || 'unknown content-type'}`); + const contentType = String(response.headers.get("content-type") || "").toLowerCase(); + if (!contentType.startsWith("image/")) { + throw new Error(`image request returned ${contentType || "unknown content-type"}`); } const input = Buffer.from(await response.arrayBuffer()); if (input.length === 0) { - throw new Error('image request returned an empty body'); + throw new Error("image request returned an empty body"); } const output = await sharp(input) .rotate() - .resize({ width: 320, height: 320, fit: 'inside', withoutEnlargement: true }) + .resize({ width: 320, height: 320, fit: "inside", withoutEnlargement: true }) .webp({ quality: 25 }) .toBuffer(); - return output.toString('base64'); + return output.toString("base64"); } + +// plain http fetch — no js execution. fast, low memory, but fails on +// js-rendered sites and gets blocked by cloudflare more often +async function fetchPlainHtml(url) { + const response = await fetchWithPolicy(url, { + timeout: PLAIN_FETCH_TIMEOUT, + retries: 1, + }); + + if (!response.ok) { + const error = new Error(`plain fetch returned ${response.status}`); + error.status = response.status; + throw error; + } + + const contentType = String(response.headers.get("content-type") || "").toLowerCase(); + if (contentType && !contentType.includes("html") && !contentType.includes("xml")) { + throw new Error(`plain fetch returned non-html content-type: ${contentType}`); + } + + const text = await response.text(); + return { + html: text.slice(0, MAX_PLAIN_HTML_LENGTH), + finalUrl: response.url || url, + }; +} + + +async function fetchBrowserHtml(url) { + const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25; + const session = await getSharedBrowserSession({ + requestTimeout: BROWSER_FETCH_TIMEOUT, + maxConcurrentPages, + }); + + const html = await session.fetchRenderedHtml(url, { timeout: BROWSER_FETCH_TIMEOUT }); + return { html, finalUrl: url }; +} + + +function stripHtmlContent(value) { + if (typeof value !== "string") return null; + const stripped = value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim(); + return stripped || null; +} + + +// runs fetch → extract → validate. returns { ok, article, html, finalUrl, reason } +// where article has been post-processed (content stripped of html). on failure, +// reason explains what tripped — used both for logging and for the per-domain +// policy update +async function attemptFetch(url, fetcher) { + let html; + let finalUrl; + try { + const result = await fetcher(url); + html = result.html; + finalUrl = result.finalUrl; + } catch (error) { + return { ok: false, reason: `fetch-error:${error.message || "unknown"}`, error }; + } + + if (!html) { + return { ok: false, reason: "empty-html" }; + } + + let extracted; + try { + extracted = await extractFromHtml(html, finalUrl || url); + } catch (error) { + return { ok: false, reason: `extractor-error:${error.message || "unknown"}` }; + } + + if (extracted) { + extracted = { + ...extracted, + content: stripHtmlContent(extracted.content), + }; + } + + const validation = validateExtractedArticle({ article: extracted, html, finalUrl }); + if (!validation.ok) { + return { ok: false, reason: validation.reason, retryable: validation.retryable, html, finalUrl }; + } + + return { ok: true, article: extracted, html, finalUrl }; +} + + +function getAttemptCount(id) { + const row = selectAttemptCount.get(id); + return row ? row.attempts || 0 : 0; +} + + async function fetchAndStoreContent(id, url, storedTitle, storedDescription) { + const policy = getEffectivePolicy(url); + + // domains we know are blocked — skip the fetch entirely until ttl expires. + // the row stays pending so it'll get picked up after the policy resets + if (policy.policy === "blocked") { + markContentPending.run( + `domain blocked by policy`, + nowIso(), + futureIso(TRANSIENT_RETRY_AFTER_MS), + id + ); + return; + } + + const tryPlainFirst = policy.policy === "auto" || policy.policy === "plain_only"; + let plainResult = null; + let browserResult = null; + + + if (tryPlainFirst) { + plainResult = await attemptFetch(url, fetchPlainHtml); + + if (plainResult.ok) { + recordPlainSuccess(url); + await commitArticle(id, url, plainResult, storedTitle, storedDescription); + return; + } + + recordPlainFailure(url); + + // hard 4xx (other than 408/429) on plain — domain might serve the same to + // browser, but try anyway since it's cheap once the policy hasnt flipped yet. + // 408/429/5xx defer for retry + const status = plainResult.error && getErrorStatus(plainResult.error); + if (status === 408 || status === 429 || (status && status >= 500)) { + markContentPending.run( + `plain ${status}`, + nowIso(), + futureIso(TRANSIENT_RETRY_AFTER_MS), + id + ); + return; + } + } + + // policy.policy === "plain_only" means we just tried plain and failed — + // dont escalate to browser, the operator (or earlier domain memory) said no + if (policy.policy === "plain_only") { + recordValidationFailure(id, plainResult); + return; + } + + + browserResult = await attemptFetch(url, fetchBrowserHtml); + + if (browserResult.ok) { + recordBrowserSuccess(url); + await commitArticle(id, url, browserResult, storedTitle, storedDescription); + return; + } + + recordBrowserFailure(url); + + const browserStatus = browserResult.error && getErrorStatus(browserResult.error); + if (browserStatus === 408 || browserStatus === 429 || (browserStatus && browserStatus >= 500)) { + markContentPending.run( + `browser ${browserStatus}`, + nowIso(), + futureIso(TRANSIENT_RETRY_AFTER_MS), + id + ); + return; + } + + // both paths exhausted (or browser-only path failed). decide between + // pending-with-retry and terminal failed based on attempt count and + // whether the validator thought it was retryable + recordValidationFailure(id, browserResult); +} + + +function recordValidationFailure(id, result) { + const reason = result?.reason || "unknown"; + const retryable = result?.retryable !== false; + const attempts = getAttemptCount(id); + + // hard fetch errors with no retryable signal — terminal after a few tries + if (!retryable || attempts + 1 >= MAX_TERMINAL_ATTEMPTS) { + markContentFailed.run(reason, nowIso(), id); + return; + } + + markContentPending.run(reason, nowIso(), futureIso(VALIDATION_RETRY_AFTER_MS), id); +} + + +async function commitArticle(id, url, result, storedTitle, storedDescription) { + const { article, finalUrl } = result; + const content = article.content || null; + + // if stored title looks like a raw url, replace with extracted one + const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim()); + if (titleLooksLikeUrl) { + const scrapedTitle = typeof article.title === "string" ? article.title.trim() : null; + const scrapedDescription = typeof article.description === "string" ? article.description.trim() : null; + if (scrapedTitle) { + updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id); + } + } + + let image = null; + if (article.image) { + try { + image = await fetchCompressedImage(article.image); + } catch (error) { + const status = getErrorStatus(error); + if (status === 401 || status === 403 || status === 404 || status === 429) { + console.warn(`image fetch skipped for ${url}: upstream returned ${status}`); + } else { + console.error(`image fetch failed for ${url}:`, error.message || error); + } + } + } + + updateArticleAssets.run(content, image, nowIso(), id); + try { - const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25; - const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages }); - const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 }); - const article = await extractFromHtml(html, url); - if (!article) { - markArticleStatus(markContentSkipped, id, 'extractor returned no article'); - return; - } - - const content = typeof article.content === 'string' - ? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null - : null; - - // if stored title looks like a raw URL, try to replace with scraped title - const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim()); - if (titleLooksLikeUrl) { - const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null; - const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null; - if (scrapedTitle) { - updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id); - } - } - - let image = null; - if (article.image) { - try { - image = await fetchCompressedImage(article.image); - } catch (error) { - const status = getErrorStatus(error); - if (status === 401 || status === 403 || status === 404 || status === 429) { - console.warn(`image fetch skipped for ${url}: upstream returned ${status}`); - } else { - console.error(`image fetch failed for ${url}:`, error); - } - } - } - - if (!content && !image) { - markArticleStatus(markContentSkipped, id, 'article had no extractable content or image'); - return; - } - - updateArticleAssets.run(content, image, new Date().toISOString(), id); await generateAndStoreEmbedding(id); } catch (error) { - const status = getErrorStatus(error); - if (status === 401 || status === 403 || status === 404) { - console.warn(`content fetch skipped for ${url}: upstream returned ${status}`); - markArticleStatus(markContentSkipped, id, `upstream returned ${status}`); - return; - } - - if (status === 408 || status === 429 || (status && status >= 500)) { - console.warn(`content fetch deferred for ${url}: upstream returned ${status}`); - markArticleStatus(markContentPending, id, null); - return; - } - - markArticleStatus(markContentFailed, id, getErrorMessage(error, 'content fetch failed')); - console.error(`content fetch failed for ${url}:`, error); + console.error(`embedding failed for article ${id}:`, error.message || error); } } + async function backfillMissingContent(perSource = 50, concurrency = 5) { if (contentBackfillRunning) { return; @@ -204,15 +385,18 @@ async function backfillMissingContent(perSource = 50, concurrency = 5) { } } + function hasPendingContent() { return Boolean(db.prepare(` SELECT 1 FROM articles WHERE (content IS NULL OR TRIM(content) = '') AND (content_status IS NULL OR content_status = 'pending') + AND (content_retry_after IS NULL OR content_retry_after <= datetime('now')) LIMIT 1 `).get()); } + module.exports = { fetchAndStoreContent, backfillMissingContent, diff --git a/src/contentValidation.js b/src/contentValidation.js new file mode 100644 index 0000000..31bd9a6 --- /dev/null +++ b/src/contentValidation.js @@ -0,0 +1,199 @@ +// validates whether an extracted article is real content vs a soft-error page +// (cookie wall, cloudflare challenge, paywall, "enable javascript", etc). +// +// the rules are deliberately conservative. we'd rather let a few junk pages +// through (caught downstream when re-checked) than reject 5% of real articles. +// fingerprints are anchored to title or the first ~500 chars of body so an +// article that *mentions* cloudflare doesnt get falsely rejected. + +const MIN_CONTENT_LENGTH = 400; +const MIN_SENTENCE_TERMINATORS = 3; +const BODY_SNIFF_LENGTH = 800; + + +// titles that ONLY appear on error/challenge pages — never on real articles. +// match is case-insensitive, exact-or-prefix only (not substring) to avoid +// false positives like a real article titled "404 reasons your startup failed" +const TITLE_BLOCKLIST = [ + "just a moment", + "just a moment...", + "attention required! | cloudflare", + "attention required!", + "access denied", + "access to this page has been denied", + "you have been blocked", + "are you a robot", + "are you a robot?", + "verify you are human", + "please verify you are a human", + "page not found", + "404 not found", + "404 page not found", + "403 forbidden", + "503 service unavailable", + "this page isn't available", + "this page isn’t available", + "site temporarily unavailable", + "request unsuccessful", +]; + + +// substrings to look for in the raw html head/early body that indicate a +// cloudflare/akamai/imperva interstitial. these are infrastructure markers +// the real site never serves +const CHALLENGE_MARKERS = [ + "cf-chl-bypass", + "__cf_chl_", + "cf_chl_opt", + "/cdn-cgi/challenge-platform", + "_incapsula_resource", + "incap_ses_", + "x-iinfo", + "akamai-bm-telemetry", + "ak_bmsc", + "distil_r_captcha", +]; + + +// phrases at the very start of extracted body text that mean we got a stub. +// anchored to first ~500 chars so we dont false-flag articles that discuss +// these topics later in the body +const BODY_PREFIX_BLOCKLIST = [ + "you need to enable javascript", + "please enable javascript", + "javascript is required", + "please enable cookies", + "cookies must be enabled", + "your browser will redirect", + "checking your browser before", + "this site requires javascript", + "please make sure your browser supports", +]; + + +// final-url path suffixes that indicate the request was redirected to a +// generic error/login page. we only check the pathname so query strings dont +// throw it off +const ERROR_PATH_HINTS = [ + "/404", + "/403", + "/error", + "/errors", + "/blocked", + "/captcha", + "/challenge", + "/access-denied", + "/account/login", + "/sign-in", + "/signin", + "/subscribe", + "/subscription", +]; + + +function normalizeForMatch(value) { + return String(value || "").trim().toLowerCase(); +} + +function countSentenceTerminators(text) { + // matches . ! ? followed by whitespace or end — avoids counting decimals like 3.14 + const matches = String(text || "").match(/[.!?](?:\s|$)/g); + return matches ? matches.length : 0; +} + +function hasErrorPath(finalUrl) { + if (!finalUrl) return false; + try { + const path = new URL(finalUrl).pathname.toLowerCase(); + return ERROR_PATH_HINTS.some((hint) => path === hint || path.startsWith(`${hint}/`) || path.endsWith(hint)); + } catch { + return false; + } +} + +function hasChallengeMarker(html) { + if (!html) return null; + // cap the search window — challenge markers are always in head or top of body, + // dont need to scan a full 1.5mb document + const haystack = String(html).slice(0, 50000).toLowerCase(); + for (const marker of CHALLENGE_MARKERS) { + if (haystack.includes(marker)) { + return marker; + } + } + return null; +} + +function titleIsBlocked(title) { + const normalized = normalizeForMatch(title); + if (!normalized) return null; + + for (const entry of TITLE_BLOCKLIST) { + if (normalized === entry || normalized.startsWith(`${entry} `) || normalized.startsWith(`${entry}|`)) { + return entry; + } + } + return null; +} + +function bodyPrefixIsBlocked(content) { + const sniff = normalizeForMatch(content).slice(0, BODY_SNIFF_LENGTH); + if (!sniff) return null; + + for (const phrase of BODY_PREFIX_BLOCKLIST) { + if (sniff.includes(phrase)) { + return phrase; + } + } + return null; +} + + +function validateExtractedArticle({ article, html, finalUrl }) { + if (!article) { + return { ok: false, reason: "extractor-returned-null", retryable: false }; + } + + const content = typeof article.content === "string" ? article.content.trim() : ""; + const title = typeof article.title === "string" ? article.title.trim() : ""; + + // title-level checks first since they're the cheapest signal + const blockedTitle = titleIsBlocked(title); + if (blockedTitle) { + return { ok: false, reason: `title-blocklist:${blockedTitle}`, retryable: true }; + } + + if (hasErrorPath(finalUrl)) { + return { ok: false, reason: `error-path:${finalUrl}`, retryable: true }; + } + + const challenge = hasChallengeMarker(html); + if (challenge) { + return { ok: false, reason: `challenge-marker:${challenge}`, retryable: true }; + } + + if (!content) { + return { ok: false, reason: "no-content-extracted", retryable: true }; + } + + if (content.length < MIN_CONTENT_LENGTH) { + return { ok: false, reason: `content-too-short:${content.length}`, retryable: true }; + } + + const blockedPrefix = bodyPrefixIsBlocked(content); + if (blockedPrefix) { + return { ok: false, reason: `body-prefix-blocklist:${blockedPrefix}`, retryable: true }; + } + + if (countSentenceTerminators(content) < MIN_SENTENCE_TERMINATORS) { + return { ok: false, reason: "too-few-sentences", retryable: true }; + } + + return { ok: true }; +} + + +module.exports = { + validateExtractedArticle, + MIN_CONTENT_LENGTH, +}; diff --git a/src/db.js b/src/db.js index c70ead0..b8896b5 100644 --- a/src/db.js +++ b/src/db.js @@ -261,11 +261,29 @@ db.exec(` ); `); +// per-domain fetch policy — caches whether plain http or browser is needed +// so we dont waste a round trip on every article from a known js-only site. +// expires_at lets us re-probe domains that may have recovered +db.exec(` + CREATE TABLE IF NOT EXISTS domain_fetch_policy ( + domain TEXT PRIMARY KEY, + policy TEXT NOT NULL DEFAULT 'auto', + consecutive_plain_failures INTEGER NOT NULL DEFAULT 0, + consecutive_browser_failures INTEGER NOT NULL DEFAULT 0, + plain_success_count INTEGER NOT NULL DEFAULT 0, + browser_success_count INTEGER NOT NULL DEFAULT 0, + expires_at TEXT, + updated_at TEXT NOT NULL DEFAULT (datetime('now')) + ); +`); + for (const statement of [ 'ALTER TABLE articles ADD COLUMN image TEXT', 'ALTER TABLE articles ADD COLUMN content_status TEXT', 'ALTER TABLE articles ADD COLUMN content_error TEXT', 'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT', + 'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0', + 'ALTER TABLE articles ADD COLUMN content_retry_after TEXT', 'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0' ]) { try { diff --git a/src/domainPolicy.js b/src/domainPolicy.js new file mode 100644 index 0000000..736c4db --- /dev/null +++ b/src/domainPolicy.js @@ -0,0 +1,181 @@ +const db = require("./db"); + + +// thresholds — kept in code rather than config because tuning these without +// understanding the consequences is a recipe for either a thundering herd +// against blocked domains or wasted plain-fetch attempts forever +const PLAIN_FAILURE_THRESHOLD = 5; +const BROWSER_FAILURE_THRESHOLD = 5; +const BROWSER_ONLY_TTL_MS = 7 * 24 * 60 * 60 * 1000; +const BLOCKED_TTL_MS = 24 * 60 * 60 * 1000; + + +const selectPolicy = db.prepare(` + SELECT domain, policy, consecutive_plain_failures, consecutive_browser_failures, + plain_success_count, browser_success_count, expires_at, updated_at + FROM domain_fetch_policy + WHERE domain = ? +`); + +const upsertPolicy = db.prepare(` + INSERT INTO domain_fetch_policy ( + domain, policy, consecutive_plain_failures, consecutive_browser_failures, + plain_success_count, browser_success_count, expires_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now')) + ON CONFLICT(domain) DO UPDATE SET + policy = excluded.policy, + consecutive_plain_failures = excluded.consecutive_plain_failures, + consecutive_browser_failures = excluded.consecutive_browser_failures, + plain_success_count = excluded.plain_success_count, + browser_success_count = excluded.browser_success_count, + expires_at = excluded.expires_at, + updated_at = datetime('now') +`); + + +function getDomain(url) { + try { + return new URL(url).hostname.toLowerCase(); + } catch { + return ""; + } +} + +function loadRow(domain) { + if (!domain) return null; + return selectPolicy.get(domain) || null; +} + +function isExpired(row) { + if (!row || !row.expires_at) return false; + return new Date(row.expires_at).getTime() <= Date.now(); +} + + +// returns the effective policy for a domain right now. expired entries +// silently revert to "auto" so we re-probe — we dont mutate the row here +// since reads happen on every fetch and writes are expensive +function getEffectivePolicy(url) { + const domain = getDomain(url); + const row = loadRow(domain); + + if (!row) { + return { domain, policy: "auto" }; + } + + if (isExpired(row)) { + return { domain, policy: "auto", wasExpired: true, previous: row.policy }; + } + + return { domain, policy: row.policy }; +} + + +function writeRow(domain, updates) { + const existing = loadRow(domain) || { + policy: "auto", + consecutive_plain_failures: 0, + consecutive_browser_failures: 0, + plain_success_count: 0, + browser_success_count: 0, + expires_at: null, + }; + + const merged = { + policy: updates.policy ?? existing.policy, + consecutive_plain_failures: updates.consecutive_plain_failures ?? existing.consecutive_plain_failures, + consecutive_browser_failures: updates.consecutive_browser_failures ?? existing.consecutive_browser_failures, + plain_success_count: updates.plain_success_count ?? existing.plain_success_count, + browser_success_count: updates.browser_success_count ?? existing.browser_success_count, + expires_at: updates.expires_at !== undefined ? updates.expires_at : existing.expires_at, + }; + + upsertPolicy.run( + domain, + merged.policy, + merged.consecutive_plain_failures, + merged.consecutive_browser_failures, + merged.plain_success_count, + merged.browser_success_count, + merged.expires_at + ); +} + + +function recordPlainSuccess(url) { + const domain = getDomain(url); + if (!domain) return; + const existing = loadRow(domain); + + writeRow(domain, { + policy: "auto", + consecutive_plain_failures: 0, + plain_success_count: (existing?.plain_success_count || 0) + 1, + expires_at: null, + }); +} + +function recordPlainFailure(url) { + const domain = getDomain(url); + if (!domain) return; + const existing = loadRow(domain); + + const failures = (existing?.consecutive_plain_failures || 0) + 1; + + if (failures >= PLAIN_FAILURE_THRESHOLD) { + writeRow(domain, { + policy: "browser_only", + consecutive_plain_failures: failures, + expires_at: new Date(Date.now() + BROWSER_ONLY_TTL_MS).toISOString(), + }); + return; + } + + writeRow(domain, { + consecutive_plain_failures: failures, + }); +} + +function recordBrowserSuccess(url) { + const domain = getDomain(url); + if (!domain) return; + const existing = loadRow(domain); + + // a browser success doesnt reset the plain-failure counter — plain fetch + // is still broken for this domain, we just confirmed the browser path works. + // policy stays browser_only until the ttl expires and we re-probe plain + writeRow(domain, { + consecutive_browser_failures: 0, + browser_success_count: (existing?.browser_success_count || 0) + 1, + }); +} + +function recordBrowserFailure(url) { + const domain = getDomain(url); + if (!domain) return; + const existing = loadRow(domain); + + const failures = (existing?.consecutive_browser_failures || 0) + 1; + + if (failures >= BROWSER_FAILURE_THRESHOLD) { + writeRow(domain, { + policy: "blocked", + consecutive_browser_failures: failures, + expires_at: new Date(Date.now() + BLOCKED_TTL_MS).toISOString(), + }); + return; + } + + writeRow(domain, { + consecutive_browser_failures: failures, + }); +} + + +module.exports = { + getEffectivePolicy, + recordPlainSuccess, + recordPlainFailure, + recordBrowserSuccess, + recordBrowserFailure, +};