migrate article embeddings to support multi-model architecture and enhance data integrity

2026-04-19 00:28:15 +01:00
parent a10c5eb39f
commit b4b2fe2ac7
4 changed files with 686 additions and 104 deletions
@@ -1,14 +1,37 @@
-const { extractFromHtml } = require('@extractus/article-extractor');
-const sharp = require('sharp');
-const db = require('./db');
-const config = require('./config');
-const { generateAndStoreEmbedding } = require('./embeddings');
-const { fetchWithPolicy } = require('./http');
-const { getSharedBrowserSession } = require('./sources/browserCrawler');
+const { extractFromHtml } = require("@extractus/article-extractor");
+const sharp = require("sharp");
+const db = require("./db");
+const config = require("./config");
+const { generateAndStoreEmbedding } = require("./embeddings");
+const { fetchWithPolicy } = require("./http");
+const { getSharedBrowserSession } = require("./sources/browserCrawler");
+const { validateExtractedArticle } = require("./contentValidation");
+const {
+  getEffectivePolicy,
+  recordPlainSuccess,
+  recordPlainFailure,
+  recordBrowserSuccess,
+  recordBrowserFailure,
+} = require("./domainPolicy");
+
+
+const MAX_PLAIN_HTML_LENGTH = 1_500_000;
+const PLAIN_FETCH_TIMEOUT = 12000;
+const BROWSER_FETCH_TIMEOUT = 20000;
+
+// retry windows for failures that look transient (validation rejected the
+// page, fetch timed out). genuinely terminal failures (404, dead url) get
+// a hard cap on attempt count instead
+const VALIDATION_RETRY_AFTER_MS = 24 * 60 * 60 * 1000;
+const TRANSIENT_RETRY_AFTER_MS = 6 * 60 * 60 * 1000;
+const MAX_TERMINAL_ATTEMPTS = 3;
+

 const updateArticleAssets = db.prepare(`
  UPDATE articles
-  SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
+  SET content = ?, image = ?, content_status = 'ready', content_error = NULL,
+      content_attempted_at = ?, content_attempt_count = content_attempt_count + 1,
+      content_retry_after = NULL
  WHERE id = ?
 `);
 const updateArticleTitleDescription = db.prepare(`
@@ -18,19 +41,25 @@ const updateArticleTitleDescription = db.prepare(`
 `);
 const markContentSkipped = db.prepare(`
  UPDATE articles
-  SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
+  SET content_status = 'skipped', content_error = ?, content_attempted_at = ?,
+      content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
  WHERE id = ?
 `);
 const markContentFailed = db.prepare(`
  UPDATE articles
-  SET content_status = 'failed', content_error = ?, content_attempted_at = ?
+  SET content_status = 'failed', content_error = ?, content_attempted_at = ?,
+      content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
  WHERE id = ?
 `);
 const markContentPending = db.prepare(`
  UPDATE articles
-  SET content_status = NULL, content_error = NULL, content_attempted_at = ?
+  SET content_status = 'pending', content_error = ?, content_attempted_at = ?,
+      content_attempt_count = content_attempt_count + 1, content_retry_after = ?
  WHERE id = ?
 `);
+
+// round-robin pull of articles needing content. respects content_retry_after so
+// a freshly-rejected article doesnt get retried in the next loop iteration
 const selectRoundRobinArticlesMissingContent = db.prepare(`
  SELECT id, url, title, description
  FROM (
@@ -39,21 +68,18 @@ const selectRoundRobinArticlesMissingContent = db.prepare(`
    FROM articles
    WHERE (content IS NULL OR TRIM(content) = '')
      AND (content_status IS NULL OR content_status = 'pending')
+      AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
  )
  WHERE rn <= ?
  ORDER BY rn, source
 `);

-const loggedBlockedDomains = new Set();
-let contentBackfillRunning = false;
+const selectAttemptCount = db.prepare(`
+  SELECT content_attempt_count AS attempts FROM articles WHERE id = ?
+`);

-function getHostname(url) {
-  try {
-    return new URL(url).hostname.toLowerCase();
-  } catch {
-    return '';
-  }
-}
+
+let contentBackfillRunning = false;


 function getErrorStatus(error) {
@@ -61,38 +87,28 @@ function getErrorStatus(error) {
    return error.status;
  }

-  const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/);
+  const match = String((error && error.message) || "").match(/\b(401|403|404|408|429|5\d\d)\b/);
  return match ? Number(match[1]) : null;
 }

 function getErrorMessage(error, fallback) {
-  const message = String(error && error.message || fallback || '').trim();
+  const message = String((error && error.message) || fallback || "").trim();
  return message ? message.slice(0, 500) : null;
 }

-function markArticleStatus(statement, id, message) {
-  const attemptedAt = new Date().toISOString();
-  const parameterCount = statement.source.split('?').length - 1;
-
-  if (parameterCount === 3) {
-    statement.run(message, attemptedAt, id);
-    return;
-  }
-
-  if (parameterCount === 2) {
-    statement.run(attemptedAt, id);
-    return;
-  }
-
-  throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
+function nowIso() {
+  return new Date().toISOString();
 }

+function futureIso(ms) {
+  return new Date(Date.now() + ms).toISOString();
+}
+
+
 async function fetchCompressedImage(url) {
  const response = await fetchWithPolicy(url, {
    retries: 1,
-    headers: {
-      Accept: 'image/*',
-    },
+    headers: { Accept: "image/*" },
  });

  if (!response.ok) {
@@ -101,90 +117,255 @@ async function fetchCompressedImage(url) {
    throw error;
  }

-  const contentType = String(response.headers.get('content-type') || '').toLowerCase();
-  if (!contentType.startsWith('image/')) {
-    throw new Error(`image request returned ${contentType || 'unknown content-type'}`);
+  const contentType = String(response.headers.get("content-type") || "").toLowerCase();
+  if (!contentType.startsWith("image/")) {
+    throw new Error(`image request returned ${contentType || "unknown content-type"}`);
  }

  const input = Buffer.from(await response.arrayBuffer());
  if (input.length === 0) {
-    throw new Error('image request returned an empty body');
+    throw new Error("image request returned an empty body");
  }

  const output = await sharp(input)
    .rotate()
-    .resize({ width: 320, height: 320, fit: 'inside', withoutEnlargement: true })
+    .resize({ width: 320, height: 320, fit: "inside", withoutEnlargement: true })
    .webp({ quality: 25 })
    .toBuffer();

-  return output.toString('base64');
+  return output.toString("base64");
 }

+
+// plain http fetch — no js execution. fast, low memory, but fails on
+// js-rendered sites and gets blocked by cloudflare more often
+async function fetchPlainHtml(url) {
+  const response = await fetchWithPolicy(url, {
+    timeout: PLAIN_FETCH_TIMEOUT,
+    retries: 1,
+  });
+
+  if (!response.ok) {
+    const error = new Error(`plain fetch returned ${response.status}`);
+    error.status = response.status;
+    throw error;
+  }
+
+  const contentType = String(response.headers.get("content-type") || "").toLowerCase();
+  if (contentType && !contentType.includes("html") && !contentType.includes("xml")) {
+    throw new Error(`plain fetch returned non-html content-type: ${contentType}`);
+  }
+
+  const text = await response.text();
+  return {
+    html: text.slice(0, MAX_PLAIN_HTML_LENGTH),
+    finalUrl: response.url || url,
+  };
+}
+
+
+async function fetchBrowserHtml(url) {
+  const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
+  const session = await getSharedBrowserSession({
+    requestTimeout: BROWSER_FETCH_TIMEOUT,
+    maxConcurrentPages,
+  });
+
+  const html = await session.fetchRenderedHtml(url, { timeout: BROWSER_FETCH_TIMEOUT });
+  return { html, finalUrl: url };
+}
+
+
+function stripHtmlContent(value) {
+  if (typeof value !== "string") return null;
+  const stripped = value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
+  return stripped || null;
+}
+
+
+// runs fetch → extract → validate. returns { ok, article, html, finalUrl, reason }
+// where article has been post-processed (content stripped of html). on failure,
+// reason explains what tripped — used both for logging and for the per-domain
+// policy update
+async function attemptFetch(url, fetcher) {
+  let html;
+  let finalUrl;
+  try {
+    const result = await fetcher(url);
+    html = result.html;
+    finalUrl = result.finalUrl;
+  } catch (error) {
+    return { ok: false, reason: `fetch-error:${error.message || "unknown"}`, error };
+  }
+
+  if (!html) {
+    return { ok: false, reason: "empty-html" };
+  }
+
+  let extracted;
+  try {
+    extracted = await extractFromHtml(html, finalUrl || url);
+  } catch (error) {
+    return { ok: false, reason: `extractor-error:${error.message || "unknown"}` };
+  }
+
+  if (extracted) {
+    extracted = {
+      ...extracted,
+      content: stripHtmlContent(extracted.content),
+    };
+  }
+
+  const validation = validateExtractedArticle({ article: extracted, html, finalUrl });
+  if (!validation.ok) {
+    return { ok: false, reason: validation.reason, retryable: validation.retryable, html, finalUrl };
+  }
+
+  return { ok: true, article: extracted, html, finalUrl };
+}
+
+
+function getAttemptCount(id) {
+  const row = selectAttemptCount.get(id);
+  return row ? row.attempts || 0 : 0;
+}
+
+
 async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
+  const policy = getEffectivePolicy(url);
+
+  // domains we know are blocked — skip the fetch entirely until ttl expires.
+  // the row stays pending so it'll get picked up after the policy resets
+  if (policy.policy === "blocked") {
+    markContentPending.run(
+      `domain blocked by policy`,
+      nowIso(),
+      futureIso(TRANSIENT_RETRY_AFTER_MS),
+      id
+    );
+    return;
+  }
+
+  const tryPlainFirst = policy.policy === "auto" || policy.policy === "plain_only";
+  let plainResult = null;
+  let browserResult = null;
+
+
+  if (tryPlainFirst) {
+    plainResult = await attemptFetch(url, fetchPlainHtml);
+
+    if (plainResult.ok) {
+      recordPlainSuccess(url);
+      await commitArticle(id, url, plainResult, storedTitle, storedDescription);
+      return;
+    }
+
+    recordPlainFailure(url);
+
+    // hard 4xx (other than 408/429) on plain — domain might serve the same to
+    // browser, but try anyway since it's cheap once the policy hasnt flipped yet.
+    // 408/429/5xx defer for retry
+    const status = plainResult.error && getErrorStatus(plainResult.error);
+    if (status === 408 || status === 429 || (status && status >= 500)) {
+      markContentPending.run(
+        `plain ${status}`,
+        nowIso(),
+        futureIso(TRANSIENT_RETRY_AFTER_MS),
+        id
+      );
+      return;
+    }
+  }
+
+  // policy.policy === "plain_only" means we just tried plain and failed —
+  // dont escalate to browser, the operator (or earlier domain memory) said no
+  if (policy.policy === "plain_only") {
+    recordValidationFailure(id, plainResult);
+    return;
+  }
+
+
+  browserResult = await attemptFetch(url, fetchBrowserHtml);
+
+  if (browserResult.ok) {
+    recordBrowserSuccess(url);
+    await commitArticle(id, url, browserResult, storedTitle, storedDescription);
+    return;
+  }
+
+  recordBrowserFailure(url);
+
+  const browserStatus = browserResult.error && getErrorStatus(browserResult.error);
+  if (browserStatus === 408 || browserStatus === 429 || (browserStatus && browserStatus >= 500)) {
+    markContentPending.run(
+      `browser ${browserStatus}`,
+      nowIso(),
+      futureIso(TRANSIENT_RETRY_AFTER_MS),
+      id
+    );
+    return;
+  }
+
+  // both paths exhausted (or browser-only path failed). decide between
+  // pending-with-retry and terminal failed based on attempt count and
+  // whether the validator thought it was retryable
+  recordValidationFailure(id, browserResult);
+}
+
+
+function recordValidationFailure(id, result) {
+  const reason = result?.reason || "unknown";
+  const retryable = result?.retryable !== false;
+  const attempts = getAttemptCount(id);
+
+  // hard fetch errors with no retryable signal — terminal after a few tries
+  if (!retryable || attempts + 1 >= MAX_TERMINAL_ATTEMPTS) {
+    markContentFailed.run(reason, nowIso(), id);
+    return;
+  }
+
+  markContentPending.run(reason, nowIso(), futureIso(VALIDATION_RETRY_AFTER_MS), id);
+}
+
+
+async function commitArticle(id, url, result, storedTitle, storedDescription) {
+  const { article, finalUrl } = result;
+  const content = article.content || null;
+
+  // if stored title looks like a raw url, replace with extracted one
+  const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
+  if (titleLooksLikeUrl) {
+    const scrapedTitle = typeof article.title === "string" ? article.title.trim() : null;
+    const scrapedDescription = typeof article.description === "string" ? article.description.trim() : null;
+    if (scrapedTitle) {
+      updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
+    }
+  }
+
+  let image = null;
+  if (article.image) {
+    try {
+      image = await fetchCompressedImage(article.image);
+    } catch (error) {
+      const status = getErrorStatus(error);
+      if (status === 401 || status === 403 || status === 404 || status === 429) {
+        console.warn(`image fetch skipped for ${url}: upstream returned ${status}`);
+      } else {
+        console.error(`image fetch failed for ${url}:`, error.message || error);
+      }
+    }
+  }
+
+  updateArticleAssets.run(content, image, nowIso(), id);
+
  try {
-    const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
-    const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
-    const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
-    const article = await extractFromHtml(html, url);
-    if (!article) {
-      markArticleStatus(markContentSkipped, id, 'extractor returned no article');
-      return;
-    }
-
-    const content = typeof article.content === 'string'
-      ? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
-      : null;
-
-    // if stored title looks like a raw URL, try to replace with scraped title
-    const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
-    if (titleLooksLikeUrl) {
-      const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
-      const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
-      if (scrapedTitle) {
-        updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
-      }
-    }
-
-    let image = null;
-    if (article.image) {
-      try {
-        image = await fetchCompressedImage(article.image);
-      } catch (error) {
-        const status = getErrorStatus(error);
-        if (status === 401 || status === 403 || status === 404 || status === 429) {
-          console.warn(`image fetch skipped for ${url}: upstream returned ${status}`);
-        } else {
-          console.error(`image fetch failed for ${url}:`, error);
-        }
-      }
-    }
-
-    if (!content && !image) {
-      markArticleStatus(markContentSkipped, id, 'article had no extractable content or image');
-      return;
-    }
-
-    updateArticleAssets.run(content, image, new Date().toISOString(), id);
    await generateAndStoreEmbedding(id);
  } catch (error) {
-    const status = getErrorStatus(error);
-    if (status === 401 || status === 403 || status === 404) {
-      console.warn(`content fetch skipped for ${url}: upstream returned ${status}`);
-      markArticleStatus(markContentSkipped, id, `upstream returned ${status}`);
-      return;
-    }
-
-    if (status === 408 || status === 429 || (status && status >= 500)) {
-      console.warn(`content fetch deferred for ${url}: upstream returned ${status}`);
-      markArticleStatus(markContentPending, id, null);
-      return;
-    }
-
-    markArticleStatus(markContentFailed, id, getErrorMessage(error, 'content fetch failed'));
-    console.error(`content fetch failed for ${url}:`, error);
+    console.error(`embedding failed for article ${id}:`, error.message || error);
  }
 }

+
 async function backfillMissingContent(perSource = 50, concurrency = 5) {
  if (contentBackfillRunning) {
    return;
@@ -204,15 +385,18 @@ async function backfillMissingContent(perSource = 50, concurrency = 5) {
  }
 }

+
 function hasPendingContent() {
  return Boolean(db.prepare(`
    SELECT 1 FROM articles
    WHERE (content IS NULL OR TRIM(content) = '')
      AND (content_status IS NULL OR content_status = 'pending')
+      AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
    LIMIT 1
  `).get());
 }

+
 module.exports = {
  fetchAndStoreContent,
  backfillMissingContent,
@@ -0,0 +1,199 @@
+// validates whether an extracted article is real content vs a soft-error page
+// (cookie wall, cloudflare challenge, paywall, "enable javascript", etc).
+//
+// the rules are deliberately conservative. we'd rather let a few junk pages
+// through (caught downstream when re-checked) than reject 5% of real articles.
+// fingerprints are anchored to title or the first ~500 chars of body so an
+// article that *mentions* cloudflare doesnt get falsely rejected.
+
+const MIN_CONTENT_LENGTH = 400;
+const MIN_SENTENCE_TERMINATORS = 3;
+const BODY_SNIFF_LENGTH = 800;
+
+
+// titles that ONLY appear on error/challenge pages — never on real articles.
+// match is case-insensitive, exact-or-prefix only (not substring) to avoid
+// false positives like a real article titled "404 reasons your startup failed"
+const TITLE_BLOCKLIST = [
+  "just a moment",
+  "just a moment...",
+  "attention required! | cloudflare",
+  "attention required!",
+  "access denied",
+  "access to this page has been denied",
+  "you have been blocked",
+  "are you a robot",
+  "are you a robot?",
+  "verify you are human",
+  "please verify you are a human",
+  "page not found",
+  "404 not found",
+  "404 page not found",
+  "403 forbidden",
+  "503 service unavailable",
+  "this page isn't available",
+  "this page isn’t available",
+  "site temporarily unavailable",
+  "request unsuccessful",
+];
+
+
+// substrings to look for in the raw html head/early body that indicate a
+// cloudflare/akamai/imperva interstitial. these are infrastructure markers
+// the real site never serves
+const CHALLENGE_MARKERS = [
+  "cf-chl-bypass",
+  "__cf_chl_",
+  "cf_chl_opt",
+  "/cdn-cgi/challenge-platform",
+  "_incapsula_resource",
+  "incap_ses_",
+  "x-iinfo",
+  "akamai-bm-telemetry",
+  "ak_bmsc",
+  "distil_r_captcha",
+];
+
+
+// phrases at the very start of extracted body text that mean we got a stub.
+// anchored to first ~500 chars so we dont false-flag articles that discuss
+// these topics later in the body
+const BODY_PREFIX_BLOCKLIST = [
+  "you need to enable javascript",
+  "please enable javascript",
+  "javascript is required",
+  "please enable cookies",
+  "cookies must be enabled",
+  "your browser will redirect",
+  "checking your browser before",
+  "this site requires javascript",
+  "please make sure your browser supports",
+];
+
+
+// final-url path suffixes that indicate the request was redirected to a
+// generic error/login page. we only check the pathname so query strings dont
+// throw it off
+const ERROR_PATH_HINTS = [
+  "/404",
+  "/403",
+  "/error",
+  "/errors",
+  "/blocked",
+  "/captcha",
+  "/challenge",
+  "/access-denied",
+  "/account/login",
+  "/sign-in",
+  "/signin",
+  "/subscribe",
+  "/subscription",
+];
+
+
+function normalizeForMatch(value) {
+  return String(value || "").trim().toLowerCase();
+}
+
+function countSentenceTerminators(text) {
+  // matches . ! ? followed by whitespace or end — avoids counting decimals like 3.14
+  const matches = String(text || "").match(/[.!?](?:\s|$)/g);
+  return matches ? matches.length : 0;
+}
+
+function hasErrorPath(finalUrl) {
+  if (!finalUrl) return false;
+  try {
+    const path = new URL(finalUrl).pathname.toLowerCase();
+    return ERROR_PATH_HINTS.some((hint) => path === hint || path.startsWith(`${hint}/`) || path.endsWith(hint));
+  } catch {
+    return false;
+  }
+}
+
+function hasChallengeMarker(html) {
+  if (!html) return null;
+  // cap the search window — challenge markers are always in head or top of body,
+  // dont need to scan a full 1.5mb document
+  const haystack = String(html).slice(0, 50000).toLowerCase();
+  for (const marker of CHALLENGE_MARKERS) {
+    if (haystack.includes(marker)) {
+      return marker;
+    }
+  }
+  return null;
+}
+
+function titleIsBlocked(title) {
+  const normalized = normalizeForMatch(title);
+  if (!normalized) return null;
+
+  for (const entry of TITLE_BLOCKLIST) {
+    if (normalized === entry || normalized.startsWith(`${entry} `) || normalized.startsWith(`${entry}|`)) {
+      return entry;
+    }
+  }
+  return null;
+}
+
+function bodyPrefixIsBlocked(content) {
+  const sniff = normalizeForMatch(content).slice(0, BODY_SNIFF_LENGTH);
+  if (!sniff) return null;
+
+  for (const phrase of BODY_PREFIX_BLOCKLIST) {
+    if (sniff.includes(phrase)) {
+      return phrase;
+    }
+  }
+  return null;
+}
+
+
+function validateExtractedArticle({ article, html, finalUrl }) {
+  if (!article) {
+    return { ok: false, reason: "extractor-returned-null", retryable: false };
+  }
+
+  const content = typeof article.content === "string" ? article.content.trim() : "";
+  const title = typeof article.title === "string" ? article.title.trim() : "";
+
+  // title-level checks first since they're the cheapest signal
+  const blockedTitle = titleIsBlocked(title);
+  if (blockedTitle) {
+    return { ok: false, reason: `title-blocklist:${blockedTitle}`, retryable: true };
+  }
+
+  if (hasErrorPath(finalUrl)) {
+    return { ok: false, reason: `error-path:${finalUrl}`, retryable: true };
+  }
+
+  const challenge = hasChallengeMarker(html);
+  if (challenge) {
+    return { ok: false, reason: `challenge-marker:${challenge}`, retryable: true };
+  }
+
+  if (!content) {
+    return { ok: false, reason: "no-content-extracted", retryable: true };
+  }
+
+  if (content.length < MIN_CONTENT_LENGTH) {
+    return { ok: false, reason: `content-too-short:${content.length}`, retryable: true };
+  }
+
+  const blockedPrefix = bodyPrefixIsBlocked(content);
+  if (blockedPrefix) {
+    return { ok: false, reason: `body-prefix-blocklist:${blockedPrefix}`, retryable: true };
+  }
+
+  if (countSentenceTerminators(content) < MIN_SENTENCE_TERMINATORS) {
+    return { ok: false, reason: "too-few-sentences", retryable: true };
+  }
+
+  return { ok: true };
+}
+
+
+module.exports = {
+  validateExtractedArticle,
+  MIN_CONTENT_LENGTH,
+};
@@ -261,11 +261,29 @@ db.exec(`
  );
 `);

+// per-domain fetch policy — caches whether plain http or browser is needed
+// so we dont waste a round trip on every article from a known js-only site.
+// expires_at lets us re-probe domains that may have recovered
+db.exec(`
+  CREATE TABLE IF NOT EXISTS domain_fetch_policy (
+    domain TEXT PRIMARY KEY,
+    policy TEXT NOT NULL DEFAULT 'auto',
+    consecutive_plain_failures INTEGER NOT NULL DEFAULT 0,
+    consecutive_browser_failures INTEGER NOT NULL DEFAULT 0,
+    plain_success_count INTEGER NOT NULL DEFAULT 0,
+    browser_success_count INTEGER NOT NULL DEFAULT 0,
+    expires_at TEXT,
+    updated_at TEXT NOT NULL DEFAULT (datetime('now'))
+  );
+`);
+
 for (const statement of [
  'ALTER TABLE articles ADD COLUMN image TEXT',
  'ALTER TABLE articles ADD COLUMN content_status TEXT',
  'ALTER TABLE articles ADD COLUMN content_error TEXT',
  'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
+  'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0',
+  'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
  'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
 ]) {
  try {
@@ -0,0 +1,181 @@
+const db = require("./db");
+
+
+// thresholds — kept in code rather than config because tuning these without
+// understanding the consequences is a recipe for either a thundering herd
+// against blocked domains or wasted plain-fetch attempts forever
+const PLAIN_FAILURE_THRESHOLD = 5;
+const BROWSER_FAILURE_THRESHOLD = 5;
+const BROWSER_ONLY_TTL_MS = 7 * 24 * 60 * 60 * 1000;
+const BLOCKED_TTL_MS = 24 * 60 * 60 * 1000;
+
+
+const selectPolicy = db.prepare(`
+  SELECT domain, policy, consecutive_plain_failures, consecutive_browser_failures,
+         plain_success_count, browser_success_count, expires_at, updated_at
+  FROM domain_fetch_policy
+  WHERE domain = ?
+`);
+
+const upsertPolicy = db.prepare(`
+  INSERT INTO domain_fetch_policy (
+    domain, policy, consecutive_plain_failures, consecutive_browser_failures,
+    plain_success_count, browser_success_count, expires_at, updated_at
+  ) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))
+  ON CONFLICT(domain) DO UPDATE SET
+    policy = excluded.policy,
+    consecutive_plain_failures = excluded.consecutive_plain_failures,
+    consecutive_browser_failures = excluded.consecutive_browser_failures,
+    plain_success_count = excluded.plain_success_count,
+    browser_success_count = excluded.browser_success_count,
+    expires_at = excluded.expires_at,
+    updated_at = datetime('now')
+`);
+
+
+function getDomain(url) {
+  try {
+    return new URL(url).hostname.toLowerCase();
+  } catch {
+    return "";
+  }
+}
+
+function loadRow(domain) {
+  if (!domain) return null;
+  return selectPolicy.get(domain) || null;
+}
+
+function isExpired(row) {
+  if (!row || !row.expires_at) return false;
+  return new Date(row.expires_at).getTime() <= Date.now();
+}
+
+
+// returns the effective policy for a domain right now. expired entries
+// silently revert to "auto" so we re-probe — we dont mutate the row here
+// since reads happen on every fetch and writes are expensive
+function getEffectivePolicy(url) {
+  const domain = getDomain(url);
+  const row = loadRow(domain);
+
+  if (!row) {
+    return { domain, policy: "auto" };
+  }
+
+  if (isExpired(row)) {
+    return { domain, policy: "auto", wasExpired: true, previous: row.policy };
+  }
+
+  return { domain, policy: row.policy };
+}
+
+
+function writeRow(domain, updates) {
+  const existing = loadRow(domain) || {
+    policy: "auto",
+    consecutive_plain_failures: 0,
+    consecutive_browser_failures: 0,
+    plain_success_count: 0,
+    browser_success_count: 0,
+    expires_at: null,
+  };
+
+  const merged = {
+    policy: updates.policy ?? existing.policy,
+    consecutive_plain_failures: updates.consecutive_plain_failures ?? existing.consecutive_plain_failures,
+    consecutive_browser_failures: updates.consecutive_browser_failures ?? existing.consecutive_browser_failures,
+    plain_success_count: updates.plain_success_count ?? existing.plain_success_count,
+    browser_success_count: updates.browser_success_count ?? existing.browser_success_count,
+    expires_at: updates.expires_at !== undefined ? updates.expires_at : existing.expires_at,
+  };
+
+  upsertPolicy.run(
+    domain,
+    merged.policy,
+    merged.consecutive_plain_failures,
+    merged.consecutive_browser_failures,
+    merged.plain_success_count,
+    merged.browser_success_count,
+    merged.expires_at
+  );
+}
+
+
+function recordPlainSuccess(url) {
+  const domain = getDomain(url);
+  if (!domain) return;
+  const existing = loadRow(domain);
+
+  writeRow(domain, {
+    policy: "auto",
+    consecutive_plain_failures: 0,
+    plain_success_count: (existing?.plain_success_count || 0) + 1,
+    expires_at: null,
+  });
+}
+
+function recordPlainFailure(url) {
+  const domain = getDomain(url);
+  if (!domain) return;
+  const existing = loadRow(domain);
+
+  const failures = (existing?.consecutive_plain_failures || 0) + 1;
+
+  if (failures >= PLAIN_FAILURE_THRESHOLD) {
+    writeRow(domain, {
+      policy: "browser_only",
+      consecutive_plain_failures: failures,
+      expires_at: new Date(Date.now() + BROWSER_ONLY_TTL_MS).toISOString(),
+    });
+    return;
+  }
+
+  writeRow(domain, {
+    consecutive_plain_failures: failures,
+  });
+}
+
+function recordBrowserSuccess(url) {
+  const domain = getDomain(url);
+  if (!domain) return;
+  const existing = loadRow(domain);
+
+  // a browser success doesnt reset the plain-failure counter — plain fetch
+  // is still broken for this domain, we just confirmed the browser path works.
+  // policy stays browser_only until the ttl expires and we re-probe plain
+  writeRow(domain, {
+    consecutive_browser_failures: 0,
+    browser_success_count: (existing?.browser_success_count || 0) + 1,
+  });
+}
+
+function recordBrowserFailure(url) {
+  const domain = getDomain(url);
+  if (!domain) return;
+  const existing = loadRow(domain);
+
+  const failures = (existing?.consecutive_browser_failures || 0) + 1;
+
+  if (failures >= BROWSER_FAILURE_THRESHOLD) {
+    writeRow(domain, {
+      policy: "blocked",
+      consecutive_browser_failures: failures,
+      expires_at: new Date(Date.now() + BLOCKED_TTL_MS).toISOString(),
+    });
+    return;
+  }
+
+  writeRow(domain, {
+    consecutive_browser_failures: failures,
+  });
+}
+
+
+module.exports = {
+  getEffectivePolicy,
+  recordPlainSuccess,
+  recordPlainFailure,
+  recordBrowserSuccess,
+  recordBrowserFailure,
+};