migrate article embeddings to support multi-model architecture and enhance data integrity
This commit is contained in:
parent
a10c5eb39f
commit
b4b2fe2ac7
4 changed files with 686 additions and 104 deletions
392
src/content.js
392
src/content.js
|
|
@ -1,14 +1,37 @@
|
|||
const { extractFromHtml } = require('@extractus/article-extractor');
|
||||
const sharp = require('sharp');
|
||||
const db = require('./db');
|
||||
const config = require('./config');
|
||||
const { generateAndStoreEmbedding } = require('./embeddings');
|
||||
const { fetchWithPolicy } = require('./http');
|
||||
const { getSharedBrowserSession } = require('./sources/browserCrawler');
|
||||
const { extractFromHtml } = require("@extractus/article-extractor");
|
||||
const sharp = require("sharp");
|
||||
const db = require("./db");
|
||||
const config = require("./config");
|
||||
const { generateAndStoreEmbedding } = require("./embeddings");
|
||||
const { fetchWithPolicy } = require("./http");
|
||||
const { getSharedBrowserSession } = require("./sources/browserCrawler");
|
||||
const { validateExtractedArticle } = require("./contentValidation");
|
||||
const {
|
||||
getEffectivePolicy,
|
||||
recordPlainSuccess,
|
||||
recordPlainFailure,
|
||||
recordBrowserSuccess,
|
||||
recordBrowserFailure,
|
||||
} = require("./domainPolicy");
|
||||
|
||||
|
||||
const MAX_PLAIN_HTML_LENGTH = 1_500_000;
|
||||
const PLAIN_FETCH_TIMEOUT = 12000;
|
||||
const BROWSER_FETCH_TIMEOUT = 20000;
|
||||
|
||||
// retry windows for failures that look transient (validation rejected the
|
||||
// page, fetch timed out). genuinely terminal failures (404, dead url) get
|
||||
// a hard cap on attempt count instead
|
||||
const VALIDATION_RETRY_AFTER_MS = 24 * 60 * 60 * 1000;
|
||||
const TRANSIENT_RETRY_AFTER_MS = 6 * 60 * 60 * 1000;
|
||||
const MAX_TERMINAL_ATTEMPTS = 3;
|
||||
|
||||
|
||||
const updateArticleAssets = db.prepare(`
|
||||
UPDATE articles
|
||||
SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
|
||||
SET content = ?, image = ?, content_status = 'ready', content_error = NULL,
|
||||
content_attempted_at = ?, content_attempt_count = content_attempt_count + 1,
|
||||
content_retry_after = NULL
|
||||
WHERE id = ?
|
||||
`);
|
||||
const updateArticleTitleDescription = db.prepare(`
|
||||
|
|
@ -18,19 +41,25 @@ const updateArticleTitleDescription = db.prepare(`
|
|||
`);
|
||||
const markContentSkipped = db.prepare(`
|
||||
UPDATE articles
|
||||
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
|
||||
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?,
|
||||
content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
|
||||
WHERE id = ?
|
||||
`);
|
||||
const markContentFailed = db.prepare(`
|
||||
UPDATE articles
|
||||
SET content_status = 'failed', content_error = ?, content_attempted_at = ?
|
||||
SET content_status = 'failed', content_error = ?, content_attempted_at = ?,
|
||||
content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
|
||||
WHERE id = ?
|
||||
`);
|
||||
const markContentPending = db.prepare(`
|
||||
UPDATE articles
|
||||
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
|
||||
SET content_status = 'pending', content_error = ?, content_attempted_at = ?,
|
||||
content_attempt_count = content_attempt_count + 1, content_retry_after = ?
|
||||
WHERE id = ?
|
||||
`);
|
||||
|
||||
// round-robin pull of articles needing content. respects content_retry_after so
|
||||
// a freshly-rejected article doesnt get retried in the next loop iteration
|
||||
const selectRoundRobinArticlesMissingContent = db.prepare(`
|
||||
SELECT id, url, title, description
|
||||
FROM (
|
||||
|
|
@ -39,21 +68,18 @@ const selectRoundRobinArticlesMissingContent = db.prepare(`
|
|||
FROM articles
|
||||
WHERE (content IS NULL OR TRIM(content) = '')
|
||||
AND (content_status IS NULL OR content_status = 'pending')
|
||||
AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
|
||||
)
|
||||
WHERE rn <= ?
|
||||
ORDER BY rn, source
|
||||
`);
|
||||
|
||||
const loggedBlockedDomains = new Set();
|
||||
let contentBackfillRunning = false;
|
||||
const selectAttemptCount = db.prepare(`
|
||||
SELECT content_attempt_count AS attempts FROM articles WHERE id = ?
|
||||
`);
|
||||
|
||||
function getHostname(url) {
|
||||
try {
|
||||
return new URL(url).hostname.toLowerCase();
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
let contentBackfillRunning = false;
|
||||
|
||||
|
||||
function getErrorStatus(error) {
|
||||
|
|
@ -61,38 +87,28 @@ function getErrorStatus(error) {
|
|||
return error.status;
|
||||
}
|
||||
|
||||
const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/);
|
||||
const match = String((error && error.message) || "").match(/\b(401|403|404|408|429|5\d\d)\b/);
|
||||
return match ? Number(match[1]) : null;
|
||||
}
|
||||
|
||||
function getErrorMessage(error, fallback) {
|
||||
const message = String(error && error.message || fallback || '').trim();
|
||||
const message = String((error && error.message) || fallback || "").trim();
|
||||
return message ? message.slice(0, 500) : null;
|
||||
}
|
||||
|
||||
function markArticleStatus(statement, id, message) {
|
||||
const attemptedAt = new Date().toISOString();
|
||||
const parameterCount = statement.source.split('?').length - 1;
|
||||
|
||||
if (parameterCount === 3) {
|
||||
statement.run(message, attemptedAt, id);
|
||||
return;
|
||||
}
|
||||
|
||||
if (parameterCount === 2) {
|
||||
statement.run(attemptedAt, id);
|
||||
return;
|
||||
}
|
||||
|
||||
throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
|
||||
function nowIso() {
|
||||
return new Date().toISOString();
|
||||
}
|
||||
|
||||
function futureIso(ms) {
|
||||
return new Date(Date.now() + ms).toISOString();
|
||||
}
|
||||
|
||||
|
||||
async function fetchCompressedImage(url) {
|
||||
const response = await fetchWithPolicy(url, {
|
||||
retries: 1,
|
||||
headers: {
|
||||
Accept: 'image/*',
|
||||
},
|
||||
headers: { Accept: "image/*" },
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
|
|
@ -101,90 +117,255 @@ async function fetchCompressedImage(url) {
|
|||
throw error;
|
||||
}
|
||||
|
||||
const contentType = String(response.headers.get('content-type') || '').toLowerCase();
|
||||
if (!contentType.startsWith('image/')) {
|
||||
throw new Error(`image request returned ${contentType || 'unknown content-type'}`);
|
||||
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
|
||||
if (!contentType.startsWith("image/")) {
|
||||
throw new Error(`image request returned ${contentType || "unknown content-type"}`);
|
||||
}
|
||||
|
||||
const input = Buffer.from(await response.arrayBuffer());
|
||||
if (input.length === 0) {
|
||||
throw new Error('image request returned an empty body');
|
||||
throw new Error("image request returned an empty body");
|
||||
}
|
||||
|
||||
const output = await sharp(input)
|
||||
.rotate()
|
||||
.resize({ width: 320, height: 320, fit: 'inside', withoutEnlargement: true })
|
||||
.resize({ width: 320, height: 320, fit: "inside", withoutEnlargement: true })
|
||||
.webp({ quality: 25 })
|
||||
.toBuffer();
|
||||
|
||||
return output.toString('base64');
|
||||
return output.toString("base64");
|
||||
}
|
||||
|
||||
|
||||
// plain http fetch — no js execution. fast, low memory, but fails on
|
||||
// js-rendered sites and gets blocked by cloudflare more often
|
||||
async function fetchPlainHtml(url) {
|
||||
const response = await fetchWithPolicy(url, {
|
||||
timeout: PLAIN_FETCH_TIMEOUT,
|
||||
retries: 1,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = new Error(`plain fetch returned ${response.status}`);
|
||||
error.status = response.status;
|
||||
throw error;
|
||||
}
|
||||
|
||||
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
|
||||
if (contentType && !contentType.includes("html") && !contentType.includes("xml")) {
|
||||
throw new Error(`plain fetch returned non-html content-type: ${contentType}`);
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
return {
|
||||
html: text.slice(0, MAX_PLAIN_HTML_LENGTH),
|
||||
finalUrl: response.url || url,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
async function fetchBrowserHtml(url) {
|
||||
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
|
||||
const session = await getSharedBrowserSession({
|
||||
requestTimeout: BROWSER_FETCH_TIMEOUT,
|
||||
maxConcurrentPages,
|
||||
});
|
||||
|
||||
const html = await session.fetchRenderedHtml(url, { timeout: BROWSER_FETCH_TIMEOUT });
|
||||
return { html, finalUrl: url };
|
||||
}
|
||||
|
||||
|
||||
function stripHtmlContent(value) {
|
||||
if (typeof value !== "string") return null;
|
||||
const stripped = value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
||||
return stripped || null;
|
||||
}
|
||||
|
||||
|
||||
// runs fetch → extract → validate. returns { ok, article, html, finalUrl, reason }
|
||||
// where article has been post-processed (content stripped of html). on failure,
|
||||
// reason explains what tripped — used both for logging and for the per-domain
|
||||
// policy update
|
||||
async function attemptFetch(url, fetcher) {
|
||||
let html;
|
||||
let finalUrl;
|
||||
try {
|
||||
const result = await fetcher(url);
|
||||
html = result.html;
|
||||
finalUrl = result.finalUrl;
|
||||
} catch (error) {
|
||||
return { ok: false, reason: `fetch-error:${error.message || "unknown"}`, error };
|
||||
}
|
||||
|
||||
if (!html) {
|
||||
return { ok: false, reason: "empty-html" };
|
||||
}
|
||||
|
||||
let extracted;
|
||||
try {
|
||||
extracted = await extractFromHtml(html, finalUrl || url);
|
||||
} catch (error) {
|
||||
return { ok: false, reason: `extractor-error:${error.message || "unknown"}` };
|
||||
}
|
||||
|
||||
if (extracted) {
|
||||
extracted = {
|
||||
...extracted,
|
||||
content: stripHtmlContent(extracted.content),
|
||||
};
|
||||
}
|
||||
|
||||
const validation = validateExtractedArticle({ article: extracted, html, finalUrl });
|
||||
if (!validation.ok) {
|
||||
return { ok: false, reason: validation.reason, retryable: validation.retryable, html, finalUrl };
|
||||
}
|
||||
|
||||
return { ok: true, article: extracted, html, finalUrl };
|
||||
}
|
||||
|
||||
|
||||
function getAttemptCount(id) {
|
||||
const row = selectAttemptCount.get(id);
|
||||
return row ? row.attempts || 0 : 0;
|
||||
}
|
||||
|
||||
|
||||
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
|
||||
const policy = getEffectivePolicy(url);
|
||||
|
||||
// domains we know are blocked — skip the fetch entirely until ttl expires.
|
||||
// the row stays pending so it'll get picked up after the policy resets
|
||||
if (policy.policy === "blocked") {
|
||||
markContentPending.run(
|
||||
`domain blocked by policy`,
|
||||
nowIso(),
|
||||
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
||||
id
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const tryPlainFirst = policy.policy === "auto" || policy.policy === "plain_only";
|
||||
let plainResult = null;
|
||||
let browserResult = null;
|
||||
|
||||
|
||||
if (tryPlainFirst) {
|
||||
plainResult = await attemptFetch(url, fetchPlainHtml);
|
||||
|
||||
if (plainResult.ok) {
|
||||
recordPlainSuccess(url);
|
||||
await commitArticle(id, url, plainResult, storedTitle, storedDescription);
|
||||
return;
|
||||
}
|
||||
|
||||
recordPlainFailure(url);
|
||||
|
||||
// hard 4xx (other than 408/429) on plain — domain might serve the same to
|
||||
// browser, but try anyway since it's cheap once the policy hasnt flipped yet.
|
||||
// 408/429/5xx defer for retry
|
||||
const status = plainResult.error && getErrorStatus(plainResult.error);
|
||||
if (status === 408 || status === 429 || (status && status >= 500)) {
|
||||
markContentPending.run(
|
||||
`plain ${status}`,
|
||||
nowIso(),
|
||||
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
||||
id
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// policy.policy === "plain_only" means we just tried plain and failed —
|
||||
// dont escalate to browser, the operator (or earlier domain memory) said no
|
||||
if (policy.policy === "plain_only") {
|
||||
recordValidationFailure(id, plainResult);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
browserResult = await attemptFetch(url, fetchBrowserHtml);
|
||||
|
||||
if (browserResult.ok) {
|
||||
recordBrowserSuccess(url);
|
||||
await commitArticle(id, url, browserResult, storedTitle, storedDescription);
|
||||
return;
|
||||
}
|
||||
|
||||
recordBrowserFailure(url);
|
||||
|
||||
const browserStatus = browserResult.error && getErrorStatus(browserResult.error);
|
||||
if (browserStatus === 408 || browserStatus === 429 || (browserStatus && browserStatus >= 500)) {
|
||||
markContentPending.run(
|
||||
`browser ${browserStatus}`,
|
||||
nowIso(),
|
||||
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
||||
id
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// both paths exhausted (or browser-only path failed). decide between
|
||||
// pending-with-retry and terminal failed based on attempt count and
|
||||
// whether the validator thought it was retryable
|
||||
recordValidationFailure(id, browserResult);
|
||||
}
|
||||
|
||||
|
||||
function recordValidationFailure(id, result) {
|
||||
const reason = result?.reason || "unknown";
|
||||
const retryable = result?.retryable !== false;
|
||||
const attempts = getAttemptCount(id);
|
||||
|
||||
// hard fetch errors with no retryable signal — terminal after a few tries
|
||||
if (!retryable || attempts + 1 >= MAX_TERMINAL_ATTEMPTS) {
|
||||
markContentFailed.run(reason, nowIso(), id);
|
||||
return;
|
||||
}
|
||||
|
||||
markContentPending.run(reason, nowIso(), futureIso(VALIDATION_RETRY_AFTER_MS), id);
|
||||
}
|
||||
|
||||
|
||||
async function commitArticle(id, url, result, storedTitle, storedDescription) {
|
||||
const { article, finalUrl } = result;
|
||||
const content = article.content || null;
|
||||
|
||||
// if stored title looks like a raw url, replace with extracted one
|
||||
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
|
||||
if (titleLooksLikeUrl) {
|
||||
const scrapedTitle = typeof article.title === "string" ? article.title.trim() : null;
|
||||
const scrapedDescription = typeof article.description === "string" ? article.description.trim() : null;
|
||||
if (scrapedTitle) {
|
||||
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
|
||||
}
|
||||
}
|
||||
|
||||
let image = null;
|
||||
if (article.image) {
|
||||
try {
|
||||
image = await fetchCompressedImage(article.image);
|
||||
} catch (error) {
|
||||
const status = getErrorStatus(error);
|
||||
if (status === 401 || status === 403 || status === 404 || status === 429) {
|
||||
console.warn(`image fetch skipped for ${url}: upstream returned ${status}`);
|
||||
} else {
|
||||
console.error(`image fetch failed for ${url}:`, error.message || error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
updateArticleAssets.run(content, image, nowIso(), id);
|
||||
|
||||
try {
|
||||
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
|
||||
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
|
||||
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
|
||||
const article = await extractFromHtml(html, url);
|
||||
if (!article) {
|
||||
markArticleStatus(markContentSkipped, id, 'extractor returned no article');
|
||||
return;
|
||||
}
|
||||
|
||||
const content = typeof article.content === 'string'
|
||||
? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
|
||||
: null;
|
||||
|
||||
// if stored title looks like a raw URL, try to replace with scraped title
|
||||
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
|
||||
if (titleLooksLikeUrl) {
|
||||
const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
|
||||
const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
|
||||
if (scrapedTitle) {
|
||||
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
|
||||
}
|
||||
}
|
||||
|
||||
let image = null;
|
||||
if (article.image) {
|
||||
try {
|
||||
image = await fetchCompressedImage(article.image);
|
||||
} catch (error) {
|
||||
const status = getErrorStatus(error);
|
||||
if (status === 401 || status === 403 || status === 404 || status === 429) {
|
||||
console.warn(`image fetch skipped for ${url}: upstream returned ${status}`);
|
||||
} else {
|
||||
console.error(`image fetch failed for ${url}:`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!content && !image) {
|
||||
markArticleStatus(markContentSkipped, id, 'article had no extractable content or image');
|
||||
return;
|
||||
}
|
||||
|
||||
updateArticleAssets.run(content, image, new Date().toISOString(), id);
|
||||
await generateAndStoreEmbedding(id);
|
||||
} catch (error) {
|
||||
const status = getErrorStatus(error);
|
||||
if (status === 401 || status === 403 || status === 404) {
|
||||
console.warn(`content fetch skipped for ${url}: upstream returned ${status}`);
|
||||
markArticleStatus(markContentSkipped, id, `upstream returned ${status}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (status === 408 || status === 429 || (status && status >= 500)) {
|
||||
console.warn(`content fetch deferred for ${url}: upstream returned ${status}`);
|
||||
markArticleStatus(markContentPending, id, null);
|
||||
return;
|
||||
}
|
||||
|
||||
markArticleStatus(markContentFailed, id, getErrorMessage(error, 'content fetch failed'));
|
||||
console.error(`content fetch failed for ${url}:`, error);
|
||||
console.error(`embedding failed for article ${id}:`, error.message || error);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function backfillMissingContent(perSource = 50, concurrency = 5) {
|
||||
if (contentBackfillRunning) {
|
||||
return;
|
||||
|
|
@ -204,15 +385,18 @@ async function backfillMissingContent(perSource = 50, concurrency = 5) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
function hasPendingContent() {
|
||||
return Boolean(db.prepare(`
|
||||
SELECT 1 FROM articles
|
||||
WHERE (content IS NULL OR TRIM(content) = '')
|
||||
AND (content_status IS NULL OR content_status = 'pending')
|
||||
AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
|
||||
LIMIT 1
|
||||
`).get());
|
||||
}
|
||||
|
||||
|
||||
module.exports = {
|
||||
fetchAndStoreContent,
|
||||
backfillMissingContent,
|
||||
|
|
|
|||
199
src/contentValidation.js
Normal file
199
src/contentValidation.js
Normal file
|
|
@ -0,0 +1,199 @@
|
|||
// validates whether an extracted article is real content vs a soft-error page
|
||||
// (cookie wall, cloudflare challenge, paywall, "enable javascript", etc).
|
||||
//
|
||||
// the rules are deliberately conservative. we'd rather let a few junk pages
|
||||
// through (caught downstream when re-checked) than reject 5% of real articles.
|
||||
// fingerprints are anchored to title or the first ~500 chars of body so an
|
||||
// article that *mentions* cloudflare doesnt get falsely rejected.
|
||||
|
||||
const MIN_CONTENT_LENGTH = 400;
|
||||
const MIN_SENTENCE_TERMINATORS = 3;
|
||||
const BODY_SNIFF_LENGTH = 800;
|
||||
|
||||
|
||||
// titles that ONLY appear on error/challenge pages — never on real articles.
|
||||
// match is case-insensitive, exact-or-prefix only (not substring) to avoid
|
||||
// false positives like a real article titled "404 reasons your startup failed"
|
||||
const TITLE_BLOCKLIST = [
|
||||
"just a moment",
|
||||
"just a moment...",
|
||||
"attention required! | cloudflare",
|
||||
"attention required!",
|
||||
"access denied",
|
||||
"access to this page has been denied",
|
||||
"you have been blocked",
|
||||
"are you a robot",
|
||||
"are you a robot?",
|
||||
"verify you are human",
|
||||
"please verify you are a human",
|
||||
"page not found",
|
||||
"404 not found",
|
||||
"404 page not found",
|
||||
"403 forbidden",
|
||||
"503 service unavailable",
|
||||
"this page isn't available",
|
||||
"this page isn’t available",
|
||||
"site temporarily unavailable",
|
||||
"request unsuccessful",
|
||||
];
|
||||
|
||||
|
||||
// substrings to look for in the raw html head/early body that indicate a
|
||||
// cloudflare/akamai/imperva interstitial. these are infrastructure markers
|
||||
// the real site never serves
|
||||
const CHALLENGE_MARKERS = [
|
||||
"cf-chl-bypass",
|
||||
"__cf_chl_",
|
||||
"cf_chl_opt",
|
||||
"/cdn-cgi/challenge-platform",
|
||||
"_incapsula_resource",
|
||||
"incap_ses_",
|
||||
"x-iinfo",
|
||||
"akamai-bm-telemetry",
|
||||
"ak_bmsc",
|
||||
"distil_r_captcha",
|
||||
];
|
||||
|
||||
|
||||
// phrases at the very start of extracted body text that mean we got a stub.
|
||||
// anchored to first ~500 chars so we dont false-flag articles that discuss
|
||||
// these topics later in the body
|
||||
const BODY_PREFIX_BLOCKLIST = [
|
||||
"you need to enable javascript",
|
||||
"please enable javascript",
|
||||
"javascript is required",
|
||||
"please enable cookies",
|
||||
"cookies must be enabled",
|
||||
"your browser will redirect",
|
||||
"checking your browser before",
|
||||
"this site requires javascript",
|
||||
"please make sure your browser supports",
|
||||
];
|
||||
|
||||
|
||||
// final-url path suffixes that indicate the request was redirected to a
|
||||
// generic error/login page. we only check the pathname so query strings dont
|
||||
// throw it off
|
||||
const ERROR_PATH_HINTS = [
|
||||
"/404",
|
||||
"/403",
|
||||
"/error",
|
||||
"/errors",
|
||||
"/blocked",
|
||||
"/captcha",
|
||||
"/challenge",
|
||||
"/access-denied",
|
||||
"/account/login",
|
||||
"/sign-in",
|
||||
"/signin",
|
||||
"/subscribe",
|
||||
"/subscription",
|
||||
];
|
||||
|
||||
|
||||
function normalizeForMatch(value) {
|
||||
return String(value || "").trim().toLowerCase();
|
||||
}
|
||||
|
||||
function countSentenceTerminators(text) {
|
||||
// matches . ! ? followed by whitespace or end — avoids counting decimals like 3.14
|
||||
const matches = String(text || "").match(/[.!?](?:\s|$)/g);
|
||||
return matches ? matches.length : 0;
|
||||
}
|
||||
|
||||
function hasErrorPath(finalUrl) {
|
||||
if (!finalUrl) return false;
|
||||
try {
|
||||
const path = new URL(finalUrl).pathname.toLowerCase();
|
||||
return ERROR_PATH_HINTS.some((hint) => path === hint || path.startsWith(`${hint}/`) || path.endsWith(hint));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function hasChallengeMarker(html) {
|
||||
if (!html) return null;
|
||||
// cap the search window — challenge markers are always in head or top of body,
|
||||
// dont need to scan a full 1.5mb document
|
||||
const haystack = String(html).slice(0, 50000).toLowerCase();
|
||||
for (const marker of CHALLENGE_MARKERS) {
|
||||
if (haystack.includes(marker)) {
|
||||
return marker;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function titleIsBlocked(title) {
|
||||
const normalized = normalizeForMatch(title);
|
||||
if (!normalized) return null;
|
||||
|
||||
for (const entry of TITLE_BLOCKLIST) {
|
||||
if (normalized === entry || normalized.startsWith(`${entry} `) || normalized.startsWith(`${entry}|`)) {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function bodyPrefixIsBlocked(content) {
|
||||
const sniff = normalizeForMatch(content).slice(0, BODY_SNIFF_LENGTH);
|
||||
if (!sniff) return null;
|
||||
|
||||
for (const phrase of BODY_PREFIX_BLOCKLIST) {
|
||||
if (sniff.includes(phrase)) {
|
||||
return phrase;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
function validateExtractedArticle({ article, html, finalUrl }) {
|
||||
if (!article) {
|
||||
return { ok: false, reason: "extractor-returned-null", retryable: false };
|
||||
}
|
||||
|
||||
const content = typeof article.content === "string" ? article.content.trim() : "";
|
||||
const title = typeof article.title === "string" ? article.title.trim() : "";
|
||||
|
||||
// title-level checks first since they're the cheapest signal
|
||||
const blockedTitle = titleIsBlocked(title);
|
||||
if (blockedTitle) {
|
||||
return { ok: false, reason: `title-blocklist:${blockedTitle}`, retryable: true };
|
||||
}
|
||||
|
||||
if (hasErrorPath(finalUrl)) {
|
||||
return { ok: false, reason: `error-path:${finalUrl}`, retryable: true };
|
||||
}
|
||||
|
||||
const challenge = hasChallengeMarker(html);
|
||||
if (challenge) {
|
||||
return { ok: false, reason: `challenge-marker:${challenge}`, retryable: true };
|
||||
}
|
||||
|
||||
if (!content) {
|
||||
return { ok: false, reason: "no-content-extracted", retryable: true };
|
||||
}
|
||||
|
||||
if (content.length < MIN_CONTENT_LENGTH) {
|
||||
return { ok: false, reason: `content-too-short:${content.length}`, retryable: true };
|
||||
}
|
||||
|
||||
const blockedPrefix = bodyPrefixIsBlocked(content);
|
||||
if (blockedPrefix) {
|
||||
return { ok: false, reason: `body-prefix-blocklist:${blockedPrefix}`, retryable: true };
|
||||
}
|
||||
|
||||
if (countSentenceTerminators(content) < MIN_SENTENCE_TERMINATORS) {
|
||||
return { ok: false, reason: "too-few-sentences", retryable: true };
|
||||
}
|
||||
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
|
||||
module.exports = {
|
||||
validateExtractedArticle,
|
||||
MIN_CONTENT_LENGTH,
|
||||
};
|
||||
18
src/db.js
18
src/db.js
|
|
@ -261,11 +261,29 @@ db.exec(`
|
|||
);
|
||||
`);
|
||||
|
||||
// per-domain fetch policy — caches whether plain http or browser is needed
|
||||
// so we dont waste a round trip on every article from a known js-only site.
|
||||
// expires_at lets us re-probe domains that may have recovered
|
||||
db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS domain_fetch_policy (
|
||||
domain TEXT PRIMARY KEY,
|
||||
policy TEXT NOT NULL DEFAULT 'auto',
|
||||
consecutive_plain_failures INTEGER NOT NULL DEFAULT 0,
|
||||
consecutive_browser_failures INTEGER NOT NULL DEFAULT 0,
|
||||
plain_success_count INTEGER NOT NULL DEFAULT 0,
|
||||
browser_success_count INTEGER NOT NULL DEFAULT 0,
|
||||
expires_at TEXT,
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
);
|
||||
`);
|
||||
|
||||
for (const statement of [
|
||||
'ALTER TABLE articles ADD COLUMN image TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN content_error TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0',
|
||||
'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
|
||||
]) {
|
||||
try {
|
||||
|
|
|
|||
181
src/domainPolicy.js
Normal file
181
src/domainPolicy.js
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
const db = require("./db");
|
||||
|
||||
|
||||
// thresholds — kept in code rather than config because tuning these without
|
||||
// understanding the consequences is a recipe for either a thundering herd
|
||||
// against blocked domains or wasted plain-fetch attempts forever
|
||||
const PLAIN_FAILURE_THRESHOLD = 5;
|
||||
const BROWSER_FAILURE_THRESHOLD = 5;
|
||||
const BROWSER_ONLY_TTL_MS = 7 * 24 * 60 * 60 * 1000;
|
||||
const BLOCKED_TTL_MS = 24 * 60 * 60 * 1000;
|
||||
|
||||
|
||||
const selectPolicy = db.prepare(`
|
||||
SELECT domain, policy, consecutive_plain_failures, consecutive_browser_failures,
|
||||
plain_success_count, browser_success_count, expires_at, updated_at
|
||||
FROM domain_fetch_policy
|
||||
WHERE domain = ?
|
||||
`);
|
||||
|
||||
const upsertPolicy = db.prepare(`
|
||||
INSERT INTO domain_fetch_policy (
|
||||
domain, policy, consecutive_plain_failures, consecutive_browser_failures,
|
||||
plain_success_count, browser_success_count, expires_at, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))
|
||||
ON CONFLICT(domain) DO UPDATE SET
|
||||
policy = excluded.policy,
|
||||
consecutive_plain_failures = excluded.consecutive_plain_failures,
|
||||
consecutive_browser_failures = excluded.consecutive_browser_failures,
|
||||
plain_success_count = excluded.plain_success_count,
|
||||
browser_success_count = excluded.browser_success_count,
|
||||
expires_at = excluded.expires_at,
|
||||
updated_at = datetime('now')
|
||||
`);
|
||||
|
||||
|
||||
function getDomain(url) {
|
||||
try {
|
||||
return new URL(url).hostname.toLowerCase();
|
||||
} catch {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
function loadRow(domain) {
|
||||
if (!domain) return null;
|
||||
return selectPolicy.get(domain) || null;
|
||||
}
|
||||
|
||||
function isExpired(row) {
|
||||
if (!row || !row.expires_at) return false;
|
||||
return new Date(row.expires_at).getTime() <= Date.now();
|
||||
}
|
||||
|
||||
|
||||
// returns the effective policy for a domain right now. expired entries
|
||||
// silently revert to "auto" so we re-probe — we dont mutate the row here
|
||||
// since reads happen on every fetch and writes are expensive
|
||||
function getEffectivePolicy(url) {
|
||||
const domain = getDomain(url);
|
||||
const row = loadRow(domain);
|
||||
|
||||
if (!row) {
|
||||
return { domain, policy: "auto" };
|
||||
}
|
||||
|
||||
if (isExpired(row)) {
|
||||
return { domain, policy: "auto", wasExpired: true, previous: row.policy };
|
||||
}
|
||||
|
||||
return { domain, policy: row.policy };
|
||||
}
|
||||
|
||||
|
||||
function writeRow(domain, updates) {
|
||||
const existing = loadRow(domain) || {
|
||||
policy: "auto",
|
||||
consecutive_plain_failures: 0,
|
||||
consecutive_browser_failures: 0,
|
||||
plain_success_count: 0,
|
||||
browser_success_count: 0,
|
||||
expires_at: null,
|
||||
};
|
||||
|
||||
const merged = {
|
||||
policy: updates.policy ?? existing.policy,
|
||||
consecutive_plain_failures: updates.consecutive_plain_failures ?? existing.consecutive_plain_failures,
|
||||
consecutive_browser_failures: updates.consecutive_browser_failures ?? existing.consecutive_browser_failures,
|
||||
plain_success_count: updates.plain_success_count ?? existing.plain_success_count,
|
||||
browser_success_count: updates.browser_success_count ?? existing.browser_success_count,
|
||||
expires_at: updates.expires_at !== undefined ? updates.expires_at : existing.expires_at,
|
||||
};
|
||||
|
||||
upsertPolicy.run(
|
||||
domain,
|
||||
merged.policy,
|
||||
merged.consecutive_plain_failures,
|
||||
merged.consecutive_browser_failures,
|
||||
merged.plain_success_count,
|
||||
merged.browser_success_count,
|
||||
merged.expires_at
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
function recordPlainSuccess(url) {
|
||||
const domain = getDomain(url);
|
||||
if (!domain) return;
|
||||
const existing = loadRow(domain);
|
||||
|
||||
writeRow(domain, {
|
||||
policy: "auto",
|
||||
consecutive_plain_failures: 0,
|
||||
plain_success_count: (existing?.plain_success_count || 0) + 1,
|
||||
expires_at: null,
|
||||
});
|
||||
}
|
||||
|
||||
function recordPlainFailure(url) {
|
||||
const domain = getDomain(url);
|
||||
if (!domain) return;
|
||||
const existing = loadRow(domain);
|
||||
|
||||
const failures = (existing?.consecutive_plain_failures || 0) + 1;
|
||||
|
||||
if (failures >= PLAIN_FAILURE_THRESHOLD) {
|
||||
writeRow(domain, {
|
||||
policy: "browser_only",
|
||||
consecutive_plain_failures: failures,
|
||||
expires_at: new Date(Date.now() + BROWSER_ONLY_TTL_MS).toISOString(),
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
writeRow(domain, {
|
||||
consecutive_plain_failures: failures,
|
||||
});
|
||||
}
|
||||
|
||||
function recordBrowserSuccess(url) {
|
||||
const domain = getDomain(url);
|
||||
if (!domain) return;
|
||||
const existing = loadRow(domain);
|
||||
|
||||
// a browser success doesnt reset the plain-failure counter — plain fetch
|
||||
// is still broken for this domain, we just confirmed the browser path works.
|
||||
// policy stays browser_only until the ttl expires and we re-probe plain
|
||||
writeRow(domain, {
|
||||
consecutive_browser_failures: 0,
|
||||
browser_success_count: (existing?.browser_success_count || 0) + 1,
|
||||
});
|
||||
}
|
||||
|
||||
function recordBrowserFailure(url) {
|
||||
const domain = getDomain(url);
|
||||
if (!domain) return;
|
||||
const existing = loadRow(domain);
|
||||
|
||||
const failures = (existing?.consecutive_browser_failures || 0) + 1;
|
||||
|
||||
if (failures >= BROWSER_FAILURE_THRESHOLD) {
|
||||
writeRow(domain, {
|
||||
policy: "blocked",
|
||||
consecutive_browser_failures: failures,
|
||||
expires_at: new Date(Date.now() + BLOCKED_TTL_MS).toISOString(),
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
writeRow(domain, {
|
||||
consecutive_browser_failures: failures,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
module.exports = {
|
||||
getEffectivePolicy,
|
||||
recordPlainSuccess,
|
||||
recordPlainFailure,
|
||||
recordBrowserSuccess,
|
||||
recordBrowserFailure,
|
||||
};
|
||||
Loading…
Add table
Reference in a new issue