migrate article embeddings to support multi-model architecture and enhance data integrity
This commit is contained in:
parent
a10c5eb39f
commit
b4b2fe2ac7
4 changed files with 686 additions and 104 deletions
344
src/content.js
344
src/content.js
|
|
@ -1,14 +1,37 @@
|
||||||
const { extractFromHtml } = require('@extractus/article-extractor');
|
const { extractFromHtml } = require("@extractus/article-extractor");
|
||||||
const sharp = require('sharp');
|
const sharp = require("sharp");
|
||||||
const db = require('./db');
|
const db = require("./db");
|
||||||
const config = require('./config');
|
const config = require("./config");
|
||||||
const { generateAndStoreEmbedding } = require('./embeddings');
|
const { generateAndStoreEmbedding } = require("./embeddings");
|
||||||
const { fetchWithPolicy } = require('./http');
|
const { fetchWithPolicy } = require("./http");
|
||||||
const { getSharedBrowserSession } = require('./sources/browserCrawler');
|
const { getSharedBrowserSession } = require("./sources/browserCrawler");
|
||||||
|
const { validateExtractedArticle } = require("./contentValidation");
|
||||||
|
const {
|
||||||
|
getEffectivePolicy,
|
||||||
|
recordPlainSuccess,
|
||||||
|
recordPlainFailure,
|
||||||
|
recordBrowserSuccess,
|
||||||
|
recordBrowserFailure,
|
||||||
|
} = require("./domainPolicy");
|
||||||
|
|
||||||
|
|
||||||
|
const MAX_PLAIN_HTML_LENGTH = 1_500_000;
|
||||||
|
const PLAIN_FETCH_TIMEOUT = 12000;
|
||||||
|
const BROWSER_FETCH_TIMEOUT = 20000;
|
||||||
|
|
||||||
|
// retry windows for failures that look transient (validation rejected the
|
||||||
|
// page, fetch timed out). genuinely terminal failures (404, dead url) get
|
||||||
|
// a hard cap on attempt count instead
|
||||||
|
const VALIDATION_RETRY_AFTER_MS = 24 * 60 * 60 * 1000;
|
||||||
|
const TRANSIENT_RETRY_AFTER_MS = 6 * 60 * 60 * 1000;
|
||||||
|
const MAX_TERMINAL_ATTEMPTS = 3;
|
||||||
|
|
||||||
|
|
||||||
const updateArticleAssets = db.prepare(`
|
const updateArticleAssets = db.prepare(`
|
||||||
UPDATE articles
|
UPDATE articles
|
||||||
SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
|
SET content = ?, image = ?, content_status = 'ready', content_error = NULL,
|
||||||
|
content_attempted_at = ?, content_attempt_count = content_attempt_count + 1,
|
||||||
|
content_retry_after = NULL
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
`);
|
`);
|
||||||
const updateArticleTitleDescription = db.prepare(`
|
const updateArticleTitleDescription = db.prepare(`
|
||||||
|
|
@ -18,19 +41,25 @@ const updateArticleTitleDescription = db.prepare(`
|
||||||
`);
|
`);
|
||||||
const markContentSkipped = db.prepare(`
|
const markContentSkipped = db.prepare(`
|
||||||
UPDATE articles
|
UPDATE articles
|
||||||
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
|
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?,
|
||||||
|
content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
`);
|
`);
|
||||||
const markContentFailed = db.prepare(`
|
const markContentFailed = db.prepare(`
|
||||||
UPDATE articles
|
UPDATE articles
|
||||||
SET content_status = 'failed', content_error = ?, content_attempted_at = ?
|
SET content_status = 'failed', content_error = ?, content_attempted_at = ?,
|
||||||
|
content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
`);
|
`);
|
||||||
const markContentPending = db.prepare(`
|
const markContentPending = db.prepare(`
|
||||||
UPDATE articles
|
UPDATE articles
|
||||||
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
|
SET content_status = 'pending', content_error = ?, content_attempted_at = ?,
|
||||||
|
content_attempt_count = content_attempt_count + 1, content_retry_after = ?
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
// round-robin pull of articles needing content. respects content_retry_after so
|
||||||
|
// a freshly-rejected article doesnt get retried in the next loop iteration
|
||||||
const selectRoundRobinArticlesMissingContent = db.prepare(`
|
const selectRoundRobinArticlesMissingContent = db.prepare(`
|
||||||
SELECT id, url, title, description
|
SELECT id, url, title, description
|
||||||
FROM (
|
FROM (
|
||||||
|
|
@ -39,21 +68,18 @@ const selectRoundRobinArticlesMissingContent = db.prepare(`
|
||||||
FROM articles
|
FROM articles
|
||||||
WHERE (content IS NULL OR TRIM(content) = '')
|
WHERE (content IS NULL OR TRIM(content) = '')
|
||||||
AND (content_status IS NULL OR content_status = 'pending')
|
AND (content_status IS NULL OR content_status = 'pending')
|
||||||
|
AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
|
||||||
)
|
)
|
||||||
WHERE rn <= ?
|
WHERE rn <= ?
|
||||||
ORDER BY rn, source
|
ORDER BY rn, source
|
||||||
`);
|
`);
|
||||||
|
|
||||||
const loggedBlockedDomains = new Set();
|
const selectAttemptCount = db.prepare(`
|
||||||
let contentBackfillRunning = false;
|
SELECT content_attempt_count AS attempts FROM articles WHERE id = ?
|
||||||
|
`);
|
||||||
|
|
||||||
function getHostname(url) {
|
|
||||||
try {
|
let contentBackfillRunning = false;
|
||||||
return new URL(url).hostname.toLowerCase();
|
|
||||||
} catch {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function getErrorStatus(error) {
|
function getErrorStatus(error) {
|
||||||
|
|
@ -61,38 +87,28 @@ function getErrorStatus(error) {
|
||||||
return error.status;
|
return error.status;
|
||||||
}
|
}
|
||||||
|
|
||||||
const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/);
|
const match = String((error && error.message) || "").match(/\b(401|403|404|408|429|5\d\d)\b/);
|
||||||
return match ? Number(match[1]) : null;
|
return match ? Number(match[1]) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function getErrorMessage(error, fallback) {
|
function getErrorMessage(error, fallback) {
|
||||||
const message = String(error && error.message || fallback || '').trim();
|
const message = String((error && error.message) || fallback || "").trim();
|
||||||
return message ? message.slice(0, 500) : null;
|
return message ? message.slice(0, 500) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function markArticleStatus(statement, id, message) {
|
function nowIso() {
|
||||||
const attemptedAt = new Date().toISOString();
|
return new Date().toISOString();
|
||||||
const parameterCount = statement.source.split('?').length - 1;
|
|
||||||
|
|
||||||
if (parameterCount === 3) {
|
|
||||||
statement.run(message, attemptedAt, id);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (parameterCount === 2) {
|
|
||||||
statement.run(attemptedAt, id);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function futureIso(ms) {
|
||||||
|
return new Date(Date.now() + ms).toISOString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
async function fetchCompressedImage(url) {
|
async function fetchCompressedImage(url) {
|
||||||
const response = await fetchWithPolicy(url, {
|
const response = await fetchWithPolicy(url, {
|
||||||
retries: 1,
|
retries: 1,
|
||||||
headers: {
|
headers: { Accept: "image/*" },
|
||||||
Accept: 'image/*',
|
|
||||||
},
|
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
|
|
@ -101,45 +117,226 @@ async function fetchCompressedImage(url) {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = String(response.headers.get('content-type') || '').toLowerCase();
|
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
|
||||||
if (!contentType.startsWith('image/')) {
|
if (!contentType.startsWith("image/")) {
|
||||||
throw new Error(`image request returned ${contentType || 'unknown content-type'}`);
|
throw new Error(`image request returned ${contentType || "unknown content-type"}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const input = Buffer.from(await response.arrayBuffer());
|
const input = Buffer.from(await response.arrayBuffer());
|
||||||
if (input.length === 0) {
|
if (input.length === 0) {
|
||||||
throw new Error('image request returned an empty body');
|
throw new Error("image request returned an empty body");
|
||||||
}
|
}
|
||||||
|
|
||||||
const output = await sharp(input)
|
const output = await sharp(input)
|
||||||
.rotate()
|
.rotate()
|
||||||
.resize({ width: 320, height: 320, fit: 'inside', withoutEnlargement: true })
|
.resize({ width: 320, height: 320, fit: "inside", withoutEnlargement: true })
|
||||||
.webp({ quality: 25 })
|
.webp({ quality: 25 })
|
||||||
.toBuffer();
|
.toBuffer();
|
||||||
|
|
||||||
return output.toString('base64');
|
return output.toString("base64");
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
|
|
||||||
try {
|
// plain http fetch — no js execution. fast, low memory, but fails on
|
||||||
|
// js-rendered sites and gets blocked by cloudflare more often
|
||||||
|
async function fetchPlainHtml(url) {
|
||||||
|
const response = await fetchWithPolicy(url, {
|
||||||
|
timeout: PLAIN_FETCH_TIMEOUT,
|
||||||
|
retries: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = new Error(`plain fetch returned ${response.status}`);
|
||||||
|
error.status = response.status;
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
|
||||||
|
if (contentType && !contentType.includes("html") && !contentType.includes("xml")) {
|
||||||
|
throw new Error(`plain fetch returned non-html content-type: ${contentType}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = await response.text();
|
||||||
|
return {
|
||||||
|
html: text.slice(0, MAX_PLAIN_HTML_LENGTH),
|
||||||
|
finalUrl: response.url || url,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async function fetchBrowserHtml(url) {
|
||||||
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
|
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
|
||||||
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
|
const session = await getSharedBrowserSession({
|
||||||
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
|
requestTimeout: BROWSER_FETCH_TIMEOUT,
|
||||||
const article = await extractFromHtml(html, url);
|
maxConcurrentPages,
|
||||||
if (!article) {
|
});
|
||||||
markArticleStatus(markContentSkipped, id, 'extractor returned no article');
|
|
||||||
|
const html = await session.fetchRenderedHtml(url, { timeout: BROWSER_FETCH_TIMEOUT });
|
||||||
|
return { html, finalUrl: url };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function stripHtmlContent(value) {
|
||||||
|
if (typeof value !== "string") return null;
|
||||||
|
const stripped = value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
||||||
|
return stripped || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// runs fetch → extract → validate. returns { ok, article, html, finalUrl, reason }
|
||||||
|
// where article has been post-processed (content stripped of html). on failure,
|
||||||
|
// reason explains what tripped — used both for logging and for the per-domain
|
||||||
|
// policy update
|
||||||
|
async function attemptFetch(url, fetcher) {
|
||||||
|
let html;
|
||||||
|
let finalUrl;
|
||||||
|
try {
|
||||||
|
const result = await fetcher(url);
|
||||||
|
html = result.html;
|
||||||
|
finalUrl = result.finalUrl;
|
||||||
|
} catch (error) {
|
||||||
|
return { ok: false, reason: `fetch-error:${error.message || "unknown"}`, error };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!html) {
|
||||||
|
return { ok: false, reason: "empty-html" };
|
||||||
|
}
|
||||||
|
|
||||||
|
let extracted;
|
||||||
|
try {
|
||||||
|
extracted = await extractFromHtml(html, finalUrl || url);
|
||||||
|
} catch (error) {
|
||||||
|
return { ok: false, reason: `extractor-error:${error.message || "unknown"}` };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extracted) {
|
||||||
|
extracted = {
|
||||||
|
...extracted,
|
||||||
|
content: stripHtmlContent(extracted.content),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const validation = validateExtractedArticle({ article: extracted, html, finalUrl });
|
||||||
|
if (!validation.ok) {
|
||||||
|
return { ok: false, reason: validation.reason, retryable: validation.retryable, html, finalUrl };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { ok: true, article: extracted, html, finalUrl };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function getAttemptCount(id) {
|
||||||
|
const row = selectAttemptCount.get(id);
|
||||||
|
return row ? row.attempts || 0 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
|
||||||
|
const policy = getEffectivePolicy(url);
|
||||||
|
|
||||||
|
// domains we know are blocked — skip the fetch entirely until ttl expires.
|
||||||
|
// the row stays pending so it'll get picked up after the policy resets
|
||||||
|
if (policy.policy === "blocked") {
|
||||||
|
markContentPending.run(
|
||||||
|
`domain blocked by policy`,
|
||||||
|
nowIso(),
|
||||||
|
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
||||||
|
id
|
||||||
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const content = typeof article.content === 'string'
|
const tryPlainFirst = policy.policy === "auto" || policy.policy === "plain_only";
|
||||||
? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
|
let plainResult = null;
|
||||||
: null;
|
let browserResult = null;
|
||||||
|
|
||||||
// if stored title looks like a raw URL, try to replace with scraped title
|
|
||||||
|
if (tryPlainFirst) {
|
||||||
|
plainResult = await attemptFetch(url, fetchPlainHtml);
|
||||||
|
|
||||||
|
if (plainResult.ok) {
|
||||||
|
recordPlainSuccess(url);
|
||||||
|
await commitArticle(id, url, plainResult, storedTitle, storedDescription);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
recordPlainFailure(url);
|
||||||
|
|
||||||
|
// hard 4xx (other than 408/429) on plain — domain might serve the same to
|
||||||
|
// browser, but try anyway since it's cheap once the policy hasnt flipped yet.
|
||||||
|
// 408/429/5xx defer for retry
|
||||||
|
const status = plainResult.error && getErrorStatus(plainResult.error);
|
||||||
|
if (status === 408 || status === 429 || (status && status >= 500)) {
|
||||||
|
markContentPending.run(
|
||||||
|
`plain ${status}`,
|
||||||
|
nowIso(),
|
||||||
|
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
||||||
|
id
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// policy.policy === "plain_only" means we just tried plain and failed —
|
||||||
|
// dont escalate to browser, the operator (or earlier domain memory) said no
|
||||||
|
if (policy.policy === "plain_only") {
|
||||||
|
recordValidationFailure(id, plainResult);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
browserResult = await attemptFetch(url, fetchBrowserHtml);
|
||||||
|
|
||||||
|
if (browserResult.ok) {
|
||||||
|
recordBrowserSuccess(url);
|
||||||
|
await commitArticle(id, url, browserResult, storedTitle, storedDescription);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
recordBrowserFailure(url);
|
||||||
|
|
||||||
|
const browserStatus = browserResult.error && getErrorStatus(browserResult.error);
|
||||||
|
if (browserStatus === 408 || browserStatus === 429 || (browserStatus && browserStatus >= 500)) {
|
||||||
|
markContentPending.run(
|
||||||
|
`browser ${browserStatus}`,
|
||||||
|
nowIso(),
|
||||||
|
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
||||||
|
id
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// both paths exhausted (or browser-only path failed). decide between
|
||||||
|
// pending-with-retry and terminal failed based on attempt count and
|
||||||
|
// whether the validator thought it was retryable
|
||||||
|
recordValidationFailure(id, browserResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function recordValidationFailure(id, result) {
|
||||||
|
const reason = result?.reason || "unknown";
|
||||||
|
const retryable = result?.retryable !== false;
|
||||||
|
const attempts = getAttemptCount(id);
|
||||||
|
|
||||||
|
// hard fetch errors with no retryable signal — terminal after a few tries
|
||||||
|
if (!retryable || attempts + 1 >= MAX_TERMINAL_ATTEMPTS) {
|
||||||
|
markContentFailed.run(reason, nowIso(), id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
markContentPending.run(reason, nowIso(), futureIso(VALIDATION_RETRY_AFTER_MS), id);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async function commitArticle(id, url, result, storedTitle, storedDescription) {
|
||||||
|
const { article, finalUrl } = result;
|
||||||
|
const content = article.content || null;
|
||||||
|
|
||||||
|
// if stored title looks like a raw url, replace with extracted one
|
||||||
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
|
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
|
||||||
if (titleLooksLikeUrl) {
|
if (titleLooksLikeUrl) {
|
||||||
const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
|
const scrapedTitle = typeof article.title === "string" ? article.title.trim() : null;
|
||||||
const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
|
const scrapedDescription = typeof article.description === "string" ? article.description.trim() : null;
|
||||||
if (scrapedTitle) {
|
if (scrapedTitle) {
|
||||||
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
|
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
|
||||||
}
|
}
|
||||||
|
|
@ -154,37 +351,21 @@ async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
|
||||||
if (status === 401 || status === 403 || status === 404 || status === 429) {
|
if (status === 401 || status === 403 || status === 404 || status === 429) {
|
||||||
console.warn(`image fetch skipped for ${url}: upstream returned ${status}`);
|
console.warn(`image fetch skipped for ${url}: upstream returned ${status}`);
|
||||||
} else {
|
} else {
|
||||||
console.error(`image fetch failed for ${url}:`, error);
|
console.error(`image fetch failed for ${url}:`, error.message || error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!content && !image) {
|
updateArticleAssets.run(content, image, nowIso(), id);
|
||||||
markArticleStatus(markContentSkipped, id, 'article had no extractable content or image');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
updateArticleAssets.run(content, image, new Date().toISOString(), id);
|
try {
|
||||||
await generateAndStoreEmbedding(id);
|
await generateAndStoreEmbedding(id);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const status = getErrorStatus(error);
|
console.error(`embedding failed for article ${id}:`, error.message || error);
|
||||||
if (status === 401 || status === 403 || status === 404) {
|
|
||||||
console.warn(`content fetch skipped for ${url}: upstream returned ${status}`);
|
|
||||||
markArticleStatus(markContentSkipped, id, `upstream returned ${status}`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (status === 408 || status === 429 || (status && status >= 500)) {
|
|
||||||
console.warn(`content fetch deferred for ${url}: upstream returned ${status}`);
|
|
||||||
markArticleStatus(markContentPending, id, null);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
markArticleStatus(markContentFailed, id, getErrorMessage(error, 'content fetch failed'));
|
|
||||||
console.error(`content fetch failed for ${url}:`, error);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async function backfillMissingContent(perSource = 50, concurrency = 5) {
|
async function backfillMissingContent(perSource = 50, concurrency = 5) {
|
||||||
if (contentBackfillRunning) {
|
if (contentBackfillRunning) {
|
||||||
return;
|
return;
|
||||||
|
|
@ -204,15 +385,18 @@ async function backfillMissingContent(perSource = 50, concurrency = 5) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function hasPendingContent() {
|
function hasPendingContent() {
|
||||||
return Boolean(db.prepare(`
|
return Boolean(db.prepare(`
|
||||||
SELECT 1 FROM articles
|
SELECT 1 FROM articles
|
||||||
WHERE (content IS NULL OR TRIM(content) = '')
|
WHERE (content IS NULL OR TRIM(content) = '')
|
||||||
AND (content_status IS NULL OR content_status = 'pending')
|
AND (content_status IS NULL OR content_status = 'pending')
|
||||||
|
AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
|
||||||
LIMIT 1
|
LIMIT 1
|
||||||
`).get());
|
`).get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
fetchAndStoreContent,
|
fetchAndStoreContent,
|
||||||
backfillMissingContent,
|
backfillMissingContent,
|
||||||
|
|
|
||||||
199
src/contentValidation.js
Normal file
199
src/contentValidation.js
Normal file
|
|
@ -0,0 +1,199 @@
|
||||||
|
// validates whether an extracted article is real content vs a soft-error page
|
||||||
|
// (cookie wall, cloudflare challenge, paywall, "enable javascript", etc).
|
||||||
|
//
|
||||||
|
// the rules are deliberately conservative. we'd rather let a few junk pages
|
||||||
|
// through (caught downstream when re-checked) than reject 5% of real articles.
|
||||||
|
// fingerprints are anchored to title or the first ~500 chars of body so an
|
||||||
|
// article that *mentions* cloudflare doesnt get falsely rejected.
|
||||||
|
|
||||||
|
const MIN_CONTENT_LENGTH = 400;
|
||||||
|
const MIN_SENTENCE_TERMINATORS = 3;
|
||||||
|
const BODY_SNIFF_LENGTH = 800;
|
||||||
|
|
||||||
|
|
||||||
|
// titles that ONLY appear on error/challenge pages — never on real articles.
|
||||||
|
// match is case-insensitive, exact-or-prefix only (not substring) to avoid
|
||||||
|
// false positives like a real article titled "404 reasons your startup failed"
|
||||||
|
const TITLE_BLOCKLIST = [
|
||||||
|
"just a moment",
|
||||||
|
"just a moment...",
|
||||||
|
"attention required! | cloudflare",
|
||||||
|
"attention required!",
|
||||||
|
"access denied",
|
||||||
|
"access to this page has been denied",
|
||||||
|
"you have been blocked",
|
||||||
|
"are you a robot",
|
||||||
|
"are you a robot?",
|
||||||
|
"verify you are human",
|
||||||
|
"please verify you are a human",
|
||||||
|
"page not found",
|
||||||
|
"404 not found",
|
||||||
|
"404 page not found",
|
||||||
|
"403 forbidden",
|
||||||
|
"503 service unavailable",
|
||||||
|
"this page isn't available",
|
||||||
|
"this page isn’t available",
|
||||||
|
"site temporarily unavailable",
|
||||||
|
"request unsuccessful",
|
||||||
|
];
|
||||||
|
|
||||||
|
|
||||||
|
// substrings to look for in the raw html head/early body that indicate a
|
||||||
|
// cloudflare/akamai/imperva interstitial. these are infrastructure markers
|
||||||
|
// the real site never serves
|
||||||
|
const CHALLENGE_MARKERS = [
|
||||||
|
"cf-chl-bypass",
|
||||||
|
"__cf_chl_",
|
||||||
|
"cf_chl_opt",
|
||||||
|
"/cdn-cgi/challenge-platform",
|
||||||
|
"_incapsula_resource",
|
||||||
|
"incap_ses_",
|
||||||
|
"x-iinfo",
|
||||||
|
"akamai-bm-telemetry",
|
||||||
|
"ak_bmsc",
|
||||||
|
"distil_r_captcha",
|
||||||
|
];
|
||||||
|
|
||||||
|
|
||||||
|
// phrases at the very start of extracted body text that mean we got a stub.
|
||||||
|
// anchored to first ~500 chars so we dont false-flag articles that discuss
|
||||||
|
// these topics later in the body
|
||||||
|
const BODY_PREFIX_BLOCKLIST = [
|
||||||
|
"you need to enable javascript",
|
||||||
|
"please enable javascript",
|
||||||
|
"javascript is required",
|
||||||
|
"please enable cookies",
|
||||||
|
"cookies must be enabled",
|
||||||
|
"your browser will redirect",
|
||||||
|
"checking your browser before",
|
||||||
|
"this site requires javascript",
|
||||||
|
"please make sure your browser supports",
|
||||||
|
];
|
||||||
|
|
||||||
|
|
||||||
|
// final-url path suffixes that indicate the request was redirected to a
|
||||||
|
// generic error/login page. we only check the pathname so query strings dont
|
||||||
|
// throw it off
|
||||||
|
const ERROR_PATH_HINTS = [
|
||||||
|
"/404",
|
||||||
|
"/403",
|
||||||
|
"/error",
|
||||||
|
"/errors",
|
||||||
|
"/blocked",
|
||||||
|
"/captcha",
|
||||||
|
"/challenge",
|
||||||
|
"/access-denied",
|
||||||
|
"/account/login",
|
||||||
|
"/sign-in",
|
||||||
|
"/signin",
|
||||||
|
"/subscribe",
|
||||||
|
"/subscription",
|
||||||
|
];
|
||||||
|
|
||||||
|
|
||||||
|
function normalizeForMatch(value) {
|
||||||
|
return String(value || "").trim().toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
function countSentenceTerminators(text) {
|
||||||
|
// matches . ! ? followed by whitespace or end — avoids counting decimals like 3.14
|
||||||
|
const matches = String(text || "").match(/[.!?](?:\s|$)/g);
|
||||||
|
return matches ? matches.length : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasErrorPath(finalUrl) {
|
||||||
|
if (!finalUrl) return false;
|
||||||
|
try {
|
||||||
|
const path = new URL(finalUrl).pathname.toLowerCase();
|
||||||
|
return ERROR_PATH_HINTS.some((hint) => path === hint || path.startsWith(`${hint}/`) || path.endsWith(hint));
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasChallengeMarker(html) {
|
||||||
|
if (!html) return null;
|
||||||
|
// cap the search window — challenge markers are always in head or top of body,
|
||||||
|
// dont need to scan a full 1.5mb document
|
||||||
|
const haystack = String(html).slice(0, 50000).toLowerCase();
|
||||||
|
for (const marker of CHALLENGE_MARKERS) {
|
||||||
|
if (haystack.includes(marker)) {
|
||||||
|
return marker;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function titleIsBlocked(title) {
|
||||||
|
const normalized = normalizeForMatch(title);
|
||||||
|
if (!normalized) return null;
|
||||||
|
|
||||||
|
for (const entry of TITLE_BLOCKLIST) {
|
||||||
|
if (normalized === entry || normalized.startsWith(`${entry} `) || normalized.startsWith(`${entry}|`)) {
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function bodyPrefixIsBlocked(content) {
|
||||||
|
const sniff = normalizeForMatch(content).slice(0, BODY_SNIFF_LENGTH);
|
||||||
|
if (!sniff) return null;
|
||||||
|
|
||||||
|
for (const phrase of BODY_PREFIX_BLOCKLIST) {
|
||||||
|
if (sniff.includes(phrase)) {
|
||||||
|
return phrase;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function validateExtractedArticle({ article, html, finalUrl }) {
|
||||||
|
if (!article) {
|
||||||
|
return { ok: false, reason: "extractor-returned-null", retryable: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
const content = typeof article.content === "string" ? article.content.trim() : "";
|
||||||
|
const title = typeof article.title === "string" ? article.title.trim() : "";
|
||||||
|
|
||||||
|
// title-level checks first since they're the cheapest signal
|
||||||
|
const blockedTitle = titleIsBlocked(title);
|
||||||
|
if (blockedTitle) {
|
||||||
|
return { ok: false, reason: `title-blocklist:${blockedTitle}`, retryable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasErrorPath(finalUrl)) {
|
||||||
|
return { ok: false, reason: `error-path:${finalUrl}`, retryable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
const challenge = hasChallengeMarker(html);
|
||||||
|
if (challenge) {
|
||||||
|
return { ok: false, reason: `challenge-marker:${challenge}`, retryable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!content) {
|
||||||
|
return { ok: false, reason: "no-content-extracted", retryable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (content.length < MIN_CONTENT_LENGTH) {
|
||||||
|
return { ok: false, reason: `content-too-short:${content.length}`, retryable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
const blockedPrefix = bodyPrefixIsBlocked(content);
|
||||||
|
if (blockedPrefix) {
|
||||||
|
return { ok: false, reason: `body-prefix-blocklist:${blockedPrefix}`, retryable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (countSentenceTerminators(content) < MIN_SENTENCE_TERMINATORS) {
|
||||||
|
return { ok: false, reason: "too-few-sentences", retryable: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { ok: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
validateExtractedArticle,
|
||||||
|
MIN_CONTENT_LENGTH,
|
||||||
|
};
|
||||||
18
src/db.js
18
src/db.js
|
|
@ -261,11 +261,29 @@ db.exec(`
|
||||||
);
|
);
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
// per-domain fetch policy — caches whether plain http or browser is needed
|
||||||
|
// so we dont waste a round trip on every article from a known js-only site.
|
||||||
|
// expires_at lets us re-probe domains that may have recovered
|
||||||
|
db.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS domain_fetch_policy (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
policy TEXT NOT NULL DEFAULT 'auto',
|
||||||
|
consecutive_plain_failures INTEGER NOT NULL DEFAULT 0,
|
||||||
|
consecutive_browser_failures INTEGER NOT NULL DEFAULT 0,
|
||||||
|
plain_success_count INTEGER NOT NULL DEFAULT 0,
|
||||||
|
browser_success_count INTEGER NOT NULL DEFAULT 0,
|
||||||
|
expires_at TEXT,
|
||||||
|
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
|
||||||
for (const statement of [
|
for (const statement of [
|
||||||
'ALTER TABLE articles ADD COLUMN image TEXT',
|
'ALTER TABLE articles ADD COLUMN image TEXT',
|
||||||
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
||||||
'ALTER TABLE articles ADD COLUMN content_error TEXT',
|
'ALTER TABLE articles ADD COLUMN content_error TEXT',
|
||||||
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
|
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
|
||||||
|
'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0',
|
||||||
|
'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
|
||||||
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
|
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
|
||||||
]) {
|
]) {
|
||||||
try {
|
try {
|
||||||
|
|
|
||||||
181
src/domainPolicy.js
Normal file
181
src/domainPolicy.js
Normal file
|
|
@ -0,0 +1,181 @@
|
||||||
|
const db = require("./db");
|
||||||
|
|
||||||
|
|
||||||
|
// thresholds — kept in code rather than config because tuning these without
|
||||||
|
// understanding the consequences is a recipe for either a thundering herd
|
||||||
|
// against blocked domains or wasted plain-fetch attempts forever
|
||||||
|
const PLAIN_FAILURE_THRESHOLD = 5;
|
||||||
|
const BROWSER_FAILURE_THRESHOLD = 5;
|
||||||
|
const BROWSER_ONLY_TTL_MS = 7 * 24 * 60 * 60 * 1000;
|
||||||
|
const BLOCKED_TTL_MS = 24 * 60 * 60 * 1000;
|
||||||
|
|
||||||
|
|
||||||
|
const selectPolicy = db.prepare(`
|
||||||
|
SELECT domain, policy, consecutive_plain_failures, consecutive_browser_failures,
|
||||||
|
plain_success_count, browser_success_count, expires_at, updated_at
|
||||||
|
FROM domain_fetch_policy
|
||||||
|
WHERE domain = ?
|
||||||
|
`);
|
||||||
|
|
||||||
|
const upsertPolicy = db.prepare(`
|
||||||
|
INSERT INTO domain_fetch_policy (
|
||||||
|
domain, policy, consecutive_plain_failures, consecutive_browser_failures,
|
||||||
|
plain_success_count, browser_success_count, expires_at, updated_at
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))
|
||||||
|
ON CONFLICT(domain) DO UPDATE SET
|
||||||
|
policy = excluded.policy,
|
||||||
|
consecutive_plain_failures = excluded.consecutive_plain_failures,
|
||||||
|
consecutive_browser_failures = excluded.consecutive_browser_failures,
|
||||||
|
plain_success_count = excluded.plain_success_count,
|
||||||
|
browser_success_count = excluded.browser_success_count,
|
||||||
|
expires_at = excluded.expires_at,
|
||||||
|
updated_at = datetime('now')
|
||||||
|
`);
|
||||||
|
|
||||||
|
|
||||||
|
function getDomain(url) {
|
||||||
|
try {
|
||||||
|
return new URL(url).hostname.toLowerCase();
|
||||||
|
} catch {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadRow(domain) {
|
||||||
|
if (!domain) return null;
|
||||||
|
return selectPolicy.get(domain) || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isExpired(row) {
|
||||||
|
if (!row || !row.expires_at) return false;
|
||||||
|
return new Date(row.expires_at).getTime() <= Date.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// returns the effective policy for a domain right now. expired entries
|
||||||
|
// silently revert to "auto" so we re-probe — we dont mutate the row here
|
||||||
|
// since reads happen on every fetch and writes are expensive
|
||||||
|
function getEffectivePolicy(url) {
|
||||||
|
const domain = getDomain(url);
|
||||||
|
const row = loadRow(domain);
|
||||||
|
|
||||||
|
if (!row) {
|
||||||
|
return { domain, policy: "auto" };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isExpired(row)) {
|
||||||
|
return { domain, policy: "auto", wasExpired: true, previous: row.policy };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { domain, policy: row.policy };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function writeRow(domain, updates) {
|
||||||
|
const existing = loadRow(domain) || {
|
||||||
|
policy: "auto",
|
||||||
|
consecutive_plain_failures: 0,
|
||||||
|
consecutive_browser_failures: 0,
|
||||||
|
plain_success_count: 0,
|
||||||
|
browser_success_count: 0,
|
||||||
|
expires_at: null,
|
||||||
|
};
|
||||||
|
|
||||||
|
const merged = {
|
||||||
|
policy: updates.policy ?? existing.policy,
|
||||||
|
consecutive_plain_failures: updates.consecutive_plain_failures ?? existing.consecutive_plain_failures,
|
||||||
|
consecutive_browser_failures: updates.consecutive_browser_failures ?? existing.consecutive_browser_failures,
|
||||||
|
plain_success_count: updates.plain_success_count ?? existing.plain_success_count,
|
||||||
|
browser_success_count: updates.browser_success_count ?? existing.browser_success_count,
|
||||||
|
expires_at: updates.expires_at !== undefined ? updates.expires_at : existing.expires_at,
|
||||||
|
};
|
||||||
|
|
||||||
|
upsertPolicy.run(
|
||||||
|
domain,
|
||||||
|
merged.policy,
|
||||||
|
merged.consecutive_plain_failures,
|
||||||
|
merged.consecutive_browser_failures,
|
||||||
|
merged.plain_success_count,
|
||||||
|
merged.browser_success_count,
|
||||||
|
merged.expires_at
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function recordPlainSuccess(url) {
|
||||||
|
const domain = getDomain(url);
|
||||||
|
if (!domain) return;
|
||||||
|
const existing = loadRow(domain);
|
||||||
|
|
||||||
|
writeRow(domain, {
|
||||||
|
policy: "auto",
|
||||||
|
consecutive_plain_failures: 0,
|
||||||
|
plain_success_count: (existing?.plain_success_count || 0) + 1,
|
||||||
|
expires_at: null,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function recordPlainFailure(url) {
|
||||||
|
const domain = getDomain(url);
|
||||||
|
if (!domain) return;
|
||||||
|
const existing = loadRow(domain);
|
||||||
|
|
||||||
|
const failures = (existing?.consecutive_plain_failures || 0) + 1;
|
||||||
|
|
||||||
|
if (failures >= PLAIN_FAILURE_THRESHOLD) {
|
||||||
|
writeRow(domain, {
|
||||||
|
policy: "browser_only",
|
||||||
|
consecutive_plain_failures: failures,
|
||||||
|
expires_at: new Date(Date.now() + BROWSER_ONLY_TTL_MS).toISOString(),
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
writeRow(domain, {
|
||||||
|
consecutive_plain_failures: failures,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function recordBrowserSuccess(url) {
|
||||||
|
const domain = getDomain(url);
|
||||||
|
if (!domain) return;
|
||||||
|
const existing = loadRow(domain);
|
||||||
|
|
||||||
|
// a browser success doesnt reset the plain-failure counter — plain fetch
|
||||||
|
// is still broken for this domain, we just confirmed the browser path works.
|
||||||
|
// policy stays browser_only until the ttl expires and we re-probe plain
|
||||||
|
writeRow(domain, {
|
||||||
|
consecutive_browser_failures: 0,
|
||||||
|
browser_success_count: (existing?.browser_success_count || 0) + 1,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function recordBrowserFailure(url) {
|
||||||
|
const domain = getDomain(url);
|
||||||
|
if (!domain) return;
|
||||||
|
const existing = loadRow(domain);
|
||||||
|
|
||||||
|
const failures = (existing?.consecutive_browser_failures || 0) + 1;
|
||||||
|
|
||||||
|
if (failures >= BROWSER_FAILURE_THRESHOLD) {
|
||||||
|
writeRow(domain, {
|
||||||
|
policy: "blocked",
|
||||||
|
consecutive_browser_failures: failures,
|
||||||
|
expires_at: new Date(Date.now() + BLOCKED_TTL_MS).toISOString(),
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
writeRow(domain, {
|
||||||
|
consecutive_browser_failures: failures,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
getEffectivePolicy,
|
||||||
|
recordPlainSuccess,
|
||||||
|
recordPlainFailure,
|
||||||
|
recordBrowserSuccess,
|
||||||
|
recordBrowserFailure,
|
||||||
|
};
|
||||||
Loading…
Add table
Reference in a new issue