436 lines
13 KiB
JavaScript
436 lines
13 KiB
JavaScript
const { extractFromHtml } = require("@extractus/article-extractor");
|
|
const db = require("./db");
|
|
const config = require("./config");
|
|
const { fetchWithPolicy } = require("./http");
|
|
const { getSharedBrowserSession } = require("./sources/browserCrawler");
|
|
const { validateExtractedArticle } = require("./contentValidation");
|
|
const {
|
|
getEffectivePolicy,
|
|
recordPlainSuccess,
|
|
recordPlainFailure,
|
|
recordBrowserSuccess,
|
|
recordBrowserFailure,
|
|
} = require("./domainPolicy");
|
|
|
|
|
|
const MAX_PLAIN_HTML_LENGTH = 1_500_000;
|
|
const PLAIN_FETCH_TIMEOUT = 12000;
|
|
const BROWSER_FETCH_TIMEOUT = 20000;
|
|
const HEAD_PRECHECK_TIMEOUT = 6000;
|
|
|
|
const VALIDATION_RETRY_AFTER_MS = 24 * 60 * 60 * 1000;
|
|
const TRANSIENT_RETRY_AFTER_MS = 6 * 60 * 60 * 1000;
|
|
const MAX_TERMINAL_ATTEMPTS = 3;
|
|
|
|
// flaky domains get a HEAD precheck before we waste a body fetch. only kicks
|
|
// in once a domain has accumulated some failure history; pristine domains
|
|
// skip the round trip
|
|
const HEAD_PRECHECK_FAILURE_THRESHOLD = 2;
|
|
|
|
|
|
const updateArticleAssets = db.prepare(`
|
|
UPDATE articles
|
|
SET content = ?, content_status = 'ready', content_error = NULL,
|
|
content_attempted_at = ?, content_attempt_count = content_attempt_count + 1,
|
|
content_retry_after = NULL
|
|
WHERE id = ?
|
|
`);
|
|
const updateArticleTitleDescription = db.prepare(`
|
|
UPDATE articles
|
|
SET title = ?, description = ?
|
|
WHERE id = ?
|
|
`);
|
|
const markContentSkipped = db.prepare(`
|
|
UPDATE articles
|
|
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?,
|
|
content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
|
|
WHERE id = ?
|
|
`);
|
|
const markContentFailed = db.prepare(`
|
|
UPDATE articles
|
|
SET content_status = 'failed', content_error = ?, content_attempted_at = ?,
|
|
content_attempt_count = content_attempt_count + 1, content_retry_after = NULL
|
|
WHERE id = ?
|
|
`);
|
|
const markContentPending = db.prepare(`
|
|
UPDATE articles
|
|
SET content_status = 'pending', content_error = ?, content_attempted_at = ?,
|
|
content_attempt_count = content_attempt_count + 1, content_retry_after = ?
|
|
WHERE id = ?
|
|
`);
|
|
|
|
// pulls a partition of pending articles. workerIndex/workerCount partitions
|
|
// by article id (deterministic) so multiple workers never see the same row.
|
|
// also round-robins by source so no single domain dominates the queue
|
|
const selectPartitionedArticlesMissingContent = db.prepare(`
|
|
SELECT id, url, title, description
|
|
FROM (
|
|
SELECT id, url, title, description, source, pub_date_effective,
|
|
ROW_NUMBER() OVER (PARTITION BY source ORDER BY pub_date_effective DESC, id DESC) AS rn
|
|
FROM articles
|
|
WHERE (content IS NULL OR TRIM(content) = '')
|
|
AND (content_status IS NULL OR content_status = 'pending')
|
|
AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
|
|
AND (id % ?) = ?
|
|
)
|
|
WHERE rn <= ?
|
|
ORDER BY pub_date_effective DESC, rn, source
|
|
`);
|
|
|
|
const selectAttemptCount = db.prepare(`
|
|
SELECT content_attempt_count AS attempts FROM articles WHERE id = ?
|
|
`);
|
|
|
|
|
|
// shared semaphore — tracks both plain and browser pool occupancy across all
|
|
// workers. defining at module scope so the limits are global, not per-worker
|
|
function makeSemaphore(limit) {
|
|
let active = 0;
|
|
const waiters = [];
|
|
|
|
return {
|
|
async acquire() {
|
|
if (active < limit) {
|
|
active += 1;
|
|
return;
|
|
}
|
|
await new Promise((resolve) => waiters.push(resolve));
|
|
active += 1;
|
|
},
|
|
release() {
|
|
active = Math.max(0, active - 1);
|
|
const next = waiters.shift();
|
|
if (next) next();
|
|
},
|
|
inFlight() {
|
|
return active;
|
|
},
|
|
};
|
|
}
|
|
|
|
const PLAIN_CONCURRENCY = Number(config.contentBackfill?.plainConcurrency) || 50;
|
|
const BROWSER_CONCURRENCY = Number(config.contentBackfill?.browserConcurrency) || 8;
|
|
|
|
const plainSemaphore = makeSemaphore(PLAIN_CONCURRENCY);
|
|
const browserSemaphore = makeSemaphore(BROWSER_CONCURRENCY);
|
|
|
|
|
|
function getErrorStatus(error) {
|
|
if (error && Number.isInteger(error.status)) {
|
|
return error.status;
|
|
}
|
|
|
|
const match = String((error && error.message) || "").match(/\b(401|403|404|408|429|5\d\d)\b/);
|
|
return match ? Number(match[1]) : null;
|
|
}
|
|
|
|
function nowIso() {
|
|
return new Date().toISOString();
|
|
}
|
|
|
|
function futureIso(ms) {
|
|
return new Date(Date.now() + ms).toISOString();
|
|
}
|
|
|
|
|
|
// cheap HEAD check before pulling the body. only used on domains we already
|
|
// know are unreliable. if HEAD says 404/410/451, skip the body fetch entirely
|
|
async function headPrecheck(url) {
|
|
try {
|
|
const response = await fetchWithPolicy(url, {
|
|
method: "HEAD",
|
|
timeout: HEAD_PRECHECK_TIMEOUT,
|
|
retries: 0,
|
|
});
|
|
return { status: response.status, finalUrl: response.url || url };
|
|
} catch (error) {
|
|
return { status: getErrorStatus(error), error };
|
|
}
|
|
}
|
|
|
|
|
|
async function fetchPlainHtml(url) {
|
|
await plainSemaphore.acquire();
|
|
try {
|
|
const response = await fetchWithPolicy(url, {
|
|
timeout: PLAIN_FETCH_TIMEOUT,
|
|
retries: 1,
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const error = new Error(`plain fetch returned ${response.status}`);
|
|
error.status = response.status;
|
|
throw error;
|
|
}
|
|
|
|
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
|
|
if (contentType && !contentType.includes("html") && !contentType.includes("xml")) {
|
|
throw new Error(`plain fetch returned non-html content-type: ${contentType}`);
|
|
}
|
|
|
|
const text = await response.text();
|
|
return {
|
|
html: text.slice(0, MAX_PLAIN_HTML_LENGTH),
|
|
finalUrl: response.url || url,
|
|
};
|
|
} finally {
|
|
plainSemaphore.release();
|
|
}
|
|
}
|
|
|
|
|
|
async function fetchBrowserHtml(url) {
|
|
await browserSemaphore.acquire();
|
|
try {
|
|
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 8;
|
|
const session = await getSharedBrowserSession({
|
|
requestTimeout: BROWSER_FETCH_TIMEOUT,
|
|
maxConcurrentPages,
|
|
});
|
|
|
|
const html = await session.fetchRenderedHtml(url, { timeout: BROWSER_FETCH_TIMEOUT });
|
|
return { html, finalUrl: url };
|
|
} finally {
|
|
browserSemaphore.release();
|
|
}
|
|
}
|
|
|
|
|
|
function stripHtmlContent(value) {
|
|
if (typeof value !== "string") return null;
|
|
const stripped = value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
return stripped || null;
|
|
}
|
|
|
|
|
|
async function attemptFetch(url, fetcher) {
|
|
let html;
|
|
let finalUrl;
|
|
try {
|
|
const result = await fetcher(url);
|
|
html = result.html;
|
|
finalUrl = result.finalUrl;
|
|
} catch (error) {
|
|
return { ok: false, reason: `fetch-error:${error.message || "unknown"}`, error };
|
|
}
|
|
|
|
if (!html) {
|
|
return { ok: false, reason: "empty-html" };
|
|
}
|
|
|
|
let extracted;
|
|
try {
|
|
extracted = await extractFromHtml(html, finalUrl || url);
|
|
} catch (error) {
|
|
return { ok: false, reason: `extractor-error:${error.message || "unknown"}` };
|
|
}
|
|
|
|
if (extracted) {
|
|
extracted = {
|
|
...extracted,
|
|
content: stripHtmlContent(extracted.content),
|
|
};
|
|
}
|
|
|
|
const validation = validateExtractedArticle({ article: extracted, html, finalUrl });
|
|
if (!validation.ok) {
|
|
return { ok: false, reason: validation.reason, retryable: validation.retryable, html, finalUrl };
|
|
}
|
|
|
|
return { ok: true, article: extracted, html, finalUrl };
|
|
}
|
|
|
|
|
|
function getAttemptCount(id) {
|
|
const row = selectAttemptCount.get(id);
|
|
return row ? row.attempts || 0 : 0;
|
|
}
|
|
|
|
|
|
// shouldPrecheck: domains with at least N consecutive plain or browser failures
|
|
// in their policy entry get a HEAD check first. we read the row directly here
|
|
// rather than threading through the policy module
|
|
const selectFailureCounts = db.prepare(`
|
|
SELECT consecutive_plain_failures, consecutive_browser_failures
|
|
FROM domain_fetch_policy WHERE domain = ?
|
|
`);
|
|
|
|
function shouldPrecheck(url) {
|
|
try {
|
|
const domain = new URL(url).hostname.toLowerCase();
|
|
const row = selectFailureCounts.get(domain);
|
|
if (!row) return false;
|
|
return (row.consecutive_plain_failures + row.consecutive_browser_failures) >= HEAD_PRECHECK_FAILURE_THRESHOLD;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
|
|
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
|
|
const policy = getEffectivePolicy(url);
|
|
|
|
if (policy.policy === "blocked") {
|
|
markContentPending.run(
|
|
`domain blocked by policy`,
|
|
nowIso(),
|
|
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
|
id
|
|
);
|
|
return;
|
|
}
|
|
|
|
// HEAD precheck for known-flaky domains. if it returns 404/410/451 we mark
|
|
// terminal-failed without burning a body fetch. transient errors fall through
|
|
if (shouldPrecheck(url)) {
|
|
const head = await headPrecheck(url);
|
|
if (head.status === 404 || head.status === 410 || head.status === 451) {
|
|
markContentFailed.run(`head ${head.status}`, nowIso(), id);
|
|
return;
|
|
}
|
|
}
|
|
|
|
const tryPlainFirst = policy.policy === "auto" || policy.policy === "plain_only";
|
|
let plainResult = null;
|
|
let browserResult = null;
|
|
|
|
|
|
if (tryPlainFirst) {
|
|
plainResult = await attemptFetch(url, fetchPlainHtml);
|
|
|
|
if (plainResult.ok) {
|
|
recordPlainSuccess(url);
|
|
commitArticle(id, url, plainResult, storedTitle, storedDescription);
|
|
return;
|
|
}
|
|
|
|
recordPlainFailure(url);
|
|
|
|
const status = plainResult.error && getErrorStatus(plainResult.error);
|
|
if (status === 408 || status === 429 || (status && status >= 500)) {
|
|
markContentPending.run(
|
|
`plain ${status}`,
|
|
nowIso(),
|
|
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
|
id
|
|
);
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (policy.policy === "plain_only") {
|
|
recordValidationFailure(id, plainResult);
|
|
return;
|
|
}
|
|
|
|
|
|
browserResult = await attemptFetch(url, fetchBrowserHtml);
|
|
|
|
if (browserResult.ok) {
|
|
recordBrowserSuccess(url);
|
|
commitArticle(id, url, browserResult, storedTitle, storedDescription);
|
|
return;
|
|
}
|
|
|
|
recordBrowserFailure(url);
|
|
|
|
const browserStatus = browserResult.error && getErrorStatus(browserResult.error);
|
|
if (browserStatus === 408 || browserStatus === 429 || (browserStatus && browserStatus >= 500)) {
|
|
markContentPending.run(
|
|
`browser ${browserStatus}`,
|
|
nowIso(),
|
|
futureIso(TRANSIENT_RETRY_AFTER_MS),
|
|
id
|
|
);
|
|
return;
|
|
}
|
|
|
|
recordValidationFailure(id, browserResult);
|
|
}
|
|
|
|
|
|
function recordValidationFailure(id, result) {
|
|
const reason = result?.reason || "unknown";
|
|
const retryable = result?.retryable !== false;
|
|
const attempts = getAttemptCount(id);
|
|
|
|
if (!retryable || attempts + 1 >= MAX_TERMINAL_ATTEMPTS) {
|
|
markContentFailed.run(reason, nowIso(), id);
|
|
return;
|
|
}
|
|
|
|
markContentPending.run(reason, nowIso(), futureIso(VALIDATION_RETRY_AFTER_MS), id);
|
|
}
|
|
|
|
|
|
function commitArticle(id, url, result, storedTitle, storedDescription) {
|
|
const { article } = result;
|
|
const content = article.content || null;
|
|
|
|
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
|
|
if (titleLooksLikeUrl) {
|
|
const scrapedTitle = typeof article.title === "string" ? article.title.trim() : null;
|
|
const scrapedDescription = typeof article.description === "string" ? article.description.trim() : null;
|
|
if (scrapedTitle) {
|
|
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
|
|
}
|
|
}
|
|
|
|
updateArticleAssets.run(content, nowIso(), id);
|
|
|
|
// embedding generation is no longer kicked off here — runEmbeddingLoop
|
|
// in scheduler.js batches them in its own pipeline. that decouples slow
|
|
// openrouter calls from content fetch throughput
|
|
}
|
|
|
|
|
|
// runs one worker pass — pulls its partition slice, fires N concurrent fetches
|
|
// from a single backfill loop. multiple workers share the plain/browser
|
|
// semaphores so total concurrency stays bounded regardless of worker count
|
|
async function runBackfillWorker({ workerIndex, workerCount, perSource, batchSize }) {
|
|
const rows = selectPartitionedArticlesMissingContent.all(workerCount, workerIndex, perSource);
|
|
if (rows.length === 0) return 0;
|
|
|
|
// dispatch in chunks of batchSize so we don't allocate thousands of unawaited
|
|
// promises at once. the semaphores throttle inside fetchAndStoreContent
|
|
for (let i = 0; i < rows.length; i += batchSize) {
|
|
const batch = rows.slice(i, i + batchSize);
|
|
await Promise.all(batch.map((row) =>
|
|
fetchAndStoreContent(row.id, row.url, row.title, row.description).catch((error) => {
|
|
console.error(`backfill worker ${workerIndex} failed on ${row.url}:`, error.message || error);
|
|
})
|
|
));
|
|
}
|
|
|
|
return rows.length;
|
|
}
|
|
|
|
|
|
function hasPendingContent() {
|
|
return Boolean(db.prepare(`
|
|
SELECT 1 FROM articles
|
|
WHERE (content IS NULL OR TRIM(content) = '')
|
|
AND (content_status IS NULL OR content_status = 'pending')
|
|
AND (content_retry_after IS NULL OR content_retry_after <= datetime('now'))
|
|
LIMIT 1
|
|
`).get());
|
|
}
|
|
|
|
|
|
// kept for backwards compat with scheduler/runAllIngestions one-shot runs
|
|
async function backfillMissingContent(perSource = 50, concurrency = 50) {
|
|
await runBackfillWorker({
|
|
workerIndex: 0,
|
|
workerCount: 1,
|
|
perSource,
|
|
batchSize: concurrency,
|
|
});
|
|
}
|
|
|
|
|
|
module.exports = {
|
|
fetchAndStoreContent,
|
|
backfillMissingContent,
|
|
runBackfillWorker,
|
|
hasPendingContent,
|
|
};
|