diff --git a/config.json b/config.json index f1a50f5..c9e50bb 100644 --- a/config.json +++ b/config.json @@ -42,7 +42,8 @@ "googleNews": "0 * * * *" }, "contentBackfill": { - "concurrency": 0 + "concurrency": 10, + "perSource": 50 }, "browser": { "maxConcurrentPages": 25 diff --git a/src/content.js b/src/content.js index 71c04ad..794254a 100644 --- a/src/content.js +++ b/src/content.js @@ -31,20 +31,17 @@ const markContentPending = db.prepare(` SET content_status = NULL, content_error = NULL, content_attempted_at = ? WHERE id = ? `); -const selectAllArticlesMissingContent = db.prepare(` +const selectRoundRobinArticlesMissingContent = db.prepare(` SELECT id, url, title, description - FROM articles - WHERE (content IS NULL OR TRIM(content) = '') - AND (content_status IS NULL OR content_status = 'pending') - ORDER BY ingested_at DESC, id DESC -`); -const selectArticlesMissingContent = db.prepare(` - SELECT id, url, title, description - FROM articles - WHERE (content IS NULL OR TRIM(content) = '') - AND (content_status IS NULL OR content_status = 'pending') - ORDER BY ingested_at DESC, id DESC - LIMIT ? + FROM ( + SELECT id, url, title, description, source, + ROW_NUMBER() OVER (PARTITION BY source ORDER BY ingested_at DESC, id DESC) AS rn + FROM articles + WHERE (content IS NULL OR TRIM(content) = '') + AND (content_status IS NULL OR content_status = 'pending') + ) + WHERE rn <= ? + ORDER BY rn, source `); const loggedBlockedDomains = new Set(); @@ -188,7 +185,7 @@ async function fetchAndStoreContent(id, url, storedTitle, storedDescription) { } } -async function backfillMissingContent(limit = 100, concurrency = 5) { +async function backfillMissingContent(perSource = 50, concurrency = 5) { if (contentBackfillRunning) { return; } @@ -196,9 +193,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) { contentBackfillRunning = true; try { - const rows = limit === -1 - ? selectAllArticlesMissingContent.all() - : selectArticlesMissingContent.all(limit); + const rows = selectRoundRobinArticlesMissingContent.all(perSource); for (let i = 0; i < rows.length; i += concurrency) { const batch = rows.slice(i, i + concurrency);