refine article filtering to ensure only usable articles are returned

This commit is contained in:
ImBenji 2026-04-18 14:34:16 +01:00
parent 1027547b79
commit bfb52b1fd9
2 changed files with 14 additions and 18 deletions

View file

@ -42,7 +42,8 @@
"googleNews": "0 * * * *" "googleNews": "0 * * * *"
}, },
"contentBackfill": { "contentBackfill": {
"concurrency": 0 "concurrency": 10,
"perSource": 50
}, },
"browser": { "browser": {
"maxConcurrentPages": 25 "maxConcurrentPages": 25

View file

@ -31,20 +31,17 @@ const markContentPending = db.prepare(`
SET content_status = NULL, content_error = NULL, content_attempted_at = ? SET content_status = NULL, content_error = NULL, content_attempted_at = ?
WHERE id = ? WHERE id = ?
`); `);
const selectAllArticlesMissingContent = db.prepare(` const selectRoundRobinArticlesMissingContent = db.prepare(`
SELECT id, url, title, description SELECT id, url, title, description
FROM articles FROM (
WHERE (content IS NULL OR TRIM(content) = '') SELECT id, url, title, description, source,
AND (content_status IS NULL OR content_status = 'pending') ROW_NUMBER() OVER (PARTITION BY source ORDER BY ingested_at DESC, id DESC) AS rn
ORDER BY ingested_at DESC, id DESC FROM articles
`); WHERE (content IS NULL OR TRIM(content) = '')
const selectArticlesMissingContent = db.prepare(` AND (content_status IS NULL OR content_status = 'pending')
SELECT id, url, title, description )
FROM articles WHERE rn <= ?
WHERE (content IS NULL OR TRIM(content) = '') ORDER BY rn, source
AND (content_status IS NULL OR content_status = 'pending')
ORDER BY ingested_at DESC, id DESC
LIMIT ?
`); `);
const loggedBlockedDomains = new Set(); const loggedBlockedDomains = new Set();
@ -188,7 +185,7 @@ async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
} }
} }
async function backfillMissingContent(limit = 100, concurrency = 5) { async function backfillMissingContent(perSource = 50, concurrency = 5) {
if (contentBackfillRunning) { if (contentBackfillRunning) {
return; return;
} }
@ -196,9 +193,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) {
contentBackfillRunning = true; contentBackfillRunning = true;
try { try {
const rows = limit === -1 const rows = selectRoundRobinArticlesMissingContent.all(perSource);
? selectAllArticlesMissingContent.all()
: selectArticlesMissingContent.all(limit);
for (let i = 0; i < rows.length; i += concurrency) { for (let i = 0; i < rows.length; i += concurrency) {
const batch = rows.slice(i, i + concurrency); const batch = rows.slice(i, i + concurrency);