refine article filtering to ensure only usable articles are returned

This commit is contained in:
ImBenji 2026-04-18 14:34:16 +01:00
parent 1027547b79
commit bfb52b1fd9
2 changed files with 14 additions and 18 deletions

View file

@ -42,7 +42,8 @@
"googleNews": "0 * * * *"
},
"contentBackfill": {
"concurrency": 0
"concurrency": 10,
"perSource": 50
},
"browser": {
"maxConcurrentPages": 25

View file

@ -31,20 +31,17 @@ const markContentPending = db.prepare(`
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
WHERE id = ?
`);
const selectAllArticlesMissingContent = db.prepare(`
const selectRoundRobinArticlesMissingContent = db.prepare(`
SELECT id, url, title, description
FROM (
SELECT id, url, title, description, source,
ROW_NUMBER() OVER (PARTITION BY source ORDER BY ingested_at DESC, id DESC) AS rn
FROM articles
WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending')
ORDER BY ingested_at DESC, id DESC
`);
const selectArticlesMissingContent = db.prepare(`
SELECT id, url, title, description
FROM articles
WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending')
ORDER BY ingested_at DESC, id DESC
LIMIT ?
)
WHERE rn <= ?
ORDER BY rn, source
`);
const loggedBlockedDomains = new Set();
@ -188,7 +185,7 @@ async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
}
}
async function backfillMissingContent(limit = 100, concurrency = 5) {
async function backfillMissingContent(perSource = 50, concurrency = 5) {
if (contentBackfillRunning) {
return;
}
@ -196,9 +193,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) {
contentBackfillRunning = true;
try {
const rows = limit === -1
? selectAllArticlesMissingContent.all()
: selectArticlesMissingContent.all(limit);
const rows = selectRoundRobinArticlesMissingContent.all(perSource);
for (let i = 0; i < rows.length; i += concurrency) {
const batch = rows.slice(i, i + concurrency);