refine article filtering to ensure only usable articles are returned
This commit is contained in:
parent
1027547b79
commit
bfb52b1fd9
2 changed files with 14 additions and 18 deletions
|
|
@ -42,7 +42,8 @@
|
||||||
"googleNews": "0 * * * *"
|
"googleNews": "0 * * * *"
|
||||||
},
|
},
|
||||||
"contentBackfill": {
|
"contentBackfill": {
|
||||||
"concurrency": 0
|
"concurrency": 10,
|
||||||
|
"perSource": 50
|
||||||
},
|
},
|
||||||
"browser": {
|
"browser": {
|
||||||
"maxConcurrentPages": 25
|
"maxConcurrentPages": 25
|
||||||
|
|
|
||||||
|
|
@ -31,20 +31,17 @@ const markContentPending = db.prepare(`
|
||||||
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
|
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
`);
|
`);
|
||||||
const selectAllArticlesMissingContent = db.prepare(`
|
const selectRoundRobinArticlesMissingContent = db.prepare(`
|
||||||
SELECT id, url, title, description
|
SELECT id, url, title, description
|
||||||
|
FROM (
|
||||||
|
SELECT id, url, title, description, source,
|
||||||
|
ROW_NUMBER() OVER (PARTITION BY source ORDER BY ingested_at DESC, id DESC) AS rn
|
||||||
FROM articles
|
FROM articles
|
||||||
WHERE (content IS NULL OR TRIM(content) = '')
|
WHERE (content IS NULL OR TRIM(content) = '')
|
||||||
AND (content_status IS NULL OR content_status = 'pending')
|
AND (content_status IS NULL OR content_status = 'pending')
|
||||||
ORDER BY ingested_at DESC, id DESC
|
)
|
||||||
`);
|
WHERE rn <= ?
|
||||||
const selectArticlesMissingContent = db.prepare(`
|
ORDER BY rn, source
|
||||||
SELECT id, url, title, description
|
|
||||||
FROM articles
|
|
||||||
WHERE (content IS NULL OR TRIM(content) = '')
|
|
||||||
AND (content_status IS NULL OR content_status = 'pending')
|
|
||||||
ORDER BY ingested_at DESC, id DESC
|
|
||||||
LIMIT ?
|
|
||||||
`);
|
`);
|
||||||
|
|
||||||
const loggedBlockedDomains = new Set();
|
const loggedBlockedDomains = new Set();
|
||||||
|
|
@ -188,7 +185,7 @@ async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function backfillMissingContent(limit = 100, concurrency = 5) {
|
async function backfillMissingContent(perSource = 50, concurrency = 5) {
|
||||||
if (contentBackfillRunning) {
|
if (contentBackfillRunning) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -196,9 +193,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) {
|
||||||
contentBackfillRunning = true;
|
contentBackfillRunning = true;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const rows = limit === -1
|
const rows = selectRoundRobinArticlesMissingContent.all(perSource);
|
||||||
? selectAllArticlesMissingContent.all()
|
|
||||||
: selectArticlesMissingContent.all(limit);
|
|
||||||
|
|
||||||
for (let i = 0; i < rows.length; i += concurrency) {
|
for (let i = 0; i < rows.length; i += concurrency) {
|
||||||
const batch = rows.slice(i, i + concurrency);
|
const batch = rows.slice(i, i + concurrency);
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue