update scheduler configuration for news crawler and other sources

This commit is contained in:
ImBenji 2026-04-17 00:11:50 +01:00
parent ed108a1959
commit c80558fd50
3 changed files with 16 additions and 3 deletions

View file

@ -658,5 +658,9 @@
"edgar": "15 * * * *",
"alphaVantage": "20 * * * *",
"finnhub": "25 * * * *"
},
"contentBackfill": {
"cron": "0 * * * *",
"batchSize": -1
}
}

View file

@ -24,6 +24,13 @@ const markContentPending = db.prepare(`
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
WHERE id = ?
`);
const selectAllArticlesMissingContent = db.prepare(`
SELECT id, url
FROM articles
WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending')
ORDER BY ingested_at DESC, id DESC
`);
const selectArticlesMissingContent = db.prepare(`
SELECT id, url
FROM articles
@ -203,7 +210,7 @@ async function fetchAndStoreContent(id, url) {
}
}
async function backfillMissingContent(limit = 10) {
async function backfillMissingContent(limit = config.contentBackfill.batchSize || 10) {
if (contentBackfillRunning) {
return;
}
@ -211,7 +218,9 @@ async function backfillMissingContent(limit = 10) {
contentBackfillRunning = true;
try {
const rows = selectArticlesMissingContent.all(limit);
const rows = limit === -1
? selectAllArticlesMissingContent.all()
: selectArticlesMissingContent.all(limit);
for (const row of rows) {
await fetchAndStoreContent(row.id, row.url);

View file

@ -82,7 +82,7 @@ function startScheduler() {
});
}
cron.schedule('0 * * * *', async () => {
cron.schedule(config.contentBackfill.cron, async () => {
try {
await backfillMissingContent();
} catch (error) {