update scheduler configuration for news crawler and other sources
This commit is contained in:
parent
ed108a1959
commit
c80558fd50
3 changed files with 16 additions and 3 deletions
|
|
@ -658,5 +658,9 @@
|
||||||
"edgar": "15 * * * *",
|
"edgar": "15 * * * *",
|
||||||
"alphaVantage": "20 * * * *",
|
"alphaVantage": "20 * * * *",
|
||||||
"finnhub": "25 * * * *"
|
"finnhub": "25 * * * *"
|
||||||
|
},
|
||||||
|
"contentBackfill": {
|
||||||
|
"cron": "0 * * * *",
|
||||||
|
"batchSize": -1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,13 @@ const markContentPending = db.prepare(`
|
||||||
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
|
SET content_status = NULL, content_error = NULL, content_attempted_at = ?
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
`);
|
`);
|
||||||
|
const selectAllArticlesMissingContent = db.prepare(`
|
||||||
|
SELECT id, url
|
||||||
|
FROM articles
|
||||||
|
WHERE (content IS NULL OR TRIM(content) = '')
|
||||||
|
AND (content_status IS NULL OR content_status = 'pending')
|
||||||
|
ORDER BY ingested_at DESC, id DESC
|
||||||
|
`);
|
||||||
const selectArticlesMissingContent = db.prepare(`
|
const selectArticlesMissingContent = db.prepare(`
|
||||||
SELECT id, url
|
SELECT id, url
|
||||||
FROM articles
|
FROM articles
|
||||||
|
|
@ -203,7 +210,7 @@ async function fetchAndStoreContent(id, url) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function backfillMissingContent(limit = 10) {
|
async function backfillMissingContent(limit = config.contentBackfill.batchSize || 10) {
|
||||||
if (contentBackfillRunning) {
|
if (contentBackfillRunning) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -211,7 +218,9 @@ async function backfillMissingContent(limit = 10) {
|
||||||
contentBackfillRunning = true;
|
contentBackfillRunning = true;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const rows = selectArticlesMissingContent.all(limit);
|
const rows = limit === -1
|
||||||
|
? selectAllArticlesMissingContent.all()
|
||||||
|
: selectArticlesMissingContent.all(limit);
|
||||||
|
|
||||||
for (const row of rows) {
|
for (const row of rows) {
|
||||||
await fetchAndStoreContent(row.id, row.url);
|
await fetchAndStoreContent(row.id, row.url);
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,7 @@ function startScheduler() {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
cron.schedule('0 * * * *', async () => {
|
cron.schedule(config.contentBackfill.cron, async () => {
|
||||||
try {
|
try {
|
||||||
await backfillMissingContent();
|
await backfillMissingContent();
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue