migrate article embeddings to support multi-model architecture and enhance data integrity
This commit is contained in:
parent
1e442df426
commit
a10c5eb39f
4 changed files with 43 additions and 14 deletions
|
|
@ -43,7 +43,7 @@
|
||||||
"googleNews": "0 * * * *"
|
"googleNews": "0 * * * *"
|
||||||
},
|
},
|
||||||
"contentBackfill": {
|
"contentBackfill": {
|
||||||
"concurrency": 10,
|
"concurrency": 100,
|
||||||
"perSource": 50
|
"perSource": 50
|
||||||
},
|
},
|
||||||
"browser": {
|
"browser": {
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
const db = require('./db');
|
const db = require('./db');
|
||||||
const { normalizeTitle } = require('./dedup');
|
const { normalizeTitle } = require('./dedup');
|
||||||
const { fetchAndStoreContent } = require('./content');
|
|
||||||
const { markSourceRun } = require('./state');
|
const { markSourceRun } = require('./state');
|
||||||
|
|
||||||
const insertArticle = db.prepare(`
|
const insertArticle = db.prepare(`
|
||||||
|
|
@ -95,7 +94,10 @@ function ingestArticle(article) {
|
||||||
ingestedAt
|
ingestedAt
|
||||||
);
|
);
|
||||||
|
|
||||||
fetchAndStoreContent(result.lastInsertRowid, url, title, description);
|
// dont kick off the content fetch here — it used to be fire-and-forget which
|
||||||
|
// pinned thousands of pending render promises in memory during big gdelt
|
||||||
|
// backfills. the runContentLoop polls for pending rows and handles them
|
||||||
|
// with proper concurrency limits
|
||||||
|
|
||||||
return { inserted: true, id: result.lastInsertRowid };
|
return { inserted: true, id: result.lastInsertRowid };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,22 @@ async function runAllIngestions() {
|
||||||
const results = [];
|
const results = [];
|
||||||
|
|
||||||
results.push(await runSource('rss', fetchRssArticles));
|
results.push(await runSource('rss', fetchRssArticles));
|
||||||
results.push(await runSource('gdelt', fetchGdeltArticles));
|
|
||||||
|
// gdelt streams via callback so we dont accumulate every article in memory
|
||||||
|
let gdeltInserted = 0;
|
||||||
|
let gdeltTotal = 0;
|
||||||
|
try {
|
||||||
|
await fetchGdeltArticles(async (articles) => {
|
||||||
|
const result = await ingestBatch("gdelt", articles);
|
||||||
|
gdeltInserted += result.inserted;
|
||||||
|
gdeltTotal += result.total;
|
||||||
|
});
|
||||||
|
results.push({ source: "gdelt", inserted: gdeltInserted, total: gdeltTotal });
|
||||||
|
} catch (error) {
|
||||||
|
console.error("gdelt ingestion failed:", error);
|
||||||
|
results.push({ source: "gdelt", inserted: gdeltInserted, total: gdeltTotal, error: error.message });
|
||||||
|
}
|
||||||
|
|
||||||
results.push(await runSource('edgar', fetchEdgarArticles));
|
results.push(await runSource('edgar', fetchEdgarArticles));
|
||||||
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
|
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
|
||||||
results.push(await runSource('finnhub', fetchFinnhubArticles));
|
results.push(await runSource('finnhub', fetchFinnhubArticles));
|
||||||
|
|
@ -61,14 +76,15 @@ function startScheduler() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const isBigQuery = String(config.gdelt?.source || 'api').toLowerCase() === 'bigquery';
|
|
||||||
|
|
||||||
if (isBigQuery) {
|
// both api and bigquery paths now stream per-window so we never hold the full
|
||||||
|
// result set across all sources × 52 weeks in memory at once
|
||||||
|
try {
|
||||||
await fetchGdeltArticles(async (articles) => {
|
await fetchGdeltArticles(async (articles) => {
|
||||||
await ingestBatch('gdelt', articles);
|
await ingestBatch("gdelt", articles);
|
||||||
});
|
});
|
||||||
} else {
|
} catch (error) {
|
||||||
await runSource('gdelt', fetchGdeltArticles);
|
console.error("gdelt ingestion failed:", error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -203,8 +203,12 @@ async function fetchGdeltArticlesBigQuery(onWindow) {
|
||||||
return allArticles;
|
return allArticles;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchGdeltArticles() {
|
async function fetchGdeltArticles(onWindow) {
|
||||||
const articles = [];
|
// when onWindow is provided we stream per-window and never hold the full set in memory.
|
||||||
|
// the legacy non-streaming caller still gets an array back, but only for tiny one-shot
|
||||||
|
// runs — anything that loops over many sources should be using the callback path
|
||||||
|
const articles = onWindow ? null : [];
|
||||||
|
|
||||||
const windows = buildWeeklyWindows();
|
const windows = buildWeeklyWindows();
|
||||||
const requestDelayMs = Math.max(0, Number(config.gdelt?.requestDelayMs) || 5500);
|
const requestDelayMs = Math.max(0, Number(config.gdelt?.requestDelayMs) || 5500);
|
||||||
const maxWindowsPerRun = Number(config.gdelt?.maxWindowsPerRun) || 0;
|
const maxWindowsPerRun = Number(config.gdelt?.maxWindowsPerRun) || 0;
|
||||||
|
|
@ -223,9 +227,16 @@ async function fetchGdeltArticles() {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const windowArticles = await fetchWindow(source, window);
|
const windowArticles = await fetchWindow(source, window);
|
||||||
articles.push(...windowArticles);
|
|
||||||
markWindowCompleted(source.id, window);
|
markWindowCompleted(source.id, window);
|
||||||
windowsFetched += 1;
|
windowsFetched += 1;
|
||||||
|
|
||||||
|
if (onWindow) {
|
||||||
|
if (windowArticles.length > 0) {
|
||||||
|
await onWindow(windowArticles);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
articles.push(...windowArticles);
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error && error.status === 429) {
|
if (error && error.status === 429) {
|
||||||
console.warn(`GDELT window rate-limited for ${source.id} ${window.startKey}-${window.endKey}`);
|
console.warn(`GDELT window rate-limited for ${source.id} ${window.startKey}-${window.endKey}`);
|
||||||
|
|
@ -246,7 +257,7 @@ async function fetchGdeltArticles() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return articles;
|
return articles || [];
|
||||||
}
|
}
|
||||||
|
|
||||||
function hasPendingWindows() {
|
function hasPendingWindows() {
|
||||||
|
|
@ -266,7 +277,7 @@ function fetchGdeltArticlesRouted(onWindow) {
|
||||||
if (source === 'bigquery') {
|
if (source === 'bigquery') {
|
||||||
return fetchGdeltArticlesBigQuery(onWindow);
|
return fetchGdeltArticlesBigQuery(onWindow);
|
||||||
}
|
}
|
||||||
return fetchGdeltArticles();
|
return fetchGdeltArticles(onWindow);
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue