add Google News integration and enhance crawler capabilities

This commit is contained in:
ImBenji 2026-04-18 06:51:36 +01:00
parent 3211c4c29e
commit 432c43254b

View file

@ -148,13 +148,24 @@ async function fetchGdeltArticlesBigQuery(onWindow) {
const windows = buildWeeklyWindows(); const windows = buildWeeklyWindows();
const maxWindowsPerRun = Number(config.gdelt?.maxWindowsPerRun) || 0; const maxWindowsPerRun = Number(config.gdelt?.maxWindowsPerRun) || 0;
const requestDelayMs = Math.max(0, Number(config.gdelt?.requestDelayMs) || 0); const requestDelayMs = Math.max(0, Number(config.gdelt?.requestDelayMs) || 0);
const maxRequestsPerSession = Number(config.gdelt?.bigQueryMaxRequestsPerSession) || 650;
const bigquery = getBigQueryClient(); const bigquery = getBigQueryClient();
const allArticles = []; const allArticles = [];
let totalRequests = 0;
for (const source of getBackfillSources()) { for (const source of getBackfillSources()) {
if (totalRequests >= maxRequestsPerSession) {
console.warn(`GDELT BigQuery: session request limit (${maxRequestsPerSession}) reached, stopping`);
break;
}
let windowsFetched = 0; let windowsFetched = 0;
for (const window of windows) { for (const window of windows) {
if (totalRequests >= maxRequestsPerSession) {
break;
}
if (maxWindowsPerRun > 0 && windowsFetched >= maxWindowsPerRun) { if (maxWindowsPerRun > 0 && windowsFetched >= maxWindowsPerRun) {
break; break;
} }
@ -167,6 +178,7 @@ async function fetchGdeltArticlesBigQuery(onWindow) {
const windowArticles = await fetchWindowBigQuery(source, window, bigquery); const windowArticles = await fetchWindowBigQuery(source, window, bigquery);
markWindowCompleted(source.id, window); markWindowCompleted(source.id, window);
windowsFetched += 1; windowsFetched += 1;
totalRequests += 1;
if (onWindow && windowArticles.length > 0) { if (onWindow && windowArticles.length > 0) {
await onWindow(windowArticles); await onWindow(windowArticles);