add Google News integration and enhance crawler capabilities

2026-04-18 13:59:46 +01:00
parent 14727cdfe3
commit 830766bcfe
5 changed files with 77 additions and 24 deletions
@@ -171,7 +171,7 @@ async function fetchAndStoreContent(id, url) {
  }
 }

-async function backfillMissingContent(limit = 10) {
+async function backfillMissingContent(limit = 100, concurrency = 5) {
  if (contentBackfillRunning) {
    return;
  }
@@ -183,15 +183,26 @@ async function backfillMissingContent(limit = 10) {
      ? selectAllArticlesMissingContent.all()
      : selectArticlesMissingContent.all(limit);

-    for (const row of rows) {
-      await fetchAndStoreContent(row.id, row.url);
+    for (let i = 0; i < rows.length; i += concurrency) {
+      const batch = rows.slice(i, i + concurrency);
+      await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url)));
    }
  } finally {
    contentBackfillRunning = false;
  }
 }

+function hasPendingContent() {
+  return Boolean(db.prepare(`
+    SELECT 1 FROM articles
+    WHERE (content IS NULL OR TRIM(content) = '')
+      AND (content_status IS NULL OR content_status = 'pending')
+    LIMIT 1
+  `).get());
+}
+
 module.exports = {
  fetchAndStoreContent,
  backfillMissingContent,
+  hasPendingContent,
 };
@@ -127,6 +127,14 @@ db.exec(`
  );
 `);

+db.exec(`
+  CREATE TABLE IF NOT EXISTS article_embedding_meta (
+    article_id INTEGER PRIMARY KEY,
+    model TEXT NOT NULL,
+    embedded_at TEXT NOT NULL DEFAULT (datetime('now'))
+  );
+`);
+
 db.exec(`
  CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
    source_id TEXT NOT NULL,
@@ -19,6 +19,16 @@ const selectEmbeddingBuffer = db.prepare(`
  FROM article_embeddings
  WHERE article_id = ?
 `);
+const upsertEmbeddingMeta = db.prepare(`
+  INSERT INTO article_embedding_meta (article_id, model, embedded_at)
+  VALUES (?, ?, datetime('now'))
+  ON CONFLICT(article_id) DO UPDATE SET
+    model = excluded.model,
+    embedded_at = excluded.embedded_at
+`);
+const selectEmbeddingModel = db.prepare(`
+  SELECT model FROM article_embedding_meta WHERE article_id = ?
+`);
 const nearestNeighbors = db.prepare(`
  SELECT article_id, distance
  FROM article_embeddings
@@ -35,10 +45,14 @@ const selectArticlesMissingEmbeddings = db.prepare(`
    AND TRIM(a.description) != ''
    AND a.content IS NOT NULL
    AND TRIM(a.content) != ''
-    AND NOT EXISTS (
-      SELECT 1
-      FROM article_embeddings e
-      WHERE e.article_id = a.id
+    AND (
+      NOT EXISTS (
+        SELECT 1 FROM article_embeddings e WHERE e.article_id = a.id
+      )
+      OR NOT EXISTS (
+        SELECT 1 FROM article_embedding_meta m
+        WHERE m.article_id = a.id AND m.model = ?
+      )
    )
  ORDER BY a.ingested_at ASC, a.id ASC
  LIMIT ?
@@ -56,7 +70,15 @@ const upsertQueryEmbedding = db.prepare(`
    created_at = datetime('now')
 `);

+const EMBEDDING_MODEL = 'perplexity/pplx-embed-v1-0.6b';
+
 let embeddingBackfillRunning = false;
+
+// backfill meta for any embeddings that existed before model tracking
+db.prepare(`
+  INSERT OR IGNORE INTO article_embedding_meta (article_id, model)
+  SELECT article_id, ? FROM article_embeddings
+`).run(EMBEDDING_MODEL);
 const embeddingJobsRunning = new Set();

 function buildEmbeddingInput(article) {
@@ -90,7 +112,7 @@ async function requestEmbedding(input) {
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
-      model: 'perplexity/pplx-embed-v1-0.6b',
+      model: EMBEDDING_MODEL,
      input,
    }),
  });
@@ -159,6 +181,7 @@ async function generateAndStoreEmbedding(id) {
    const embedding = await requestEmbedding(input);
    deleteEmbedding.run(BigInt(id));
    insertEmbedding.run(BigInt(id), serializeEmbedding(embedding));
+    upsertEmbeddingMeta.run(id, EMBEDDING_MODEL);
    return { stored: true, shouldPauseBatch: false };
  } catch (error) {
    console.error(`embedding generation failed for article ${id}:`, error);
@@ -179,7 +202,7 @@ async function backfillMissingEmbeddings(limit = 100) {
  embeddingBackfillRunning = true;

  try {
-    const rows = selectArticlesMissingEmbeddings.all(limit);
+    const rows = selectArticlesMissingEmbeddings.all(EMBEDDING_MODEL, limit);

    for (const row of rows) {
      const result = await generateAndStoreEmbedding(row.id);
@@ -7,7 +7,7 @@ const { fetchEdgarArticles } = require('./sources/edgar');
 const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
 const { fetchFinnhubArticles } = require('./sources/finnhub');
 const { fetchGoogleNewsArticles } = require('./sources/googleNews');
-const { backfillMissingContent } = require('./content');
+const { backfillMissingContent, hasPendingContent } = require('./content');
 const { backfillMissingEmbeddings } = require('./embeddings');

 function sleep(ms) {
@@ -89,17 +89,24 @@ function startScheduler() {
    await runSource('googlenews', fetchGoogleNewsArticles);
  };

-  const runContentMaintenance = async () => {
-    try {
-      await backfillMissingContent();
-    } catch (error) {
-      console.error('content backfill failed:', error);
-    }
+  const runContentLoop = async () => {
+    while (true) {
+      if (!hasPendingContent()) {
+        await sleep(60 * 1000);
+        continue;
+      }

-    try {
-      await backfillMissingEmbeddings();
-    } catch (error) {
-      console.error('embedding backfill failed:', error);
+      try {
+        await backfillMissingContent();
+      } catch (error) {
+        console.error('content backfill failed:', error);
+      }
+
+      try {
+        await backfillMissingEmbeddings();
+      } catch (error) {
+        console.error('embedding backfill failed:', error);
+      }
    }
  };

@@ -109,14 +116,13 @@ function startScheduler() {
  runAlphaVantage();
  runFinnhub();
  // runGoogleNews();
-  runContentMaintenance();
+  runContentLoop();

  cron.schedule(config.scheduler.rss, runRss);
  cron.schedule(config.scheduler.edgar, runEdgar);
  cron.schedule(config.scheduler.alphaVantage, runAlphaVantage);
  cron.schedule(config.scheduler.finnhub, runFinnhub);

-cron.schedule(config.contentBackfill.cron, runContentMaintenance);
 }

 module.exports = {
@@ -114,14 +114,19 @@ async function fetchWindowBigQuery(source, window, bigquery) {
  const maxRecords = Math.max(1, Math.min(Number(config.gdelt?.maxRecords) || 100, 1000));
  const domainClauses = source.website.map((d) => `LOWER(DocumentIdentifier) LIKE '%${d}%'`).join(' OR ');

+  // use _PARTITIONTIME (not the INTEGER DATE column) so BigQuery can prune partitions
+  // and avoid full-table scans on the ~130TB gkg table
+  const startTs = window.start.toISOString();
+  const endTs = window.end.toISOString();
+
  const query = `
    SELECT
      DocumentIdentifier AS url,
      SourceCommonName AS domain,
      CAST(DATE AS STRING) AS seendate
    FROM \`gdelt-bq.gdeltv2.gkg\`
-    WHERE DATE >= ${window.startKey}
-      AND DATE < ${window.endKey}
+    WHERE _PARTITIONTIME >= TIMESTAMP("${startTs}")
+      AND _PARTITIONTIME < TIMESTAMP("${endTs}")
      AND (${domainClauses})
    LIMIT ${maxRecords}
  `;