refine article filtering to ensure only usable articles are returned

2026-04-18 14:30:35 +01:00
parent 9df24d44c9
commit 1027547b79
3 changed files with 27 additions and 7 deletions
@@ -42,7 +42,10 @@
    "googleNews": "0 * * * *"
  },
  "contentBackfill": {
-    "concurrency": 10
+    "concurrency": 0
+  },
+  "browser": {
+    "maxConcurrentPages": 25
  },
  "googleNews": {
    "queries": [
@@ -1,6 +1,7 @@
 const { extractFromHtml } = require('@extractus/article-extractor');
 const sharp = require('sharp');
 const db = require('./db');
+const config = require('./config');
 const { generateAndStoreEmbedding } = require('./embeddings');
 const { fetchWithPolicy } = require('./http');
 const { getSharedBrowserSession } = require('./sources/browserCrawler');
@@ -10,6 +11,11 @@ const updateArticleAssets = db.prepare(`
  SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
  WHERE id = ?
 `);
+const updateArticleTitleDescription = db.prepare(`
+  UPDATE articles
+  SET title = ?, description = ?
+  WHERE id = ?
+`);
 const markContentSkipped = db.prepare(`
  UPDATE articles
  SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
@@ -26,14 +32,14 @@ const markContentPending = db.prepare(`
  WHERE id = ?
 `);
 const selectAllArticlesMissingContent = db.prepare(`
-  SELECT id, url
+  SELECT id, url, title, description
  FROM articles
  WHERE (content IS NULL OR TRIM(content) = '')
    AND (content_status IS NULL OR content_status = 'pending')
  ORDER BY ingested_at DESC, id DESC
 `);
 const selectArticlesMissingContent = db.prepare(`
-  SELECT id, url
+  SELECT id, url, title, description
  FROM articles
  WHERE (content IS NULL OR TRIM(content) = '')
    AND (content_status IS NULL OR content_status = 'pending')
@@ -117,9 +123,10 @@ async function fetchCompressedImage(url) {
  return output.toString('base64');
 }

-async function fetchAndStoreContent(id, url) {
+async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
  try {
-    const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages: 2 });
+    const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
+    const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
    const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
    const article = await extractFromHtml(html, url);
    if (!article) {
@@ -131,6 +138,16 @@ async function fetchAndStoreContent(id, url) {
      ? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
      : null;

+    // if stored title looks like a raw URL, try to replace with scraped title
+    const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
+    if (titleLooksLikeUrl) {
+      const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
+      const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
+      if (scrapedTitle) {
+        updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
+      }
+    }
+
    let image = null;
    if (article.image) {
      try {
@@ -185,7 +202,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) {

    for (let i = 0; i < rows.length; i += concurrency) {
      const batch = rows.slice(i, i + concurrency);
-      await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url)));
+      await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url, row.title, row.description)));
    }
  } finally {
    contentBackfillRunning = false;
@@ -95,7 +95,7 @@ function ingestArticle(article) {
      ingestedAt
    );

-    fetchAndStoreContent(result.lastInsertRowid, url);
+    fetchAndStoreContent(result.lastInsertRowid, url, title, description);

    return { inserted: true, id: result.lastInsertRowid };
  } catch (error) {