diff --git a/config.json b/config.json index e9b36a1..f1a50f5 100644 --- a/config.json +++ b/config.json @@ -42,7 +42,10 @@ "googleNews": "0 * * * *" }, "contentBackfill": { - "concurrency": 10 + "concurrency": 0 + }, + "browser": { + "maxConcurrentPages": 25 }, "googleNews": { "queries": [ diff --git a/src/content.js b/src/content.js index d24b380..71c04ad 100644 --- a/src/content.js +++ b/src/content.js @@ -1,6 +1,7 @@ const { extractFromHtml } = require('@extractus/article-extractor'); const sharp = require('sharp'); const db = require('./db'); +const config = require('./config'); const { generateAndStoreEmbedding } = require('./embeddings'); const { fetchWithPolicy } = require('./http'); const { getSharedBrowserSession } = require('./sources/browserCrawler'); @@ -10,6 +11,11 @@ const updateArticleAssets = db.prepare(` SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ? WHERE id = ? `); +const updateArticleTitleDescription = db.prepare(` + UPDATE articles + SET title = ?, description = ? + WHERE id = ? +`); const markContentSkipped = db.prepare(` UPDATE articles SET content_status = 'skipped', content_error = ?, content_attempted_at = ? @@ -26,14 +32,14 @@ const markContentPending = db.prepare(` WHERE id = ? `); const selectAllArticlesMissingContent = db.prepare(` - SELECT id, url + SELECT id, url, title, description FROM articles WHERE (content IS NULL OR TRIM(content) = '') AND (content_status IS NULL OR content_status = 'pending') ORDER BY ingested_at DESC, id DESC `); const selectArticlesMissingContent = db.prepare(` - SELECT id, url + SELECT id, url, title, description FROM articles WHERE (content IS NULL OR TRIM(content) = '') AND (content_status IS NULL OR content_status = 'pending') @@ -117,9 +123,10 @@ async function fetchCompressedImage(url) { return output.toString('base64'); } -async function fetchAndStoreContent(id, url) { +async function fetchAndStoreContent(id, url, storedTitle, storedDescription) { try { - const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages: 2 }); + const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25; + const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages }); const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 }); const article = await extractFromHtml(html, url); if (!article) { @@ -131,6 +138,16 @@ async function fetchAndStoreContent(id, url) { ? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null : null; + // if stored title looks like a raw URL, try to replace with scraped title + const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim()); + if (titleLooksLikeUrl) { + const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null; + const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null; + if (scrapedTitle) { + updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id); + } + } + let image = null; if (article.image) { try { @@ -185,7 +202,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) { for (let i = 0; i < rows.length; i += concurrency) { const batch = rows.slice(i, i + concurrency); - await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url))); + await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url, row.title, row.description))); } } finally { contentBackfillRunning = false; diff --git a/src/ingest.js b/src/ingest.js index 34887ce..2cf32cf 100644 --- a/src/ingest.js +++ b/src/ingest.js @@ -95,7 +95,7 @@ function ingestArticle(article) { ingestedAt ); - fetchAndStoreContent(result.lastInsertRowid, url); + fetchAndStoreContent(result.lastInsertRowid, url, title, description); return { inserted: true, id: result.lastInsertRowid }; } catch (error) {