refine article filtering to ensure only usable articles are returned

This commit is contained in:
ImBenji 2026-04-18 14:30:35 +01:00
parent 9df24d44c9
commit 1027547b79
3 changed files with 27 additions and 7 deletions

View file

@ -42,7 +42,10 @@
"googleNews": "0 * * * *" "googleNews": "0 * * * *"
}, },
"contentBackfill": { "contentBackfill": {
"concurrency": 10 "concurrency": 0
},
"browser": {
"maxConcurrentPages": 25
}, },
"googleNews": { "googleNews": {
"queries": [ "queries": [

View file

@ -1,6 +1,7 @@
const { extractFromHtml } = require('@extractus/article-extractor'); const { extractFromHtml } = require('@extractus/article-extractor');
const sharp = require('sharp'); const sharp = require('sharp');
const db = require('./db'); const db = require('./db');
const config = require('./config');
const { generateAndStoreEmbedding } = require('./embeddings'); const { generateAndStoreEmbedding } = require('./embeddings');
const { fetchWithPolicy } = require('./http'); const { fetchWithPolicy } = require('./http');
const { getSharedBrowserSession } = require('./sources/browserCrawler'); const { getSharedBrowserSession } = require('./sources/browserCrawler');
@ -10,6 +11,11 @@ const updateArticleAssets = db.prepare(`
SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ? SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
WHERE id = ? WHERE id = ?
`); `);
const updateArticleTitleDescription = db.prepare(`
UPDATE articles
SET title = ?, description = ?
WHERE id = ?
`);
const markContentSkipped = db.prepare(` const markContentSkipped = db.prepare(`
UPDATE articles UPDATE articles
SET content_status = 'skipped', content_error = ?, content_attempted_at = ? SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
@ -26,14 +32,14 @@ const markContentPending = db.prepare(`
WHERE id = ? WHERE id = ?
`); `);
const selectAllArticlesMissingContent = db.prepare(` const selectAllArticlesMissingContent = db.prepare(`
SELECT id, url SELECT id, url, title, description
FROM articles FROM articles
WHERE (content IS NULL OR TRIM(content) = '') WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending') AND (content_status IS NULL OR content_status = 'pending')
ORDER BY ingested_at DESC, id DESC ORDER BY ingested_at DESC, id DESC
`); `);
const selectArticlesMissingContent = db.prepare(` const selectArticlesMissingContent = db.prepare(`
SELECT id, url SELECT id, url, title, description
FROM articles FROM articles
WHERE (content IS NULL OR TRIM(content) = '') WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending') AND (content_status IS NULL OR content_status = 'pending')
@ -117,9 +123,10 @@ async function fetchCompressedImage(url) {
return output.toString('base64'); return output.toString('base64');
} }
async function fetchAndStoreContent(id, url) { async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
try { try {
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages: 2 }); const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 }); const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
const article = await extractFromHtml(html, url); const article = await extractFromHtml(html, url);
if (!article) { if (!article) {
@ -131,6 +138,16 @@ async function fetchAndStoreContent(id, url) {
? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null ? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
: null; : null;
// if stored title looks like a raw URL, try to replace with scraped title
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
if (titleLooksLikeUrl) {
const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
if (scrapedTitle) {
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
}
}
let image = null; let image = null;
if (article.image) { if (article.image) {
try { try {
@ -185,7 +202,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) {
for (let i = 0; i < rows.length; i += concurrency) { for (let i = 0; i < rows.length; i += concurrency) {
const batch = rows.slice(i, i + concurrency); const batch = rows.slice(i, i + concurrency);
await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url))); await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url, row.title, row.description)));
} }
} finally { } finally {
contentBackfillRunning = false; contentBackfillRunning = false;

View file

@ -95,7 +95,7 @@ function ingestArticle(article) {
ingestedAt ingestedAt
); );
fetchAndStoreContent(result.lastInsertRowid, url); fetchAndStoreContent(result.lastInsertRowid, url, title, description);
return { inserted: true, id: result.lastInsertRowid }; return { inserted: true, id: result.lastInsertRowid };
} catch (error) { } catch (error) {