refine article filtering to ensure only usable articles are returned

This commit is contained in:
ImBenji 2026-04-18 14:30:35 +01:00
parent 9df24d44c9
commit 1027547b79
3 changed files with 27 additions and 7 deletions

View file

@ -42,7 +42,10 @@
"googleNews": "0 * * * *"
},
"contentBackfill": {
"concurrency": 10
"concurrency": 0
},
"browser": {
"maxConcurrentPages": 25
},
"googleNews": {
"queries": [

View file

@ -1,6 +1,7 @@
const { extractFromHtml } = require('@extractus/article-extractor');
const sharp = require('sharp');
const db = require('./db');
const config = require('./config');
const { generateAndStoreEmbedding } = require('./embeddings');
const { fetchWithPolicy } = require('./http');
const { getSharedBrowserSession } = require('./sources/browserCrawler');
@ -10,6 +11,11 @@ const updateArticleAssets = db.prepare(`
SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
WHERE id = ?
`);
const updateArticleTitleDescription = db.prepare(`
UPDATE articles
SET title = ?, description = ?
WHERE id = ?
`);
const markContentSkipped = db.prepare(`
UPDATE articles
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
@ -26,14 +32,14 @@ const markContentPending = db.prepare(`
WHERE id = ?
`);
const selectAllArticlesMissingContent = db.prepare(`
SELECT id, url
SELECT id, url, title, description
FROM articles
WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending')
ORDER BY ingested_at DESC, id DESC
`);
const selectArticlesMissingContent = db.prepare(`
SELECT id, url
SELECT id, url, title, description
FROM articles
WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending')
@ -117,9 +123,10 @@ async function fetchCompressedImage(url) {
return output.toString('base64');
}
async function fetchAndStoreContent(id, url) {
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
try {
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages: 2 });
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
const article = await extractFromHtml(html, url);
if (!article) {
@ -131,6 +138,16 @@ async function fetchAndStoreContent(id, url) {
? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
: null;
// if stored title looks like a raw URL, try to replace with scraped title
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
if (titleLooksLikeUrl) {
const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
if (scrapedTitle) {
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
}
}
let image = null;
if (article.image) {
try {
@ -185,7 +202,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) {
for (let i = 0; i < rows.length; i += concurrency) {
const batch = rows.slice(i, i + concurrency);
await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url)));
await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url, row.title, row.description)));
}
} finally {
contentBackfillRunning = false;

View file

@ -95,7 +95,7 @@ function ingestArticle(article) {
ingestedAt
);
fetchAndStoreContent(result.lastInsertRowid, url);
fetchAndStoreContent(result.lastInsertRowid, url, title, description);
return { inserted: true, id: result.lastInsertRowid };
} catch (error) {