refine article filtering to ensure only usable articles are returned
This commit is contained in:
parent
9df24d44c9
commit
1027547b79
3 changed files with 27 additions and 7 deletions
|
|
@ -42,7 +42,10 @@
|
|||
"googleNews": "0 * * * *"
|
||||
},
|
||||
"contentBackfill": {
|
||||
"concurrency": 10
|
||||
"concurrency": 0
|
||||
},
|
||||
"browser": {
|
||||
"maxConcurrentPages": 25
|
||||
},
|
||||
"googleNews": {
|
||||
"queries": [
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
const { extractFromHtml } = require('@extractus/article-extractor');
|
||||
const sharp = require('sharp');
|
||||
const db = require('./db');
|
||||
const config = require('./config');
|
||||
const { generateAndStoreEmbedding } = require('./embeddings');
|
||||
const { fetchWithPolicy } = require('./http');
|
||||
const { getSharedBrowserSession } = require('./sources/browserCrawler');
|
||||
|
|
@ -10,6 +11,11 @@ const updateArticleAssets = db.prepare(`
|
|||
SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
|
||||
WHERE id = ?
|
||||
`);
|
||||
const updateArticleTitleDescription = db.prepare(`
|
||||
UPDATE articles
|
||||
SET title = ?, description = ?
|
||||
WHERE id = ?
|
||||
`);
|
||||
const markContentSkipped = db.prepare(`
|
||||
UPDATE articles
|
||||
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
|
||||
|
|
@ -26,14 +32,14 @@ const markContentPending = db.prepare(`
|
|||
WHERE id = ?
|
||||
`);
|
||||
const selectAllArticlesMissingContent = db.prepare(`
|
||||
SELECT id, url
|
||||
SELECT id, url, title, description
|
||||
FROM articles
|
||||
WHERE (content IS NULL OR TRIM(content) = '')
|
||||
AND (content_status IS NULL OR content_status = 'pending')
|
||||
ORDER BY ingested_at DESC, id DESC
|
||||
`);
|
||||
const selectArticlesMissingContent = db.prepare(`
|
||||
SELECT id, url
|
||||
SELECT id, url, title, description
|
||||
FROM articles
|
||||
WHERE (content IS NULL OR TRIM(content) = '')
|
||||
AND (content_status IS NULL OR content_status = 'pending')
|
||||
|
|
@ -117,9 +123,10 @@ async function fetchCompressedImage(url) {
|
|||
return output.toString('base64');
|
||||
}
|
||||
|
||||
async function fetchAndStoreContent(id, url) {
|
||||
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
|
||||
try {
|
||||
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages: 2 });
|
||||
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
|
||||
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
|
||||
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
|
||||
const article = await extractFromHtml(html, url);
|
||||
if (!article) {
|
||||
|
|
@ -131,6 +138,16 @@ async function fetchAndStoreContent(id, url) {
|
|||
? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
|
||||
: null;
|
||||
|
||||
// if stored title looks like a raw URL, try to replace with scraped title
|
||||
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
|
||||
if (titleLooksLikeUrl) {
|
||||
const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
|
||||
const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
|
||||
if (scrapedTitle) {
|
||||
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
|
||||
}
|
||||
}
|
||||
|
||||
let image = null;
|
||||
if (article.image) {
|
||||
try {
|
||||
|
|
@ -185,7 +202,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) {
|
|||
|
||||
for (let i = 0; i < rows.length; i += concurrency) {
|
||||
const batch = rows.slice(i, i + concurrency);
|
||||
await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url)));
|
||||
await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url, row.title, row.description)));
|
||||
}
|
||||
} finally {
|
||||
contentBackfillRunning = false;
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ function ingestArticle(article) {
|
|||
ingestedAt
|
||||
);
|
||||
|
||||
fetchAndStoreContent(result.lastInsertRowid, url);
|
||||
fetchAndStoreContent(result.lastInsertRowid, url, title, description);
|
||||
|
||||
return { inserted: true, id: result.lastInsertRowid };
|
||||
} catch (error) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue