refine article filtering to ensure only usable articles are returned
This commit is contained in:
parent
9df24d44c9
commit
1027547b79
3 changed files with 27 additions and 7 deletions
|
|
@ -42,7 +42,10 @@
|
||||||
"googleNews": "0 * * * *"
|
"googleNews": "0 * * * *"
|
||||||
},
|
},
|
||||||
"contentBackfill": {
|
"contentBackfill": {
|
||||||
"concurrency": 10
|
"concurrency": 0
|
||||||
|
},
|
||||||
|
"browser": {
|
||||||
|
"maxConcurrentPages": 25
|
||||||
},
|
},
|
||||||
"googleNews": {
|
"googleNews": {
|
||||||
"queries": [
|
"queries": [
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
const { extractFromHtml } = require('@extractus/article-extractor');
|
const { extractFromHtml } = require('@extractus/article-extractor');
|
||||||
const sharp = require('sharp');
|
const sharp = require('sharp');
|
||||||
const db = require('./db');
|
const db = require('./db');
|
||||||
|
const config = require('./config');
|
||||||
const { generateAndStoreEmbedding } = require('./embeddings');
|
const { generateAndStoreEmbedding } = require('./embeddings');
|
||||||
const { fetchWithPolicy } = require('./http');
|
const { fetchWithPolicy } = require('./http');
|
||||||
const { getSharedBrowserSession } = require('./sources/browserCrawler');
|
const { getSharedBrowserSession } = require('./sources/browserCrawler');
|
||||||
|
|
@ -10,6 +11,11 @@ const updateArticleAssets = db.prepare(`
|
||||||
SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
|
SET content = ?, image = ?, content_status = 'ready', content_error = NULL, content_attempted_at = ?
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
`);
|
`);
|
||||||
|
const updateArticleTitleDescription = db.prepare(`
|
||||||
|
UPDATE articles
|
||||||
|
SET title = ?, description = ?
|
||||||
|
WHERE id = ?
|
||||||
|
`);
|
||||||
const markContentSkipped = db.prepare(`
|
const markContentSkipped = db.prepare(`
|
||||||
UPDATE articles
|
UPDATE articles
|
||||||
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
|
SET content_status = 'skipped', content_error = ?, content_attempted_at = ?
|
||||||
|
|
@ -26,14 +32,14 @@ const markContentPending = db.prepare(`
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
`);
|
`);
|
||||||
const selectAllArticlesMissingContent = db.prepare(`
|
const selectAllArticlesMissingContent = db.prepare(`
|
||||||
SELECT id, url
|
SELECT id, url, title, description
|
||||||
FROM articles
|
FROM articles
|
||||||
WHERE (content IS NULL OR TRIM(content) = '')
|
WHERE (content IS NULL OR TRIM(content) = '')
|
||||||
AND (content_status IS NULL OR content_status = 'pending')
|
AND (content_status IS NULL OR content_status = 'pending')
|
||||||
ORDER BY ingested_at DESC, id DESC
|
ORDER BY ingested_at DESC, id DESC
|
||||||
`);
|
`);
|
||||||
const selectArticlesMissingContent = db.prepare(`
|
const selectArticlesMissingContent = db.prepare(`
|
||||||
SELECT id, url
|
SELECT id, url, title, description
|
||||||
FROM articles
|
FROM articles
|
||||||
WHERE (content IS NULL OR TRIM(content) = '')
|
WHERE (content IS NULL OR TRIM(content) = '')
|
||||||
AND (content_status IS NULL OR content_status = 'pending')
|
AND (content_status IS NULL OR content_status = 'pending')
|
||||||
|
|
@ -117,9 +123,10 @@ async function fetchCompressedImage(url) {
|
||||||
return output.toString('base64');
|
return output.toString('base64');
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchAndStoreContent(id, url) {
|
async function fetchAndStoreContent(id, url, storedTitle, storedDescription) {
|
||||||
try {
|
try {
|
||||||
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages: 2 });
|
const maxConcurrentPages = Number(config.browser?.maxConcurrentPages) || 25;
|
||||||
|
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages });
|
||||||
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
|
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
|
||||||
const article = await extractFromHtml(html, url);
|
const article = await extractFromHtml(html, url);
|
||||||
if (!article) {
|
if (!article) {
|
||||||
|
|
@ -131,6 +138,16 @@ async function fetchAndStoreContent(id, url) {
|
||||||
? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
|
? article.content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || null
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
|
// if stored title looks like a raw URL, try to replace with scraped title
|
||||||
|
const titleLooksLikeUrl = storedTitle && /^https?:\/\//i.test(storedTitle.trim());
|
||||||
|
if (titleLooksLikeUrl) {
|
||||||
|
const scrapedTitle = typeof article.title === 'string' ? article.title.trim() : null;
|
||||||
|
const scrapedDescription = typeof article.description === 'string' ? article.description.trim() : null;
|
||||||
|
if (scrapedTitle) {
|
||||||
|
updateArticleTitleDescription.run(scrapedTitle, scrapedDescription || storedDescription || null, id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let image = null;
|
let image = null;
|
||||||
if (article.image) {
|
if (article.image) {
|
||||||
try {
|
try {
|
||||||
|
|
@ -185,7 +202,7 @@ async function backfillMissingContent(limit = 100, concurrency = 5) {
|
||||||
|
|
||||||
for (let i = 0; i < rows.length; i += concurrency) {
|
for (let i = 0; i < rows.length; i += concurrency) {
|
||||||
const batch = rows.slice(i, i + concurrency);
|
const batch = rows.slice(i, i + concurrency);
|
||||||
await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url)));
|
await Promise.all(batch.map((row) => fetchAndStoreContent(row.id, row.url, row.title, row.description)));
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
contentBackfillRunning = false;
|
contentBackfillRunning = false;
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,7 @@ function ingestArticle(article) {
|
||||||
ingestedAt
|
ingestedAt
|
||||||
);
|
);
|
||||||
|
|
||||||
fetchAndStoreContent(result.lastInsertRowid, url);
|
fetchAndStoreContent(result.lastInsertRowid, url, title, description);
|
||||||
|
|
||||||
return { inserted: true, id: result.lastInsertRowid };
|
return { inserted: true, id: result.lastInsertRowid };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue