From 01bc624a240bcf132cb101ac47a67de4ab15bce1 Mon Sep 17 00:00:00 2001 From: ImBenji Date: Fri, 17 Apr 2026 01:02:39 +0100 Subject: [PATCH] enhance article handling with index page detection and query improvements --- src/db.js | 31 +++++++++++++- src/ingest.js | 15 ++++++- src/routes/articles.js | 82 +++++++++++++++++++------------------- src/sources/newsCrawler.js | 1 + 4 files changed, 86 insertions(+), 43 deletions(-) diff --git a/src/db.js b/src/db.js index b74157f..ebc8ae7 100644 --- a/src/db.js +++ b/src/db.js @@ -19,6 +19,7 @@ db.exec(` content_status TEXT, content_error TEXT, content_attempted_at TEXT, + is_index_page INTEGER NOT NULL DEFAULT 0, url TEXT NOT NULL UNIQUE, normalized_title TEXT NOT NULL, source TEXT NOT NULL, @@ -54,6 +55,7 @@ function rebuildArticlesTableIfNeeded() { content_status TEXT, content_error TEXT, content_attempted_at TEXT, + is_index_page INTEGER NOT NULL DEFAULT 0, url TEXT NOT NULL UNIQUE, normalized_title TEXT NOT NULL, source TEXT NOT NULL, @@ -70,6 +72,7 @@ function rebuildArticlesTableIfNeeded() { content_status, content_error, content_attempted_at, + is_index_page, url, normalized_title, source, @@ -85,6 +88,7 @@ function rebuildArticlesTableIfNeeded() { content_status, content_error, content_attempted_at, + 0, url, normalized_title, source, @@ -127,7 +131,8 @@ for (const statement of [ 'ALTER TABLE articles ADD COLUMN image TEXT', 'ALTER TABLE articles ADD COLUMN content_status TEXT', 'ALTER TABLE articles ADD COLUMN content_error TEXT', - 'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT' + 'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT', + 'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0' ]) { try { db.exec(statement); @@ -138,4 +143,28 @@ for (const statement of [ } } +db.exec(` + UPDATE articles + SET is_index_page = 1 + WHERE is_index_page = 0 + AND ( + LOWER(url) LIKE '%/category/%' + OR LOWER(url) LIKE '%/categories/%' + OR LOWER(url) LIKE '%/tag/%' + OR LOWER(url) LIKE '%/tags/%' + OR LOWER(url) LIKE '%/topic/%' + OR LOWER(url) LIKE '%/topics/%' + OR LOWER(url) LIKE '%/section/%' + OR LOWER(url) LIKE '%/sections/%' + OR LOWER(url) LIKE '%/archive%' + OR LOWER(url) LIKE '%/archives/%' + OR LOWER(url) LIKE '%/authors/%' + OR LOWER(url) LIKE '%/search%' + OR LOWER(title) LIKE '%category%' + OR LOWER(title) LIKE '%archives%' + OR LOWER(title) LIKE '%archive%' + OR LOWER(title) LIKE '%latest news%' + ) +`); + module.exports = db; diff --git a/src/ingest.js b/src/ingest.js index c915eb1..34887ce 100644 --- a/src/ingest.js +++ b/src/ingest.js @@ -9,14 +9,17 @@ const insertArticle = db.prepare(` description, content, image, + is_index_page, url, normalized_title, source, pub_date, ingested_at - ) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?) + ) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?, ?) `); const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?'); +const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i; +const INDEX_PAGE_TITLE_HINT = /\b(category|archives?|latest news)\b/i; function normalizePubDate(value) { if (!value) { @@ -53,6 +56,14 @@ function normalizePubDate(value) { return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString(); } +function inferIsIndexPage(article, title, url) { + if (article.isIndexPage != null) { + return article.isIndexPage ? 1 : 0; + } + + return INDEX_PAGE_URL_HINT.test(url) || INDEX_PAGE_TITLE_HINT.test(title) ? 1 : 0; +} + function ingestArticle(article) { const title = String(article.title || '').trim(); const url = String(article.url || '').trim(); @@ -68,6 +79,7 @@ function ingestArticle(article) { } const description = article.description == null ? null : String(article.description).trim() || null; + const isIndexPage = inferIsIndexPage(article, title, url); const pubDate = normalizePubDate(article.pubDate); const ingestedAt = new Date().toISOString(); @@ -75,6 +87,7 @@ function ingestArticle(article) { const result = insertArticle.run( title, description, + isIndexPage, url, normalizedTitle, source, diff --git a/src/routes/articles.js b/src/routes/articles.js index 072bf40..7c14dc4 100644 --- a/src/routes/articles.js +++ b/src/routes/articles.js @@ -51,6 +51,39 @@ function buildArticlesQuery(query) { }; } +function getRequestedLimit(value) { + const limit = Number.parseInt(value, 10); + return Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20; +} + +function shouldExcludeIndexPages(query) { + return String(query.exclude_index_pages || '').toLowerCase() !== 'false'; +} + +function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) { + const ids = neighbors.map((row) => row.articleId); + if (ids.length === 0) { + return []; + } + + const placeholders = ids.map(() => '?').join(', '); + const articles = db.prepare(` + SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at + FROM articles + WHERE id IN (${placeholders}) + ${excludeIndexPages ? 'AND is_index_page = 0' : ''} + `).all(...ids); + const byId = new Map(articles.map((article) => [article.id, article])); + + return neighbors + .map((row) => { + const article = byId.get(row.articleId); + return article ? { ...article, distance: row.distance } : null; + }) + .filter(Boolean) + .slice(0, limit); +} + async function articleRoutes(fastify) { fastify.get('/articles', async (request, reply) => { const query = request.query || {}; @@ -60,7 +93,8 @@ async function articleRoutes(fastify) { } if (query.semantic !== undefined) { - const limit = Number.parseInt(query.limit, 10); + const limit = getRequestedLimit(query.limit); + const excludeIndexPages = shouldExcludeIndexPages(query); const embedding = await getOrCreateQueryEmbedding(query.semantic); if (!embedding) { @@ -70,35 +104,19 @@ async function articleRoutes(fastify) { const neighbors = findArticlesByEmbedding( embedding, - Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20 + Math.min(limit * 5, 500) ); - const ids = neighbors.map((row) => row.articleId); - if (ids.length === 0) { - return []; - } - const placeholders = ids.map(() => '?').join(', '); - const articles = db.prepare(` - SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at - FROM articles - WHERE id IN (${placeholders}) - `).all(...ids); - const byId = new Map(articles.map((article) => [article.id, article])); - - return neighbors - .map((row) => { - const article = byId.get(row.articleId); - return article ? { ...article, distance: row.distance } : null; - }) - .filter(Boolean); + return mapNeighborsToArticles(neighbors, excludeIndexPages, limit); } if (query.similar_to_article) { - const limit = Number.parseInt(query.limit, 10); + const limit = getRequestedLimit(query.limit); + const excludeIndexPages = shouldExcludeIndexPages(query); const articleId = Number.parseInt(query.similar_to_article, 10); const neighbors = findSimilarArticles( articleId, - Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20 + Math.min(limit * 5, 500) ); if (neighbors.length === 0 && !getEmbeddingBuffer(articleId)) { @@ -106,25 +124,7 @@ async function articleRoutes(fastify) { return { error: 'Embedding not found for article' }; } - const ids = neighbors.map((row) => row.articleId); - if (ids.length === 0) { - return []; - } - - const placeholders = ids.map(() => '?').join(', '); - const articles = db.prepare(` - SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at - FROM articles - WHERE id IN (${placeholders}) - `).all(...ids); - const byId = new Map(articles.map((article) => [article.id, article])); - - return neighbors - .map((row) => { - const article = byId.get(row.articleId); - return article ? { ...article, distance: row.distance } : null; - }) - .filter(Boolean); + return mapNeighborsToArticles(neighbors, excludeIndexPages, limit); } const { sql, params } = buildArticlesQuery(query); diff --git a/src/sources/newsCrawler.js b/src/sources/newsCrawler.js index 3ddab52..0a14f49 100644 --- a/src/sources/newsCrawler.js +++ b/src/sources/newsCrawler.js @@ -557,6 +557,7 @@ async function crawlSite(site) { url: canonicalUrl, source: normalizedSite.name, pubDate: selectPubDate(meta, jsonLdArticle, html), + isIndexPage: !isArticleCandidate, }); } }