From 01bc624a240bcf132cb101ac47a67de4ab15bce1 Mon Sep 17 00:00:00 2001
From: ImBenji <benjamin.watt@imbenji.net>
Date: Fri, 17 Apr 2026 01:02:39 +0100
Subject: [PATCH] enhance article handling with index page detection and query
 improvements

---
 src/db.js                  | 31 +++++++++++++-
 src/ingest.js              | 15 ++++++-
 src/routes/articles.js     | 82 +++++++++++++++++++-------------------
 src/sources/newsCrawler.js |  1 +
 4 files changed, 86 insertions(+), 43 deletions(-)

diff --git a/src/db.js b/src/db.js
index b74157f..ebc8ae7 100644
--- a/src/db.js
+++ b/src/db.js
@@ -19,6 +19,7 @@ db.exec(`
     content_status TEXT,
     content_error TEXT,
     content_attempted_at TEXT,
+    is_index_page INTEGER NOT NULL DEFAULT 0,
     url TEXT NOT NULL UNIQUE,
     normalized_title TEXT NOT NULL,
     source TEXT NOT NULL,
@@ -54,6 +55,7 @@ function rebuildArticlesTableIfNeeded() {
       content_status TEXT,
       content_error TEXT,
       content_attempted_at TEXT,
+      is_index_page INTEGER NOT NULL DEFAULT 0,
       url TEXT NOT NULL UNIQUE,
       normalized_title TEXT NOT NULL,
       source TEXT NOT NULL,
@@ -70,6 +72,7 @@ function rebuildArticlesTableIfNeeded() {
       content_status,
       content_error,
       content_attempted_at,
+      is_index_page,
       url,
       normalized_title,
       source,
@@ -85,6 +88,7 @@ function rebuildArticlesTableIfNeeded() {
       content_status,
       content_error,
       content_attempted_at,
+      0,
       url,
       normalized_title,
       source,
@@ -127,7 +131,8 @@ for (const statement of [
   'ALTER TABLE articles ADD COLUMN image TEXT',
   'ALTER TABLE articles ADD COLUMN content_status TEXT',
   'ALTER TABLE articles ADD COLUMN content_error TEXT',
-  'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT'
+  'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
+  'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
 ]) {
   try {
     db.exec(statement);
@@ -138,4 +143,28 @@ for (const statement of [
   }
 }
 
+db.exec(`
+  UPDATE articles
+  SET is_index_page = 1
+  WHERE is_index_page = 0
+    AND (
+      LOWER(url) LIKE '%/category/%'
+      OR LOWER(url) LIKE '%/categories/%'
+      OR LOWER(url) LIKE '%/tag/%'
+      OR LOWER(url) LIKE '%/tags/%'
+      OR LOWER(url) LIKE '%/topic/%'
+      OR LOWER(url) LIKE '%/topics/%'
+      OR LOWER(url) LIKE '%/section/%'
+      OR LOWER(url) LIKE '%/sections/%'
+      OR LOWER(url) LIKE '%/archive%'
+      OR LOWER(url) LIKE '%/archives/%'
+      OR LOWER(url) LIKE '%/authors/%'
+      OR LOWER(url) LIKE '%/search%'
+      OR LOWER(title) LIKE '%category%'
+      OR LOWER(title) LIKE '%archives%'
+      OR LOWER(title) LIKE '%archive%'
+      OR LOWER(title) LIKE '%latest news%'
+    )
+`);
+
 module.exports = db;
diff --git a/src/ingest.js b/src/ingest.js
index c915eb1..34887ce 100644
--- a/src/ingest.js
+++ b/src/ingest.js
@@ -9,14 +9,17 @@ const insertArticle = db.prepare(`
     description,
     content,
     image,
+    is_index_page,
     url,
     normalized_title,
     source,
     pub_date,
     ingested_at
-  ) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?)
+  ) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?, ?)
 `);
 const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
+const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
+const INDEX_PAGE_TITLE_HINT = /\b(category|archives?|latest news)\b/i;
 
 function normalizePubDate(value) {
   if (!value) {
@@ -53,6 +56,14 @@ function normalizePubDate(value) {
   return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
 }
 
+function inferIsIndexPage(article, title, url) {
+  if (article.isIndexPage != null) {
+    return article.isIndexPage ? 1 : 0;
+  }
+
+  return INDEX_PAGE_URL_HINT.test(url) || INDEX_PAGE_TITLE_HINT.test(title) ? 1 : 0;
+}
+
 function ingestArticle(article) {
   const title = String(article.title || '').trim();
   const url = String(article.url || '').trim();
@@ -68,6 +79,7 @@ function ingestArticle(article) {
   }
 
   const description = article.description == null ? null : String(article.description).trim() || null;
+  const isIndexPage = inferIsIndexPage(article, title, url);
   const pubDate = normalizePubDate(article.pubDate);
   const ingestedAt = new Date().toISOString();
 
@@ -75,6 +87,7 @@ function ingestArticle(article) {
     const result = insertArticle.run(
       title,
       description,
+      isIndexPage,
       url,
       normalizedTitle,
       source,
diff --git a/src/routes/articles.js b/src/routes/articles.js
index 072bf40..7c14dc4 100644
--- a/src/routes/articles.js
+++ b/src/routes/articles.js
@@ -51,6 +51,39 @@ function buildArticlesQuery(query) {
   };
 }
 
+function getRequestedLimit(value) {
+  const limit = Number.parseInt(value, 10);
+  return Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20;
+}
+
+function shouldExcludeIndexPages(query) {
+  return String(query.exclude_index_pages || '').toLowerCase() !== 'false';
+}
+
+function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) {
+  const ids = neighbors.map((row) => row.articleId);
+  if (ids.length === 0) {
+    return [];
+  }
+
+  const placeholders = ids.map(() => '?').join(', ');
+  const articles = db.prepare(`
+    SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
+    FROM articles
+    WHERE id IN (${placeholders})
+    ${excludeIndexPages ? 'AND is_index_page = 0' : ''}
+  `).all(...ids);
+  const byId = new Map(articles.map((article) => [article.id, article]));
+
+  return neighbors
+    .map((row) => {
+      const article = byId.get(row.articleId);
+      return article ? { ...article, distance: row.distance } : null;
+    })
+    .filter(Boolean)
+    .slice(0, limit);
+}
+
 async function articleRoutes(fastify) {
   fastify.get('/articles', async (request, reply) => {
     const query = request.query || {};
@@ -60,7 +93,8 @@ async function articleRoutes(fastify) {
     }
 
     if (query.semantic !== undefined) {
-      const limit = Number.parseInt(query.limit, 10);
+      const limit = getRequestedLimit(query.limit);
+      const excludeIndexPages = shouldExcludeIndexPages(query);
       const embedding = await getOrCreateQueryEmbedding(query.semantic);
 
       if (!embedding) {
@@ -70,35 +104,19 @@ async function articleRoutes(fastify) {
 
       const neighbors = findArticlesByEmbedding(
         embedding,
-        Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20
+        Math.min(limit * 5, 500)
       );
-      const ids = neighbors.map((row) => row.articleId);
-      if (ids.length === 0) {
-        return [];
-      }
 
-      const placeholders = ids.map(() => '?').join(', ');
-      const articles = db.prepare(`
-        SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
-        FROM articles
-        WHERE id IN (${placeholders})
-      `).all(...ids);
-      const byId = new Map(articles.map((article) => [article.id, article]));
-
-      return neighbors
-        .map((row) => {
-          const article = byId.get(row.articleId);
-          return article ? { ...article, distance: row.distance } : null;
-        })
-        .filter(Boolean);
+      return mapNeighborsToArticles(neighbors, excludeIndexPages, limit);
     }
 
     if (query.similar_to_article) {
-      const limit = Number.parseInt(query.limit, 10);
+      const limit = getRequestedLimit(query.limit);
+      const excludeIndexPages = shouldExcludeIndexPages(query);
       const articleId = Number.parseInt(query.similar_to_article, 10);
       const neighbors = findSimilarArticles(
         articleId,
-        Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20
+        Math.min(limit * 5, 500)
       );
 
       if (neighbors.length === 0 && !getEmbeddingBuffer(articleId)) {
@@ -106,25 +124,7 @@ async function articleRoutes(fastify) {
         return { error: 'Embedding not found for article' };
       }
 
-      const ids = neighbors.map((row) => row.articleId);
-      if (ids.length === 0) {
-        return [];
-      }
-
-      const placeholders = ids.map(() => '?').join(', ');
-      const articles = db.prepare(`
-        SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
-        FROM articles
-        WHERE id IN (${placeholders})
-      `).all(...ids);
-      const byId = new Map(articles.map((article) => [article.id, article]));
-
-      return neighbors
-        .map((row) => {
-          const article = byId.get(row.articleId);
-          return article ? { ...article, distance: row.distance } : null;
-        })
-        .filter(Boolean);
+      return mapNeighborsToArticles(neighbors, excludeIndexPages, limit);
     }
 
     const { sql, params } = buildArticlesQuery(query);
diff --git a/src/sources/newsCrawler.js b/src/sources/newsCrawler.js
index 3ddab52..0a14f49 100644
--- a/src/sources/newsCrawler.js
+++ b/src/sources/newsCrawler.js
@@ -557,6 +557,7 @@ async function crawlSite(site) {
           url: canonicalUrl,
           source: normalizedSite.name,
           pubDate: selectPubDate(meta, jsonLdArticle, html),
+          isIndexPage: !isArticleCandidate,
         });
       }
     }