enhance article handling with index page detection and query improvements

This commit is contained in:
ImBenji 2026-04-17 01:02:39 +01:00
parent b298be6108
commit 01bc624a24
4 changed files with 86 additions and 43 deletions

View file

@ -19,6 +19,7 @@ db.exec(`
content_status TEXT, content_status TEXT,
content_error TEXT, content_error TEXT,
content_attempted_at TEXT, content_attempted_at TEXT,
is_index_page INTEGER NOT NULL DEFAULT 0,
url TEXT NOT NULL UNIQUE, url TEXT NOT NULL UNIQUE,
normalized_title TEXT NOT NULL, normalized_title TEXT NOT NULL,
source TEXT NOT NULL, source TEXT NOT NULL,
@ -54,6 +55,7 @@ function rebuildArticlesTableIfNeeded() {
content_status TEXT, content_status TEXT,
content_error TEXT, content_error TEXT,
content_attempted_at TEXT, content_attempted_at TEXT,
is_index_page INTEGER NOT NULL DEFAULT 0,
url TEXT NOT NULL UNIQUE, url TEXT NOT NULL UNIQUE,
normalized_title TEXT NOT NULL, normalized_title TEXT NOT NULL,
source TEXT NOT NULL, source TEXT NOT NULL,
@ -70,6 +72,7 @@ function rebuildArticlesTableIfNeeded() {
content_status, content_status,
content_error, content_error,
content_attempted_at, content_attempted_at,
is_index_page,
url, url,
normalized_title, normalized_title,
source, source,
@ -85,6 +88,7 @@ function rebuildArticlesTableIfNeeded() {
content_status, content_status,
content_error, content_error,
content_attempted_at, content_attempted_at,
0,
url, url,
normalized_title, normalized_title,
source, source,
@ -127,7 +131,8 @@ for (const statement of [
'ALTER TABLE articles ADD COLUMN image TEXT', 'ALTER TABLE articles ADD COLUMN image TEXT',
'ALTER TABLE articles ADD COLUMN content_status TEXT', 'ALTER TABLE articles ADD COLUMN content_status TEXT',
'ALTER TABLE articles ADD COLUMN content_error TEXT', 'ALTER TABLE articles ADD COLUMN content_error TEXT',
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT' 'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
]) { ]) {
try { try {
db.exec(statement); db.exec(statement);
@ -138,4 +143,28 @@ for (const statement of [
} }
} }
db.exec(`
UPDATE articles
SET is_index_page = 1
WHERE is_index_page = 0
AND (
LOWER(url) LIKE '%/category/%'
OR LOWER(url) LIKE '%/categories/%'
OR LOWER(url) LIKE '%/tag/%'
OR LOWER(url) LIKE '%/tags/%'
OR LOWER(url) LIKE '%/topic/%'
OR LOWER(url) LIKE '%/topics/%'
OR LOWER(url) LIKE '%/section/%'
OR LOWER(url) LIKE '%/sections/%'
OR LOWER(url) LIKE '%/archive%'
OR LOWER(url) LIKE '%/archives/%'
OR LOWER(url) LIKE '%/authors/%'
OR LOWER(url) LIKE '%/search%'
OR LOWER(title) LIKE '%category%'
OR LOWER(title) LIKE '%archives%'
OR LOWER(title) LIKE '%archive%'
OR LOWER(title) LIKE '%latest news%'
)
`);
module.exports = db; module.exports = db;

View file

@ -9,14 +9,17 @@ const insertArticle = db.prepare(`
description, description,
content, content,
image, image,
is_index_page,
url, url,
normalized_title, normalized_title,
source, source,
pub_date, pub_date,
ingested_at ingested_at
) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?) ) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?, ?)
`); `);
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?'); const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
const INDEX_PAGE_TITLE_HINT = /\b(category|archives?|latest news)\b/i;
function normalizePubDate(value) { function normalizePubDate(value) {
if (!value) { if (!value) {
@ -53,6 +56,14 @@ function normalizePubDate(value) {
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString(); return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
} }
function inferIsIndexPage(article, title, url) {
if (article.isIndexPage != null) {
return article.isIndexPage ? 1 : 0;
}
return INDEX_PAGE_URL_HINT.test(url) || INDEX_PAGE_TITLE_HINT.test(title) ? 1 : 0;
}
function ingestArticle(article) { function ingestArticle(article) {
const title = String(article.title || '').trim(); const title = String(article.title || '').trim();
const url = String(article.url || '').trim(); const url = String(article.url || '').trim();
@ -68,6 +79,7 @@ function ingestArticle(article) {
} }
const description = article.description == null ? null : String(article.description).trim() || null; const description = article.description == null ? null : String(article.description).trim() || null;
const isIndexPage = inferIsIndexPage(article, title, url);
const pubDate = normalizePubDate(article.pubDate); const pubDate = normalizePubDate(article.pubDate);
const ingestedAt = new Date().toISOString(); const ingestedAt = new Date().toISOString();
@ -75,6 +87,7 @@ function ingestArticle(article) {
const result = insertArticle.run( const result = insertArticle.run(
title, title,
description, description,
isIndexPage,
url, url,
normalizedTitle, normalizedTitle,
source, source,

View file

@ -51,6 +51,39 @@ function buildArticlesQuery(query) {
}; };
} }
function getRequestedLimit(value) {
const limit = Number.parseInt(value, 10);
return Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20;
}
function shouldExcludeIndexPages(query) {
return String(query.exclude_index_pages || '').toLowerCase() !== 'false';
}
function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) {
const ids = neighbors.map((row) => row.articleId);
if (ids.length === 0) {
return [];
}
const placeholders = ids.map(() => '?').join(', ');
const articles = db.prepare(`
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
FROM articles
WHERE id IN (${placeholders})
${excludeIndexPages ? 'AND is_index_page = 0' : ''}
`).all(...ids);
const byId = new Map(articles.map((article) => [article.id, article]));
return neighbors
.map((row) => {
const article = byId.get(row.articleId);
return article ? { ...article, distance: row.distance } : null;
})
.filter(Boolean)
.slice(0, limit);
}
async function articleRoutes(fastify) { async function articleRoutes(fastify) {
fastify.get('/articles', async (request, reply) => { fastify.get('/articles', async (request, reply) => {
const query = request.query || {}; const query = request.query || {};
@ -60,7 +93,8 @@ async function articleRoutes(fastify) {
} }
if (query.semantic !== undefined) { if (query.semantic !== undefined) {
const limit = Number.parseInt(query.limit, 10); const limit = getRequestedLimit(query.limit);
const excludeIndexPages = shouldExcludeIndexPages(query);
const embedding = await getOrCreateQueryEmbedding(query.semantic); const embedding = await getOrCreateQueryEmbedding(query.semantic);
if (!embedding) { if (!embedding) {
@ -70,35 +104,19 @@ async function articleRoutes(fastify) {
const neighbors = findArticlesByEmbedding( const neighbors = findArticlesByEmbedding(
embedding, embedding,
Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20 Math.min(limit * 5, 500)
); );
const ids = neighbors.map((row) => row.articleId);
if (ids.length === 0) {
return [];
}
const placeholders = ids.map(() => '?').join(', '); return mapNeighborsToArticles(neighbors, excludeIndexPages, limit);
const articles = db.prepare(`
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
FROM articles
WHERE id IN (${placeholders})
`).all(...ids);
const byId = new Map(articles.map((article) => [article.id, article]));
return neighbors
.map((row) => {
const article = byId.get(row.articleId);
return article ? { ...article, distance: row.distance } : null;
})
.filter(Boolean);
} }
if (query.similar_to_article) { if (query.similar_to_article) {
const limit = Number.parseInt(query.limit, 10); const limit = getRequestedLimit(query.limit);
const excludeIndexPages = shouldExcludeIndexPages(query);
const articleId = Number.parseInt(query.similar_to_article, 10); const articleId = Number.parseInt(query.similar_to_article, 10);
const neighbors = findSimilarArticles( const neighbors = findSimilarArticles(
articleId, articleId,
Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20 Math.min(limit * 5, 500)
); );
if (neighbors.length === 0 && !getEmbeddingBuffer(articleId)) { if (neighbors.length === 0 && !getEmbeddingBuffer(articleId)) {
@ -106,25 +124,7 @@ async function articleRoutes(fastify) {
return { error: 'Embedding not found for article' }; return { error: 'Embedding not found for article' };
} }
const ids = neighbors.map((row) => row.articleId); return mapNeighborsToArticles(neighbors, excludeIndexPages, limit);
if (ids.length === 0) {
return [];
}
const placeholders = ids.map(() => '?').join(', ');
const articles = db.prepare(`
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
FROM articles
WHERE id IN (${placeholders})
`).all(...ids);
const byId = new Map(articles.map((article) => [article.id, article]));
return neighbors
.map((row) => {
const article = byId.get(row.articleId);
return article ? { ...article, distance: row.distance } : null;
})
.filter(Boolean);
} }
const { sql, params } = buildArticlesQuery(query); const { sql, params } = buildArticlesQuery(query);

View file

@ -557,6 +557,7 @@ async function crawlSite(site) {
url: canonicalUrl, url: canonicalUrl,
source: normalizedSite.name, source: normalizedSite.name,
pubDate: selectPubDate(meta, jsonLdArticle, html), pubDate: selectPubDate(meta, jsonLdArticle, html),
isIndexPage: !isArticleCandidate,
}); });
} }
} }