enhance article handling with index page detection and query improvements
This commit is contained in:
parent
b298be6108
commit
01bc624a24
4 changed files with 86 additions and 43 deletions
31
src/db.js
31
src/db.js
|
|
@ -19,6 +19,7 @@ db.exec(`
|
|||
content_status TEXT,
|
||||
content_error TEXT,
|
||||
content_attempted_at TEXT,
|
||||
is_index_page INTEGER NOT NULL DEFAULT 0,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
normalized_title TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
|
|
@ -54,6 +55,7 @@ function rebuildArticlesTableIfNeeded() {
|
|||
content_status TEXT,
|
||||
content_error TEXT,
|
||||
content_attempted_at TEXT,
|
||||
is_index_page INTEGER NOT NULL DEFAULT 0,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
normalized_title TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
|
|
@ -70,6 +72,7 @@ function rebuildArticlesTableIfNeeded() {
|
|||
content_status,
|
||||
content_error,
|
||||
content_attempted_at,
|
||||
is_index_page,
|
||||
url,
|
||||
normalized_title,
|
||||
source,
|
||||
|
|
@ -85,6 +88,7 @@ function rebuildArticlesTableIfNeeded() {
|
|||
content_status,
|
||||
content_error,
|
||||
content_attempted_at,
|
||||
0,
|
||||
url,
|
||||
normalized_title,
|
||||
source,
|
||||
|
|
@ -127,7 +131,8 @@ for (const statement of [
|
|||
'ALTER TABLE articles ADD COLUMN image TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN content_error TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT'
|
||||
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
|
||||
]) {
|
||||
try {
|
||||
db.exec(statement);
|
||||
|
|
@ -138,4 +143,28 @@ for (const statement of [
|
|||
}
|
||||
}
|
||||
|
||||
db.exec(`
|
||||
UPDATE articles
|
||||
SET is_index_page = 1
|
||||
WHERE is_index_page = 0
|
||||
AND (
|
||||
LOWER(url) LIKE '%/category/%'
|
||||
OR LOWER(url) LIKE '%/categories/%'
|
||||
OR LOWER(url) LIKE '%/tag/%'
|
||||
OR LOWER(url) LIKE '%/tags/%'
|
||||
OR LOWER(url) LIKE '%/topic/%'
|
||||
OR LOWER(url) LIKE '%/topics/%'
|
||||
OR LOWER(url) LIKE '%/section/%'
|
||||
OR LOWER(url) LIKE '%/sections/%'
|
||||
OR LOWER(url) LIKE '%/archive%'
|
||||
OR LOWER(url) LIKE '%/archives/%'
|
||||
OR LOWER(url) LIKE '%/authors/%'
|
||||
OR LOWER(url) LIKE '%/search%'
|
||||
OR LOWER(title) LIKE '%category%'
|
||||
OR LOWER(title) LIKE '%archives%'
|
||||
OR LOWER(title) LIKE '%archive%'
|
||||
OR LOWER(title) LIKE '%latest news%'
|
||||
)
|
||||
`);
|
||||
|
||||
module.exports = db;
|
||||
|
|
|
|||
|
|
@ -9,14 +9,17 @@ const insertArticle = db.prepare(`
|
|||
description,
|
||||
content,
|
||||
image,
|
||||
is_index_page,
|
||||
url,
|
||||
normalized_title,
|
||||
source,
|
||||
pub_date,
|
||||
ingested_at
|
||||
) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?)
|
||||
) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?, ?)
|
||||
`);
|
||||
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
|
||||
const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
|
||||
const INDEX_PAGE_TITLE_HINT = /\b(category|archives?|latest news)\b/i;
|
||||
|
||||
function normalizePubDate(value) {
|
||||
if (!value) {
|
||||
|
|
@ -53,6 +56,14 @@ function normalizePubDate(value) {
|
|||
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
|
||||
}
|
||||
|
||||
function inferIsIndexPage(article, title, url) {
|
||||
if (article.isIndexPage != null) {
|
||||
return article.isIndexPage ? 1 : 0;
|
||||
}
|
||||
|
||||
return INDEX_PAGE_URL_HINT.test(url) || INDEX_PAGE_TITLE_HINT.test(title) ? 1 : 0;
|
||||
}
|
||||
|
||||
function ingestArticle(article) {
|
||||
const title = String(article.title || '').trim();
|
||||
const url = String(article.url || '').trim();
|
||||
|
|
@ -68,6 +79,7 @@ function ingestArticle(article) {
|
|||
}
|
||||
|
||||
const description = article.description == null ? null : String(article.description).trim() || null;
|
||||
const isIndexPage = inferIsIndexPage(article, title, url);
|
||||
const pubDate = normalizePubDate(article.pubDate);
|
||||
const ingestedAt = new Date().toISOString();
|
||||
|
||||
|
|
@ -75,6 +87,7 @@ function ingestArticle(article) {
|
|||
const result = insertArticle.run(
|
||||
title,
|
||||
description,
|
||||
isIndexPage,
|
||||
url,
|
||||
normalizedTitle,
|
||||
source,
|
||||
|
|
|
|||
|
|
@ -51,6 +51,39 @@ function buildArticlesQuery(query) {
|
|||
};
|
||||
}
|
||||
|
||||
function getRequestedLimit(value) {
|
||||
const limit = Number.parseInt(value, 10);
|
||||
return Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20;
|
||||
}
|
||||
|
||||
function shouldExcludeIndexPages(query) {
|
||||
return String(query.exclude_index_pages || '').toLowerCase() !== 'false';
|
||||
}
|
||||
|
||||
function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) {
|
||||
const ids = neighbors.map((row) => row.articleId);
|
||||
if (ids.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const placeholders = ids.map(() => '?').join(', ');
|
||||
const articles = db.prepare(`
|
||||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
||||
FROM articles
|
||||
WHERE id IN (${placeholders})
|
||||
${excludeIndexPages ? 'AND is_index_page = 0' : ''}
|
||||
`).all(...ids);
|
||||
const byId = new Map(articles.map((article) => [article.id, article]));
|
||||
|
||||
return neighbors
|
||||
.map((row) => {
|
||||
const article = byId.get(row.articleId);
|
||||
return article ? { ...article, distance: row.distance } : null;
|
||||
})
|
||||
.filter(Boolean)
|
||||
.slice(0, limit);
|
||||
}
|
||||
|
||||
async function articleRoutes(fastify) {
|
||||
fastify.get('/articles', async (request, reply) => {
|
||||
const query = request.query || {};
|
||||
|
|
@ -60,7 +93,8 @@ async function articleRoutes(fastify) {
|
|||
}
|
||||
|
||||
if (query.semantic !== undefined) {
|
||||
const limit = Number.parseInt(query.limit, 10);
|
||||
const limit = getRequestedLimit(query.limit);
|
||||
const excludeIndexPages = shouldExcludeIndexPages(query);
|
||||
const embedding = await getOrCreateQueryEmbedding(query.semantic);
|
||||
|
||||
if (!embedding) {
|
||||
|
|
@ -70,35 +104,19 @@ async function articleRoutes(fastify) {
|
|||
|
||||
const neighbors = findArticlesByEmbedding(
|
||||
embedding,
|
||||
Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20
|
||||
Math.min(limit * 5, 500)
|
||||
);
|
||||
const ids = neighbors.map((row) => row.articleId);
|
||||
if (ids.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const placeholders = ids.map(() => '?').join(', ');
|
||||
const articles = db.prepare(`
|
||||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
||||
FROM articles
|
||||
WHERE id IN (${placeholders})
|
||||
`).all(...ids);
|
||||
const byId = new Map(articles.map((article) => [article.id, article]));
|
||||
|
||||
return neighbors
|
||||
.map((row) => {
|
||||
const article = byId.get(row.articleId);
|
||||
return article ? { ...article, distance: row.distance } : null;
|
||||
})
|
||||
.filter(Boolean);
|
||||
return mapNeighborsToArticles(neighbors, excludeIndexPages, limit);
|
||||
}
|
||||
|
||||
if (query.similar_to_article) {
|
||||
const limit = Number.parseInt(query.limit, 10);
|
||||
const limit = getRequestedLimit(query.limit);
|
||||
const excludeIndexPages = shouldExcludeIndexPages(query);
|
||||
const articleId = Number.parseInt(query.similar_to_article, 10);
|
||||
const neighbors = findSimilarArticles(
|
||||
articleId,
|
||||
Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20
|
||||
Math.min(limit * 5, 500)
|
||||
);
|
||||
|
||||
if (neighbors.length === 0 && !getEmbeddingBuffer(articleId)) {
|
||||
|
|
@ -106,25 +124,7 @@ async function articleRoutes(fastify) {
|
|||
return { error: 'Embedding not found for article' };
|
||||
}
|
||||
|
||||
const ids = neighbors.map((row) => row.articleId);
|
||||
if (ids.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const placeholders = ids.map(() => '?').join(', ');
|
||||
const articles = db.prepare(`
|
||||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
||||
FROM articles
|
||||
WHERE id IN (${placeholders})
|
||||
`).all(...ids);
|
||||
const byId = new Map(articles.map((article) => [article.id, article]));
|
||||
|
||||
return neighbors
|
||||
.map((row) => {
|
||||
const article = byId.get(row.articleId);
|
||||
return article ? { ...article, distance: row.distance } : null;
|
||||
})
|
||||
.filter(Boolean);
|
||||
return mapNeighborsToArticles(neighbors, excludeIndexPages, limit);
|
||||
}
|
||||
|
||||
const { sql, params } = buildArticlesQuery(query);
|
||||
|
|
|
|||
|
|
@ -557,6 +557,7 @@ async function crawlSite(site) {
|
|||
url: canonicalUrl,
|
||||
source: normalizedSite.name,
|
||||
pubDate: selectPubDate(meta, jsonLdArticle, html),
|
||||
isIndexPage: !isArticleCandidate,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue