enhance article handling with index page detection and query improvements
This commit is contained in:
parent
b298be6108
commit
01bc624a24
4 changed files with 86 additions and 43 deletions
31
src/db.js
31
src/db.js
|
|
@ -19,6 +19,7 @@ db.exec(`
|
||||||
content_status TEXT,
|
content_status TEXT,
|
||||||
content_error TEXT,
|
content_error TEXT,
|
||||||
content_attempted_at TEXT,
|
content_attempted_at TEXT,
|
||||||
|
is_index_page INTEGER NOT NULL DEFAULT 0,
|
||||||
url TEXT NOT NULL UNIQUE,
|
url TEXT NOT NULL UNIQUE,
|
||||||
normalized_title TEXT NOT NULL,
|
normalized_title TEXT NOT NULL,
|
||||||
source TEXT NOT NULL,
|
source TEXT NOT NULL,
|
||||||
|
|
@ -54,6 +55,7 @@ function rebuildArticlesTableIfNeeded() {
|
||||||
content_status TEXT,
|
content_status TEXT,
|
||||||
content_error TEXT,
|
content_error TEXT,
|
||||||
content_attempted_at TEXT,
|
content_attempted_at TEXT,
|
||||||
|
is_index_page INTEGER NOT NULL DEFAULT 0,
|
||||||
url TEXT NOT NULL UNIQUE,
|
url TEXT NOT NULL UNIQUE,
|
||||||
normalized_title TEXT NOT NULL,
|
normalized_title TEXT NOT NULL,
|
||||||
source TEXT NOT NULL,
|
source TEXT NOT NULL,
|
||||||
|
|
@ -70,6 +72,7 @@ function rebuildArticlesTableIfNeeded() {
|
||||||
content_status,
|
content_status,
|
||||||
content_error,
|
content_error,
|
||||||
content_attempted_at,
|
content_attempted_at,
|
||||||
|
is_index_page,
|
||||||
url,
|
url,
|
||||||
normalized_title,
|
normalized_title,
|
||||||
source,
|
source,
|
||||||
|
|
@ -85,6 +88,7 @@ function rebuildArticlesTableIfNeeded() {
|
||||||
content_status,
|
content_status,
|
||||||
content_error,
|
content_error,
|
||||||
content_attempted_at,
|
content_attempted_at,
|
||||||
|
0,
|
||||||
url,
|
url,
|
||||||
normalized_title,
|
normalized_title,
|
||||||
source,
|
source,
|
||||||
|
|
@ -127,7 +131,8 @@ for (const statement of [
|
||||||
'ALTER TABLE articles ADD COLUMN image TEXT',
|
'ALTER TABLE articles ADD COLUMN image TEXT',
|
||||||
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
||||||
'ALTER TABLE articles ADD COLUMN content_error TEXT',
|
'ALTER TABLE articles ADD COLUMN content_error TEXT',
|
||||||
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT'
|
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
|
||||||
|
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
|
||||||
]) {
|
]) {
|
||||||
try {
|
try {
|
||||||
db.exec(statement);
|
db.exec(statement);
|
||||||
|
|
@ -138,4 +143,28 @@ for (const statement of [
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
db.exec(`
|
||||||
|
UPDATE articles
|
||||||
|
SET is_index_page = 1
|
||||||
|
WHERE is_index_page = 0
|
||||||
|
AND (
|
||||||
|
LOWER(url) LIKE '%/category/%'
|
||||||
|
OR LOWER(url) LIKE '%/categories/%'
|
||||||
|
OR LOWER(url) LIKE '%/tag/%'
|
||||||
|
OR LOWER(url) LIKE '%/tags/%'
|
||||||
|
OR LOWER(url) LIKE '%/topic/%'
|
||||||
|
OR LOWER(url) LIKE '%/topics/%'
|
||||||
|
OR LOWER(url) LIKE '%/section/%'
|
||||||
|
OR LOWER(url) LIKE '%/sections/%'
|
||||||
|
OR LOWER(url) LIKE '%/archive%'
|
||||||
|
OR LOWER(url) LIKE '%/archives/%'
|
||||||
|
OR LOWER(url) LIKE '%/authors/%'
|
||||||
|
OR LOWER(url) LIKE '%/search%'
|
||||||
|
OR LOWER(title) LIKE '%category%'
|
||||||
|
OR LOWER(title) LIKE '%archives%'
|
||||||
|
OR LOWER(title) LIKE '%archive%'
|
||||||
|
OR LOWER(title) LIKE '%latest news%'
|
||||||
|
)
|
||||||
|
`);
|
||||||
|
|
||||||
module.exports = db;
|
module.exports = db;
|
||||||
|
|
|
||||||
|
|
@ -9,14 +9,17 @@ const insertArticle = db.prepare(`
|
||||||
description,
|
description,
|
||||||
content,
|
content,
|
||||||
image,
|
image,
|
||||||
|
is_index_page,
|
||||||
url,
|
url,
|
||||||
normalized_title,
|
normalized_title,
|
||||||
source,
|
source,
|
||||||
pub_date,
|
pub_date,
|
||||||
ingested_at
|
ingested_at
|
||||||
) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?)
|
) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?, ?)
|
||||||
`);
|
`);
|
||||||
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
|
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
|
||||||
|
const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
|
||||||
|
const INDEX_PAGE_TITLE_HINT = /\b(category|archives?|latest news)\b/i;
|
||||||
|
|
||||||
function normalizePubDate(value) {
|
function normalizePubDate(value) {
|
||||||
if (!value) {
|
if (!value) {
|
||||||
|
|
@ -53,6 +56,14 @@ function normalizePubDate(value) {
|
||||||
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
|
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function inferIsIndexPage(article, title, url) {
|
||||||
|
if (article.isIndexPage != null) {
|
||||||
|
return article.isIndexPage ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return INDEX_PAGE_URL_HINT.test(url) || INDEX_PAGE_TITLE_HINT.test(title) ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
function ingestArticle(article) {
|
function ingestArticle(article) {
|
||||||
const title = String(article.title || '').trim();
|
const title = String(article.title || '').trim();
|
||||||
const url = String(article.url || '').trim();
|
const url = String(article.url || '').trim();
|
||||||
|
|
@ -68,6 +79,7 @@ function ingestArticle(article) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const description = article.description == null ? null : String(article.description).trim() || null;
|
const description = article.description == null ? null : String(article.description).trim() || null;
|
||||||
|
const isIndexPage = inferIsIndexPage(article, title, url);
|
||||||
const pubDate = normalizePubDate(article.pubDate);
|
const pubDate = normalizePubDate(article.pubDate);
|
||||||
const ingestedAt = new Date().toISOString();
|
const ingestedAt = new Date().toISOString();
|
||||||
|
|
||||||
|
|
@ -75,6 +87,7 @@ function ingestArticle(article) {
|
||||||
const result = insertArticle.run(
|
const result = insertArticle.run(
|
||||||
title,
|
title,
|
||||||
description,
|
description,
|
||||||
|
isIndexPage,
|
||||||
url,
|
url,
|
||||||
normalizedTitle,
|
normalizedTitle,
|
||||||
source,
|
source,
|
||||||
|
|
|
||||||
|
|
@ -51,6 +51,39 @@ function buildArticlesQuery(query) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getRequestedLimit(value) {
|
||||||
|
const limit = Number.parseInt(value, 10);
|
||||||
|
return Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20;
|
||||||
|
}
|
||||||
|
|
||||||
|
function shouldExcludeIndexPages(query) {
|
||||||
|
return String(query.exclude_index_pages || '').toLowerCase() !== 'false';
|
||||||
|
}
|
||||||
|
|
||||||
|
function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) {
|
||||||
|
const ids = neighbors.map((row) => row.articleId);
|
||||||
|
if (ids.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const placeholders = ids.map(() => '?').join(', ');
|
||||||
|
const articles = db.prepare(`
|
||||||
|
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
||||||
|
FROM articles
|
||||||
|
WHERE id IN (${placeholders})
|
||||||
|
${excludeIndexPages ? 'AND is_index_page = 0' : ''}
|
||||||
|
`).all(...ids);
|
||||||
|
const byId = new Map(articles.map((article) => [article.id, article]));
|
||||||
|
|
||||||
|
return neighbors
|
||||||
|
.map((row) => {
|
||||||
|
const article = byId.get(row.articleId);
|
||||||
|
return article ? { ...article, distance: row.distance } : null;
|
||||||
|
})
|
||||||
|
.filter(Boolean)
|
||||||
|
.slice(0, limit);
|
||||||
|
}
|
||||||
|
|
||||||
async function articleRoutes(fastify) {
|
async function articleRoutes(fastify) {
|
||||||
fastify.get('/articles', async (request, reply) => {
|
fastify.get('/articles', async (request, reply) => {
|
||||||
const query = request.query || {};
|
const query = request.query || {};
|
||||||
|
|
@ -60,7 +93,8 @@ async function articleRoutes(fastify) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (query.semantic !== undefined) {
|
if (query.semantic !== undefined) {
|
||||||
const limit = Number.parseInt(query.limit, 10);
|
const limit = getRequestedLimit(query.limit);
|
||||||
|
const excludeIndexPages = shouldExcludeIndexPages(query);
|
||||||
const embedding = await getOrCreateQueryEmbedding(query.semantic);
|
const embedding = await getOrCreateQueryEmbedding(query.semantic);
|
||||||
|
|
||||||
if (!embedding) {
|
if (!embedding) {
|
||||||
|
|
@ -70,35 +104,19 @@ async function articleRoutes(fastify) {
|
||||||
|
|
||||||
const neighbors = findArticlesByEmbedding(
|
const neighbors = findArticlesByEmbedding(
|
||||||
embedding,
|
embedding,
|
||||||
Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20
|
Math.min(limit * 5, 500)
|
||||||
);
|
);
|
||||||
const ids = neighbors.map((row) => row.articleId);
|
|
||||||
if (ids.length === 0) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
const placeholders = ids.map(() => '?').join(', ');
|
return mapNeighborsToArticles(neighbors, excludeIndexPages, limit);
|
||||||
const articles = db.prepare(`
|
|
||||||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
|
||||||
FROM articles
|
|
||||||
WHERE id IN (${placeholders})
|
|
||||||
`).all(...ids);
|
|
||||||
const byId = new Map(articles.map((article) => [article.id, article]));
|
|
||||||
|
|
||||||
return neighbors
|
|
||||||
.map((row) => {
|
|
||||||
const article = byId.get(row.articleId);
|
|
||||||
return article ? { ...article, distance: row.distance } : null;
|
|
||||||
})
|
|
||||||
.filter(Boolean);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (query.similar_to_article) {
|
if (query.similar_to_article) {
|
||||||
const limit = Number.parseInt(query.limit, 10);
|
const limit = getRequestedLimit(query.limit);
|
||||||
|
const excludeIndexPages = shouldExcludeIndexPages(query);
|
||||||
const articleId = Number.parseInt(query.similar_to_article, 10);
|
const articleId = Number.parseInt(query.similar_to_article, 10);
|
||||||
const neighbors = findSimilarArticles(
|
const neighbors = findSimilarArticles(
|
||||||
articleId,
|
articleId,
|
||||||
Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20
|
Math.min(limit * 5, 500)
|
||||||
);
|
);
|
||||||
|
|
||||||
if (neighbors.length === 0 && !getEmbeddingBuffer(articleId)) {
|
if (neighbors.length === 0 && !getEmbeddingBuffer(articleId)) {
|
||||||
|
|
@ -106,25 +124,7 @@ async function articleRoutes(fastify) {
|
||||||
return { error: 'Embedding not found for article' };
|
return { error: 'Embedding not found for article' };
|
||||||
}
|
}
|
||||||
|
|
||||||
const ids = neighbors.map((row) => row.articleId);
|
return mapNeighborsToArticles(neighbors, excludeIndexPages, limit);
|
||||||
if (ids.length === 0) {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
const placeholders = ids.map(() => '?').join(', ');
|
|
||||||
const articles = db.prepare(`
|
|
||||||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
|
||||||
FROM articles
|
|
||||||
WHERE id IN (${placeholders})
|
|
||||||
`).all(...ids);
|
|
||||||
const byId = new Map(articles.map((article) => [article.id, article]));
|
|
||||||
|
|
||||||
return neighbors
|
|
||||||
.map((row) => {
|
|
||||||
const article = byId.get(row.articleId);
|
|
||||||
return article ? { ...article, distance: row.distance } : null;
|
|
||||||
})
|
|
||||||
.filter(Boolean);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const { sql, params } = buildArticlesQuery(query);
|
const { sql, params } = buildArticlesQuery(query);
|
||||||
|
|
|
||||||
|
|
@ -557,6 +557,7 @@ async function crawlSite(site) {
|
||||||
url: canonicalUrl,
|
url: canonicalUrl,
|
||||||
source: normalizedSite.name,
|
source: normalizedSite.name,
|
||||||
pubDate: selectPubDate(meta, jsonLdArticle, html),
|
pubDate: selectPubDate(meta, jsonLdArticle, html),
|
||||||
|
isIndexPage: !isArticleCandidate,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue