diff --git a/README.md b/README.md index 9417240..b834a52 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Use this to confirm the server is running, not to inspect ingestion state. ### `GET /articles` -Returns articles from the `articles` table. Behavior changes based on the query params you send. +Returns articles from the `articles` table. Only articles that are considered **usable** are exposed — meaning they have non-empty `content`, a stored embedding, and are not index/category pages. Behavior changes based on the query params you send. #### Query params @@ -237,6 +237,7 @@ Returns one article by numeric ID. **Behavior** - Looks up the article directly in SQLite. +- Same usability filter as the list endpoint — returns `404` if the article exists but is not usable. - Returns the same article fields as normal `/articles` list mode. - Does not return embedding data. - Returns `404` if the ID does not exist. @@ -257,16 +258,10 @@ Returns ingestion and archive summary information. **Response fields** -- `totalArticles`: total number of rows in `articles` -- `countsBySource`: article counts grouped by source name +- `total`: total number of rows in `articles` across all sources +- `usable`: articles that have content, an embedding, and are not index pages - `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source -- `contentFetchCoverage.total`: total article count used for coverage math -- `contentFetchCoverage.withContent`: rows whose `content` is present and non-empty -- `contentFetchCoverage.withImage`: rows whose `image` is present and non-empty -- `contentFetchCoverage.withEmbedding`: rows that have an embedding in `article_embeddings` -- `contentFetchCoverage.contentRatio`: `withContent / total` -- `contentFetchCoverage.imageRatio`: `withImage / total` -- `contentFetchCoverage.embeddingRatio`: `withEmbedding / total` +- `bySource`: per-source breakdown, each with `total` and `usable` counts **Important detail** @@ -275,26 +270,18 @@ Returns ingestion and archive summary information. **Example response** ```json { - "totalArticles": 10234, - "countsBySource": { - "alphavantage": 120, - "edgar": 88, - "finnhub": 400, - "gdelt": 2100, - "rss": 7526 - }, + "total": 10234, + "usable": 8700, "lastIngestionBySource": { "rss": "2025-01-02T10:00:00.000Z", "gdelt": "2025-01-02T10:05:00.000Z" }, - "contentFetchCoverage": { - "withContent": 9000, - "withImage": 6500, - "withEmbedding": 8700, - "total": 10234, - "contentRatio": 0.8794, - "imageRatio": 0.6351, - "embeddingRatio": 0.8501 + "bySource": { + "alphavantage": { "total": 120, "usable": 98 }, + "edgar": { "total": 88, "usable": 70 }, + "finnhub": { "total": 400, "usable": 360 }, + "gdelt": { "total": 2100, "usable": 1800 }, + "rss": { "total": 7526, "usable": 6372 } } } ``` diff --git a/src/routes/articles.js b/src/routes/articles.js index 7c14dc4..298f43f 100644 --- a/src/routes/articles.js +++ b/src/routes/articles.js @@ -32,7 +32,11 @@ function buildArticlesQuery(query) { params.push(query.to); } - const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : ''; + conditions.push('content IS NOT NULL AND content != \'\''); + conditions.push('is_index_page = 0'); + conditions.push('EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)'); + + const whereClause = `WHERE ${conditions.join(' AND ')}`; const limit = Number.parseInt(query.limit, 10); const offset = Number.parseInt(query.offset, 10); @@ -71,7 +75,9 @@ function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) { SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at FROM articles WHERE id IN (${placeholders}) - ${excludeIndexPages ? 'AND is_index_page = 0' : ''} + AND content IS NOT NULL AND content != '' + AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id) + ${excludeIndexPages ? 'AND is_index_page = 0' : ''} `).all(...ids); const byId = new Map(articles.map((article) => [article.id, article])); @@ -141,6 +147,9 @@ async function articleRoutes(fastify) { SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at FROM articles WHERE id = ? + AND content IS NOT NULL AND content != '' + AND is_index_page = 0 + AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id) `).get(request.params.id); if (!article) { diff --git a/src/routes/status.js b/src/routes/status.js index 62799a7..1f323a3 100644 --- a/src/routes/status.js +++ b/src/routes/status.js @@ -3,30 +3,38 @@ const { getLastIngestionBySource } = require('../state'); async function statusRoutes(fastify) { fastify.get('/status', async () => { - const total = db.prepare('SELECT COUNT(*) AS count FROM articles').get().count; - const bySourceRows = db.prepare('SELECT source, COUNT(*) AS count FROM articles GROUP BY source ORDER BY source').all(); - const contentCoverage = db.prepare(` + const bySourceRows = db.prepare(` SELECT + source, COUNT(*) AS total, - SUM(CASE WHEN content IS NOT NULL AND TRIM(content) <> '' THEN 1 ELSE 0 END) AS with_content, - SUM(CASE WHEN image IS NOT NULL AND TRIM(image) <> '' THEN 1 ELSE 0 END) AS with_image + + SUM(CASE + WHEN content IS NOT NULL AND content != '' + AND is_index_page = 0 + AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id) + THEN 1 ELSE 0 + END) AS usable FROM articles - `).get(); - const embeddingCoverage = db.prepare('SELECT COUNT(*) AS count FROM article_embeddings').get(); + GROUP BY source + ORDER BY source + `).all(); + + const totals = bySourceRows.reduce( + (acc, row) => { + acc.total += row.total; + acc.usable += row.usable; + return acc; + }, + { total: 0, usable: 0 } + ); return { - totalArticles: total, - countsBySource: Object.fromEntries(bySourceRows.map((row) => [row.source, row.count])), + total: totals.total, + usable: totals.usable, lastIngestionBySource: getLastIngestionBySource(), - contentFetchCoverage: { - withContent: contentCoverage.with_content || 0, - withImage: contentCoverage.with_image || 0, - withEmbedding: embeddingCoverage.count || 0, - total: contentCoverage.total || 0, - contentRatio: contentCoverage.total ? Number((contentCoverage.with_content / contentCoverage.total).toFixed(4)) : 0, - imageRatio: contentCoverage.total ? Number((contentCoverage.with_image / contentCoverage.total).toFixed(4)) : 0, - embeddingRatio: contentCoverage.total ? Number((embeddingCoverage.count / contentCoverage.total).toFixed(4)) : 0, - }, + bySource: Object.fromEntries( + bySourceRows.map((row) => [row.source, { total: row.total, usable: row.usable }]) + ), }; }); }