refine article filtering to ensure only usable articles are returned

This commit is contained in:
ImBenji 2026-04-18 14:17:31 +01:00
parent 4883632e37
commit 9df24d44c9
3 changed files with 50 additions and 46 deletions

View file

@ -50,7 +50,7 @@ Use this to confirm the server is running, not to inspect ingestion state.
### `GET /articles` ### `GET /articles`
Returns articles from the `articles` table. Behavior changes based on the query params you send. Returns articles from the `articles` table. Only articles that are considered **usable** are exposed — meaning they have non-empty `content`, a stored embedding, and are not index/category pages. Behavior changes based on the query params you send.
#### Query params #### Query params
@ -237,6 +237,7 @@ Returns one article by numeric ID.
**Behavior** **Behavior**
- Looks up the article directly in SQLite. - Looks up the article directly in SQLite.
- Same usability filter as the list endpoint — returns `404` if the article exists but is not usable.
- Returns the same article fields as normal `/articles` list mode. - Returns the same article fields as normal `/articles` list mode.
- Does not return embedding data. - Does not return embedding data.
- Returns `404` if the ID does not exist. - Returns `404` if the ID does not exist.
@ -257,16 +258,10 @@ Returns ingestion and archive summary information.
**Response fields** **Response fields**
- `totalArticles`: total number of rows in `articles` - `total`: total number of rows in `articles` across all sources
- `countsBySource`: article counts grouped by source name - `usable`: articles that have content, an embedding, and are not index pages
- `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source - `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source
- `contentFetchCoverage.total`: total article count used for coverage math - `bySource`: per-source breakdown, each with `total` and `usable` counts
- `contentFetchCoverage.withContent`: rows whose `content` is present and non-empty
- `contentFetchCoverage.withImage`: rows whose `image` is present and non-empty
- `contentFetchCoverage.withEmbedding`: rows that have an embedding in `article_embeddings`
- `contentFetchCoverage.contentRatio`: `withContent / total`
- `contentFetchCoverage.imageRatio`: `withImage / total`
- `contentFetchCoverage.embeddingRatio`: `withEmbedding / total`
**Important detail** **Important detail**
@ -275,26 +270,18 @@ Returns ingestion and archive summary information.
**Example response** **Example response**
```json ```json
{ {
"totalArticles": 10234, "total": 10234,
"countsBySource": { "usable": 8700,
"alphavantage": 120,
"edgar": 88,
"finnhub": 400,
"gdelt": 2100,
"rss": 7526
},
"lastIngestionBySource": { "lastIngestionBySource": {
"rss": "2025-01-02T10:00:00.000Z", "rss": "2025-01-02T10:00:00.000Z",
"gdelt": "2025-01-02T10:05:00.000Z" "gdelt": "2025-01-02T10:05:00.000Z"
}, },
"contentFetchCoverage": { "bySource": {
"withContent": 9000, "alphavantage": { "total": 120, "usable": 98 },
"withImage": 6500, "edgar": { "total": 88, "usable": 70 },
"withEmbedding": 8700, "finnhub": { "total": 400, "usable": 360 },
"total": 10234, "gdelt": { "total": 2100, "usable": 1800 },
"contentRatio": 0.8794, "rss": { "total": 7526, "usable": 6372 }
"imageRatio": 0.6351,
"embeddingRatio": 0.8501
} }
} }
``` ```

View file

@ -32,7 +32,11 @@ function buildArticlesQuery(query) {
params.push(query.to); params.push(query.to);
} }
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : ''; conditions.push('content IS NOT NULL AND content != \'\'');
conditions.push('is_index_page = 0');
conditions.push('EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)');
const whereClause = `WHERE ${conditions.join(' AND ')}`;
const limit = Number.parseInt(query.limit, 10); const limit = Number.parseInt(query.limit, 10);
const offset = Number.parseInt(query.offset, 10); const offset = Number.parseInt(query.offset, 10);
@ -71,6 +75,8 @@ function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) {
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
FROM articles FROM articles
WHERE id IN (${placeholders}) WHERE id IN (${placeholders})
AND content IS NOT NULL AND content != ''
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
${excludeIndexPages ? 'AND is_index_page = 0' : ''} ${excludeIndexPages ? 'AND is_index_page = 0' : ''}
`).all(...ids); `).all(...ids);
const byId = new Map(articles.map((article) => [article.id, article])); const byId = new Map(articles.map((article) => [article.id, article]));
@ -141,6 +147,9 @@ async function articleRoutes(fastify) {
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
FROM articles FROM articles
WHERE id = ? WHERE id = ?
AND content IS NOT NULL AND content != ''
AND is_index_page = 0
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
`).get(request.params.id); `).get(request.params.id);
if (!article) { if (!article) {

View file

@ -3,30 +3,38 @@ const { getLastIngestionBySource } = require('../state');
async function statusRoutes(fastify) { async function statusRoutes(fastify) {
fastify.get('/status', async () => { fastify.get('/status', async () => {
const total = db.prepare('SELECT COUNT(*) AS count FROM articles').get().count; const bySourceRows = db.prepare(`
const bySourceRows = db.prepare('SELECT source, COUNT(*) AS count FROM articles GROUP BY source ORDER BY source').all();
const contentCoverage = db.prepare(`
SELECT SELECT
source,
COUNT(*) AS total, COUNT(*) AS total,
SUM(CASE WHEN content IS NOT NULL AND TRIM(content) <> '' THEN 1 ELSE 0 END) AS with_content,
SUM(CASE WHEN image IS NOT NULL AND TRIM(image) <> '' THEN 1 ELSE 0 END) AS with_image SUM(CASE
WHEN content IS NOT NULL AND content != ''
AND is_index_page = 0
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
THEN 1 ELSE 0
END) AS usable
FROM articles FROM articles
`).get(); GROUP BY source
const embeddingCoverage = db.prepare('SELECT COUNT(*) AS count FROM article_embeddings').get(); ORDER BY source
`).all();
const totals = bySourceRows.reduce(
(acc, row) => {
acc.total += row.total;
acc.usable += row.usable;
return acc;
},
{ total: 0, usable: 0 }
);
return { return {
totalArticles: total, total: totals.total,
countsBySource: Object.fromEntries(bySourceRows.map((row) => [row.source, row.count])), usable: totals.usable,
lastIngestionBySource: getLastIngestionBySource(), lastIngestionBySource: getLastIngestionBySource(),
contentFetchCoverage: { bySource: Object.fromEntries(
withContent: contentCoverage.with_content || 0, bySourceRows.map((row) => [row.source, { total: row.total, usable: row.usable }])
withImage: contentCoverage.with_image || 0, ),
withEmbedding: embeddingCoverage.count || 0,
total: contentCoverage.total || 0,
contentRatio: contentCoverage.total ? Number((contentCoverage.with_content / contentCoverage.total).toFixed(4)) : 0,
imageRatio: contentCoverage.total ? Number((contentCoverage.with_image / contentCoverage.total).toFixed(4)) : 0,
embeddingRatio: contentCoverage.total ? Number((embeddingCoverage.count / contentCoverage.total).toFixed(4)) : 0,
},
}; };
}); });
} }