refine article filtering to ensure only usable articles are returned
This commit is contained in:
parent
4883632e37
commit
9df24d44c9
3 changed files with 50 additions and 46 deletions
39
README.md
39
README.md
|
|
@ -50,7 +50,7 @@ Use this to confirm the server is running, not to inspect ingestion state.
|
|||
|
||||
### `GET /articles`
|
||||
|
||||
Returns articles from the `articles` table. Behavior changes based on the query params you send.
|
||||
Returns articles from the `articles` table. Only articles that are considered **usable** are exposed — meaning they have non-empty `content`, a stored embedding, and are not index/category pages. Behavior changes based on the query params you send.
|
||||
|
||||
#### Query params
|
||||
|
||||
|
|
@ -237,6 +237,7 @@ Returns one article by numeric ID.
|
|||
**Behavior**
|
||||
|
||||
- Looks up the article directly in SQLite.
|
||||
- Same usability filter as the list endpoint — returns `404` if the article exists but is not usable.
|
||||
- Returns the same article fields as normal `/articles` list mode.
|
||||
- Does not return embedding data.
|
||||
- Returns `404` if the ID does not exist.
|
||||
|
|
@ -257,16 +258,10 @@ Returns ingestion and archive summary information.
|
|||
|
||||
**Response fields**
|
||||
|
||||
- `totalArticles`: total number of rows in `articles`
|
||||
- `countsBySource`: article counts grouped by source name
|
||||
- `total`: total number of rows in `articles` across all sources
|
||||
- `usable`: articles that have content, an embedding, and are not index pages
|
||||
- `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source
|
||||
- `contentFetchCoverage.total`: total article count used for coverage math
|
||||
- `contentFetchCoverage.withContent`: rows whose `content` is present and non-empty
|
||||
- `contentFetchCoverage.withImage`: rows whose `image` is present and non-empty
|
||||
- `contentFetchCoverage.withEmbedding`: rows that have an embedding in `article_embeddings`
|
||||
- `contentFetchCoverage.contentRatio`: `withContent / total`
|
||||
- `contentFetchCoverage.imageRatio`: `withImage / total`
|
||||
- `contentFetchCoverage.embeddingRatio`: `withEmbedding / total`
|
||||
- `bySource`: per-source breakdown, each with `total` and `usable` counts
|
||||
|
||||
**Important detail**
|
||||
|
||||
|
|
@ -275,26 +270,18 @@ Returns ingestion and archive summary information.
|
|||
**Example response**
|
||||
```json
|
||||
{
|
||||
"totalArticles": 10234,
|
||||
"countsBySource": {
|
||||
"alphavantage": 120,
|
||||
"edgar": 88,
|
||||
"finnhub": 400,
|
||||
"gdelt": 2100,
|
||||
"rss": 7526
|
||||
},
|
||||
"total": 10234,
|
||||
"usable": 8700,
|
||||
"lastIngestionBySource": {
|
||||
"rss": "2025-01-02T10:00:00.000Z",
|
||||
"gdelt": "2025-01-02T10:05:00.000Z"
|
||||
},
|
||||
"contentFetchCoverage": {
|
||||
"withContent": 9000,
|
||||
"withImage": 6500,
|
||||
"withEmbedding": 8700,
|
||||
"total": 10234,
|
||||
"contentRatio": 0.8794,
|
||||
"imageRatio": 0.6351,
|
||||
"embeddingRatio": 0.8501
|
||||
"bySource": {
|
||||
"alphavantage": { "total": 120, "usable": 98 },
|
||||
"edgar": { "total": 88, "usable": 70 },
|
||||
"finnhub": { "total": 400, "usable": 360 },
|
||||
"gdelt": { "total": 2100, "usable": 1800 },
|
||||
"rss": { "total": 7526, "usable": 6372 }
|
||||
}
|
||||
}
|
||||
```
|
||||
|
|
|
|||
|
|
@ -32,7 +32,11 @@ function buildArticlesQuery(query) {
|
|||
params.push(query.to);
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
||||
conditions.push('content IS NOT NULL AND content != \'\'');
|
||||
conditions.push('is_index_page = 0');
|
||||
conditions.push('EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)');
|
||||
|
||||
const whereClause = `WHERE ${conditions.join(' AND ')}`;
|
||||
const limit = Number.parseInt(query.limit, 10);
|
||||
const offset = Number.parseInt(query.offset, 10);
|
||||
|
||||
|
|
@ -71,6 +75,8 @@ function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) {
|
|||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
||||
FROM articles
|
||||
WHERE id IN (${placeholders})
|
||||
AND content IS NOT NULL AND content != ''
|
||||
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
|
||||
${excludeIndexPages ? 'AND is_index_page = 0' : ''}
|
||||
`).all(...ids);
|
||||
const byId = new Map(articles.map((article) => [article.id, article]));
|
||||
|
|
@ -141,6 +147,9 @@ async function articleRoutes(fastify) {
|
|||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
||||
FROM articles
|
||||
WHERE id = ?
|
||||
AND content IS NOT NULL AND content != ''
|
||||
AND is_index_page = 0
|
||||
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
|
||||
`).get(request.params.id);
|
||||
|
||||
if (!article) {
|
||||
|
|
|
|||
|
|
@ -3,30 +3,38 @@ const { getLastIngestionBySource } = require('../state');
|
|||
|
||||
async function statusRoutes(fastify) {
|
||||
fastify.get('/status', async () => {
|
||||
const total = db.prepare('SELECT COUNT(*) AS count FROM articles').get().count;
|
||||
const bySourceRows = db.prepare('SELECT source, COUNT(*) AS count FROM articles GROUP BY source ORDER BY source').all();
|
||||
const contentCoverage = db.prepare(`
|
||||
const bySourceRows = db.prepare(`
|
||||
SELECT
|
||||
source,
|
||||
COUNT(*) AS total,
|
||||
SUM(CASE WHEN content IS NOT NULL AND TRIM(content) <> '' THEN 1 ELSE 0 END) AS with_content,
|
||||
SUM(CASE WHEN image IS NOT NULL AND TRIM(image) <> '' THEN 1 ELSE 0 END) AS with_image
|
||||
|
||||
SUM(CASE
|
||||
WHEN content IS NOT NULL AND content != ''
|
||||
AND is_index_page = 0
|
||||
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
|
||||
THEN 1 ELSE 0
|
||||
END) AS usable
|
||||
FROM articles
|
||||
`).get();
|
||||
const embeddingCoverage = db.prepare('SELECT COUNT(*) AS count FROM article_embeddings').get();
|
||||
GROUP BY source
|
||||
ORDER BY source
|
||||
`).all();
|
||||
|
||||
const totals = bySourceRows.reduce(
|
||||
(acc, row) => {
|
||||
acc.total += row.total;
|
||||
acc.usable += row.usable;
|
||||
return acc;
|
||||
},
|
||||
{ total: 0, usable: 0 }
|
||||
);
|
||||
|
||||
return {
|
||||
totalArticles: total,
|
||||
countsBySource: Object.fromEntries(bySourceRows.map((row) => [row.source, row.count])),
|
||||
total: totals.total,
|
||||
usable: totals.usable,
|
||||
lastIngestionBySource: getLastIngestionBySource(),
|
||||
contentFetchCoverage: {
|
||||
withContent: contentCoverage.with_content || 0,
|
||||
withImage: contentCoverage.with_image || 0,
|
||||
withEmbedding: embeddingCoverage.count || 0,
|
||||
total: contentCoverage.total || 0,
|
||||
contentRatio: contentCoverage.total ? Number((contentCoverage.with_content / contentCoverage.total).toFixed(4)) : 0,
|
||||
imageRatio: contentCoverage.total ? Number((contentCoverage.with_image / contentCoverage.total).toFixed(4)) : 0,
|
||||
embeddingRatio: contentCoverage.total ? Number((embeddingCoverage.count / contentCoverage.total).toFixed(4)) : 0,
|
||||
},
|
||||
bySource: Object.fromEntries(
|
||||
bySourceRows.map((row) => [row.source, { total: row.total, usable: row.usable }])
|
||||
),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue