refine article filtering to ensure only usable articles are returned
This commit is contained in:
parent
4883632e37
commit
9df24d44c9
3 changed files with 50 additions and 46 deletions
39
README.md
39
README.md
|
|
@ -50,7 +50,7 @@ Use this to confirm the server is running, not to inspect ingestion state.
|
||||||
|
|
||||||
### `GET /articles`
|
### `GET /articles`
|
||||||
|
|
||||||
Returns articles from the `articles` table. Behavior changes based on the query params you send.
|
Returns articles from the `articles` table. Only articles that are considered **usable** are exposed — meaning they have non-empty `content`, a stored embedding, and are not index/category pages. Behavior changes based on the query params you send.
|
||||||
|
|
||||||
#### Query params
|
#### Query params
|
||||||
|
|
||||||
|
|
@ -237,6 +237,7 @@ Returns one article by numeric ID.
|
||||||
**Behavior**
|
**Behavior**
|
||||||
|
|
||||||
- Looks up the article directly in SQLite.
|
- Looks up the article directly in SQLite.
|
||||||
|
- Same usability filter as the list endpoint — returns `404` if the article exists but is not usable.
|
||||||
- Returns the same article fields as normal `/articles` list mode.
|
- Returns the same article fields as normal `/articles` list mode.
|
||||||
- Does not return embedding data.
|
- Does not return embedding data.
|
||||||
- Returns `404` if the ID does not exist.
|
- Returns `404` if the ID does not exist.
|
||||||
|
|
@ -257,16 +258,10 @@ Returns ingestion and archive summary information.
|
||||||
|
|
||||||
**Response fields**
|
**Response fields**
|
||||||
|
|
||||||
- `totalArticles`: total number of rows in `articles`
|
- `total`: total number of rows in `articles` across all sources
|
||||||
- `countsBySource`: article counts grouped by source name
|
- `usable`: articles that have content, an embedding, and are not index pages
|
||||||
- `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source
|
- `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source
|
||||||
- `contentFetchCoverage.total`: total article count used for coverage math
|
- `bySource`: per-source breakdown, each with `total` and `usable` counts
|
||||||
- `contentFetchCoverage.withContent`: rows whose `content` is present and non-empty
|
|
||||||
- `contentFetchCoverage.withImage`: rows whose `image` is present and non-empty
|
|
||||||
- `contentFetchCoverage.withEmbedding`: rows that have an embedding in `article_embeddings`
|
|
||||||
- `contentFetchCoverage.contentRatio`: `withContent / total`
|
|
||||||
- `contentFetchCoverage.imageRatio`: `withImage / total`
|
|
||||||
- `contentFetchCoverage.embeddingRatio`: `withEmbedding / total`
|
|
||||||
|
|
||||||
**Important detail**
|
**Important detail**
|
||||||
|
|
||||||
|
|
@ -275,26 +270,18 @@ Returns ingestion and archive summary information.
|
||||||
**Example response**
|
**Example response**
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"totalArticles": 10234,
|
"total": 10234,
|
||||||
"countsBySource": {
|
"usable": 8700,
|
||||||
"alphavantage": 120,
|
|
||||||
"edgar": 88,
|
|
||||||
"finnhub": 400,
|
|
||||||
"gdelt": 2100,
|
|
||||||
"rss": 7526
|
|
||||||
},
|
|
||||||
"lastIngestionBySource": {
|
"lastIngestionBySource": {
|
||||||
"rss": "2025-01-02T10:00:00.000Z",
|
"rss": "2025-01-02T10:00:00.000Z",
|
||||||
"gdelt": "2025-01-02T10:05:00.000Z"
|
"gdelt": "2025-01-02T10:05:00.000Z"
|
||||||
},
|
},
|
||||||
"contentFetchCoverage": {
|
"bySource": {
|
||||||
"withContent": 9000,
|
"alphavantage": { "total": 120, "usable": 98 },
|
||||||
"withImage": 6500,
|
"edgar": { "total": 88, "usable": 70 },
|
||||||
"withEmbedding": 8700,
|
"finnhub": { "total": 400, "usable": 360 },
|
||||||
"total": 10234,
|
"gdelt": { "total": 2100, "usable": 1800 },
|
||||||
"contentRatio": 0.8794,
|
"rss": { "total": 7526, "usable": 6372 }
|
||||||
"imageRatio": 0.6351,
|
|
||||||
"embeddingRatio": 0.8501
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,11 @@ function buildArticlesQuery(query) {
|
||||||
params.push(query.to);
|
params.push(query.to);
|
||||||
}
|
}
|
||||||
|
|
||||||
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
conditions.push('content IS NOT NULL AND content != \'\'');
|
||||||
|
conditions.push('is_index_page = 0');
|
||||||
|
conditions.push('EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)');
|
||||||
|
|
||||||
|
const whereClause = `WHERE ${conditions.join(' AND ')}`;
|
||||||
const limit = Number.parseInt(query.limit, 10);
|
const limit = Number.parseInt(query.limit, 10);
|
||||||
const offset = Number.parseInt(query.offset, 10);
|
const offset = Number.parseInt(query.offset, 10);
|
||||||
|
|
||||||
|
|
@ -71,7 +75,9 @@ function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) {
|
||||||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
||||||
FROM articles
|
FROM articles
|
||||||
WHERE id IN (${placeholders})
|
WHERE id IN (${placeholders})
|
||||||
${excludeIndexPages ? 'AND is_index_page = 0' : ''}
|
AND content IS NOT NULL AND content != ''
|
||||||
|
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
|
||||||
|
${excludeIndexPages ? 'AND is_index_page = 0' : ''}
|
||||||
`).all(...ids);
|
`).all(...ids);
|
||||||
const byId = new Map(articles.map((article) => [article.id, article]));
|
const byId = new Map(articles.map((article) => [article.id, article]));
|
||||||
|
|
||||||
|
|
@ -141,6 +147,9 @@ async function articleRoutes(fastify) {
|
||||||
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
SELECT id, title, description, content, image, url, normalized_title, source, pub_date, ingested_at
|
||||||
FROM articles
|
FROM articles
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
|
AND content IS NOT NULL AND content != ''
|
||||||
|
AND is_index_page = 0
|
||||||
|
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
|
||||||
`).get(request.params.id);
|
`).get(request.params.id);
|
||||||
|
|
||||||
if (!article) {
|
if (!article) {
|
||||||
|
|
|
||||||
|
|
@ -3,30 +3,38 @@ const { getLastIngestionBySource } = require('../state');
|
||||||
|
|
||||||
async function statusRoutes(fastify) {
|
async function statusRoutes(fastify) {
|
||||||
fastify.get('/status', async () => {
|
fastify.get('/status', async () => {
|
||||||
const total = db.prepare('SELECT COUNT(*) AS count FROM articles').get().count;
|
const bySourceRows = db.prepare(`
|
||||||
const bySourceRows = db.prepare('SELECT source, COUNT(*) AS count FROM articles GROUP BY source ORDER BY source').all();
|
|
||||||
const contentCoverage = db.prepare(`
|
|
||||||
SELECT
|
SELECT
|
||||||
|
source,
|
||||||
COUNT(*) AS total,
|
COUNT(*) AS total,
|
||||||
SUM(CASE WHEN content IS NOT NULL AND TRIM(content) <> '' THEN 1 ELSE 0 END) AS with_content,
|
|
||||||
SUM(CASE WHEN image IS NOT NULL AND TRIM(image) <> '' THEN 1 ELSE 0 END) AS with_image
|
SUM(CASE
|
||||||
|
WHEN content IS NOT NULL AND content != ''
|
||||||
|
AND is_index_page = 0
|
||||||
|
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
|
||||||
|
THEN 1 ELSE 0
|
||||||
|
END) AS usable
|
||||||
FROM articles
|
FROM articles
|
||||||
`).get();
|
GROUP BY source
|
||||||
const embeddingCoverage = db.prepare('SELECT COUNT(*) AS count FROM article_embeddings').get();
|
ORDER BY source
|
||||||
|
`).all();
|
||||||
|
|
||||||
|
const totals = bySourceRows.reduce(
|
||||||
|
(acc, row) => {
|
||||||
|
acc.total += row.total;
|
||||||
|
acc.usable += row.usable;
|
||||||
|
return acc;
|
||||||
|
},
|
||||||
|
{ total: 0, usable: 0 }
|
||||||
|
);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
totalArticles: total,
|
total: totals.total,
|
||||||
countsBySource: Object.fromEntries(bySourceRows.map((row) => [row.source, row.count])),
|
usable: totals.usable,
|
||||||
lastIngestionBySource: getLastIngestionBySource(),
|
lastIngestionBySource: getLastIngestionBySource(),
|
||||||
contentFetchCoverage: {
|
bySource: Object.fromEntries(
|
||||||
withContent: contentCoverage.with_content || 0,
|
bySourceRows.map((row) => [row.source, { total: row.total, usable: row.usable }])
|
||||||
withImage: contentCoverage.with_image || 0,
|
),
|
||||||
withEmbedding: embeddingCoverage.count || 0,
|
|
||||||
total: contentCoverage.total || 0,
|
|
||||||
contentRatio: contentCoverage.total ? Number((contentCoverage.with_content / contentCoverage.total).toFixed(4)) : 0,
|
|
||||||
imageRatio: contentCoverage.total ? Number((contentCoverage.with_image / contentCoverage.total).toFixed(4)) : 0,
|
|
||||||
embeddingRatio: contentCoverage.total ? Number((embeddingCoverage.count / contentCoverage.total).toFixed(4)) : 0,
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue