diff --git a/server.js b/server.js index 87330bb..be60406 100644 --- a/server.js +++ b/server.js @@ -2,6 +2,7 @@ const Fastify = require('fastify'); const cors = require('@fastify/cors'); const articleRoutes = require('./src/routes/articles'); const statusRoutes = require('./src/routes/status'); +const sourcesRoutes = require('./src/routes/sources'); const config = require('./src/config'); const { startScheduler } = require('./src/scheduler'); @@ -10,6 +11,7 @@ const app = Fastify({ logger: true }); app.register(cors, { origin: true }); app.register(articleRoutes); app.register(statusRoutes); +app.register(sourcesRoutes); app.get('/', async () => ({ ok: true })); diff --git a/src/contentValidation.js b/src/contentValidation.js index 31bd9a6..d69fddd 100644 --- a/src/contentValidation.js +++ b/src/contentValidation.js @@ -68,6 +68,10 @@ const BODY_PREFIX_BLOCKLIST = [ "checking your browser before", "this site requires javascript", "please make sure your browser supports", + + // yahoo finance serves its global nav when the article body is js-rendered + // and the plain fetch only gets the static shell + "today's news us politics world weather", ]; diff --git a/src/db.js b/src/db.js index 9dc208b..8518521 100644 --- a/src/db.js +++ b/src/db.js @@ -286,7 +286,9 @@ for (const statement of [ 'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT', 'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0', 'ALTER TABLE articles ADD COLUMN content_retry_after TEXT', - 'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0' + 'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0', + 'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0', + 'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT' ]) { try { db.exec(statement); @@ -297,6 +299,24 @@ for (const statement of [ } } +// backfill has_embedding for existing rows — safe to re-run, only touches rows that need it +db.exec(` + UPDATE articles SET has_embedding = 1 + WHERE has_embedding = 0 + AND EXISTS (SELECT 1 FROM article_embedding_meta WHERE article_id = articles.id) +`); + +// backfill pub_date_effective for existing rows +db.exec(` + UPDATE articles SET pub_date_effective = COALESCE(pub_date, ingested_at) + WHERE pub_date_effective IS NULL +`); + +db.exec(` + CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding); + CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC); +`); + db.exec(` UPDATE articles SET is_index_page = 1 @@ -321,4 +341,13 @@ db.exec(` ) `); +// reset articles that grabbed yahoo finance's nav shell instead of article body +db.exec(` + UPDATE articles + SET content = NULL, content_status = NULL, content_error = NULL, + content_attempted_at = NULL, content_attempt_count = 0, + content_retry_after = NULL + WHERE content LIKE 'Today''s news US Politics World Weather%' +`); + module.exports = db; diff --git a/src/embeddings.js b/src/embeddings.js index d821378..93cd535 100644 --- a/src/embeddings.js +++ b/src/embeddings.js @@ -35,6 +35,10 @@ const upsertEmbeddingMeta = db.prepare(` embedded_at = excluded.embedded_at `); +const markArticleHasEmbedding = db.prepare(` + UPDATE articles SET has_embedding = 1 WHERE id = ? AND has_embedding = 0 +`); + const upsertEmbeddingStore = db.prepare(` INSERT INTO article_embedding_store (article_id, model, embedding, embedded_at) VALUES (?, ?, ?, datetime('now')) @@ -364,6 +368,7 @@ async function generateAndStoreEmbedding(id) { deleteEmbedding.run(BigInt(id)); insertEmbedding.run(BigInt(id), padEmbeddingForVec0(embedding)); upsertEmbeddingMeta.run(id, EMBEDDING_MODEL); + markArticleHasEmbedding.run(id); return { stored: true, shouldPauseBatch: false }; } catch (error) { @@ -387,6 +392,7 @@ function commitEmbeddingBatch(rows) { deleteEmbedding.run(BigInt(entry.id)); insertEmbedding.run(BigInt(entry.id), padEmbeddingForVec0(entry.embedding)); upsertEmbeddingMeta.run(entry.id, EMBEDDING_MODEL); + markArticleHasEmbedding.run(entry.id); } }); diff --git a/src/ingest.js b/src/ingest.js index 62e670e..e0b876f 100644 --- a/src/ingest.js +++ b/src/ingest.js @@ -12,8 +12,9 @@ const insertArticle = db.prepare(` normalized_title, source, pub_date, - ingested_at - ) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?) + ingested_at, + pub_date_effective + ) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?) `); const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?'); const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i; @@ -90,7 +91,8 @@ function ingestArticle(article) { normalizedTitle, source, pubDate, - ingestedAt + ingestedAt, + pubDate || ingestedAt ); // dont kick off the content fetch here — it used to be fire-and-forget which diff --git a/src/routes/articles.js b/src/routes/articles.js index 859ff43..8fc1543 100644 --- a/src/routes/articles.js +++ b/src/routes/articles.js @@ -34,7 +34,7 @@ function buildArticlesQuery(query) { conditions.push('content IS NOT NULL AND content != \'\''); conditions.push('is_index_page = 0'); - conditions.push('EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)'); + conditions.push('has_embedding = 1'); const whereClause = `WHERE ${conditions.join(' AND ')}`; const limit = Number.parseInt(query.limit, 10); @@ -48,7 +48,7 @@ function buildArticlesQuery(query) { SELECT id, title, description, content, ${includeEmbedding ? 'embedding,' : ''} url, normalized_title, source, pub_date, ingested_at FROM articles ${whereClause} - ORDER BY COALESCE(pub_date, ingested_at) DESC, id DESC + ORDER BY pub_date_effective DESC, id DESC LIMIT ? OFFSET ? `, params, @@ -76,7 +76,7 @@ function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) { FROM articles WHERE id IN (${placeholders}) AND content IS NOT NULL AND content != '' - AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id) + AND has_embedding = 1 ${excludeIndexPages ? 'AND is_index_page = 0' : ''} `).all(...ids); const byId = new Map(articles.map((article) => [article.id, article])); @@ -149,7 +149,7 @@ async function articleRoutes(fastify) { WHERE id = ? AND content IS NOT NULL AND content != '' AND is_index_page = 0 - AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id) + AND has_embedding = 1 `).get(request.params.id); if (!article) { diff --git a/src/routes/sources.js b/src/routes/sources.js new file mode 100644 index 0000000..fe06f6e --- /dev/null +++ b/src/routes/sources.js @@ -0,0 +1,110 @@ +const db = require('../db'); +const { getSourceCatalog } = require('../sources/sourceCatalog'); + + +async function sourcesRoutes(fastify) { + fastify.get('/sources', async () => { + const catalog = getSourceCatalog(); + + + // bucket every article by source string and content_status so we can + // answer "why is this source unusable" without N queries + const statusRows = db.prepare(` + SELECT + source, + COUNT(*) AS total, + SUM(CASE WHEN content_status = 'ready' THEN 1 ELSE 0 END) AS ready, + SUM(CASE WHEN content_status = 'skipped' THEN 1 ELSE 0 END) AS skipped, + SUM(CASE WHEN content_status = 'failed' THEN 1 ELSE 0 END) AS failed, + SUM(CASE WHEN content_status = 'pending' THEN 1 ELSE 0 END) AS pending, + SUM(CASE WHEN content_status IS NULL THEN 1 ELSE 0 END) AS untried, + SUM(CASE + WHEN content IS NOT NULL AND content != '' + AND is_index_page = 0 + AND has_embedding = 1 + THEN 1 ELSE 0 + END) AS usable + FROM articles + GROUP BY source + `).all(); + + const statusByLabel = new Map(); + for (const row of statusRows) { + // source can be "rss:Al Jazeera", "gdelt:Al Jazeera", or just "alphavantage" + const idx = row.source.indexOf(':'); + const label = idx >= 0 ? row.source.slice(idx + 1) : row.source; + const feed = idx >= 0 ? row.source.slice(0, idx) : row.source; + + if (!statusByLabel.has(label)) statusByLabel.set(label, []); + statusByLabel.get(label).push({ feed, ...row }); + } + + + const policyRows = db.prepare(` + SELECT domain, policy, consecutive_plain_failures, consecutive_browser_failures, + plain_success_count, browser_success_count, expires_at, updated_at + FROM domain_fetch_policy + `).all(); + + const policyByDomain = new Map(policyRows.map((r) => [r.domain, r])); + + + return catalog.map((s) => { + const buckets = statusByLabel.get(s.label) || []; + + const counts = buckets.reduce( + (acc, b) => { + acc.total += b.total; + acc.ready += b.ready; + acc.skipped += b.skipped; + acc.failed += b.failed; + acc.pending += b.pending; + acc.untried += b.untried; + acc.usable += b.usable; + return acc; + }, + { total: 0, ready: 0, skipped: 0, failed: 0, pending: 0, untried: 0, usable: 0 } + ); + + const domains = s.website.map((d) => { + const row = policyByDomain.get(d); + if (!row) { + return { domain: d, policy: 'auto' }; + } + + return { + domain: d, + policy: row.policy, + plainFailures: row.consecutive_plain_failures, + browserFailures: row.consecutive_browser_failures, + plainSuccesses: row.plain_success_count, + browserSuccesses: row.browser_success_count, + expiresAt: row.expires_at, + updatedAt: row.updated_at, + }; + }); + + return { + id: s.id, + label: s.label, + websites: s.website, + backfill: s.backfill, + feeds: s.feedUrls, + counts, + byFeed: buckets.map((b) => ({ + feed: b.feed, + total: b.total, + ready: b.ready, + skipped: b.skipped, + failed: b.failed, + pending: b.pending, + untried: b.untried, + usable: b.usable, + })), + domains, + }; + }); + }); +} + +module.exports = sourcesRoutes; diff --git a/src/routes/status.js b/src/routes/status.js index 4be3aaa..9509575 100644 --- a/src/routes/status.js +++ b/src/routes/status.js @@ -1,8 +1,17 @@ const db = require('../db'); const { getLastIngestionBySource } = require('../state'); +let statusCache = null; +let statusCacheAt = 0; +const STATUS_CACHE_TTL_MS = 30 * 1000; + async function statusRoutes(fastify) { fastify.get('/status', async () => { + const now = Date.now(); + if (statusCache && now - statusCacheAt < STATUS_CACHE_TTL_MS) { + return statusCache; + } + const bySourceRows = db.prepare(` SELECT source, @@ -11,7 +20,7 @@ async function statusRoutes(fastify) { SUM(CASE WHEN content IS NOT NULL AND content != '' AND is_index_page = 0 - AND EXISTS (SELECT 1 FROM article_embedding_meta WHERE article_id = articles.id) + AND has_embedding = 1 THEN 1 ELSE 0 END) AS usable FROM articles @@ -38,7 +47,7 @@ async function statusRoutes(fastify) { ORDER BY article_count DESC `).all(); - return { + const result = { total: totals.total, usable: totals.usable, lastIngestionBySource: getLastIngestionBySource(), @@ -51,6 +60,10 @@ async function statusRoutes(fastify) { dimensions: row.sample_bytes ? row.sample_bytes / 4 : null, })), }; + + statusCache = result; + statusCacheAt = now; + return result; }); }