refactor content fetching and embedding processes for improved concurrency and error handling
This commit is contained in:
parent
6bf3a9282f
commit
37d9dfb083
8 changed files with 176 additions and 10 deletions
|
|
@ -2,6 +2,7 @@ const Fastify = require('fastify');
|
|||
const cors = require('@fastify/cors');
|
||||
const articleRoutes = require('./src/routes/articles');
|
||||
const statusRoutes = require('./src/routes/status');
|
||||
const sourcesRoutes = require('./src/routes/sources');
|
||||
const config = require('./src/config');
|
||||
const { startScheduler } = require('./src/scheduler');
|
||||
|
||||
|
|
@ -10,6 +11,7 @@ const app = Fastify({ logger: true });
|
|||
app.register(cors, { origin: true });
|
||||
app.register(articleRoutes);
|
||||
app.register(statusRoutes);
|
||||
app.register(sourcesRoutes);
|
||||
|
||||
app.get('/', async () => ({ ok: true }));
|
||||
|
||||
|
|
|
|||
|
|
@ -68,6 +68,10 @@ const BODY_PREFIX_BLOCKLIST = [
|
|||
"checking your browser before",
|
||||
"this site requires javascript",
|
||||
"please make sure your browser supports",
|
||||
|
||||
// yahoo finance serves its global nav when the article body is js-rendered
|
||||
// and the plain fetch only gets the static shell
|
||||
"today's news us politics world weather",
|
||||
];
|
||||
|
||||
|
||||
|
|
|
|||
31
src/db.js
31
src/db.js
|
|
@ -286,7 +286,9 @@ for (const statement of [
|
|||
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0',
|
||||
'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
|
||||
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0',
|
||||
'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0',
|
||||
'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT'
|
||||
]) {
|
||||
try {
|
||||
db.exec(statement);
|
||||
|
|
@ -297,6 +299,24 @@ for (const statement of [
|
|||
}
|
||||
}
|
||||
|
||||
// backfill has_embedding for existing rows — safe to re-run, only touches rows that need it
|
||||
db.exec(`
|
||||
UPDATE articles SET has_embedding = 1
|
||||
WHERE has_embedding = 0
|
||||
AND EXISTS (SELECT 1 FROM article_embedding_meta WHERE article_id = articles.id)
|
||||
`);
|
||||
|
||||
// backfill pub_date_effective for existing rows
|
||||
db.exec(`
|
||||
UPDATE articles SET pub_date_effective = COALESCE(pub_date, ingested_at)
|
||||
WHERE pub_date_effective IS NULL
|
||||
`);
|
||||
|
||||
db.exec(`
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);
|
||||
`);
|
||||
|
||||
db.exec(`
|
||||
UPDATE articles
|
||||
SET is_index_page = 1
|
||||
|
|
@ -321,4 +341,13 @@ db.exec(`
|
|||
)
|
||||
`);
|
||||
|
||||
// reset articles that grabbed yahoo finance's nav shell instead of article body
|
||||
db.exec(`
|
||||
UPDATE articles
|
||||
SET content = NULL, content_status = NULL, content_error = NULL,
|
||||
content_attempted_at = NULL, content_attempt_count = 0,
|
||||
content_retry_after = NULL
|
||||
WHERE content LIKE 'Today''s news US Politics World Weather%'
|
||||
`);
|
||||
|
||||
module.exports = db;
|
||||
|
|
|
|||
|
|
@ -35,6 +35,10 @@ const upsertEmbeddingMeta = db.prepare(`
|
|||
embedded_at = excluded.embedded_at
|
||||
`);
|
||||
|
||||
const markArticleHasEmbedding = db.prepare(`
|
||||
UPDATE articles SET has_embedding = 1 WHERE id = ? AND has_embedding = 0
|
||||
`);
|
||||
|
||||
const upsertEmbeddingStore = db.prepare(`
|
||||
INSERT INTO article_embedding_store (article_id, model, embedding, embedded_at)
|
||||
VALUES (?, ?, ?, datetime('now'))
|
||||
|
|
@ -364,6 +368,7 @@ async function generateAndStoreEmbedding(id) {
|
|||
deleteEmbedding.run(BigInt(id));
|
||||
insertEmbedding.run(BigInt(id), padEmbeddingForVec0(embedding));
|
||||
upsertEmbeddingMeta.run(id, EMBEDDING_MODEL);
|
||||
markArticleHasEmbedding.run(id);
|
||||
|
||||
return { stored: true, shouldPauseBatch: false };
|
||||
} catch (error) {
|
||||
|
|
@ -387,6 +392,7 @@ function commitEmbeddingBatch(rows) {
|
|||
deleteEmbedding.run(BigInt(entry.id));
|
||||
insertEmbedding.run(BigInt(entry.id), padEmbeddingForVec0(entry.embedding));
|
||||
upsertEmbeddingMeta.run(entry.id, EMBEDDING_MODEL);
|
||||
markArticleHasEmbedding.run(entry.id);
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -12,8 +12,9 @@ const insertArticle = db.prepare(`
|
|||
normalized_title,
|
||||
source,
|
||||
pub_date,
|
||||
ingested_at
|
||||
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?)
|
||||
ingested_at,
|
||||
pub_date_effective
|
||||
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?)
|
||||
`);
|
||||
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
|
||||
const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
|
||||
|
|
@ -90,7 +91,8 @@ function ingestArticle(article) {
|
|||
normalizedTitle,
|
||||
source,
|
||||
pubDate,
|
||||
ingestedAt
|
||||
ingestedAt,
|
||||
pubDate || ingestedAt
|
||||
);
|
||||
|
||||
// dont kick off the content fetch here — it used to be fire-and-forget which
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ function buildArticlesQuery(query) {
|
|||
|
||||
conditions.push('content IS NOT NULL AND content != \'\'');
|
||||
conditions.push('is_index_page = 0');
|
||||
conditions.push('EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)');
|
||||
conditions.push('has_embedding = 1');
|
||||
|
||||
const whereClause = `WHERE ${conditions.join(' AND ')}`;
|
||||
const limit = Number.parseInt(query.limit, 10);
|
||||
|
|
@ -48,7 +48,7 @@ function buildArticlesQuery(query) {
|
|||
SELECT id, title, description, content, ${includeEmbedding ? 'embedding,' : ''} url, normalized_title, source, pub_date, ingested_at
|
||||
FROM articles
|
||||
${whereClause}
|
||||
ORDER BY COALESCE(pub_date, ingested_at) DESC, id DESC
|
||||
ORDER BY pub_date_effective DESC, id DESC
|
||||
LIMIT ? OFFSET ?
|
||||
`,
|
||||
params,
|
||||
|
|
@ -76,7 +76,7 @@ function mapNeighborsToArticles(neighbors, excludeIndexPages, limit) {
|
|||
FROM articles
|
||||
WHERE id IN (${placeholders})
|
||||
AND content IS NOT NULL AND content != ''
|
||||
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
|
||||
AND has_embedding = 1
|
||||
${excludeIndexPages ? 'AND is_index_page = 0' : ''}
|
||||
`).all(...ids);
|
||||
const byId = new Map(articles.map((article) => [article.id, article]));
|
||||
|
|
@ -149,7 +149,7 @@ async function articleRoutes(fastify) {
|
|||
WHERE id = ?
|
||||
AND content IS NOT NULL AND content != ''
|
||||
AND is_index_page = 0
|
||||
AND EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = articles.id)
|
||||
AND has_embedding = 1
|
||||
`).get(request.params.id);
|
||||
|
||||
if (!article) {
|
||||
|
|
|
|||
110
src/routes/sources.js
Normal file
110
src/routes/sources.js
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
const db = require('../db');
|
||||
const { getSourceCatalog } = require('../sources/sourceCatalog');
|
||||
|
||||
|
||||
async function sourcesRoutes(fastify) {
|
||||
fastify.get('/sources', async () => {
|
||||
const catalog = getSourceCatalog();
|
||||
|
||||
|
||||
// bucket every article by source string and content_status so we can
|
||||
// answer "why is this source unusable" without N queries
|
||||
const statusRows = db.prepare(`
|
||||
SELECT
|
||||
source,
|
||||
COUNT(*) AS total,
|
||||
SUM(CASE WHEN content_status = 'ready' THEN 1 ELSE 0 END) AS ready,
|
||||
SUM(CASE WHEN content_status = 'skipped' THEN 1 ELSE 0 END) AS skipped,
|
||||
SUM(CASE WHEN content_status = 'failed' THEN 1 ELSE 0 END) AS failed,
|
||||
SUM(CASE WHEN content_status = 'pending' THEN 1 ELSE 0 END) AS pending,
|
||||
SUM(CASE WHEN content_status IS NULL THEN 1 ELSE 0 END) AS untried,
|
||||
SUM(CASE
|
||||
WHEN content IS NOT NULL AND content != ''
|
||||
AND is_index_page = 0
|
||||
AND has_embedding = 1
|
||||
THEN 1 ELSE 0
|
||||
END) AS usable
|
||||
FROM articles
|
||||
GROUP BY source
|
||||
`).all();
|
||||
|
||||
const statusByLabel = new Map();
|
||||
for (const row of statusRows) {
|
||||
// source can be "rss:Al Jazeera", "gdelt:Al Jazeera", or just "alphavantage"
|
||||
const idx = row.source.indexOf(':');
|
||||
const label = idx >= 0 ? row.source.slice(idx + 1) : row.source;
|
||||
const feed = idx >= 0 ? row.source.slice(0, idx) : row.source;
|
||||
|
||||
if (!statusByLabel.has(label)) statusByLabel.set(label, []);
|
||||
statusByLabel.get(label).push({ feed, ...row });
|
||||
}
|
||||
|
||||
|
||||
const policyRows = db.prepare(`
|
||||
SELECT domain, policy, consecutive_plain_failures, consecutive_browser_failures,
|
||||
plain_success_count, browser_success_count, expires_at, updated_at
|
||||
FROM domain_fetch_policy
|
||||
`).all();
|
||||
|
||||
const policyByDomain = new Map(policyRows.map((r) => [r.domain, r]));
|
||||
|
||||
|
||||
return catalog.map((s) => {
|
||||
const buckets = statusByLabel.get(s.label) || [];
|
||||
|
||||
const counts = buckets.reduce(
|
||||
(acc, b) => {
|
||||
acc.total += b.total;
|
||||
acc.ready += b.ready;
|
||||
acc.skipped += b.skipped;
|
||||
acc.failed += b.failed;
|
||||
acc.pending += b.pending;
|
||||
acc.untried += b.untried;
|
||||
acc.usable += b.usable;
|
||||
return acc;
|
||||
},
|
||||
{ total: 0, ready: 0, skipped: 0, failed: 0, pending: 0, untried: 0, usable: 0 }
|
||||
);
|
||||
|
||||
const domains = s.website.map((d) => {
|
||||
const row = policyByDomain.get(d);
|
||||
if (!row) {
|
||||
return { domain: d, policy: 'auto' };
|
||||
}
|
||||
|
||||
return {
|
||||
domain: d,
|
||||
policy: row.policy,
|
||||
plainFailures: row.consecutive_plain_failures,
|
||||
browserFailures: row.consecutive_browser_failures,
|
||||
plainSuccesses: row.plain_success_count,
|
||||
browserSuccesses: row.browser_success_count,
|
||||
expiresAt: row.expires_at,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
id: s.id,
|
||||
label: s.label,
|
||||
websites: s.website,
|
||||
backfill: s.backfill,
|
||||
feeds: s.feedUrls,
|
||||
counts,
|
||||
byFeed: buckets.map((b) => ({
|
||||
feed: b.feed,
|
||||
total: b.total,
|
||||
ready: b.ready,
|
||||
skipped: b.skipped,
|
||||
failed: b.failed,
|
||||
pending: b.pending,
|
||||
untried: b.untried,
|
||||
usable: b.usable,
|
||||
})),
|
||||
domains,
|
||||
};
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = sourcesRoutes;
|
||||
|
|
@ -1,8 +1,17 @@
|
|||
const db = require('../db');
|
||||
const { getLastIngestionBySource } = require('../state');
|
||||
|
||||
let statusCache = null;
|
||||
let statusCacheAt = 0;
|
||||
const STATUS_CACHE_TTL_MS = 30 * 1000;
|
||||
|
||||
async function statusRoutes(fastify) {
|
||||
fastify.get('/status', async () => {
|
||||
const now = Date.now();
|
||||
if (statusCache && now - statusCacheAt < STATUS_CACHE_TTL_MS) {
|
||||
return statusCache;
|
||||
}
|
||||
|
||||
const bySourceRows = db.prepare(`
|
||||
SELECT
|
||||
source,
|
||||
|
|
@ -11,7 +20,7 @@ async function statusRoutes(fastify) {
|
|||
SUM(CASE
|
||||
WHEN content IS NOT NULL AND content != ''
|
||||
AND is_index_page = 0
|
||||
AND EXISTS (SELECT 1 FROM article_embedding_meta WHERE article_id = articles.id)
|
||||
AND has_embedding = 1
|
||||
THEN 1 ELSE 0
|
||||
END) AS usable
|
||||
FROM articles
|
||||
|
|
@ -38,7 +47,7 @@ async function statusRoutes(fastify) {
|
|||
ORDER BY article_count DESC
|
||||
`).all();
|
||||
|
||||
return {
|
||||
const result = {
|
||||
total: totals.total,
|
||||
usable: totals.usable,
|
||||
lastIngestionBySource: getLastIngestionBySource(),
|
||||
|
|
@ -51,6 +60,10 @@ async function statusRoutes(fastify) {
|
|||
dimensions: row.sample_bytes ? row.sample_bytes / 4 : null,
|
||||
})),
|
||||
};
|
||||
|
||||
statusCache = result;
|
||||
statusCacheAt = now;
|
||||
return result;
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue