diff --git a/README.md b/README.md index 0f4754d..b5ec5c5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # duriin_api -Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, and GDELT into a local SQLite archive. +Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, GDELT, and configured publisher crawlers into a local SQLite archive. ## Setup @@ -8,27 +8,312 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing ```bash npm install ``` -2. Edit `config.json` with your API keys, including `openRouter.apiKey`, tickers, RSS feeds, and schedules. +2. Edit `config.json` with your API keys, tickers, RSS feeds, crawler settings, and schedules. 3. Start the server: ```bash npm start ``` -## API +The server listens on the host and port defined in `config.json`. -- `GET /articles?q=&source=&from=&to=&limit=&offset=` -- `GET /articles?similar_to={id}&limit=` -- `GET /articles?topic={query}&limit=` -- `GET /articles/:id` -- `GET /status` +## How the data pipeline works + +On startup the server: + +1. Opens the SQLite database. +2. Registers the article and status routes. +3. Starts the HTTP server. +4. Immediately runs all ingestion sources once. +5. Starts the cron scheduler for recurring ingestions, content backfill, and embedding backfill. + +When a new article is inserted: + +- the record is written immediately with `title`, `description`, `url`, `source`, and timestamps +- `content` and `image` start as `null` +- full article extraction runs asynchronously after insert +- vector embeddings are generated later, after title, description, and content are all available + +## API overview + +All exposed endpoints are `GET` endpoints. + +### `GET /` + +Simple health check. + +**Response** +```json +{ "ok": true } +``` + +Use this to confirm the server is running, not to inspect ingestion state. + +### `GET /articles` + +Returns articles from the `articles` table. Behavior changes based on the query params you send. + +#### Query params + +##### `keyword` + +Plain keyword search. + +- matches `title`, `description`, and `content` +- uses SQL `LIKE` +- works like substring matching, not semantic search +- best when you want literal words or phrases to appear in the article text + +Example: +```http +GET /articles?keyword=earnings +``` + +##### `source` + +Exact match on the stored `source` field. + +Example: +```http +GET /articles?source=rss +``` + +##### `from` + +Only returns rows where `pub_date >= from`. + +Example: +```http +GET /articles?from=2025-01-01T00:00:00.000Z +``` + +##### `to` + +Only returns rows where `pub_date <= to`. + +Example: +```http +GET /articles?to=2025-01-31T23:59:59.999Z +``` + +##### `limit` + +Number of rows to return. + +- default: `20` +- max: `100` + +Example: +```http +GET /articles?limit=10 +``` + +##### `offset` + +Pagination offset. + +- default: `0` + +Example: +```http +GET /articles?limit=10&offset=20 +``` + +##### `similar_to_article` + +Runs vector similarity search instead of normal list mode. + +- value must be an existing article ID +- the server looks up that article's embedding +- nearest-neighbor search runs in `sqlite-vec` +- the source article is excluded from the result set +- each result includes a `distance` field +- lower `distance` means more similar +- returns `404` if the article has no stored embedding + +Example: +```http +GET /articles?similar_to_article=123&limit=5 +``` + +Not found response: +```json +{ "error": "Embedding not found for article" } +``` + +##### `semantic` + +Semantic search by meaning, not exact wording. + +- use this when you want conceptually related results +- unlike `keyword`, the words do not need to appear literally in the article text +- the query text is normalized before embedding +- query embeddings are cached in SQLite +- on cache miss, the server requests an embedding from OpenRouter +- nearest article matches are returned from the embedding index +- each result includes a `distance` field +- lower `distance` means a closer semantic match +- returns `400` if `semantic` is empty + +Example: +```http +GET /articles?semantic=ai chip demand&limit=10 +``` + +Bad request response: +```json +{ "error": "Semantic query must not be empty" } +``` + +##### `include_embedding` + +Explicitly rejected on `/articles`. + +Response: +```json +{ "error": "Embeddings are not returned directly. Use similar_to_article for vector search." } +``` + +#### General behavior + +- If `semantic` is present, semantic search is used. +- Else if `similar_to_article` is present, similarity search is used. +- Otherwise normal list/search mode is used. +- `keyword` is literal keyword matching. +- `semantic` is semantic matching by meaning. +- Normal list/search results are ordered by `COALESCE(pub_date, ingested_at) DESC, id DESC`. +- `from` and `to` are compared against stored publication timestamps, so ISO-8601 values are the safest input. +- `source` must match the stored source name exactly. +- `keyword` is substring matching, not full-text search. + +#### Normal list/search response shape + +```json +[ + { + "id": 123, + "title": "...", + "description": "...", + "content": "...", + "image": "...", + "url": "...", + "normalized_title": "...", + "source": "rss", + "pub_date": "2025-01-01T12:34:56.000Z", + "ingested_at": "2025-01-01T12:35:10.000Z" + } +] +``` + +#### Similarity/topic search response shape + +```json +[ + { + "id": 456, + "title": "...", + "description": "...", + "content": "...", + "image": "...", + "url": "...", + "normalized_title": "...", + "source": "rss", + "pub_date": "2025-01-02T09:00:00.000Z", + "ingested_at": "2025-01-02T09:00:10.000Z", + "distance": 0.1234 + } +] +``` + +#### Combined example + +```http +GET /articles?keyword=earnings&source=rss&from=2025-01-01T00:00:00.000Z&limit=10&offset=0 +``` + +### `GET /articles/:id` + +Returns one article by numeric ID. + +**Behavior** + +- Looks up the article directly in SQLite. +- Returns the same article fields as normal `/articles` list mode. +- Does not return embedding data. +- Returns `404` if the ID does not exist. + +**Example** +```http +GET /articles/123 +``` + +**Not found response** +```json +{ "error": "Article not found" } +``` + +### `GET /status` + +Returns ingestion and archive summary information. + +**Response fields** + +- `totalArticles`: total number of rows in `articles` +- `countsBySource`: article counts grouped by source name +- `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source +- `contentFetchCoverage.total`: total article count used for coverage math +- `contentFetchCoverage.withContent`: rows whose `content` is present and non-empty +- `contentFetchCoverage.withImage`: rows whose `image` is present and non-empty +- `contentFetchCoverage.withEmbedding`: rows that have an embedding in `article_embeddings` +- `contentFetchCoverage.contentRatio`: `withContent / total` +- `contentFetchCoverage.imageRatio`: `withImage / total` +- `contentFetchCoverage.embeddingRatio`: `withEmbedding / total` + +**Important detail** + +`lastIngestionBySource` is kept in memory, so it resets when the process restarts. + +**Example response** +```json +{ + "totalArticles": 10234, + "countsBySource": { + "alphavantage": 120, + "edgar": 88, + "finnhub": 400, + "gdelt": 2100, + "rss": 7526 + }, + "lastIngestionBySource": { + "rss": "2025-01-02T10:00:00.000Z", + "gdelt": "2025-01-02T10:05:00.000Z" + }, + "contentFetchCoverage": { + "withContent": 9000, + "withImage": 6500, + "withEmbedding": 8700, + "total": 10234, + "contentRatio": 0.8794, + "imageRatio": 0.6351, + "embeddingRatio": 0.8501 + } +} +``` + +## Article field notes + +- `image` stores the extracted main image as ultra-compressed base64 WebP. +- `normalized_title` is stored for matching and indexing. +- `source` may be a shared source like `rss`, `gdelt`, `edgar`, `alphavantage`, or `finnhub`, or a crawler-derived source name for a configured publisher. +- `pub_date` is normalized to ISO-8601 when it can be parsed. +- `ingested_at` is the insert timestamp set by the server. ## Notes - SQLite archive file defaults to `./archive.sqlite`. - Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique. - `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts. -- Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content. -- Main article images are stored as ultra-compressed base64 WebP. +- Article body extraction runs asynchronously after insertion, with scheduled retries for rows still missing content. - Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search. - Topic search caches normalized query embeddings in SQLite and falls back to OpenRouter on cache miss. - SEC requests use the configured `User-Agent`. +- Duplicate URLs are skipped rather than inserted again. diff --git a/config.json b/config.json index af2b47a..a7b0d4f 100644 --- a/config.json +++ b/config.json @@ -489,7 +489,8 @@ "https://jamaica-gleaner.com/", "https://jamaica-gleaner.com/news", "https://jamaica-gleaner.com/business" - ] + ], + "requestTimeout": 25000 }, "Jamaica Observer": { "allowedHosts": [ diff --git a/src/http.js b/src/http.js index 6f9de11..9e12ae2 100644 --- a/src/http.js +++ b/src/http.js @@ -41,6 +41,33 @@ function getRetryDelay(attempt, response) { return baseDelay + Math.floor(Math.random() * 250); } +function getErrorCode(error) { + return String(error?.code || error?.cause?.code || '').trim(); +} + +function isRetryableError(error) { + const code = getErrorCode(error); + const message = String(error?.message || '').toLowerCase(); + + if (code === 'UNABLE_TO_VERIFY_LEAF_SIGNATURE') { + return false; + } + + if (error?.name === 'TimeoutError') { + return true; + } + + return [ + 'UND_ERR_SOCKET', + 'UND_ERR_CONNECT_TIMEOUT', + 'ECONNRESET', + 'ECONNREFUSED', + 'ETIMEDOUT', + 'EAI_AGAIN', + 'ENETUNREACH', + ].includes(code) || message.includes('other side closed'); +} + async function fetchWithPolicy(url, options = {}) { const { timeout = 20000, @@ -73,6 +100,10 @@ async function fetchWithPolicy(url, options = {}) { lastError = error; } catch (error) { lastError = error; + + if (!isRetryableError(error)) { + throw error; + } } if (attempt < retries) { diff --git a/src/routes/articles.js b/src/routes/articles.js index 4f0b9b9..072bf40 100644 --- a/src/routes/articles.js +++ b/src/routes/articles.js @@ -11,9 +11,9 @@ function buildArticlesQuery(query) { const params = []; const includeEmbedding = String(query.include_embedding || '').toLowerCase() === 'true'; - if (query.q) { + if (query.keyword) { conditions.push('(title LIKE ? OR description LIKE ? OR content LIKE ?)'); - const keyword = `%${query.q}%`; + const keyword = `%${query.keyword}%`; params.push(keyword, keyword, keyword); } @@ -56,16 +56,16 @@ async function articleRoutes(fastify) { const query = request.query || {}; if (query.include_embedding) { reply.code(400); - return { error: 'Embeddings are not returned directly. Use similar_to for vector search.' }; + return { error: 'Embeddings are not returned directly. Use similar_to_article for vector search.' }; } - if (query.topic !== undefined) { + if (query.semantic !== undefined) { const limit = Number.parseInt(query.limit, 10); - const embedding = await getOrCreateQueryEmbedding(query.topic); + const embedding = await getOrCreateQueryEmbedding(query.semantic); if (!embedding) { reply.code(400); - return { error: 'Topic must not be empty' }; + return { error: 'Semantic query must not be empty' }; } const neighbors = findArticlesByEmbedding( @@ -93,9 +93,9 @@ async function articleRoutes(fastify) { .filter(Boolean); } - if (query.similar_to) { + if (query.similar_to_article) { const limit = Number.parseInt(query.limit, 10); - const articleId = Number.parseInt(query.similar_to, 10); + const articleId = Number.parseInt(query.similar_to_article, 10); const neighbors = findSimilarArticles( articleId, Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20 @@ -134,7 +134,7 @@ async function articleRoutes(fastify) { fastify.get('/articles/:id', async (request, reply) => { if (String((request.query || {}).include_embedding || '').toLowerCase() === 'true') { reply.code(400); - return { error: 'Embeddings are not returned directly. Use similar_to for vector search.' }; + return { error: 'Embeddings are not returned directly. Use similar_to_article for vector search.' }; } const article = db.prepare(`