refactor article query parameters and improve error messages

2026-04-17 00:27:32 +01:00 · 2026-04-17 00:27:32 +01:00 · b298be6108
commit b298be6108
parent c80558fd50
4 changed files with 337 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # duriin_api

-Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, and GDELT into a local SQLite archive.
+Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, GDELT, and configured publisher crawlers into a local SQLite archive.

 ## Setup

@ -8,27 +8,312 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing
   ```bash
   npm install
   ```
-2. Edit `config.json` with your API keys, including `openRouter.apiKey`, tickers, RSS feeds, and schedules.
+2. Edit `config.json` with your API keys, tickers, RSS feeds, crawler settings, and schedules.
 3. Start the server:
   ```bash
   npm start
   ```

-## API
+The server listens on the host and port defined in `config.json`.

- `GET /articles?q=&source=&from=&to=&limit=&offset=`
- `GET /articles?similar_to={id}&limit=`
- `GET /articles?topic={query}&limit=`
- `GET /articles/:id`
- `GET /status`
+## How the data pipeline works
+
+On startup the server:
+
+1. Opens the SQLite database.
+2. Registers the article and status routes.
+3. Starts the HTTP server.
+4. Immediately runs all ingestion sources once.
+5. Starts the cron scheduler for recurring ingestions, content backfill, and embedding backfill.
+
+When a new article is inserted:
+
+- the record is written immediately with `title`, `description`, `url`, `source`, and timestamps
+- `content` and `image` start as `null`
+- full article extraction runs asynchronously after insert
+- vector embeddings are generated later, after title, description, and content are all available
+
+## API overview
+
+All exposed endpoints are `GET` endpoints.
+
+### `GET /`
+
+Simple health check.
+
+**Response**
+```json
+{ "ok": true }
+```
+
+Use this to confirm the server is running, not to inspect ingestion state.
+
+### `GET /articles`
+
+Returns articles from the `articles` table. Behavior changes based on the query params you send.
+
+#### Query params
+
+##### `keyword`
+
+Plain keyword search.
+
+- matches `title`, `description`, and `content`
+- uses SQL `LIKE`
+- works like substring matching, not semantic search
+- best when you want literal words or phrases to appear in the article text
+
+Example:
+```http
+GET /articles?keyword=earnings
+```
+
+##### `source`
+
+Exact match on the stored `source` field.
+
+Example:
+```http
+GET /articles?source=rss
+```
+
+##### `from`
+
+Only returns rows where `pub_date >= from`.
+
+Example:
+```http
+GET /articles?from=2025-01-01T00:00:00.000Z
+```
+
+##### `to`
+
+Only returns rows where `pub_date <= to`.
+
+Example:
+```http
+GET /articles?to=2025-01-31T23:59:59.999Z
+```
+
+##### `limit`
+
+Number of rows to return.
+
+- default: `20`
+- max: `100`
+
+Example:
+```http
+GET /articles?limit=10
+```
+
+##### `offset`
+
+Pagination offset.
+
+- default: `0`
+
+Example:
+```http
+GET /articles?limit=10&offset=20
+```
+
+##### `similar_to_article`
+
+Runs vector similarity search instead of normal list mode.
+
+- value must be an existing article ID
+- the server looks up that article's embedding
+- nearest-neighbor search runs in `sqlite-vec`
+- the source article is excluded from the result set
+- each result includes a `distance` field
+- lower `distance` means more similar
+- returns `404` if the article has no stored embedding
+
+Example:
+```http
+GET /articles?similar_to_article=123&limit=5
+```
+
+Not found response:
+```json
+{ "error": "Embedding not found for article" }
+```
+
+##### `semantic`
+
+Semantic search by meaning, not exact wording.
+
+- use this when you want conceptually related results
+- unlike `keyword`, the words do not need to appear literally in the article text
+- the query text is normalized before embedding
+- query embeddings are cached in SQLite
+- on cache miss, the server requests an embedding from OpenRouter
+- nearest article matches are returned from the embedding index
+- each result includes a `distance` field
+- lower `distance` means a closer semantic match
+- returns `400` if `semantic` is empty
+
+Example:
+```http
+GET /articles?semantic=ai chip demand&limit=10
+```
+
+Bad request response:
+```json
+{ "error": "Semantic query must not be empty" }
+```
+
+##### `include_embedding`
+
+Explicitly rejected on `/articles`.
+
+Response:
+```json
+{ "error": "Embeddings are not returned directly. Use similar_to_article for vector search." }
+```
+
+#### General behavior
+
+- If `semantic` is present, semantic search is used.
+- Else if `similar_to_article` is present, similarity search is used.
+- Otherwise normal list/search mode is used.
+- `keyword` is literal keyword matching.
+- `semantic` is semantic matching by meaning.
+- Normal list/search results are ordered by `COALESCE(pub_date, ingested_at) DESC, id DESC`.
+- `from` and `to` are compared against stored publication timestamps, so ISO-8601 values are the safest input.
+- `source` must match the stored source name exactly.
+- `keyword` is substring matching, not full-text search.
+
+#### Normal list/search response shape
+
+```json
+[
+  {
+    "id": 123,
+    "title": "...",
+    "description": "...",
+    "content": "...",
+    "image": "...",
+    "url": "...",
+    "normalized_title": "...",
+    "source": "rss",
+    "pub_date": "2025-01-01T12:34:56.000Z",
+    "ingested_at": "2025-01-01T12:35:10.000Z"
+  }
+]
+```
+
+#### Similarity/topic search response shape
+
+```json
+[
+  {
+    "id": 456,
+    "title": "...",
+    "description": "...",
+    "content": "...",
+    "image": "...",
+    "url": "...",
+    "normalized_title": "...",
+    "source": "rss",
+    "pub_date": "2025-01-02T09:00:00.000Z",
+    "ingested_at": "2025-01-02T09:00:10.000Z",
+    "distance": 0.1234
+  }
+]
+```
+
+#### Combined example
+
+```http
+GET /articles?keyword=earnings&source=rss&from=2025-01-01T00:00:00.000Z&limit=10&offset=0
+```
+
+### `GET /articles/:id`
+
+Returns one article by numeric ID.
+
+**Behavior**
+
+- Looks up the article directly in SQLite.
+- Returns the same article fields as normal `/articles` list mode.
+- Does not return embedding data.
+- Returns `404` if the ID does not exist.
+
+**Example**
+```http
+GET /articles/123
+```
+
+**Not found response**
+```json
+{ "error": "Article not found" }
+```
+
+### `GET /status`
+
+Returns ingestion and archive summary information.
+
+**Response fields**
+
+- `totalArticles`: total number of rows in `articles`
+- `countsBySource`: article counts grouped by source name
+- `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source
+- `contentFetchCoverage.total`: total article count used for coverage math
+- `contentFetchCoverage.withContent`: rows whose `content` is present and non-empty
+- `contentFetchCoverage.withImage`: rows whose `image` is present and non-empty
+- `contentFetchCoverage.withEmbedding`: rows that have an embedding in `article_embeddings`
+- `contentFetchCoverage.contentRatio`: `withContent / total`
+- `contentFetchCoverage.imageRatio`: `withImage / total`
+- `contentFetchCoverage.embeddingRatio`: `withEmbedding / total`
+
+**Important detail**
+
+`lastIngestionBySource` is kept in memory, so it resets when the process restarts.
+
+**Example response**
+```json
+{
+  "totalArticles": 10234,
+  "countsBySource": {
+    "alphavantage": 120,
+    "edgar": 88,
+    "finnhub": 400,
+    "gdelt": 2100,
+    "rss": 7526
+  },
+  "lastIngestionBySource": {
+    "rss": "2025-01-02T10:00:00.000Z",
+    "gdelt": "2025-01-02T10:05:00.000Z"
+  },
+  "contentFetchCoverage": {
+    "withContent": 9000,
+    "withImage": 6500,
+    "withEmbedding": 8700,
+    "total": 10234,
+    "contentRatio": 0.8794,
+    "imageRatio": 0.6351,
+    "embeddingRatio": 0.8501
+  }
+}
+```
+
+## Article field notes
+
+- `image` stores the extracted main image as ultra-compressed base64 WebP.
+- `normalized_title` is stored for matching and indexing.
+- `source` may be a shared source like `rss`, `gdelt`, `edgar`, `alphavantage`, or `finnhub`, or a crawler-derived source name for a configured publisher.
+- `pub_date` is normalized to ISO-8601 when it can be parsed.
+- `ingested_at` is the insert timestamp set by the server.

 ## Notes

 - SQLite archive file defaults to `./archive.sqlite`.
 - Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique.
 - `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts.
- Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content.
- Main article images are stored as ultra-compressed base64 WebP.
+- Article body extraction runs asynchronously after insertion, with scheduled retries for rows still missing content.
 - Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.
 - Topic search caches normalized query embeddings in SQLite and falls back to OpenRouter on cache miss.
 - SEC requests use the configured `User-Agent`.
+- Duplicate URLs are skipped rather than inserted again.
--- a/config.json
+++ b/config.json
@ -489,7 +489,8 @@
          "https://jamaica-gleaner.com/",
          "https://jamaica-gleaner.com/news",
          "https://jamaica-gleaner.com/business"
-        ]
+        ],
+        "requestTimeout": 25000
      },
      "Jamaica Observer": {
        "allowedHosts": [
--- a/src/http.js
+++ b/src/http.js
@ -41,6 +41,33 @@ function getRetryDelay(attempt, response) {
  return baseDelay + Math.floor(Math.random() * 250);
 }

+function getErrorCode(error) {
+  return String(error?.code || error?.cause?.code || '').trim();
+}
+
+function isRetryableError(error) {
+  const code = getErrorCode(error);
+  const message = String(error?.message || '').toLowerCase();
+
+  if (code === 'UNABLE_TO_VERIFY_LEAF_SIGNATURE') {
+    return false;
+  }
+
+  if (error?.name === 'TimeoutError') {
+    return true;
+  }
+
+  return [
+    'UND_ERR_SOCKET',
+    'UND_ERR_CONNECT_TIMEOUT',
+    'ECONNRESET',
+    'ECONNREFUSED',
+    'ETIMEDOUT',
+    'EAI_AGAIN',
+    'ENETUNREACH',
+  ].includes(code) || message.includes('other side closed');
+}
+
 async function fetchWithPolicy(url, options = {}) {
  const {
    timeout = 20000,
@ -73,6 +100,10 @@ async function fetchWithPolicy(url, options = {}) {
      lastError = error;
    } catch (error) {
      lastError = error;
+
+      if (!isRetryableError(error)) {
+        throw error;
+      }
    }

    if (attempt < retries) {
--- a/src/routes/articles.js
+++ b/src/routes/articles.js
@ -11,9 +11,9 @@ function buildArticlesQuery(query) {
  const params = [];
  const includeEmbedding = String(query.include_embedding || '').toLowerCase() === 'true';

-  if (query.q) {
+  if (query.keyword) {
    conditions.push('(title LIKE ? OR description LIKE ? OR content LIKE ?)');
-    const keyword = `%${query.q}%`;
+    const keyword = `%${query.keyword}%`;
    params.push(keyword, keyword, keyword);
  }

@ -56,16 +56,16 @@ async function articleRoutes(fastify) {
    const query = request.query || {};
    if (query.include_embedding) {
      reply.code(400);
-      return { error: 'Embeddings are not returned directly. Use similar_to for vector search.' };
+      return { error: 'Embeddings are not returned directly. Use similar_to_article for vector search.' };
    }

-    if (query.topic !== undefined) {
+    if (query.semantic !== undefined) {
      const limit = Number.parseInt(query.limit, 10);
-      const embedding = await getOrCreateQueryEmbedding(query.topic);
+      const embedding = await getOrCreateQueryEmbedding(query.semantic);

      if (!embedding) {
        reply.code(400);
-        return { error: 'Topic must not be empty' };
+        return { error: 'Semantic query must not be empty' };
      }

      const neighbors = findArticlesByEmbedding(
@ -93,9 +93,9 @@ async function articleRoutes(fastify) {
        .filter(Boolean);
    }

-    if (query.similar_to) {
+    if (query.similar_to_article) {
      const limit = Number.parseInt(query.limit, 10);
-      const articleId = Number.parseInt(query.similar_to, 10);
+      const articleId = Number.parseInt(query.similar_to_article, 10);
      const neighbors = findSimilarArticles(
        articleId,
        Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20
@ -134,7 +134,7 @@ async function articleRoutes(fastify) {
  fastify.get('/articles/:id', async (request, reply) => {
    if (String((request.query || {}).include_embedding || '').toLowerCase() === 'true') {
      reply.code(400);
-      return { error: 'Embeddings are not returned directly. Use similar_to for vector search.' };
+      return { error: 'Embeddings are not returned directly. Use similar_to_article for vector search.' };
    }

    const article = db.prepare(`