diff --git a/README.md b/README.md
index 0f4754d..b5ec5c5 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # duriin_api
 
-Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, and GDELT into a local SQLite archive.
+Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, GDELT, and configured publisher crawlers into a local SQLite archive.
 
 ## Setup
 
@@ -8,27 +8,312 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing
    ```bash
    npm install
    ```
-2. Edit `config.json` with your API keys, including `openRouter.apiKey`, tickers, RSS feeds, and schedules.
+2. Edit `config.json` with your API keys, tickers, RSS feeds, crawler settings, and schedules.
 3. Start the server:
    ```bash
    npm start
    ```
 
-## API
+The server listens on the host and port defined in `config.json`.
 
-- `GET /articles?q=&source=&from=&to=&limit=&offset=`
-- `GET /articles?similar_to={id}&limit=`
-- `GET /articles?topic={query}&limit=`
-- `GET /articles/:id`
-- `GET /status`
+## How the data pipeline works
+
+On startup the server:
+
+1. Opens the SQLite database.
+2. Registers the article and status routes.
+3. Starts the HTTP server.
+4. Immediately runs all ingestion sources once.
+5. Starts the cron scheduler for recurring ingestions, content backfill, and embedding backfill.
+
+When a new article is inserted:
+
+- the record is written immediately with `title`, `description`, `url`, `source`, and timestamps
+- `content` and `image` start as `null`
+- full article extraction runs asynchronously after insert
+- vector embeddings are generated later, after title, description, and content are all available
+
+## API overview
+
+All exposed endpoints are `GET` endpoints.
+
+### `GET /`
+
+Simple health check.
+
+**Response**
+```json
+{ "ok": true }
+```
+
+Use this to confirm the server is running, not to inspect ingestion state.
+
+### `GET /articles`
+
+Returns articles from the `articles` table. Behavior changes based on the query params you send.
+
+#### Query params
+
+##### `keyword`
+
+Plain keyword search.
+
+- matches `title`, `description`, and `content`
+- uses SQL `LIKE`
+- works like substring matching, not semantic search
+- best when you want literal words or phrases to appear in the article text
+
+Example:
+```http
+GET /articles?keyword=earnings
+```
+
+##### `source`
+
+Exact match on the stored `source` field.
+
+Example:
+```http
+GET /articles?source=rss
+```
+
+##### `from`
+
+Only returns rows where `pub_date >= from`.
+
+Example:
+```http
+GET /articles?from=2025-01-01T00:00:00.000Z
+```
+
+##### `to`
+
+Only returns rows where `pub_date <= to`.
+
+Example:
+```http
+GET /articles?to=2025-01-31T23:59:59.999Z
+```
+
+##### `limit`
+
+Number of rows to return.
+
+- default: `20`
+- max: `100`
+
+Example:
+```http
+GET /articles?limit=10
+```
+
+##### `offset`
+
+Pagination offset.
+
+- default: `0`
+
+Example:
+```http
+GET /articles?limit=10&offset=20
+```
+
+##### `similar_to_article`
+
+Runs vector similarity search instead of normal list mode.
+
+- value must be an existing article ID
+- the server looks up that article's embedding
+- nearest-neighbor search runs in `sqlite-vec`
+- the source article is excluded from the result set
+- each result includes a `distance` field
+- lower `distance` means more similar
+- returns `404` if the article has no stored embedding
+
+Example:
+```http
+GET /articles?similar_to_article=123&limit=5
+```
+
+Not found response:
+```json
+{ "error": "Embedding not found for article" }
+```
+
+##### `semantic`
+
+Semantic search by meaning, not exact wording.
+
+- use this when you want conceptually related results
+- unlike `keyword`, the words do not need to appear literally in the article text
+- the query text is normalized before embedding
+- query embeddings are cached in SQLite
+- on cache miss, the server requests an embedding from OpenRouter
+- nearest article matches are returned from the embedding index
+- each result includes a `distance` field
+- lower `distance` means a closer semantic match
+- returns `400` if `semantic` is empty
+
+Example:
+```http
+GET /articles?semantic=ai chip demand&limit=10
+```
+
+Bad request response:
+```json
+{ "error": "Semantic query must not be empty" }
+```
+
+##### `include_embedding`
+
+Explicitly rejected on `/articles`.
+
+Response:
+```json
+{ "error": "Embeddings are not returned directly. Use similar_to_article for vector search." }
+```
+
+#### General behavior
+
+- If `semantic` is present, semantic search is used.
+- Else if `similar_to_article` is present, similarity search is used.
+- Otherwise normal list/search mode is used.
+- `keyword` is literal keyword matching.
+- `semantic` is semantic matching by meaning.
+- Normal list/search results are ordered by `COALESCE(pub_date, ingested_at) DESC, id DESC`.
+- `from` and `to` are compared against stored publication timestamps, so ISO-8601 values are the safest input.
+- `source` must match the stored source name exactly.
+- `keyword` is substring matching, not full-text search.
+
+#### Normal list/search response shape
+
+```json
+[
+  {
+    "id": 123,
+    "title": "...",
+    "description": "...",
+    "content": "...",
+    "image": "...",
+    "url": "...",
+    "normalized_title": "...",
+    "source": "rss",
+    "pub_date": "2025-01-01T12:34:56.000Z",
+    "ingested_at": "2025-01-01T12:35:10.000Z"
+  }
+]
+```
+
+#### Similarity/topic search response shape
+
+```json
+[
+  {
+    "id": 456,
+    "title": "...",
+    "description": "...",
+    "content": "...",
+    "image": "...",
+    "url": "...",
+    "normalized_title": "...",
+    "source": "rss",
+    "pub_date": "2025-01-02T09:00:00.000Z",
+    "ingested_at": "2025-01-02T09:00:10.000Z",
+    "distance": 0.1234
+  }
+]
+```
+
+#### Combined example
+
+```http
+GET /articles?keyword=earnings&source=rss&from=2025-01-01T00:00:00.000Z&limit=10&offset=0
+```
+
+### `GET /articles/:id`
+
+Returns one article by numeric ID.
+
+**Behavior**
+
+- Looks up the article directly in SQLite.
+- Returns the same article fields as normal `/articles` list mode.
+- Does not return embedding data.
+- Returns `404` if the ID does not exist.
+
+**Example**
+```http
+GET /articles/123
+```
+
+**Not found response**
+```json
+{ "error": "Article not found" }
+```
+
+### `GET /status`
+
+Returns ingestion and archive summary information.
+
+**Response fields**
+
+- `totalArticles`: total number of rows in `articles`
+- `countsBySource`: article counts grouped by source name
+- `lastIngestionBySource`: in-memory timestamps of the last successful batch run per source
+- `contentFetchCoverage.total`: total article count used for coverage math
+- `contentFetchCoverage.withContent`: rows whose `content` is present and non-empty
+- `contentFetchCoverage.withImage`: rows whose `image` is present and non-empty
+- `contentFetchCoverage.withEmbedding`: rows that have an embedding in `article_embeddings`
+- `contentFetchCoverage.contentRatio`: `withContent / total`
+- `contentFetchCoverage.imageRatio`: `withImage / total`
+- `contentFetchCoverage.embeddingRatio`: `withEmbedding / total`
+
+**Important detail**
+
+`lastIngestionBySource` is kept in memory, so it resets when the process restarts.
+
+**Example response**
+```json
+{
+  "totalArticles": 10234,
+  "countsBySource": {
+    "alphavantage": 120,
+    "edgar": 88,
+    "finnhub": 400,
+    "gdelt": 2100,
+    "rss": 7526
+  },
+  "lastIngestionBySource": {
+    "rss": "2025-01-02T10:00:00.000Z",
+    "gdelt": "2025-01-02T10:05:00.000Z"
+  },
+  "contentFetchCoverage": {
+    "withContent": 9000,
+    "withImage": 6500,
+    "withEmbedding": 8700,
+    "total": 10234,
+    "contentRatio": 0.8794,
+    "imageRatio": 0.6351,
+    "embeddingRatio": 0.8501
+  }
+}
+```
+
+## Article field notes
+
+- `image` stores the extracted main image as ultra-compressed base64 WebP.
+- `normalized_title` is stored for matching and indexing.
+- `source` may be a shared source like `rss`, `gdelt`, `edgar`, `alphavantage`, or `finnhub`, or a crawler-derived source name for a configured publisher.
+- `pub_date` is normalized to ISO-8601 when it can be parsed.
+- `ingested_at` is the insert timestamp set by the server.
 
 ## Notes
 
 - SQLite archive file defaults to `./archive.sqlite`.
 - Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique.
 - `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts.
-- Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content.
-- Main article images are stored as ultra-compressed base64 WebP.
+- Article body extraction runs asynchronously after insertion, with scheduled retries for rows still missing content.
 - Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.
 - Topic search caches normalized query embeddings in SQLite and falls back to OpenRouter on cache miss.
 - SEC requests use the configured `User-Agent`.
+- Duplicate URLs are skipped rather than inserted again.
diff --git a/config.json b/config.json
index af2b47a..a7b0d4f 100644
--- a/config.json
+++ b/config.json
@@ -489,7 +489,8 @@
           "https://jamaica-gleaner.com/",
           "https://jamaica-gleaner.com/news",
           "https://jamaica-gleaner.com/business"
-        ]
+        ],
+        "requestTimeout": 25000
       },
       "Jamaica Observer": {
         "allowedHosts": [
diff --git a/src/http.js b/src/http.js
index 6f9de11..9e12ae2 100644
--- a/src/http.js
+++ b/src/http.js
@@ -41,6 +41,33 @@ function getRetryDelay(attempt, response) {
   return baseDelay + Math.floor(Math.random() * 250);
 }
 
+function getErrorCode(error) {
+  return String(error?.code || error?.cause?.code || '').trim();
+}
+
+function isRetryableError(error) {
+  const code = getErrorCode(error);
+  const message = String(error?.message || '').toLowerCase();
+
+  if (code === 'UNABLE_TO_VERIFY_LEAF_SIGNATURE') {
+    return false;
+  }
+
+  if (error?.name === 'TimeoutError') {
+    return true;
+  }
+
+  return [
+    'UND_ERR_SOCKET',
+    'UND_ERR_CONNECT_TIMEOUT',
+    'ECONNRESET',
+    'ECONNREFUSED',
+    'ETIMEDOUT',
+    'EAI_AGAIN',
+    'ENETUNREACH',
+  ].includes(code) || message.includes('other side closed');
+}
+
 async function fetchWithPolicy(url, options = {}) {
   const {
     timeout = 20000,
@@ -73,6 +100,10 @@ async function fetchWithPolicy(url, options = {}) {
       lastError = error;
     } catch (error) {
       lastError = error;
+
+      if (!isRetryableError(error)) {
+        throw error;
+      }
     }
 
     if (attempt < retries) {
diff --git a/src/routes/articles.js b/src/routes/articles.js
index 4f0b9b9..072bf40 100644
--- a/src/routes/articles.js
+++ b/src/routes/articles.js
@@ -11,9 +11,9 @@ function buildArticlesQuery(query) {
   const params = [];
   const includeEmbedding = String(query.include_embedding || '').toLowerCase() === 'true';
 
-  if (query.q) {
+  if (query.keyword) {
     conditions.push('(title LIKE ? OR description LIKE ? OR content LIKE ?)');
-    const keyword = `%${query.q}%`;
+    const keyword = `%${query.keyword}%`;
     params.push(keyword, keyword, keyword);
   }
 
@@ -56,16 +56,16 @@ async function articleRoutes(fastify) {
     const query = request.query || {};
     if (query.include_embedding) {
       reply.code(400);
-      return { error: 'Embeddings are not returned directly. Use similar_to for vector search.' };
+      return { error: 'Embeddings are not returned directly. Use similar_to_article for vector search.' };
     }
 
-    if (query.topic !== undefined) {
+    if (query.semantic !== undefined) {
       const limit = Number.parseInt(query.limit, 10);
-      const embedding = await getOrCreateQueryEmbedding(query.topic);
+      const embedding = await getOrCreateQueryEmbedding(query.semantic);
 
       if (!embedding) {
         reply.code(400);
-        return { error: 'Topic must not be empty' };
+        return { error: 'Semantic query must not be empty' };
       }
 
       const neighbors = findArticlesByEmbedding(
@@ -93,9 +93,9 @@ async function articleRoutes(fastify) {
         .filter(Boolean);
     }
 
-    if (query.similar_to) {
+    if (query.similar_to_article) {
       const limit = Number.parseInt(query.limit, 10);
-      const articleId = Number.parseInt(query.similar_to, 10);
+      const articleId = Number.parseInt(query.similar_to_article, 10);
       const neighbors = findSimilarArticles(
         articleId,
         Number.isFinite(limit) && limit > 0 ? Math.min(limit, 100) : 20
@@ -134,7 +134,7 @@ async function articleRoutes(fastify) {
   fastify.get('/articles/:id', async (request, reply) => {
     if (String((request.query || {}).include_embedding || '').toLowerCase() === 'true') {
       reply.code(400);
-      return { error: 'Embeddings are not returned directly. Use similar_to for vector search.' };
+      return { error: 'Embeddings are not returned directly. Use similar_to_article for vector search.' };
     }
 
     const article = db.prepare(`