From 8805d3a3fca01a232b76169c3c0e80db1dc3d25b Mon Sep 17 00:00:00 2001 From: ImBenji Date: Mon, 20 Apr 2026 03:41:10 +0100 Subject: [PATCH] enhance article processing by adding language support and adjusting embedding parameters --- CLAUDE.md | 7 ++++ config.json | 2 +- sources.json | 91 +++++++++++++++++++++++++++++++++++++++++++++++ src/content.js | 6 ++-- src/db.js | 17 ++++++++- src/embeddings.js | 2 +- src/ingest.js | 13 +++++-- 7 files changed, 129 insertions(+), 9 deletions(-) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..2488f24 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,7 @@ +# Database Policy + +When making any changes to the database schema or data, a strictly no data loss policy must be followed. This means: +- Never DROP columns, tables, or indexes that contain data without first migrating that data elsewhere +- All schema changes must be additive or safe migrations (e.g. ADD COLUMN, rename via copy+verify+drop) +- Always backup or verify row counts before and after any bulk UPDATE or DELETE +- Destructive operations require explicit user confirmation before executing diff --git a/config.json b/config.json index f79e684..14cb93f 100644 --- a/config.json +++ b/config.json @@ -51,7 +51,7 @@ }, "embeddingBackfill": { "perRound": 256, - "batchSize": 16 + "batchSize": 128 }, "browser": { "maxConcurrentPages": 8 diff --git a/sources.json b/sources.json index c9908f7..5f0f290 100644 --- a/sources.json +++ b/sources.json @@ -4,6 +4,7 @@ "label": "Al Jazeera", "feedUrl": "https://www.aljazeera.com/xml/rss/all.xml", "website": "aljazeera.com", + "language": "en", "backfill": true }, { @@ -14,6 +15,7 @@ "bbc.com", "bbc.co.uk" ], + "language": "en", "backfill": true }, { @@ -21,6 +23,7 @@ "label": "Business Insider", "feedUrl": "https://feeds.businessinsider.com/custom/all", "website": "businessinsider.com", + "language": "en", "backfill": true }, { @@ -28,6 +31,7 @@ "label": "Bloomberg Markets", "feedUrl": "https://feeds.bloomberg.com/markets/news.rss", "website": "bloomberg.com", + "language": "en", "backfill": true }, { @@ -35,6 +39,7 @@ "label": "CNBC", "feedUrl": "https://www.cnbc.com/id/100003114/device/rss/rss.html", "website": "cnbc.com", + "language": "en", "backfill": true }, { @@ -42,6 +47,7 @@ "label": "Wall Street Journal", "feedUrl": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml", "website": "wsj.com", + "language": "en", "backfill": true }, { @@ -49,6 +55,7 @@ "label": "MarketWatch", "feedUrl": "https://feeds.marketwatch.com/marketwatch/topstories/", "website": "marketwatch.com", + "language": "en", "backfill": true }, { @@ -59,6 +66,7 @@ "finance.yahoo.com", "yahoo.com" ], + "language": "en", "backfill": true }, { @@ -66,6 +74,7 @@ "label": "Seeking Alpha", "feedUrl": "https://seekingalpha.com/feed.xml", "website": "seekingalpha.com", + "language": "en", "backfill": true }, { @@ -73,6 +82,7 @@ "label": "Financial Times", "feedUrl": "https://www.ft.com/?format=rss", "website": "ft.com", + "language": "en", "backfill": true }, { @@ -80,6 +90,7 @@ "label": "The Economist", "feedUrl": "https://www.economist.com/finance-and-economics/rss.xml", "website": "economist.com", + "language": "en", "backfill": true }, { @@ -87,6 +98,7 @@ "label": "Fortune", "feedUrl": "https://fortune.com/feed", "website": "fortune.com", + "language": "en", "backfill": true }, { @@ -94,6 +106,7 @@ "label": "Forbes Business", "feedUrl": "https://www.forbes.com/business/feed/", "website": "forbes.com", + "language": "en", "backfill": true }, { @@ -101,6 +114,7 @@ "label": "Inc Magazine", "feedUrl": "https://www.inc.com/rss", "website": "inc.com", + "language": "en", "backfill": true }, { @@ -108,6 +122,7 @@ "label": "Fast Company", "feedUrl": "https://www.fastcompany.com/latest/rss", "website": "fastcompany.com", + "language": "en", "backfill": true }, { @@ -115,6 +130,7 @@ "label": "Entrepreneur", "feedUrl": "https://www.entrepreneur.com/latest.rss", "website": "entrepreneur.com", + "language": "en", "backfill": true }, { @@ -122,6 +138,7 @@ "label": "Axios", "feedUrl": "https://api.axios.com/feed/", "website": "axios.com", + "language": "en", "backfill": true }, { @@ -129,6 +146,7 @@ "label": "Wired Business", "feedUrl": "https://www.wired.com/feed/category/business/latest/rss", "website": "wired.com", + "language": "en", "backfill": true }, { @@ -136,6 +154,7 @@ "label": "NPR Business", "feedUrl": "https://feeds.npr.org/1006/rss.xml", "website": "npr.org", + "language": "en", "backfill": true }, { @@ -143,6 +162,7 @@ "label": "Federal Reserve", "feedUrl": "https://www.federalreserve.gov/feeds/press_all.xml", "website": "federalreserve.gov", + "language": "en", "backfill": true }, { @@ -150,6 +170,7 @@ "label": "TechCrunch", "feedUrl": "https://techcrunch.com/feed/", "website": "techcrunch.com", + "language": "en", "backfill": true }, { @@ -157,6 +178,7 @@ "label": "The Verge", "feedUrl": "https://www.theverge.com/rss/index.xml", "website": "theverge.com", + "language": "en", "backfill": true }, { @@ -164,6 +186,7 @@ "label": "Ars Technica", "feedUrl": "https://feeds.arstechnica.com/arstechnica/index", "website": "arstechnica.com", + "language": "en", "backfill": true }, { @@ -171,6 +194,7 @@ "label": "Retail Dive", "feedUrl": "https://www.retaildive.com/feeds/news/", "website": "retaildive.com", + "language": "en", "backfill": true }, { @@ -178,6 +202,7 @@ "label": "Manufacturing Dive", "feedUrl": "https://www.manufacturingdive.com/feeds/news/", "website": "manufacturingdive.com", + "language": "en", "backfill": true }, { @@ -185,6 +210,7 @@ "label": "Banking Dive", "feedUrl": "https://www.bankingdive.com/feeds/news/", "website": "bankingdive.com", + "language": "en", "backfill": true }, { @@ -192,6 +218,7 @@ "label": "Financial Post CA", "feedUrl": "https://financialpost.com/feed", "website": "financialpost.com", + "language": "en", "backfill": true }, { @@ -199,6 +226,7 @@ "label": "Globe and Mail", "feedUrl": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/", "website": "theglobeandmail.com", + "language": "en", "backfill": true }, { @@ -206,6 +234,7 @@ "label": "Guardian Business", "feedUrl": "https://www.theguardian.com/uk/business/rss", "website": "theguardian.com", + "language": "en", "backfill": true }, { @@ -213,6 +242,7 @@ "label": "Sky News Business", "feedUrl": "https://feeds.skynews.com/feeds/rss/business.xml", "website": "skynews.com", + "language": "en", "backfill": true }, { @@ -220,6 +250,7 @@ "label": "This Is Money", "feedUrl": "[FAILED] https://www.thisismoney.co.uk/money/news/index.rss", "website": "thisismoney.co.uk", + "language": "en", "backfill": true }, { @@ -227,6 +258,7 @@ "label": "City A.M.", "feedUrl": "https://www.cityam.com/feed/", "website": "cityam.com", + "language": "en", "backfill": true }, { @@ -234,6 +266,7 @@ "label": "Spiegel Wirtschaft", "feedUrl": "https://www.spiegel.de/wirtschaft/index.rss", "website": "spiegel.de", + "language": "de", "backfill": true }, { @@ -241,6 +274,7 @@ "label": "Handelsblatt", "feedUrl": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen", "website": "handelsblatt.com", + "language": "de", "backfill": true }, { @@ -248,6 +282,7 @@ "label": "FAZ Wirtschaft", "feedUrl": "https://www.faz.net/rss/aktuell/wirtschaft/", "website": "faz.net", + "language": "de", "backfill": true }, { @@ -255,6 +290,7 @@ "label": "Die Welt Wirtschaft", "feedUrl": "https://www.welt.de/feeds/section/wirtschaft.rss", "website": "welt.de", + "language": "de", "backfill": true }, { @@ -262,6 +298,7 @@ "label": "Les Echos", "feedUrl": "[FAILED] https://feeds.lesechos.fr/rss/rss_la_une.xml", "website": "lesechos.fr", + "language": "fr", "backfill": true }, { @@ -269,6 +306,7 @@ "label": "Le Monde Economie", "feedUrl": "https://www.lemonde.fr/economie/rss_full.xml", "website": "lemonde.fr", + "language": "fr", "backfill": true }, { @@ -276,6 +314,7 @@ "label": "BFM Business", "feedUrl": "[FAILED] https://bfmbusiness.bfmtv.com/rss/news-flux-rss/", "website": "bfmbusiness.bfmtv.com", + "language": "fr", "backfill": true }, { @@ -283,6 +322,7 @@ "label": "El Economista ES", "feedUrl": "[FAILED] https://www.eleconomista.es/rss/rss-de-portada.php", "website": "eleconomista.es", + "language": "es", "backfill": true }, { @@ -290,6 +330,7 @@ "label": "Expansion ES", "feedUrl": "https://e00-expansion.uecdn.es/rss/portada.xml", "website": "expansion.com", + "language": "es", "backfill": true }, { @@ -297,6 +338,7 @@ "label": "Cinco Dias", "feedUrl": "[FAILED] https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml", "website": "cincodias.elpais.com", + "language": "es", "backfill": true }, { @@ -304,6 +346,7 @@ "label": "Il Sole 24 Ore", "feedUrl": "[FAILED] https://www.ilsole24ore.com/rss/economia--finanza.xml", "website": "ilsole24ore.com", + "language": "it", "backfill": true }, { @@ -311,6 +354,7 @@ "label": "FD.nl", "feedUrl": "[FAILED] https://fd.nl/rss", "website": "fd.nl", + "language": "nl", "backfill": true }, { @@ -318,6 +362,7 @@ "label": "NZZ Wirtschaft", "feedUrl": "https://www.nzz.ch/wirtschaft.rss", "website": "nzz.ch", + "language": "de", "backfill": true }, { @@ -325,6 +370,7 @@ "label": "Moscow Times", "feedUrl": "https://www.themoscowtimes.com/rss/news", "website": "themoscowtimes.com", + "language": "en", "backfill": true }, { @@ -332,6 +378,7 @@ "label": "RBC Russia", "feedUrl": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss", "website": "rbc.ru", + "language": "ru", "backfill": true }, { @@ -339,6 +386,7 @@ "label": "Economic Times India", "feedUrl": "https://economictimes.indiatimes.com/rssfeedstopstories.cms", "website": "economictimes.indiatimes.com", + "language": "en", "backfill": true }, { @@ -346,6 +394,7 @@ "label": "Business Standard IN", "feedUrl": "https://www.business-standard.com/rss/home_page_top_stories.rss", "website": "business-standard.com", + "language": "en", "backfill": true }, { @@ -353,6 +402,7 @@ "label": "Live Mint", "feedUrl": "[FAILED] https://www.livemint.com/rss/headlines", "website": "livemint.com", + "language": "en", "backfill": true }, { @@ -360,6 +410,7 @@ "label": "Moneycontrol", "feedUrl": "https://www.moneycontrol.com/rss/MCtopnews.xml", "website": "moneycontrol.com", + "language": "en", "backfill": true }, { @@ -367,6 +418,7 @@ "label": "Hindu Business Line", "feedUrl": "https://www.thehindubusinessline.com/feeder/default.rss", "website": "thehindubusinessline.com", + "language": "en", "backfill": true }, { @@ -374,6 +426,7 @@ "label": "Caixin Global", "feedUrl": "[FAILED] https://www.caixinglobal.com/rss/newsfeeds/", "website": "caixinglobal.com", + "language": "en", "backfill": true }, { @@ -381,6 +434,7 @@ "label": "China Daily Business", "feedUrl": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml", "website": "chinadaily.com.cn", + "language": "en", "backfill": true }, { @@ -388,6 +442,7 @@ "label": "Xinhua Business", "feedUrl": "[FAILED] https://english.news.cn/rss/business.xml", "website": "news.cn", + "language": "en", "backfill": true }, { @@ -395,6 +450,7 @@ "label": "South China Morning Post", "feedUrl": "https://www.scmp.com/rss/91/feed", "website": "scmp.com", + "language": "en", "backfill": true }, { @@ -402,6 +458,7 @@ "label": "Nikkei Asia", "feedUrl": "https://asia.nikkei.com/rss/feed/nar", "website": "asia.nikkei.com", + "language": "en", "backfill": true }, { @@ -409,6 +466,7 @@ "label": "Japan Times Business", "feedUrl": "[FAILED] https://www.japantimes.co.jp/feed/business/", "website": "japantimes.co.jp", + "language": "en", "backfill": true }, { @@ -416,6 +474,7 @@ "label": "Korea Herald", "feedUrl": "https://www.koreaherald.com/rss/010000000000.xml", "website": "koreaherald.com", + "language": "en", "backfill": true }, { @@ -423,6 +482,7 @@ "label": "Korea JoongAng Daily", "feedUrl": "[FAILED] https://koreajoongangdaily.joins.com/rss/", "website": "koreajoongangdaily.joins.com", + "language": "en", "backfill": true }, { @@ -430,6 +490,7 @@ "label": "Business Times SG", "feedUrl": "https://www.businesstimes.com.sg/rss.xml", "website": "businesstimes.com.sg", + "language": "en", "backfill": true }, { @@ -437,6 +498,7 @@ "label": "Straits Times Business", "feedUrl": "https://www.straitstimes.com/news/business/rss.xml", "website": "straitstimes.com", + "language": "en", "backfill": true }, { @@ -444,6 +506,7 @@ "label": "Channel NewsAsia", "feedUrl": "https://www.channelnewsasia.com/rssfeeds/8395986", "website": "channelnewsasia.com", + "language": "en", "backfill": true }, { @@ -451,6 +514,7 @@ "label": "Bangkok Post Business", "feedUrl": "https://www.bangkokpost.com/rss/data/business.xml", "website": "bangkokpost.com", + "language": "en", "backfill": true }, { @@ -458,6 +522,7 @@ "label": "The Star Malaysia", "feedUrl": "[FAILED] https://www.thestar.com.my/rss/Business/Business-News", "website": "thestar.com.my", + "language": "en", "backfill": true }, { @@ -465,6 +530,7 @@ "label": "Australian Fin Review", "feedUrl": "[FAILED] https://www.afr.com/rss", "website": "afr.com", + "language": "en", "backfill": true }, { @@ -472,6 +538,7 @@ "label": "ABC Business AU", "feedUrl": "[FAILED] https://www.abc.net.au/news/feed/52278/rss.xml", "website": "abc.net.au", + "language": "en", "backfill": true }, { @@ -479,6 +546,7 @@ "label": "NZ Herald Business", "feedUrl": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/", "website": "nzherald.co.nz", + "language": "en", "backfill": true }, { @@ -486,6 +554,7 @@ "label": "Arabian Business", "feedUrl": "[FAILED] https://www.arabianbusiness.com/rss.xml", "website": "arabianbusiness.com", + "language": "en", "backfill": true }, { @@ -493,6 +562,7 @@ "label": "Gulf News Business", "feedUrl": "[FAILED] https://gulfnews.com/rss/business", "website": "gulfnews.com", + "language": "en", "backfill": true }, { @@ -500,6 +570,7 @@ "label": "Arab News", "feedUrl": "[FAILED] https://www.arabnews.com/rss/front_page.xml", "website": "arabnews.com", + "language": "en", "backfill": true }, { @@ -507,6 +578,7 @@ "label": "The National UAE", "feedUrl": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml", "website": "thenationalnews.com", + "language": "en", "backfill": true }, { @@ -514,6 +586,7 @@ "label": "BusinessDay Nigeria", "feedUrl": "https://businessday.ng/feed/", "website": "businessday.ng", + "language": "en", "backfill": true }, { @@ -521,6 +594,7 @@ "label": "Moneyweb SA", "feedUrl": "https://www.moneyweb.co.za/feed/", "website": "moneyweb.co.za", + "language": "en", "backfill": true }, { @@ -528,6 +602,7 @@ "label": "BusinessLive SA", "feedUrl": "[FAILED] https://www.businesslive.co.za/rss/bd/", "website": "businesslive.co.za", + "language": "en", "backfill": true }, { @@ -535,6 +610,7 @@ "label": "Business Daily Africa", "feedUrl": "[FAILED] https://www.businessdailyafrica.com/rss/", "website": "businessdailyafrica.com", + "language": "en", "backfill": true }, { @@ -542,6 +618,7 @@ "label": "Vanguard Business NG", "feedUrl": "https://www.vanguardngr.com/category/business/feed/", "website": "vanguardngr.com", + "language": "en", "backfill": true }, { @@ -549,6 +626,7 @@ "label": "Folha Mercado BR", "feedUrl": "https://feeds.folha.uol.com.br/mercado/rss091.xml", "website": "folha.uol.com.br", + "language": "pt", "backfill": true }, { @@ -556,6 +634,7 @@ "label": "G1 Economia BR", "feedUrl": "https://g1.globo.com/dynamo/economia/rss2.xml", "website": "g1.globo.com", + "language": "pt", "backfill": true }, { @@ -563,6 +642,7 @@ "label": "Exame BR", "feedUrl": "https://exame.com/feed/", "website": "exame.com", + "language": "pt", "backfill": true }, { @@ -570,6 +650,7 @@ "label": "El Economista MX", "feedUrl": "[FAILED] https://www.eleconomista.com.mx/rss/rss.html", "website": "eleconomista.com.mx", + "language": "es", "backfill": true }, { @@ -577,6 +658,7 @@ "label": "Expansion MX", "feedUrl": "https://expansion.mx/rss", "website": "expansion.mx", + "language": "es", "backfill": true }, { @@ -584,6 +666,7 @@ "label": "La Nacion AR", "feedUrl": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/", "website": "lanacion.com.ar", + "language": "es", "backfill": true }, { @@ -591,6 +674,7 @@ "label": "Infobae Economia AR", "feedUrl": "[FAILED] https://www.infobae.com/feeds/rss/economia/", "website": "infobae.com", + "language": "es", "backfill": true }, { @@ -598,6 +682,7 @@ "label": "Portafolio Colombia", "feedUrl": "[FAILED] https://www.portafolio.co/rss/portafolio.xml", "website": "portafolio.co", + "language": "es", "backfill": true }, { @@ -605,6 +690,7 @@ "label": "El Comercio Peru", "feedUrl": "[FAILED] https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/", "website": "elcomercio.pe", + "language": "es", "backfill": true }, { @@ -615,6 +701,7 @@ "jamaica-gleaner.com", "jamaicagleaner.com" ], + "language": "en", "backfill": true }, { @@ -622,6 +709,7 @@ "label": "Jamaica Observer", "feedUrl": "https://www.jamaicaobserver.com/app/business/", "website": "jamaicaobserver.com", + "language": "en", "backfill": true }, { @@ -629,6 +717,7 @@ "label": "Stabroek News", "feedUrl": "[FAILED] https://www.stabroeknews.com/feed/", "website": "stabroeknews.com", + "language": "en", "backfill": true }, { @@ -636,6 +725,7 @@ "label": "Nation News Barbados", "feedUrl": "[FAILED] https://nationnews.com/rss-feed/", "website": "nationnews.com", + "language": "en", "backfill": true }, { @@ -643,6 +733,7 @@ "label": "Google News", "feedUrl": "https://news.google.com/rss?hl=en-GB&gl=GB&ceid=GB:en", "website": "news.google.com", + "language": "en", "backfill": false } ] diff --git a/src/content.js b/src/content.js index c1c8f46..4ff14b2 100644 --- a/src/content.js +++ b/src/content.js @@ -65,8 +65,8 @@ const markContentPending = db.prepare(` const selectPartitionedArticlesMissingContent = db.prepare(` SELECT id, url, title, description FROM ( - SELECT id, url, title, description, source, - ROW_NUMBER() OVER (PARTITION BY source ORDER BY ingested_at DESC, id DESC) AS rn + SELECT id, url, title, description, source, pub_date_effective, + ROW_NUMBER() OVER (PARTITION BY source ORDER BY pub_date_effective DESC, id DESC) AS rn FROM articles WHERE (content IS NULL OR TRIM(content) = '') AND (content_status IS NULL OR content_status = 'pending') @@ -74,7 +74,7 @@ const selectPartitionedArticlesMissingContent = db.prepare(` AND (id % ?) = ? ) WHERE rn <= ? - ORDER BY rn, source + ORDER BY pub_date_effective DESC, rn, source `); const selectAttemptCount = db.prepare(` diff --git a/src/db.js b/src/db.js index 8518521..95c8f11 100644 --- a/src/db.js +++ b/src/db.js @@ -288,7 +288,8 @@ for (const statement of [ 'ALTER TABLE articles ADD COLUMN content_retry_after TEXT', 'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0', 'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0', - 'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT' + 'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT', + 'ALTER TABLE articles ADD COLUMN language TEXT' ]) { try { db.exec(statement); @@ -312,6 +313,20 @@ db.exec(` WHERE pub_date_effective IS NULL `); +// backfill language from sources.json for existing rows +{ + const sources = require('../sources.json'); + const updateLang = db.prepare(`UPDATE articles SET language = ? WHERE source = ? AND language IS NULL`); + const backfillLang = db.transaction(() => { + for (const src of sources) { + if (src.language) { + updateLang.run(src.language, src.id); + } + } + }); + backfillLang(); +} + db.exec(` CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding); CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC); diff --git a/src/embeddings.js b/src/embeddings.js index 93cd535..2fc7fb6 100644 --- a/src/embeddings.js +++ b/src/embeddings.js @@ -73,7 +73,7 @@ const selectArticlesMissingEmbeddings = db.prepare(` SELECT 1 FROM article_embedding_store s WHERE s.article_id = a.id AND s.model = ? ) - ORDER BY a.ingested_at ASC, a.id ASC + ORDER BY a.pub_date_effective DESC, a.id DESC LIMIT ? `); diff --git a/src/ingest.js b/src/ingest.js index e0b876f..1d63007 100644 --- a/src/ingest.js +++ b/src/ingest.js @@ -2,6 +2,10 @@ const db = require('./db'); const { normalizeTitle } = require('./dedup'); const { markSourceRun } = require('./state'); +const sourcesById = Object.fromEntries( + require('../sources.json').map((s) => [s.id, s]) +); + const insertArticle = db.prepare(` INSERT INTO articles ( title, @@ -13,8 +17,9 @@ const insertArticle = db.prepare(` source, pub_date, ingested_at, - pub_date_effective - ) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?) + pub_date_effective, + language + ) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?, ?) `); const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?'); const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i; @@ -81,6 +86,7 @@ function ingestArticle(article) { const isIndexPage = inferIsIndexPage(article, title, url); const pubDate = normalizePubDate(article.pubDate); const ingestedAt = new Date().toISOString(); + const language = (sourcesById[source] && sourcesById[source].language) || null; try { const result = insertArticle.run( @@ -92,7 +98,8 @@ function ingestArticle(article) { source, pubDate, ingestedAt, - pubDate || ingestedAt + pubDate || ingestedAt, + language ); // dont kick off the content fetch here — it used to be fire-and-forget which