enhance article processing by adding language support and adjusting embedding parameters

This commit is contained in:
ImBenji 2026-04-20 03:41:10 +01:00
parent 37d9dfb083
commit 8805d3a3fc
7 changed files with 129 additions and 9 deletions

7
CLAUDE.md Normal file
View file

@ -0,0 +1,7 @@
# Database Policy
When making any changes to the database schema or data, a strictly no data loss policy must be followed. This means:
- Never DROP columns, tables, or indexes that contain data without first migrating that data elsewhere
- All schema changes must be additive or safe migrations (e.g. ADD COLUMN, rename via copy+verify+drop)
- Always backup or verify row counts before and after any bulk UPDATE or DELETE
- Destructive operations require explicit user confirmation before executing

View file

@ -51,7 +51,7 @@
}, },
"embeddingBackfill": { "embeddingBackfill": {
"perRound": 256, "perRound": 256,
"batchSize": 16 "batchSize": 128
}, },
"browser": { "browser": {
"maxConcurrentPages": 8 "maxConcurrentPages": 8

View file

@ -4,6 +4,7 @@
"label": "Al Jazeera", "label": "Al Jazeera",
"feedUrl": "https://www.aljazeera.com/xml/rss/all.xml", "feedUrl": "https://www.aljazeera.com/xml/rss/all.xml",
"website": "aljazeera.com", "website": "aljazeera.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -14,6 +15,7 @@
"bbc.com", "bbc.com",
"bbc.co.uk" "bbc.co.uk"
], ],
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -21,6 +23,7 @@
"label": "Business Insider", "label": "Business Insider",
"feedUrl": "https://feeds.businessinsider.com/custom/all", "feedUrl": "https://feeds.businessinsider.com/custom/all",
"website": "businessinsider.com", "website": "businessinsider.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -28,6 +31,7 @@
"label": "Bloomberg Markets", "label": "Bloomberg Markets",
"feedUrl": "https://feeds.bloomberg.com/markets/news.rss", "feedUrl": "https://feeds.bloomberg.com/markets/news.rss",
"website": "bloomberg.com", "website": "bloomberg.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -35,6 +39,7 @@
"label": "CNBC", "label": "CNBC",
"feedUrl": "https://www.cnbc.com/id/100003114/device/rss/rss.html", "feedUrl": "https://www.cnbc.com/id/100003114/device/rss/rss.html",
"website": "cnbc.com", "website": "cnbc.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -42,6 +47,7 @@
"label": "Wall Street Journal", "label": "Wall Street Journal",
"feedUrl": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml", "feedUrl": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml",
"website": "wsj.com", "website": "wsj.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -49,6 +55,7 @@
"label": "MarketWatch", "label": "MarketWatch",
"feedUrl": "https://feeds.marketwatch.com/marketwatch/topstories/", "feedUrl": "https://feeds.marketwatch.com/marketwatch/topstories/",
"website": "marketwatch.com", "website": "marketwatch.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -59,6 +66,7 @@
"finance.yahoo.com", "finance.yahoo.com",
"yahoo.com" "yahoo.com"
], ],
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -66,6 +74,7 @@
"label": "Seeking Alpha", "label": "Seeking Alpha",
"feedUrl": "https://seekingalpha.com/feed.xml", "feedUrl": "https://seekingalpha.com/feed.xml",
"website": "seekingalpha.com", "website": "seekingalpha.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -73,6 +82,7 @@
"label": "Financial Times", "label": "Financial Times",
"feedUrl": "https://www.ft.com/?format=rss", "feedUrl": "https://www.ft.com/?format=rss",
"website": "ft.com", "website": "ft.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -80,6 +90,7 @@
"label": "The Economist", "label": "The Economist",
"feedUrl": "https://www.economist.com/finance-and-economics/rss.xml", "feedUrl": "https://www.economist.com/finance-and-economics/rss.xml",
"website": "economist.com", "website": "economist.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -87,6 +98,7 @@
"label": "Fortune", "label": "Fortune",
"feedUrl": "https://fortune.com/feed", "feedUrl": "https://fortune.com/feed",
"website": "fortune.com", "website": "fortune.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -94,6 +106,7 @@
"label": "Forbes Business", "label": "Forbes Business",
"feedUrl": "https://www.forbes.com/business/feed/", "feedUrl": "https://www.forbes.com/business/feed/",
"website": "forbes.com", "website": "forbes.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -101,6 +114,7 @@
"label": "Inc Magazine", "label": "Inc Magazine",
"feedUrl": "https://www.inc.com/rss", "feedUrl": "https://www.inc.com/rss",
"website": "inc.com", "website": "inc.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -108,6 +122,7 @@
"label": "Fast Company", "label": "Fast Company",
"feedUrl": "https://www.fastcompany.com/latest/rss", "feedUrl": "https://www.fastcompany.com/latest/rss",
"website": "fastcompany.com", "website": "fastcompany.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -115,6 +130,7 @@
"label": "Entrepreneur", "label": "Entrepreneur",
"feedUrl": "https://www.entrepreneur.com/latest.rss", "feedUrl": "https://www.entrepreneur.com/latest.rss",
"website": "entrepreneur.com", "website": "entrepreneur.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -122,6 +138,7 @@
"label": "Axios", "label": "Axios",
"feedUrl": "https://api.axios.com/feed/", "feedUrl": "https://api.axios.com/feed/",
"website": "axios.com", "website": "axios.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -129,6 +146,7 @@
"label": "Wired Business", "label": "Wired Business",
"feedUrl": "https://www.wired.com/feed/category/business/latest/rss", "feedUrl": "https://www.wired.com/feed/category/business/latest/rss",
"website": "wired.com", "website": "wired.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -136,6 +154,7 @@
"label": "NPR Business", "label": "NPR Business",
"feedUrl": "https://feeds.npr.org/1006/rss.xml", "feedUrl": "https://feeds.npr.org/1006/rss.xml",
"website": "npr.org", "website": "npr.org",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -143,6 +162,7 @@
"label": "Federal Reserve", "label": "Federal Reserve",
"feedUrl": "https://www.federalreserve.gov/feeds/press_all.xml", "feedUrl": "https://www.federalreserve.gov/feeds/press_all.xml",
"website": "federalreserve.gov", "website": "federalreserve.gov",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -150,6 +170,7 @@
"label": "TechCrunch", "label": "TechCrunch",
"feedUrl": "https://techcrunch.com/feed/", "feedUrl": "https://techcrunch.com/feed/",
"website": "techcrunch.com", "website": "techcrunch.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -157,6 +178,7 @@
"label": "The Verge", "label": "The Verge",
"feedUrl": "https://www.theverge.com/rss/index.xml", "feedUrl": "https://www.theverge.com/rss/index.xml",
"website": "theverge.com", "website": "theverge.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -164,6 +186,7 @@
"label": "Ars Technica", "label": "Ars Technica",
"feedUrl": "https://feeds.arstechnica.com/arstechnica/index", "feedUrl": "https://feeds.arstechnica.com/arstechnica/index",
"website": "arstechnica.com", "website": "arstechnica.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -171,6 +194,7 @@
"label": "Retail Dive", "label": "Retail Dive",
"feedUrl": "https://www.retaildive.com/feeds/news/", "feedUrl": "https://www.retaildive.com/feeds/news/",
"website": "retaildive.com", "website": "retaildive.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -178,6 +202,7 @@
"label": "Manufacturing Dive", "label": "Manufacturing Dive",
"feedUrl": "https://www.manufacturingdive.com/feeds/news/", "feedUrl": "https://www.manufacturingdive.com/feeds/news/",
"website": "manufacturingdive.com", "website": "manufacturingdive.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -185,6 +210,7 @@
"label": "Banking Dive", "label": "Banking Dive",
"feedUrl": "https://www.bankingdive.com/feeds/news/", "feedUrl": "https://www.bankingdive.com/feeds/news/",
"website": "bankingdive.com", "website": "bankingdive.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -192,6 +218,7 @@
"label": "Financial Post CA", "label": "Financial Post CA",
"feedUrl": "https://financialpost.com/feed", "feedUrl": "https://financialpost.com/feed",
"website": "financialpost.com", "website": "financialpost.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -199,6 +226,7 @@
"label": "Globe and Mail", "label": "Globe and Mail",
"feedUrl": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/", "feedUrl": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/",
"website": "theglobeandmail.com", "website": "theglobeandmail.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -206,6 +234,7 @@
"label": "Guardian Business", "label": "Guardian Business",
"feedUrl": "https://www.theguardian.com/uk/business/rss", "feedUrl": "https://www.theguardian.com/uk/business/rss",
"website": "theguardian.com", "website": "theguardian.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -213,6 +242,7 @@
"label": "Sky News Business", "label": "Sky News Business",
"feedUrl": "https://feeds.skynews.com/feeds/rss/business.xml", "feedUrl": "https://feeds.skynews.com/feeds/rss/business.xml",
"website": "skynews.com", "website": "skynews.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -220,6 +250,7 @@
"label": "This Is Money", "label": "This Is Money",
"feedUrl": "[FAILED] https://www.thisismoney.co.uk/money/news/index.rss", "feedUrl": "[FAILED] https://www.thisismoney.co.uk/money/news/index.rss",
"website": "thisismoney.co.uk", "website": "thisismoney.co.uk",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -227,6 +258,7 @@
"label": "City A.M.", "label": "City A.M.",
"feedUrl": "https://www.cityam.com/feed/", "feedUrl": "https://www.cityam.com/feed/",
"website": "cityam.com", "website": "cityam.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -234,6 +266,7 @@
"label": "Spiegel Wirtschaft", "label": "Spiegel Wirtschaft",
"feedUrl": "https://www.spiegel.de/wirtschaft/index.rss", "feedUrl": "https://www.spiegel.de/wirtschaft/index.rss",
"website": "spiegel.de", "website": "spiegel.de",
"language": "de",
"backfill": true "backfill": true
}, },
{ {
@ -241,6 +274,7 @@
"label": "Handelsblatt", "label": "Handelsblatt",
"feedUrl": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen", "feedUrl": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen",
"website": "handelsblatt.com", "website": "handelsblatt.com",
"language": "de",
"backfill": true "backfill": true
}, },
{ {
@ -248,6 +282,7 @@
"label": "FAZ Wirtschaft", "label": "FAZ Wirtschaft",
"feedUrl": "https://www.faz.net/rss/aktuell/wirtschaft/", "feedUrl": "https://www.faz.net/rss/aktuell/wirtschaft/",
"website": "faz.net", "website": "faz.net",
"language": "de",
"backfill": true "backfill": true
}, },
{ {
@ -255,6 +290,7 @@
"label": "Die Welt Wirtschaft", "label": "Die Welt Wirtschaft",
"feedUrl": "https://www.welt.de/feeds/section/wirtschaft.rss", "feedUrl": "https://www.welt.de/feeds/section/wirtschaft.rss",
"website": "welt.de", "website": "welt.de",
"language": "de",
"backfill": true "backfill": true
}, },
{ {
@ -262,6 +298,7 @@
"label": "Les Echos", "label": "Les Echos",
"feedUrl": "[FAILED] https://feeds.lesechos.fr/rss/rss_la_une.xml", "feedUrl": "[FAILED] https://feeds.lesechos.fr/rss/rss_la_une.xml",
"website": "lesechos.fr", "website": "lesechos.fr",
"language": "fr",
"backfill": true "backfill": true
}, },
{ {
@ -269,6 +306,7 @@
"label": "Le Monde Economie", "label": "Le Monde Economie",
"feedUrl": "https://www.lemonde.fr/economie/rss_full.xml", "feedUrl": "https://www.lemonde.fr/economie/rss_full.xml",
"website": "lemonde.fr", "website": "lemonde.fr",
"language": "fr",
"backfill": true "backfill": true
}, },
{ {
@ -276,6 +314,7 @@
"label": "BFM Business", "label": "BFM Business",
"feedUrl": "[FAILED] https://bfmbusiness.bfmtv.com/rss/news-flux-rss/", "feedUrl": "[FAILED] https://bfmbusiness.bfmtv.com/rss/news-flux-rss/",
"website": "bfmbusiness.bfmtv.com", "website": "bfmbusiness.bfmtv.com",
"language": "fr",
"backfill": true "backfill": true
}, },
{ {
@ -283,6 +322,7 @@
"label": "El Economista ES", "label": "El Economista ES",
"feedUrl": "[FAILED] https://www.eleconomista.es/rss/rss-de-portada.php", "feedUrl": "[FAILED] https://www.eleconomista.es/rss/rss-de-portada.php",
"website": "eleconomista.es", "website": "eleconomista.es",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -290,6 +330,7 @@
"label": "Expansion ES", "label": "Expansion ES",
"feedUrl": "https://e00-expansion.uecdn.es/rss/portada.xml", "feedUrl": "https://e00-expansion.uecdn.es/rss/portada.xml",
"website": "expansion.com", "website": "expansion.com",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -297,6 +338,7 @@
"label": "Cinco Dias", "label": "Cinco Dias",
"feedUrl": "[FAILED] https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml", "feedUrl": "[FAILED] https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml",
"website": "cincodias.elpais.com", "website": "cincodias.elpais.com",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -304,6 +346,7 @@
"label": "Il Sole 24 Ore", "label": "Il Sole 24 Ore",
"feedUrl": "[FAILED] https://www.ilsole24ore.com/rss/economia--finanza.xml", "feedUrl": "[FAILED] https://www.ilsole24ore.com/rss/economia--finanza.xml",
"website": "ilsole24ore.com", "website": "ilsole24ore.com",
"language": "it",
"backfill": true "backfill": true
}, },
{ {
@ -311,6 +354,7 @@
"label": "FD.nl", "label": "FD.nl",
"feedUrl": "[FAILED] https://fd.nl/rss", "feedUrl": "[FAILED] https://fd.nl/rss",
"website": "fd.nl", "website": "fd.nl",
"language": "nl",
"backfill": true "backfill": true
}, },
{ {
@ -318,6 +362,7 @@
"label": "NZZ Wirtschaft", "label": "NZZ Wirtschaft",
"feedUrl": "https://www.nzz.ch/wirtschaft.rss", "feedUrl": "https://www.nzz.ch/wirtschaft.rss",
"website": "nzz.ch", "website": "nzz.ch",
"language": "de",
"backfill": true "backfill": true
}, },
{ {
@ -325,6 +370,7 @@
"label": "Moscow Times", "label": "Moscow Times",
"feedUrl": "https://www.themoscowtimes.com/rss/news", "feedUrl": "https://www.themoscowtimes.com/rss/news",
"website": "themoscowtimes.com", "website": "themoscowtimes.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -332,6 +378,7 @@
"label": "RBC Russia", "label": "RBC Russia",
"feedUrl": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss", "feedUrl": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss",
"website": "rbc.ru", "website": "rbc.ru",
"language": "ru",
"backfill": true "backfill": true
}, },
{ {
@ -339,6 +386,7 @@
"label": "Economic Times India", "label": "Economic Times India",
"feedUrl": "https://economictimes.indiatimes.com/rssfeedstopstories.cms", "feedUrl": "https://economictimes.indiatimes.com/rssfeedstopstories.cms",
"website": "economictimes.indiatimes.com", "website": "economictimes.indiatimes.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -346,6 +394,7 @@
"label": "Business Standard IN", "label": "Business Standard IN",
"feedUrl": "https://www.business-standard.com/rss/home_page_top_stories.rss", "feedUrl": "https://www.business-standard.com/rss/home_page_top_stories.rss",
"website": "business-standard.com", "website": "business-standard.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -353,6 +402,7 @@
"label": "Live Mint", "label": "Live Mint",
"feedUrl": "[FAILED] https://www.livemint.com/rss/headlines", "feedUrl": "[FAILED] https://www.livemint.com/rss/headlines",
"website": "livemint.com", "website": "livemint.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -360,6 +410,7 @@
"label": "Moneycontrol", "label": "Moneycontrol",
"feedUrl": "https://www.moneycontrol.com/rss/MCtopnews.xml", "feedUrl": "https://www.moneycontrol.com/rss/MCtopnews.xml",
"website": "moneycontrol.com", "website": "moneycontrol.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -367,6 +418,7 @@
"label": "Hindu Business Line", "label": "Hindu Business Line",
"feedUrl": "https://www.thehindubusinessline.com/feeder/default.rss", "feedUrl": "https://www.thehindubusinessline.com/feeder/default.rss",
"website": "thehindubusinessline.com", "website": "thehindubusinessline.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -374,6 +426,7 @@
"label": "Caixin Global", "label": "Caixin Global",
"feedUrl": "[FAILED] https://www.caixinglobal.com/rss/newsfeeds/", "feedUrl": "[FAILED] https://www.caixinglobal.com/rss/newsfeeds/",
"website": "caixinglobal.com", "website": "caixinglobal.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -381,6 +434,7 @@
"label": "China Daily Business", "label": "China Daily Business",
"feedUrl": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml", "feedUrl": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml",
"website": "chinadaily.com.cn", "website": "chinadaily.com.cn",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -388,6 +442,7 @@
"label": "Xinhua Business", "label": "Xinhua Business",
"feedUrl": "[FAILED] https://english.news.cn/rss/business.xml", "feedUrl": "[FAILED] https://english.news.cn/rss/business.xml",
"website": "news.cn", "website": "news.cn",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -395,6 +450,7 @@
"label": "South China Morning Post", "label": "South China Morning Post",
"feedUrl": "https://www.scmp.com/rss/91/feed", "feedUrl": "https://www.scmp.com/rss/91/feed",
"website": "scmp.com", "website": "scmp.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -402,6 +458,7 @@
"label": "Nikkei Asia", "label": "Nikkei Asia",
"feedUrl": "https://asia.nikkei.com/rss/feed/nar", "feedUrl": "https://asia.nikkei.com/rss/feed/nar",
"website": "asia.nikkei.com", "website": "asia.nikkei.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -409,6 +466,7 @@
"label": "Japan Times Business", "label": "Japan Times Business",
"feedUrl": "[FAILED] https://www.japantimes.co.jp/feed/business/", "feedUrl": "[FAILED] https://www.japantimes.co.jp/feed/business/",
"website": "japantimes.co.jp", "website": "japantimes.co.jp",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -416,6 +474,7 @@
"label": "Korea Herald", "label": "Korea Herald",
"feedUrl": "https://www.koreaherald.com/rss/010000000000.xml", "feedUrl": "https://www.koreaherald.com/rss/010000000000.xml",
"website": "koreaherald.com", "website": "koreaherald.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -423,6 +482,7 @@
"label": "Korea JoongAng Daily", "label": "Korea JoongAng Daily",
"feedUrl": "[FAILED] https://koreajoongangdaily.joins.com/rss/", "feedUrl": "[FAILED] https://koreajoongangdaily.joins.com/rss/",
"website": "koreajoongangdaily.joins.com", "website": "koreajoongangdaily.joins.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -430,6 +490,7 @@
"label": "Business Times SG", "label": "Business Times SG",
"feedUrl": "https://www.businesstimes.com.sg/rss.xml", "feedUrl": "https://www.businesstimes.com.sg/rss.xml",
"website": "businesstimes.com.sg", "website": "businesstimes.com.sg",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -437,6 +498,7 @@
"label": "Straits Times Business", "label": "Straits Times Business",
"feedUrl": "https://www.straitstimes.com/news/business/rss.xml", "feedUrl": "https://www.straitstimes.com/news/business/rss.xml",
"website": "straitstimes.com", "website": "straitstimes.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -444,6 +506,7 @@
"label": "Channel NewsAsia", "label": "Channel NewsAsia",
"feedUrl": "https://www.channelnewsasia.com/rssfeeds/8395986", "feedUrl": "https://www.channelnewsasia.com/rssfeeds/8395986",
"website": "channelnewsasia.com", "website": "channelnewsasia.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -451,6 +514,7 @@
"label": "Bangkok Post Business", "label": "Bangkok Post Business",
"feedUrl": "https://www.bangkokpost.com/rss/data/business.xml", "feedUrl": "https://www.bangkokpost.com/rss/data/business.xml",
"website": "bangkokpost.com", "website": "bangkokpost.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -458,6 +522,7 @@
"label": "The Star Malaysia", "label": "The Star Malaysia",
"feedUrl": "[FAILED] https://www.thestar.com.my/rss/Business/Business-News", "feedUrl": "[FAILED] https://www.thestar.com.my/rss/Business/Business-News",
"website": "thestar.com.my", "website": "thestar.com.my",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -465,6 +530,7 @@
"label": "Australian Fin Review", "label": "Australian Fin Review",
"feedUrl": "[FAILED] https://www.afr.com/rss", "feedUrl": "[FAILED] https://www.afr.com/rss",
"website": "afr.com", "website": "afr.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -472,6 +538,7 @@
"label": "ABC Business AU", "label": "ABC Business AU",
"feedUrl": "[FAILED] https://www.abc.net.au/news/feed/52278/rss.xml", "feedUrl": "[FAILED] https://www.abc.net.au/news/feed/52278/rss.xml",
"website": "abc.net.au", "website": "abc.net.au",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -479,6 +546,7 @@
"label": "NZ Herald Business", "label": "NZ Herald Business",
"feedUrl": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/", "feedUrl": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/",
"website": "nzherald.co.nz", "website": "nzherald.co.nz",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -486,6 +554,7 @@
"label": "Arabian Business", "label": "Arabian Business",
"feedUrl": "[FAILED] https://www.arabianbusiness.com/rss.xml", "feedUrl": "[FAILED] https://www.arabianbusiness.com/rss.xml",
"website": "arabianbusiness.com", "website": "arabianbusiness.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -493,6 +562,7 @@
"label": "Gulf News Business", "label": "Gulf News Business",
"feedUrl": "[FAILED] https://gulfnews.com/rss/business", "feedUrl": "[FAILED] https://gulfnews.com/rss/business",
"website": "gulfnews.com", "website": "gulfnews.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -500,6 +570,7 @@
"label": "Arab News", "label": "Arab News",
"feedUrl": "[FAILED] https://www.arabnews.com/rss/front_page.xml", "feedUrl": "[FAILED] https://www.arabnews.com/rss/front_page.xml",
"website": "arabnews.com", "website": "arabnews.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -507,6 +578,7 @@
"label": "The National UAE", "label": "The National UAE",
"feedUrl": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml", "feedUrl": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml",
"website": "thenationalnews.com", "website": "thenationalnews.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -514,6 +586,7 @@
"label": "BusinessDay Nigeria", "label": "BusinessDay Nigeria",
"feedUrl": "https://businessday.ng/feed/", "feedUrl": "https://businessday.ng/feed/",
"website": "businessday.ng", "website": "businessday.ng",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -521,6 +594,7 @@
"label": "Moneyweb SA", "label": "Moneyweb SA",
"feedUrl": "https://www.moneyweb.co.za/feed/", "feedUrl": "https://www.moneyweb.co.za/feed/",
"website": "moneyweb.co.za", "website": "moneyweb.co.za",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -528,6 +602,7 @@
"label": "BusinessLive SA", "label": "BusinessLive SA",
"feedUrl": "[FAILED] https://www.businesslive.co.za/rss/bd/", "feedUrl": "[FAILED] https://www.businesslive.co.za/rss/bd/",
"website": "businesslive.co.za", "website": "businesslive.co.za",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -535,6 +610,7 @@
"label": "Business Daily Africa", "label": "Business Daily Africa",
"feedUrl": "[FAILED] https://www.businessdailyafrica.com/rss/", "feedUrl": "[FAILED] https://www.businessdailyafrica.com/rss/",
"website": "businessdailyafrica.com", "website": "businessdailyafrica.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -542,6 +618,7 @@
"label": "Vanguard Business NG", "label": "Vanguard Business NG",
"feedUrl": "https://www.vanguardngr.com/category/business/feed/", "feedUrl": "https://www.vanguardngr.com/category/business/feed/",
"website": "vanguardngr.com", "website": "vanguardngr.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -549,6 +626,7 @@
"label": "Folha Mercado BR", "label": "Folha Mercado BR",
"feedUrl": "https://feeds.folha.uol.com.br/mercado/rss091.xml", "feedUrl": "https://feeds.folha.uol.com.br/mercado/rss091.xml",
"website": "folha.uol.com.br", "website": "folha.uol.com.br",
"language": "pt",
"backfill": true "backfill": true
}, },
{ {
@ -556,6 +634,7 @@
"label": "G1 Economia BR", "label": "G1 Economia BR",
"feedUrl": "https://g1.globo.com/dynamo/economia/rss2.xml", "feedUrl": "https://g1.globo.com/dynamo/economia/rss2.xml",
"website": "g1.globo.com", "website": "g1.globo.com",
"language": "pt",
"backfill": true "backfill": true
}, },
{ {
@ -563,6 +642,7 @@
"label": "Exame BR", "label": "Exame BR",
"feedUrl": "https://exame.com/feed/", "feedUrl": "https://exame.com/feed/",
"website": "exame.com", "website": "exame.com",
"language": "pt",
"backfill": true "backfill": true
}, },
{ {
@ -570,6 +650,7 @@
"label": "El Economista MX", "label": "El Economista MX",
"feedUrl": "[FAILED] https://www.eleconomista.com.mx/rss/rss.html", "feedUrl": "[FAILED] https://www.eleconomista.com.mx/rss/rss.html",
"website": "eleconomista.com.mx", "website": "eleconomista.com.mx",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -577,6 +658,7 @@
"label": "Expansion MX", "label": "Expansion MX",
"feedUrl": "https://expansion.mx/rss", "feedUrl": "https://expansion.mx/rss",
"website": "expansion.mx", "website": "expansion.mx",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -584,6 +666,7 @@
"label": "La Nacion AR", "label": "La Nacion AR",
"feedUrl": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/", "feedUrl": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/",
"website": "lanacion.com.ar", "website": "lanacion.com.ar",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -591,6 +674,7 @@
"label": "Infobae Economia AR", "label": "Infobae Economia AR",
"feedUrl": "[FAILED] https://www.infobae.com/feeds/rss/economia/", "feedUrl": "[FAILED] https://www.infobae.com/feeds/rss/economia/",
"website": "infobae.com", "website": "infobae.com",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -598,6 +682,7 @@
"label": "Portafolio Colombia", "label": "Portafolio Colombia",
"feedUrl": "[FAILED] https://www.portafolio.co/rss/portafolio.xml", "feedUrl": "[FAILED] https://www.portafolio.co/rss/portafolio.xml",
"website": "portafolio.co", "website": "portafolio.co",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -605,6 +690,7 @@
"label": "El Comercio Peru", "label": "El Comercio Peru",
"feedUrl": "[FAILED] https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/", "feedUrl": "[FAILED] https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/",
"website": "elcomercio.pe", "website": "elcomercio.pe",
"language": "es",
"backfill": true "backfill": true
}, },
{ {
@ -615,6 +701,7 @@
"jamaica-gleaner.com", "jamaica-gleaner.com",
"jamaicagleaner.com" "jamaicagleaner.com"
], ],
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -622,6 +709,7 @@
"label": "Jamaica Observer", "label": "Jamaica Observer",
"feedUrl": "https://www.jamaicaobserver.com/app/business/", "feedUrl": "https://www.jamaicaobserver.com/app/business/",
"website": "jamaicaobserver.com", "website": "jamaicaobserver.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -629,6 +717,7 @@
"label": "Stabroek News", "label": "Stabroek News",
"feedUrl": "[FAILED] https://www.stabroeknews.com/feed/", "feedUrl": "[FAILED] https://www.stabroeknews.com/feed/",
"website": "stabroeknews.com", "website": "stabroeknews.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -636,6 +725,7 @@
"label": "Nation News Barbados", "label": "Nation News Barbados",
"feedUrl": "[FAILED] https://nationnews.com/rss-feed/", "feedUrl": "[FAILED] https://nationnews.com/rss-feed/",
"website": "nationnews.com", "website": "nationnews.com",
"language": "en",
"backfill": true "backfill": true
}, },
{ {
@ -643,6 +733,7 @@
"label": "Google News", "label": "Google News",
"feedUrl": "https://news.google.com/rss?hl=en-GB&gl=GB&ceid=GB:en", "feedUrl": "https://news.google.com/rss?hl=en-GB&gl=GB&ceid=GB:en",
"website": "news.google.com", "website": "news.google.com",
"language": "en",
"backfill": false "backfill": false
} }
] ]

View file

@ -65,8 +65,8 @@ const markContentPending = db.prepare(`
const selectPartitionedArticlesMissingContent = db.prepare(` const selectPartitionedArticlesMissingContent = db.prepare(`
SELECT id, url, title, description SELECT id, url, title, description
FROM ( FROM (
SELECT id, url, title, description, source, SELECT id, url, title, description, source, pub_date_effective,
ROW_NUMBER() OVER (PARTITION BY source ORDER BY ingested_at DESC, id DESC) AS rn ROW_NUMBER() OVER (PARTITION BY source ORDER BY pub_date_effective DESC, id DESC) AS rn
FROM articles FROM articles
WHERE (content IS NULL OR TRIM(content) = '') WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending') AND (content_status IS NULL OR content_status = 'pending')
@ -74,7 +74,7 @@ const selectPartitionedArticlesMissingContent = db.prepare(`
AND (id % ?) = ? AND (id % ?) = ?
) )
WHERE rn <= ? WHERE rn <= ?
ORDER BY rn, source ORDER BY pub_date_effective DESC, rn, source
`); `);
const selectAttemptCount = db.prepare(` const selectAttemptCount = db.prepare(`

View file

@ -288,7 +288,8 @@ for (const statement of [
'ALTER TABLE articles ADD COLUMN content_retry_after TEXT', 'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0', 'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0',
'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0', 'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0',
'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT' 'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT',
'ALTER TABLE articles ADD COLUMN language TEXT'
]) { ]) {
try { try {
db.exec(statement); db.exec(statement);
@ -312,6 +313,20 @@ db.exec(`
WHERE pub_date_effective IS NULL WHERE pub_date_effective IS NULL
`); `);
// backfill language from sources.json for existing rows
{
const sources = require('../sources.json');
const updateLang = db.prepare(`UPDATE articles SET language = ? WHERE source = ? AND language IS NULL`);
const backfillLang = db.transaction(() => {
for (const src of sources) {
if (src.language) {
updateLang.run(src.language, src.id);
}
}
});
backfillLang();
}
db.exec(` db.exec(`
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding); CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC); CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);

View file

@ -73,7 +73,7 @@ const selectArticlesMissingEmbeddings = db.prepare(`
SELECT 1 FROM article_embedding_store s SELECT 1 FROM article_embedding_store s
WHERE s.article_id = a.id AND s.model = ? WHERE s.article_id = a.id AND s.model = ?
) )
ORDER BY a.ingested_at ASC, a.id ASC ORDER BY a.pub_date_effective DESC, a.id DESC
LIMIT ? LIMIT ?
`); `);

View file

@ -2,6 +2,10 @@ const db = require('./db');
const { normalizeTitle } = require('./dedup'); const { normalizeTitle } = require('./dedup');
const { markSourceRun } = require('./state'); const { markSourceRun } = require('./state');
const sourcesById = Object.fromEntries(
require('../sources.json').map((s) => [s.id, s])
);
const insertArticle = db.prepare(` const insertArticle = db.prepare(`
INSERT INTO articles ( INSERT INTO articles (
title, title,
@ -13,8 +17,9 @@ const insertArticle = db.prepare(`
source, source,
pub_date, pub_date,
ingested_at, ingested_at,
pub_date_effective pub_date_effective,
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?) language
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?, ?)
`); `);
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?'); const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i; const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
@ -81,6 +86,7 @@ function ingestArticle(article) {
const isIndexPage = inferIsIndexPage(article, title, url); const isIndexPage = inferIsIndexPage(article, title, url);
const pubDate = normalizePubDate(article.pubDate); const pubDate = normalizePubDate(article.pubDate);
const ingestedAt = new Date().toISOString(); const ingestedAt = new Date().toISOString();
const language = (sourcesById[source] && sourcesById[source].language) || null;
try { try {
const result = insertArticle.run( const result = insertArticle.run(
@ -92,7 +98,8 @@ function ingestArticle(article) {
source, source,
pubDate, pubDate,
ingestedAt, ingestedAt,
pubDate || ingestedAt pubDate || ingestedAt,
language
); );
// dont kick off the content fetch here — it used to be fire-and-forget which // dont kick off the content fetch here — it used to be fire-and-forget which