add Google News integration and enhance crawler capabilities
This commit is contained in:
parent
1a8504389a
commit
c3f9e59c5e
16 changed files with 3020 additions and 904 deletions
|
|
@ -1,6 +1,6 @@
|
||||||
# duriin_api
|
# duriin_api
|
||||||
|
|
||||||
Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, GDELT, and configured publisher crawlers into a local SQLite archive.
|
Node.js Fastify server that ingests news articles from RSS, Google News RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, and GDELT into a local SQLite archive.
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
|
|
@ -8,7 +8,7 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing
|
||||||
```bash
|
```bash
|
||||||
npm install
|
npm install
|
||||||
```
|
```
|
||||||
2. Edit `config.json` with your API keys, tickers, RSS feeds, crawler settings, and schedules.
|
2. Edit `config.json` with your API keys, tickers, RSS feeds, Google News settings, and schedules.
|
||||||
3. Start the server:
|
3. Start the server:
|
||||||
```bash
|
```bash
|
||||||
npm start
|
npm start
|
||||||
|
|
@ -303,7 +303,7 @@ Returns ingestion and archive summary information.
|
||||||
|
|
||||||
- `image` stores the extracted main image as ultra-compressed base64 WebP.
|
- `image` stores the extracted main image as ultra-compressed base64 WebP.
|
||||||
- `normalized_title` is stored for matching and indexing.
|
- `normalized_title` is stored for matching and indexing.
|
||||||
- `source` may be a shared source like `rss`, `gdelt`, `edgar`, `alphavantage`, or `finnhub`, or a crawler-derived source name for a configured publisher.
|
- `source` may be a shared source like `rss`, `googlenews`, `gdelt`, `edgar`, `alphavantage`, or `finnhub`.
|
||||||
- `pub_date` is normalized to ISO-8601 when it can be parsed.
|
- `pub_date` is normalized to ISO-8601 when it can be parsed.
|
||||||
- `ingested_at` is the insert timestamp set by the server.
|
- `ingested_at` is the insert timestamp set by the server.
|
||||||
|
|
||||||
|
|
@ -311,7 +311,7 @@ Returns ingestion and archive summary information.
|
||||||
|
|
||||||
- SQLite archive file defaults to `./archive.sqlite`.
|
- SQLite archive file defaults to `./archive.sqlite`.
|
||||||
- Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique.
|
- Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique.
|
||||||
- `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts.
|
- `googleNews` accepts `queries`, `topics`, `language`, and `country`, and resolves Google redirect URLs to publisher URLs before ingestion.
|
||||||
- Article body extraction runs asynchronously after insertion, with scheduled retries for rows still missing content.
|
- Article body extraction runs asynchronously after insertion, with scheduled retries for rows still missing content.
|
||||||
- Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.
|
- Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.
|
||||||
- Topic search caches normalized query embeddings in SQLite and falls back to OpenRouter on cache miss.
|
- Topic search caches normalized query embeddings in SQLite and falls back to OpenRouter on cache miss.
|
||||||
|
|
|
||||||
669
config.json
669
config.json
|
|
@ -21,664 +21,39 @@
|
||||||
"openRouter": {
|
"openRouter": {
|
||||||
"apiKey": "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"
|
"apiKey": "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"
|
||||||
},
|
},
|
||||||
"rssFeeds": [
|
|
||||||
{
|
|
||||||
"url": "https://www.aljazeera.com/xml/rss/all.xml",
|
|
||||||
"label": "Al Jazeera"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.bbci.co.uk/news/business/rss.xml",
|
|
||||||
"label": "BBC Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.businessinsider.com/custom/all",
|
|
||||||
"label": "Business Insider"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.bloomberg.com/markets/news.rss",
|
|
||||||
"label": "Bloomberg Markets"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.cnbc.com/id/100003114/device/rss/rss.html",
|
|
||||||
"label": "CNBC"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml",
|
|
||||||
"label": "Wall Street Journal"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.marketwatch.com/marketwatch/topstories/",
|
|
||||||
"label": "MarketWatch"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://finance.yahoo.com/news/rssindex",
|
|
||||||
"label": "Yahoo Finance"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://seekingalpha.com/feed.xml",
|
|
||||||
"label": "Seeking Alpha"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.ft.com/?format=rss",
|
|
||||||
"label": "Financial Times"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.economist.com/finance-and-economics/rss.xml",
|
|
||||||
"label": "The Economist"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://fortune.com/feed",
|
|
||||||
"label": "Fortune"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.forbes.com/business/feed/",
|
|
||||||
"label": "Forbes Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.inc.com/rss",
|
|
||||||
"label": "Inc Magazine"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.fastcompany.com/latest/rss",
|
|
||||||
"label": "Fast Company"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.entrepreneur.com/latest.rss",
|
|
||||||
"label": "Entrepreneur"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://api.axios.com/feed/",
|
|
||||||
"label": "Axios"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.wired.com/feed/category/business/latest/rss",
|
|
||||||
"label": "Wired Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.npr.org/1006/rss.xml",
|
|
||||||
"label": "NPR Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.federalreserve.gov/feeds/press_all.xml",
|
|
||||||
"label": "Federal Reserve"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://techcrunch.com/feed/",
|
|
||||||
"label": "TechCrunch"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.theverge.com/rss/index.xml",
|
|
||||||
"label": "The Verge"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.arstechnica.com/arstechnica/index",
|
|
||||||
"label": "Ars Technica"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.retaildive.com/feeds/news/",
|
|
||||||
"label": "Retail Dive"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.manufacturingdive.com/feeds/news/",
|
|
||||||
"label": "Manufacturing Dive"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.bankingdive.com/feeds/news/",
|
|
||||||
"label": "Banking Dive"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://financialpost.com/feed",
|
|
||||||
"label": "Financial Post CA"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/",
|
|
||||||
"label": "Globe and Mail"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.theguardian.com/uk/business/rss",
|
|
||||||
"label": "Guardian Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.skynews.com/feeds/rss/business.xml",
|
|
||||||
"label": "Sky News Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.thisismoney.co.uk/money/news/index.rss",
|
|
||||||
"label": "This Is Money"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.cityam.com/feed/",
|
|
||||||
"label": "City A.M."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.spiegel.de/wirtschaft/index.rss",
|
|
||||||
"label": "Spiegel Wirtschaft"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen",
|
|
||||||
"label": "Handelsblatt"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.faz.net/rss/aktuell/wirtschaft/",
|
|
||||||
"label": "FAZ Wirtschaft"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.welt.de/feeds/section/wirtschaft.rss",
|
|
||||||
"label": "Die Welt Wirtschaft"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.lesechos.fr/rss/rss_la_une.xml",
|
|
||||||
"label": "Les Echos"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.lemonde.fr/economie/rss_full.xml",
|
|
||||||
"label": "Le Monde Economie"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://bfmbusiness.bfmtv.com/rss/news-flux-rss/",
|
|
||||||
"label": "BFM Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.eleconomista.es/rss/rss-de-portada.php",
|
|
||||||
"label": "El Economista ES"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://e00-expansion.uecdn.es/rss/portada.xml",
|
|
||||||
"label": "Expansion ES"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml",
|
|
||||||
"label": "Cinco Dias"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.ilsole24ore.com/rss/economia--finanza.xml",
|
|
||||||
"label": "Il Sole 24 Ore"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://fd.nl/rss",
|
|
||||||
"label": "FD.nl"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.nzz.ch/wirtschaft.rss",
|
|
||||||
"label": "NZZ Wirtschaft"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.themoscowtimes.com/rss/news",
|
|
||||||
"label": "Moscow Times"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss",
|
|
||||||
"label": "RBC Russia"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://economictimes.indiatimes.com/rssfeedstopstories.cms",
|
|
||||||
"label": "Economic Times India"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.business-standard.com/rss/home_page_top_stories.rss",
|
|
||||||
"label": "Business Standard IN"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.livemint.com/rss/headlines",
|
|
||||||
"label": "Live Mint"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.moneycontrol.com/rss/MCtopnews.xml",
|
|
||||||
"label": "Moneycontrol"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.thehindubusinessline.com/feeder/default.rss",
|
|
||||||
"label": "Hindu Business Line"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.caixinglobal.com/rss/newsfeeds/",
|
|
||||||
"label": "Caixin Global"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml",
|
|
||||||
"label": "China Daily Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://english.news.cn/rss/business.xml",
|
|
||||||
"label": "Xinhua Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.scmp.com/rss/91/feed",
|
|
||||||
"label": "South China Morning Post"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://asia.nikkei.com/rss/feed/nar",
|
|
||||||
"label": "Nikkei Asia"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.japantimes.co.jp/feed/business/",
|
|
||||||
"label": "Japan Times Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.koreaherald.com/rss/010000000000.xml",
|
|
||||||
"label": "Korea Herald"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://koreajoongangdaily.joins.com/rss/",
|
|
||||||
"label": "Korea JoongAng Daily"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.businesstimes.com.sg/rss.xml",
|
|
||||||
"label": "Business Times SG"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.straitstimes.com/news/business/rss.xml",
|
|
||||||
"label": "Straits Times Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.channelnewsasia.com/rssfeeds/8395986",
|
|
||||||
"label": "Channel NewsAsia"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.bangkokpost.com/rss/data/business.xml",
|
|
||||||
"label": "Bangkok Post Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.thestar.com.my/rss/Business/Business-News",
|
|
||||||
"label": "The Star Malaysia"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.afr.com/rss",
|
|
||||||
"label": "Australian Fin Review"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.abc.net.au/news/feed/52278/rss.xml",
|
|
||||||
"label": "ABC Business AU"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/",
|
|
||||||
"label": "NZ Herald Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.arabianbusiness.com/rss.xml",
|
|
||||||
"label": "Arabian Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://gulfnews.com/rss/business",
|
|
||||||
"label": "Gulf News Business"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.arabnews.com/rss/front_page.xml",
|
|
||||||
"label": "Arab News"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml",
|
|
||||||
"label": "The National UAE"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://businessday.ng/feed/",
|
|
||||||
"label": "BusinessDay Nigeria"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.moneyweb.co.za/feed/",
|
|
||||||
"label": "Moneyweb SA"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.businesslive.co.za/rss/bd/",
|
|
||||||
"label": "BusinessLive SA"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.businessdailyafrica.com/rss/",
|
|
||||||
"label": "Business Daily Africa"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.vanguardngr.com/category/business/feed/",
|
|
||||||
"label": "Vanguard Business NG"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://feeds.folha.uol.com.br/mercado/rss091.xml",
|
|
||||||
"label": "Folha Mercado BR"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://g1.globo.com/dynamo/economia/rss2.xml",
|
|
||||||
"label": "G1 Economia BR"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://exame.com/feed/",
|
|
||||||
"label": "Exame BR"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.eleconomista.com.mx/rss/rss.html",
|
|
||||||
"label": "El Economista MX"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://expansion.mx/rss",
|
|
||||||
"label": "Expansion MX"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/",
|
|
||||||
"label": "La Nacion AR"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.infobae.com/feeds/rss/economia/",
|
|
||||||
"label": "Infobae Economia AR"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.portafolio.co/rss/portafolio.xml",
|
|
||||||
"label": "Portafolio Colombia"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/",
|
|
||||||
"label": "El Comercio Peru"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://jamaica-gleaner.com/feed/business.xml",
|
|
||||||
"label": "Jamaica Gleaner"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.jamaicaobserver.com/app/business/",
|
|
||||||
"label": "Jamaica Observer"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://www.stabroeknews.com/feed/",
|
|
||||||
"label": "Stabroek News"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://nationnews.com/rss-feed/",
|
|
||||||
"label": "Nation News Barbados"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"gdelt": {
|
"gdelt": {
|
||||||
"queries": [
|
"source": "bigquery",
|
||||||
"technology"
|
|
||||||
],
|
|
||||||
"mode": "ArtList",
|
"mode": "ArtList",
|
||||||
"maxRecords": 50,
|
"maxRecords": 100,
|
||||||
"format": "json"
|
"format": "json",
|
||||||
},
|
"windowDays": 7,
|
||||||
"newsCrawler": {
|
"lookbackWeeks": 312,
|
||||||
"maxPages": -1,
|
"requestDelayMs": 6500,
|
||||||
"maxDepth": 10,
|
"maxWindowsPerRun": 4,
|
||||||
"pageConcurrency": 4,
|
"bigQueryProject": "duriin",
|
||||||
"requestTimeout": 15000,
|
"bigQueryKeyFile": "./gdelt-credentials.json"
|
||||||
"disabledLabels": [
|
|
||||||
"Arab News",
|
|
||||||
"Arabian Business",
|
|
||||||
"Australian Fin Review",
|
|
||||||
"BFM Business",
|
|
||||||
"Business Daily Africa",
|
|
||||||
"Business Standard IN",
|
|
||||||
"BusinessLive SA",
|
|
||||||
"Caixin Global",
|
|
||||||
"Cinco Dias",
|
|
||||||
"City A.M.",
|
|
||||||
"El Comercio Peru",
|
|
||||||
"El Economista ES",
|
|
||||||
"El Economista MX",
|
|
||||||
"FD.nl",
|
|
||||||
"Gulf News Business",
|
|
||||||
"Il Sole 24 Ore",
|
|
||||||
"Infobae Economia AR",
|
|
||||||
"Japan Times Business",
|
|
||||||
"Korea JoongAng Daily",
|
|
||||||
"Les Echos",
|
|
||||||
"Live Mint",
|
|
||||||
"Moneycontrol",
|
|
||||||
"NZ Herald Business",
|
|
||||||
"Portafolio Colombia",
|
|
||||||
"Reuters",
|
|
||||||
"The Star Malaysia",
|
|
||||||
"This Is Money",
|
|
||||||
"Xinhua Business"
|
|
||||||
],
|
|
||||||
"overrides": {
|
|
||||||
"Al Jazeera": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.aljazeera.com",
|
|
||||||
"aljazeera.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.aljazeera.com/",
|
|
||||||
"https://www.aljazeera.com/economy/",
|
|
||||||
"https://www.aljazeera.com/tag/technology/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Ars Technica": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"arstechnica.com",
|
|
||||||
"www.arstechnica.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://arstechnica.com/",
|
|
||||||
"https://arstechnica.com/tech-policy/",
|
|
||||||
"https://arstechnica.com/information-technology/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"BBC Business": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.bbc.com",
|
|
||||||
"bbc.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.bbc.com/news/business",
|
|
||||||
"https://www.bbc.com/news/technology"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"CNBC": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.cnbc.com",
|
|
||||||
"cnbc.com"
|
|
||||||
],
|
|
||||||
"renderMode": "browser",
|
|
||||||
"seeds": [
|
|
||||||
"https://www.cnbc.com/world/",
|
|
||||||
"https://www.cnbc.com/business/",
|
|
||||||
"https://www.cnbc.com/technology/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Guardian Business": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.theguardian.com",
|
|
||||||
"theguardian.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.theguardian.com/",
|
|
||||||
"https://www.theguardian.com/business",
|
|
||||||
"https://www.theguardian.com/technology"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Jamaica Gleaner": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"jamaica-gleaner.com",
|
|
||||||
"www.jamaica-gleaner.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://jamaica-gleaner.com/",
|
|
||||||
"https://jamaica-gleaner.com/news",
|
|
||||||
"https://jamaica-gleaner.com/business"
|
|
||||||
],
|
|
||||||
"requestTimeout": 25000
|
|
||||||
},
|
|
||||||
"Jamaica Observer": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.jamaicaobserver.com",
|
|
||||||
"jamaicaobserver.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.jamaicaobserver.com/",
|
|
||||||
"https://www.jamaicaobserver.com/news/",
|
|
||||||
"https://www.jamaicaobserver.com/business/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Nation News Barbados": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"nationnews.com",
|
|
||||||
"www.nationnews.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://nationnews.com/",
|
|
||||||
"https://nationnews.com/category/business/",
|
|
||||||
"https://nationnews.com/category/news/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"NPR Business": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.npr.org",
|
|
||||||
"npr.org"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.npr.org/sections/business/",
|
|
||||||
"https://www.npr.org/sections/technology/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"The Verge": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.theverge.com",
|
|
||||||
"theverge.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.theverge.com/tech",
|
|
||||||
"https://www.theverge.com/business",
|
|
||||||
"https://www.theverge.com/archives"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"TechCrunch": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"techcrunch.com",
|
|
||||||
"www.techcrunch.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://techcrunch.com/",
|
|
||||||
"https://techcrunch.com/category/startups/",
|
|
||||||
"https://techcrunch.com/category/venture/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"The Economist": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.economist.com",
|
|
||||||
"economist.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.economist.com/finance-and-economics",
|
|
||||||
"https://www.economist.com/business",
|
|
||||||
"https://www.economist.com/science-and-technology"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Federal Reserve": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.federalreserve.gov",
|
|
||||||
"federalreserve.gov"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.federalreserve.gov/newsevents.htm",
|
|
||||||
"https://www.federalreserve.gov/monetarypolicy.htm"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Fortune": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"fortune.com",
|
|
||||||
"www.fortune.com"
|
|
||||||
],
|
|
||||||
"renderMode": "browser",
|
|
||||||
"seeds": [
|
|
||||||
"https://fortune.com/",
|
|
||||||
"https://fortune.com/section/tech/",
|
|
||||||
"https://fortune.com/section/finance/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Forbes Business": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.forbes.com",
|
|
||||||
"forbes.com"
|
|
||||||
],
|
|
||||||
"renderMode": "browser",
|
|
||||||
"seeds": [
|
|
||||||
"https://www.forbes.com/business/",
|
|
||||||
"https://www.forbes.com/innovation/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Financial Times": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.ft.com",
|
|
||||||
"ft.com"
|
|
||||||
],
|
|
||||||
"renderMode": "browser",
|
|
||||||
"seeds": [
|
|
||||||
"https://www.ft.com/world/us",
|
|
||||||
"https://www.ft.com/technology"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Nikkei Asia": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"asia.nikkei.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://asia.nikkei.com/",
|
|
||||||
"https://asia.nikkei.com/Business",
|
|
||||||
"https://asia.nikkei.com/Technology"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"South China Morning Post": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.scmp.com",
|
|
||||||
"scmp.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.scmp.com/",
|
|
||||||
"https://www.scmp.com/business",
|
|
||||||
"https://www.scmp.com/tech"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Stabroek News": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.stabroeknews.com",
|
|
||||||
"stabroeknews.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.stabroeknews.com/",
|
|
||||||
"https://www.stabroeknews.com/category/business/",
|
|
||||||
"https://www.stabroeknews.com/category/news/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Wall Street Journal": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.wsj.com",
|
|
||||||
"wsj.com"
|
|
||||||
],
|
|
||||||
"seeds": [
|
|
||||||
"https://www.wsj.com/news/business",
|
|
||||||
"https://www.wsj.com/tech"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Wired Business": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"www.wired.com",
|
|
||||||
"wired.com"
|
|
||||||
],
|
|
||||||
"renderMode": "browser",
|
|
||||||
"seeds": [
|
|
||||||
"https://www.wired.com/category/business/",
|
|
||||||
"https://www.wired.com/category/security/"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Yahoo Finance": {
|
|
||||||
"allowedHosts": [
|
|
||||||
"finance.yahoo.com"
|
|
||||||
],
|
|
||||||
"renderMode": "browser",
|
|
||||||
"seeds": [
|
|
||||||
"https://finance.yahoo.com/",
|
|
||||||
"https://finance.yahoo.com/news/",
|
|
||||||
"https://finance.yahoo.com/topic/tech/"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"scheduler": {
|
"scheduler": {
|
||||||
"newsCrawler": "0 * * * *",
|
|
||||||
"rss": "5 * * * *",
|
"rss": "5 * * * *",
|
||||||
"gdelt": "10 * * * *",
|
"gdelt": "10 * * * *",
|
||||||
"edgar": "15 * * * *",
|
"edgar": "15 * * * *",
|
||||||
"alphaVantage": "20 * * * *",
|
"alphaVantage": "20 * * * *",
|
||||||
"finnhub": "25 * * * *"
|
"finnhub": "25 * * * *",
|
||||||
|
"googleNews": "0 * * * *"
|
||||||
},
|
},
|
||||||
"contentBackfill": {
|
"contentBackfill": {
|
||||||
"cron": "0 * * * *",
|
"cron": "0 * * * *",
|
||||||
"batchSize": -1
|
"batchSize": -1
|
||||||
|
},
|
||||||
|
"googleNews": {
|
||||||
|
"queries": [
|
||||||
|
"technology"
|
||||||
|
],
|
||||||
|
"topics": [
|
||||||
|
"BUSINESS",
|
||||||
|
"TECHNOLOGY"
|
||||||
|
],
|
||||||
|
"language": "en",
|
||||||
|
"country": "US"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
13
gdelt-credentials.json
Normal file
13
gdelt-credentials.json
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"type": "service_account",
|
||||||
|
"project_id": "duriin",
|
||||||
|
"private_key_id": "28c6050948e703e3443203ac1a8c10e2e3009793",
|
||||||
|
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDQlDtHQASuKDOn\nUWejQA6lbbPGn5AV8Xc+mQ7G+JgmxqeVxmqmQU6Eh+Q/8aUHL2oMwbaVi1Z8Dfbl\nhjjKaTYuRB1N3MAr94hahxQBk7EvSLUtzoMv36aMtZNcvx1W57hKQtW6Qou6N6Y4\ncVlNOS+lj9Jl5WWr88zkDhS2FH5G0WvBp7fWtZFhSi2hganaZtXdYnvJ3LZ3x/FP\nbBdrthF5mRVkNm89ylHqCDa04ZZjqJ8RTr5XmNTlS4T6vCLBYNwHdO0xYTh2PxVo\nlWUUrIyoV6b1xn/H/ko04E/hcfG/jehhvxBUqUPJHb7/bt7DAQqNIqQRLPcjXWwH\n2leSlUTnAgMBAAECggEASgizWcDxaFfArd0JKjrsd++OZivw2rkQmFl/k0RdzTWp\n+lGpPUXk9sm9TK0a5IgB3nFLu5zvn6zdO+7+bWoW3ykyNZbrZy+/aFKV2VFxDNWD\n6bRpgC6kUUGKAtubMGOjWEiM0EYajoh+KX6iMfTgYqXACob4JaatzSzqUQ7JG51J\nzZnzsNXBTr21kKkxtfTAIrCXjmy5ogAJhYCNgeoqVd8ILhrYHluK8F8WCqk+BPnA\nfZ4vPTeTEvxsh0uYFmRY8wA3TGwy9Q40Lsg+oHEcs/XmUDJHXFryNryknUHYwsA7\nWqbUi26/SHPKRs0w6y17f+LxCn1vg6MxOG7M3LcTcQKBgQDrq8grI8EbBLo6gdY9\n8mjBklTnkvEpxmYDmjzVWX1XCdZIj4xJYyt1Y3PDbBEeEbjwqrlrTQgk+R0tC3ed\nW3jLEbioUfulnri8dWfeuAr9xhCJc8qSxDeLNfnJQc7rpUzyhyD5KhNI8GTJ1zwW\n2JzJGWaPAu5KoNAY2SA//sfRiwKBgQDikjBx1oeIVqFg3Lp47G734l1ikgn9LcP3\njSW3cjYg2XBIlM1LrRt39ljdRByvA0vo3dlN5cFZMknzFlWV8ymvjNghbeVfD2r0\nBcWOMJ0ZeFB3cK127GBN+iMJ9Y8xR6ZWg0d9SmBVUSwVrndS1u4kS6vfYRBSCVWb\nmZujR/TNlQKBgQDR/t69Of3O8nZyvdDGoCMiIR8QvgmwfL3YBe6g+T3LedN8EpUh\nq4FE95pmjvvtvEL8CFRyPVC9iVCrG6W5DJHk+OR+75Z5bKYWH9OvTHVWzc9ce1YN\nU1Re8niiEcasiT24ehoyi4BlpPdaNzSu8tM6Ci0tz6G/0+25xneLLp6kowKBgCuc\npjSTbd1Bh6jEdCRopmeSrBUYNVIFqC4TfkoUcvTZxfJCqk3B0YLC6ZIV1Uue39LA\nOV70NcZ8lp1zFCBcAQ8olkXBCKDGr/iuz7syAltvvFVxXAKDN3prBqmZGeoLd6o5\ndN5aHbbufATkY1WPx6E266uA3Ipd/5uG8t14MVgNAoGBAJsE7YhgTtMfn1eYJiDG\nRhM8YbJ5njFezrug1Fzhq3BeXLTclpQXUlQC/hNDfOsDBQS7bQlmXNkKaN5Pc0G6\ngRKzuZucKJGEMpce0ZaM5mN1j+wRnZUH096O286X/M35WONI3iYD3atqyiR6meUp\nTHSWlR6A3P5xGWVKrNOs04ck\n-----END PRIVATE KEY-----\n",
|
||||||
|
"client_email": "duriin-gdelt@duriin.iam.gserviceaccount.com",
|
||||||
|
"client_id": "101084071372544178772",
|
||||||
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
||||||
|
"token_uri": "https://oauth2.googleapis.com/token",
|
||||||
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
||||||
|
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/duriin-gdelt%40duriin.iam.gserviceaccount.com",
|
||||||
|
"universe_domain": "googleapis.com"
|
||||||
|
}
|
||||||
482
package-lock.json
generated
482
package-lock.json
generated
|
|
@ -11,6 +11,7 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@extractus/article-extractor": "^8.0.18",
|
"@extractus/article-extractor": "^8.0.18",
|
||||||
"@fastify/cors": "^11.2.0",
|
"@fastify/cors": "^11.2.0",
|
||||||
|
"@google-cloud/bigquery": "^8.1.1",
|
||||||
"better-sqlite3": "^12.4.1",
|
"better-sqlite3": "^12.4.1",
|
||||||
"fastify": "^5.6.1",
|
"fastify": "^5.6.1",
|
||||||
"node-cron": "^4.2.1",
|
"node-cron": "^4.2.1",
|
||||||
|
|
@ -177,6 +178,104 @@
|
||||||
"ipaddr.js": "^2.1.0"
|
"ipaddr.js": "^2.1.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@google-cloud/bigquery": {
|
||||||
|
"version": "8.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/@google-cloud/bigquery/-/bigquery-8.1.1.tgz",
|
||||||
|
"integrity": "sha512-2GHlohfA/VJffTvibMazMsZi6jPRx8MmaMberyDTL8rnhVs/frKSXVVRtLU83uSAy2j/5SD4mOs4jMQgJPON2g==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"@google-cloud/common": "^6.0.0",
|
||||||
|
"@google-cloud/paginator": "^6.0.0",
|
||||||
|
"@google-cloud/precise-date": "^5.0.0",
|
||||||
|
"@google-cloud/promisify": "^5.0.0",
|
||||||
|
"arrify": "^3.0.0",
|
||||||
|
"big.js": "^6.2.2",
|
||||||
|
"duplexify": "^4.1.3",
|
||||||
|
"extend": "^3.0.2",
|
||||||
|
"stream-events": "^1.0.5",
|
||||||
|
"teeny-request": "^10.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@google-cloud/common": {
|
||||||
|
"version": "6.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@google-cloud/common/-/common-6.0.0.tgz",
|
||||||
|
"integrity": "sha512-IXh04DlkLMxWgYLIUYuHHKXKOUwPDzDgke1ykkkJPe48cGIS9kkL2U/o0pm4ankHLlvzLF/ma1eO86n/bkumIA==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"@google-cloud/projectify": "^4.0.0",
|
||||||
|
"@google-cloud/promisify": "^4.0.0",
|
||||||
|
"arrify": "^2.0.0",
|
||||||
|
"duplexify": "^4.1.3",
|
||||||
|
"extend": "^3.0.2",
|
||||||
|
"google-auth-library": "^10.0.0-rc.1",
|
||||||
|
"html-entities": "^2.5.2",
|
||||||
|
"retry-request": "^8.0.0",
|
||||||
|
"teeny-request": "^10.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@google-cloud/common/node_modules/@google-cloud/promisify": {
|
||||||
|
"version": "4.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@google-cloud/promisify/-/promisify-4.1.0.tgz",
|
||||||
|
"integrity": "sha512-G/FQx5cE/+DqBbOpA5jKsegGwdPniU6PuIEMt+qxWgFxvxuFOzVmp6zYchtYuwAWV5/8Dgs0yAmjvNZv3uXLQg==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@google-cloud/common/node_modules/arrify": {
|
||||||
|
"version": "2.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/arrify/-/arrify-2.0.1.tgz",
|
||||||
|
"integrity": "sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=8"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@google-cloud/paginator": {
|
||||||
|
"version": "6.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@google-cloud/paginator/-/paginator-6.0.0.tgz",
|
||||||
|
"integrity": "sha512-g5nmMnzC+94kBxOKkLGpK1ikvolTFCC3s2qtE4F+1EuArcJ7HHC23RDQVt3Ra3CqpUYZ+oXNKZ8n5Cn5yug8DA==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"extend": "^3.0.2"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@google-cloud/precise-date": {
|
||||||
|
"version": "5.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@google-cloud/precise-date/-/precise-date-5.0.0.tgz",
|
||||||
|
"integrity": "sha512-9h0Gvw92EvPdE8AK8AgZPbMnH5ftDyPtKm7/KUfcJVaPEPjwGDsJd1QV0H8esBDV4II41R/2lDWH1epBqIoKUw==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@google-cloud/projectify": {
|
||||||
|
"version": "4.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@google-cloud/projectify/-/projectify-4.0.0.tgz",
|
||||||
|
"integrity": "sha512-MmaX6HeSvyPbWGwFq7mXdo0uQZLGBYCwziiLIGq5JVX+/bdI3SAq6bP98trV5eTWfLuvsMcIC1YJOF2vfteLFA==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=14.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@google-cloud/promisify": {
|
||||||
|
"version": "5.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@google-cloud/promisify/-/promisify-5.0.0.tgz",
|
||||||
|
"integrity": "sha512-N8qS6dlORGHwk7WjGXKOSsLjIjNINCPicsOX6gyyLiYk7mq3MtII96NZ9N2ahwA2vnkLmZODOIH9rlNniYWvCQ==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@img/colour": {
|
"node_modules/@img/colour": {
|
||||||
"version": "1.1.0",
|
"version": "1.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.1.0.tgz",
|
||||||
|
|
@ -669,6 +768,15 @@
|
||||||
"integrity": "sha512-2BjRTZxTPvheOvGbBslFSYOUkr+SjPtOnrLP33f+VIWLzezQpZcqVg7ja3L4dBXmzzgwT+a029jRx5PCi3JuiA==",
|
"integrity": "sha512-2BjRTZxTPvheOvGbBslFSYOUkr+SjPtOnrLP33f+VIWLzezQpZcqVg7ja3L4dBXmzzgwT+a029jRx5PCi3JuiA==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/agent-base": {
|
||||||
|
"version": "7.1.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
|
||||||
|
"integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 14"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/ajv": {
|
"node_modules/ajv": {
|
||||||
"version": "8.18.0",
|
"version": "8.18.0",
|
||||||
"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz",
|
"resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz",
|
||||||
|
|
@ -702,6 +810,18 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/arrify": {
|
||||||
|
"version": "3.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/arrify/-/arrify-3.0.0.tgz",
|
||||||
|
"integrity": "sha512-tLkvA81vQG/XqE2mjDkGQHoOINtMHtysSnemrmoGe6PydDPMRbVugqyk4A6V/WDWEfm3l+0d8anA9r8cv/5Jaw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=12"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/sindresorhus"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/atomic-sleep": {
|
"node_modules/atomic-sleep": {
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz",
|
||||||
|
|
@ -765,6 +885,28 @@
|
||||||
"node": "20.x || 22.x || 23.x || 24.x || 25.x"
|
"node": "20.x || 22.x || 23.x || 24.x || 25.x"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/big.js": {
|
||||||
|
"version": "6.2.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/big.js/-/big.js-6.2.2.tgz",
|
||||||
|
"integrity": "sha512-y/ie+Faknx7sZA5MfGA2xKlu0GDv8RWrXGsmlteyJQ2lvoKv9GBK/fpRMc2qlSoBAgNxrixICFCBefIq8WCQpQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": "*"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "opencollective",
|
||||||
|
"url": "https://opencollective.com/bigjs"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/bignumber.js": {
|
||||||
|
"version": "9.3.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz",
|
||||||
|
"integrity": "sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": "*"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/bindings": {
|
"node_modules/bindings": {
|
||||||
"version": "1.5.0",
|
"version": "1.5.0",
|
||||||
"resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz",
|
"resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz",
|
||||||
|
|
@ -815,6 +957,12 @@
|
||||||
"ieee754": "^1.1.13"
|
"ieee754": "^1.1.13"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/buffer-equal-constant-time": {
|
||||||
|
"version": "1.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz",
|
||||||
|
"integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==",
|
||||||
|
"license": "BSD-3-Clause"
|
||||||
|
},
|
||||||
"node_modules/chownr": {
|
"node_modules/chownr": {
|
||||||
"version": "1.1.4",
|
"version": "1.1.4",
|
||||||
"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
|
"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
|
||||||
|
|
@ -877,6 +1025,32 @@
|
||||||
"integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==",
|
"integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/data-uri-to-buffer": {
|
||||||
|
"version": "4.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz",
|
||||||
|
"integrity": "sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/debug": {
|
||||||
|
"version": "4.4.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
|
||||||
|
"integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"ms": "^2.1.3"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=6.0"
|
||||||
|
},
|
||||||
|
"peerDependenciesMeta": {
|
||||||
|
"supports-color": {
|
||||||
|
"optional": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/decompress-response": {
|
"node_modules/decompress-response": {
|
||||||
"version": "6.0.0",
|
"version": "6.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
|
||||||
|
|
@ -983,6 +1157,27 @@
|
||||||
"url": "https://github.com/fb55/domutils?sponsor=1"
|
"url": "https://github.com/fb55/domutils?sponsor=1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/duplexify": {
|
||||||
|
"version": "4.1.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/duplexify/-/duplexify-4.1.3.tgz",
|
||||||
|
"integrity": "sha512-M3BmBhwJRZsSx38lZyhE53Csddgzl5R7xGJNk7CVddZD6CcmwMCH8J+7AprIrQKH7TonKxaCjcv27Qmf+sQ+oA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"end-of-stream": "^1.4.1",
|
||||||
|
"inherits": "^2.0.3",
|
||||||
|
"readable-stream": "^3.1.1",
|
||||||
|
"stream-shift": "^1.0.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/ecdsa-sig-formatter": {
|
||||||
|
"version": "1.0.11",
|
||||||
|
"resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz",
|
||||||
|
"integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"safe-buffer": "^5.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/end-of-stream": {
|
"node_modules/end-of-stream": {
|
||||||
"version": "1.4.5",
|
"version": "1.4.5",
|
||||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
|
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
|
||||||
|
|
@ -1025,6 +1220,12 @@
|
||||||
"node": ">=6"
|
"node": ">=6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/extend": {
|
||||||
|
"version": "3.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
|
||||||
|
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/fast-decode-uri-component": {
|
"node_modules/fast-decode-uri-component": {
|
||||||
"version": "1.0.1",
|
"version": "1.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/fast-decode-uri-component/-/fast-decode-uri-component-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/fast-decode-uri-component/-/fast-decode-uri-component-1.0.1.tgz",
|
||||||
|
|
@ -1144,6 +1345,29 @@
|
||||||
"reusify": "^1.0.4"
|
"reusify": "^1.0.4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/fetch-blob": {
|
||||||
|
"version": "3.2.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.2.0.tgz",
|
||||||
|
"integrity": "sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/jimmywarting"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "paypal",
|
||||||
|
"url": "https://paypal.me/jimmywarting"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"node-domexception": "^1.0.0",
|
||||||
|
"web-streams-polyfill": "^3.0.3"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": "^12.20 || >= 14.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/file-uri-to-path": {
|
"node_modules/file-uri-to-path": {
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz",
|
||||||
|
|
@ -1164,6 +1388,18 @@
|
||||||
"node": ">=20"
|
"node": ">=20"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/formdata-polyfill": {
|
||||||
|
"version": "4.0.10",
|
||||||
|
"resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz",
|
||||||
|
"integrity": "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"fetch-blob": "^3.1.2"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=12.20.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/fs-constants": {
|
"node_modules/fs-constants": {
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",
|
||||||
|
|
@ -1184,12 +1420,100 @@
|
||||||
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/gaxios": {
|
||||||
|
"version": "7.1.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/gaxios/-/gaxios-7.1.4.tgz",
|
||||||
|
"integrity": "sha512-bTIgTsM2bWn3XklZISBTQX7ZSddGW+IO3bMdGaemHZ3tbqExMENHLx6kKZ/KlejgrMtj8q7wBItt51yegqalrA==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"extend": "^3.0.2",
|
||||||
|
"https-proxy-agent": "^7.0.1",
|
||||||
|
"node-fetch": "^3.3.2"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/gaxios/node_modules/node-fetch": {
|
||||||
|
"version": "3.3.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz",
|
||||||
|
"integrity": "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"data-uri-to-buffer": "^4.0.0",
|
||||||
|
"fetch-blob": "^3.1.4",
|
||||||
|
"formdata-polyfill": "^4.0.10"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": "^12.20.0 || ^14.13.1 || >=16.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "opencollective",
|
||||||
|
"url": "https://opencollective.com/node-fetch"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/gcp-metadata": {
|
||||||
|
"version": "8.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-8.1.2.tgz",
|
||||||
|
"integrity": "sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"gaxios": "^7.0.0",
|
||||||
|
"google-logging-utils": "^1.0.0",
|
||||||
|
"json-bigint": "^1.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/github-from-package": {
|
"node_modules/github-from-package": {
|
||||||
"version": "0.0.0",
|
"version": "0.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
|
||||||
"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==",
|
"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/google-auth-library": {
|
||||||
|
"version": "10.6.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-10.6.2.tgz",
|
||||||
|
"integrity": "sha512-e27Z6EThmVNNvtYASwQxose/G57rkRuaRbQyxM2bvYLLX/GqWZ5chWq2EBoUchJbCc57eC9ArzO5wMsEmWftCw==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"base64-js": "^1.3.0",
|
||||||
|
"ecdsa-sig-formatter": "^1.0.11",
|
||||||
|
"gaxios": "^7.1.4",
|
||||||
|
"gcp-metadata": "8.1.2",
|
||||||
|
"google-logging-utils": "1.1.3",
|
||||||
|
"jws": "^4.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/google-logging-utils": {
|
||||||
|
"version": "1.1.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/google-logging-utils/-/google-logging-utils-1.1.3.tgz",
|
||||||
|
"integrity": "sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=14"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/html-entities": {
|
||||||
|
"version": "2.6.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.6.0.tgz",
|
||||||
|
"integrity": "sha512-kig+rMn/QOVRvr7c86gQ8lWXq+Hkv6CbAH1hLu+RG338StTpE8Z0b44SDVaqVu7HGKf27frdmUYEs9hTUX/cLQ==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/mdevils"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "patreon",
|
||||||
|
"url": "https://patreon.com/mdevils"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/html-escaper": {
|
"node_modules/html-escaper": {
|
||||||
"version": "3.0.3",
|
"version": "3.0.3",
|
||||||
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz",
|
||||||
|
|
@ -1227,6 +1551,32 @@
|
||||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/http-proxy-agent": {
|
||||||
|
"version": "7.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
|
||||||
|
"integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"agent-base": "^7.1.0",
|
||||||
|
"debug": "^4.3.4"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 14"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/https-proxy-agent": {
|
||||||
|
"version": "7.0.6",
|
||||||
|
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
|
||||||
|
"integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"agent-base": "^7.1.2",
|
||||||
|
"debug": "4"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 14"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/ieee754": {
|
"node_modules/ieee754": {
|
||||||
"version": "1.2.1",
|
"version": "1.2.1",
|
||||||
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
|
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
|
||||||
|
|
@ -1277,6 +1627,15 @@
|
||||||
"node": ">=0.10.0"
|
"node": ">=0.10.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/json-bigint": {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz",
|
||||||
|
"integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"bignumber.js": "^9.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/json-schema-ref-resolver": {
|
"node_modules/json-schema-ref-resolver": {
|
||||||
"version": "3.0.0",
|
"version": "3.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/json-schema-ref-resolver/-/json-schema-ref-resolver-3.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/json-schema-ref-resolver/-/json-schema-ref-resolver-3.0.0.tgz",
|
||||||
|
|
@ -1302,6 +1661,27 @@
|
||||||
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
|
"integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/jwa": {
|
||||||
|
"version": "2.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz",
|
||||||
|
"integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"buffer-equal-constant-time": "^1.0.1",
|
||||||
|
"ecdsa-sig-formatter": "1.0.11",
|
||||||
|
"safe-buffer": "^5.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/jws": {
|
||||||
|
"version": "4.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz",
|
||||||
|
"integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"jwa": "^2.0.1",
|
||||||
|
"safe-buffer": "^5.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/light-my-request": {
|
"node_modules/light-my-request": {
|
||||||
"version": "6.6.0",
|
"version": "6.6.0",
|
||||||
"resolved": "https://registry.npmjs.org/light-my-request/-/light-my-request-6.6.0.tgz",
|
"resolved": "https://registry.npmjs.org/light-my-request/-/light-my-request-6.6.0.tgz",
|
||||||
|
|
@ -1390,6 +1770,12 @@
|
||||||
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==",
|
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/ms": {
|
||||||
|
"version": "2.1.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
|
||||||
|
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/nanoid": {
|
"node_modules/nanoid": {
|
||||||
"version": "3.3.11",
|
"version": "3.3.11",
|
||||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
|
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
|
||||||
|
|
@ -1435,6 +1821,26 @@
|
||||||
"node": ">=6.0.0"
|
"node": ">=6.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/node-domexception": {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz",
|
||||||
|
"integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==",
|
||||||
|
"deprecated": "Use your platform's native DOMException instead",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/jimmywarting"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://paypal.me/jimmywarting"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=10.5.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/node-fetch": {
|
"node_modules/node-fetch": {
|
||||||
"version": "2.7.0",
|
"version": "2.7.0",
|
||||||
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
|
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
|
||||||
|
|
@ -1707,6 +2113,19 @@
|
||||||
"node": ">=10"
|
"node": ">=10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/retry-request": {
|
||||||
|
"version": "8.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/retry-request/-/retry-request-8.0.2.tgz",
|
||||||
|
"integrity": "sha512-JzFPAfklk1kjR1w76f0QOIhoDkNkSqW8wYKT08n9yysTmZfB+RQ2QoXoTAeOi1HD9ZipTyTAZg3c4pM/jeqgSw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"extend": "^3.0.2",
|
||||||
|
"teeny-request": "^10.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/reusify": {
|
"node_modules/reusify": {
|
||||||
"version": "1.1.0",
|
"version": "1.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz",
|
||||||
|
|
@ -2063,6 +2482,21 @@
|
||||||
"win32"
|
"win32"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
"node_modules/stream-events": {
|
||||||
|
"version": "1.0.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/stream-events/-/stream-events-1.0.5.tgz",
|
||||||
|
"integrity": "sha512-E1GUzBSgvct8Jsb3v2X15pjzN1tYebtbLaMg+eBOUOAxgbLoSbT2NS91ckc5lJD1KfLjId+jXJRgo0qnV5Nerg==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"stubs": "^3.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/stream-shift": {
|
||||||
|
"version": "1.0.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.3.tgz",
|
||||||
|
"integrity": "sha512-76ORR0DO1o1hlKwTbi/DM3EXWGf3ZJYO8cXX5RJwnul2DEg2oyoZyjLNoQM8WsvZiFKCRfC1O0J7iCvie3RZmQ==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/string_decoder": {
|
"node_modules/string_decoder": {
|
||||||
"version": "1.3.0",
|
"version": "1.3.0",
|
||||||
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
|
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
|
||||||
|
|
@ -2081,6 +2515,12 @@
|
||||||
"node": ">=0.10.0"
|
"node": ">=0.10.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/stubs": {
|
||||||
|
"version": "3.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/stubs/-/stubs-3.0.0.tgz",
|
||||||
|
"integrity": "sha512-PdHt7hHUJKxvTCgbKX9C1V/ftOcjJQgz8BZwNfV5c4B6dcGqlpelTbJ999jBGZ2jYiPAwcX5dP6oBwVlBlUbxw==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/tar-fs": {
|
"node_modules/tar-fs": {
|
||||||
"version": "2.1.4",
|
"version": "2.1.4",
|
||||||
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz",
|
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz",
|
||||||
|
|
@ -2109,6 +2549,39 @@
|
||||||
"node": ">=6"
|
"node": ">=6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/teeny-request": {
|
||||||
|
"version": "10.1.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/teeny-request/-/teeny-request-10.1.2.tgz",
|
||||||
|
"integrity": "sha512-Xj0ZAQ0CeuQn6UxCDPLbFRlgcSTUEyO3+wiepr2grjIjyL/lMMs1Z4OwXn8kLvn/V1OuaEP0UY7Na6UDNNsYrQ==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"dependencies": {
|
||||||
|
"http-proxy-agent": "^7.0.0",
|
||||||
|
"https-proxy-agent": "^7.0.1",
|
||||||
|
"node-fetch": "^3.3.2",
|
||||||
|
"stream-events": "^1.0.5"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/teeny-request/node_modules/node-fetch": {
|
||||||
|
"version": "3.3.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz",
|
||||||
|
"integrity": "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"data-uri-to-buffer": "^4.0.0",
|
||||||
|
"fetch-blob": "^3.1.4",
|
||||||
|
"formdata-polyfill": "^4.0.10"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": "^12.20.0 || ^14.13.1 || >=16.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "opencollective",
|
||||||
|
"url": "https://opencollective.com/node-fetch"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/thread-stream": {
|
"node_modules/thread-stream": {
|
||||||
"version": "4.0.0",
|
"version": "4.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/thread-stream/-/thread-stream-4.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/thread-stream/-/thread-stream-4.0.0.tgz",
|
||||||
|
|
@ -2167,6 +2640,15 @@
|
||||||
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
|
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/web-streams-polyfill": {
|
||||||
|
"version": "3.3.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz",
|
||||||
|
"integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 8"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/webidl-conversions": {
|
"node_modules/webidl-conversions": {
|
||||||
"version": "3.0.1",
|
"version": "3.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@extractus/article-extractor": "^8.0.18",
|
"@extractus/article-extractor": "^8.0.18",
|
||||||
"@fastify/cors": "^11.2.0",
|
"@fastify/cors": "^11.2.0",
|
||||||
|
"@google-cloud/bigquery": "^8.1.1",
|
||||||
"better-sqlite3": "^12.4.1",
|
"better-sqlite3": "^12.4.1",
|
||||||
"fastify": "^5.6.1",
|
"fastify": "^5.6.1",
|
||||||
"node-cron": "^4.2.1",
|
"node-cron": "^4.2.1",
|
||||||
|
|
|
||||||
648
sources.json
Normal file
648
sources.json
Normal file
|
|
@ -0,0 +1,648 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "al_jazeera",
|
||||||
|
"label": "Al Jazeera",
|
||||||
|
"feedUrl": "https://www.aljazeera.com/xml/rss/all.xml",
|
||||||
|
"website": "aljazeera.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "bbc",
|
||||||
|
"label": "BBC",
|
||||||
|
"feedUrl": "https://feeds.bbci.co.uk/news/business/rss.xml",
|
||||||
|
"website": [
|
||||||
|
"bbc.com",
|
||||||
|
"bbc.co.uk"
|
||||||
|
],
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "business_insider",
|
||||||
|
"label": "Business Insider",
|
||||||
|
"feedUrl": "https://feeds.businessinsider.com/custom/all",
|
||||||
|
"website": "businessinsider.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "bloomberg_markets",
|
||||||
|
"label": "Bloomberg Markets",
|
||||||
|
"feedUrl": "https://feeds.bloomberg.com/markets/news.rss",
|
||||||
|
"website": "bloomberg.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "cnbc",
|
||||||
|
"label": "CNBC",
|
||||||
|
"feedUrl": "https://www.cnbc.com/id/100003114/device/rss/rss.html",
|
||||||
|
"website": "cnbc.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "wall_street_journal",
|
||||||
|
"label": "Wall Street Journal",
|
||||||
|
"feedUrl": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml",
|
||||||
|
"website": "wsj.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "marketwatch",
|
||||||
|
"label": "MarketWatch",
|
||||||
|
"feedUrl": "https://feeds.marketwatch.com/marketwatch/topstories/",
|
||||||
|
"website": "marketwatch.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "yahoo_finance",
|
||||||
|
"label": "Yahoo Finance",
|
||||||
|
"feedUrl": "https://finance.yahoo.com/news/rssindex",
|
||||||
|
"website": [
|
||||||
|
"finance.yahoo.com",
|
||||||
|
"yahoo.com"
|
||||||
|
],
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "seeking_alpha",
|
||||||
|
"label": "Seeking Alpha",
|
||||||
|
"feedUrl": "https://seekingalpha.com/feed.xml",
|
||||||
|
"website": "seekingalpha.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "financial_times",
|
||||||
|
"label": "Financial Times",
|
||||||
|
"feedUrl": "https://www.ft.com/?format=rss",
|
||||||
|
"website": "ft.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "the_economist",
|
||||||
|
"label": "The Economist",
|
||||||
|
"feedUrl": "https://www.economist.com/finance-and-economics/rss.xml",
|
||||||
|
"website": "economist.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "fortune",
|
||||||
|
"label": "Fortune",
|
||||||
|
"feedUrl": "https://fortune.com/feed",
|
||||||
|
"website": "fortune.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "forbes_business",
|
||||||
|
"label": "Forbes Business",
|
||||||
|
"feedUrl": "https://www.forbes.com/business/feed/",
|
||||||
|
"website": "forbes.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "inc_magazine",
|
||||||
|
"label": "Inc Magazine",
|
||||||
|
"feedUrl": "https://www.inc.com/rss",
|
||||||
|
"website": "inc.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "fast_company",
|
||||||
|
"label": "Fast Company",
|
||||||
|
"feedUrl": "https://www.fastcompany.com/latest/rss",
|
||||||
|
"website": "fastcompany.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "entrepreneur",
|
||||||
|
"label": "Entrepreneur",
|
||||||
|
"feedUrl": "https://www.entrepreneur.com/latest.rss",
|
||||||
|
"website": "entrepreneur.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "axios",
|
||||||
|
"label": "Axios",
|
||||||
|
"feedUrl": "https://api.axios.com/feed/",
|
||||||
|
"website": "axios.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "wired_business",
|
||||||
|
"label": "Wired Business",
|
||||||
|
"feedUrl": "https://www.wired.com/feed/category/business/latest/rss",
|
||||||
|
"website": "wired.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "npr_business",
|
||||||
|
"label": "NPR Business",
|
||||||
|
"feedUrl": "https://feeds.npr.org/1006/rss.xml",
|
||||||
|
"website": "npr.org",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "federal_reserve",
|
||||||
|
"label": "Federal Reserve",
|
||||||
|
"feedUrl": "https://www.federalreserve.gov/feeds/press_all.xml",
|
||||||
|
"website": "federalreserve.gov",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "techcrunch",
|
||||||
|
"label": "TechCrunch",
|
||||||
|
"feedUrl": "https://techcrunch.com/feed/",
|
||||||
|
"website": "techcrunch.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "the_verge",
|
||||||
|
"label": "The Verge",
|
||||||
|
"feedUrl": "https://www.theverge.com/rss/index.xml",
|
||||||
|
"website": "theverge.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "ars_technica",
|
||||||
|
"label": "Ars Technica",
|
||||||
|
"feedUrl": "https://feeds.arstechnica.com/arstechnica/index",
|
||||||
|
"website": "arstechnica.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "retail_dive",
|
||||||
|
"label": "Retail Dive",
|
||||||
|
"feedUrl": "https://www.retaildive.com/feeds/news/",
|
||||||
|
"website": "retaildive.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "manufacturing_dive",
|
||||||
|
"label": "Manufacturing Dive",
|
||||||
|
"feedUrl": "https://www.manufacturingdive.com/feeds/news/",
|
||||||
|
"website": "manufacturingdive.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "banking_dive",
|
||||||
|
"label": "Banking Dive",
|
||||||
|
"feedUrl": "https://www.bankingdive.com/feeds/news/",
|
||||||
|
"website": "bankingdive.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "financial_post_ca",
|
||||||
|
"label": "Financial Post CA",
|
||||||
|
"feedUrl": "https://financialpost.com/feed",
|
||||||
|
"website": "financialpost.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "globe_and_mail",
|
||||||
|
"label": "Globe and Mail",
|
||||||
|
"feedUrl": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/",
|
||||||
|
"website": "theglobeandmail.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "guardian_business",
|
||||||
|
"label": "Guardian Business",
|
||||||
|
"feedUrl": "https://www.theguardian.com/uk/business/rss",
|
||||||
|
"website": "theguardian.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sky_news_business",
|
||||||
|
"label": "Sky News Business",
|
||||||
|
"feedUrl": "https://feeds.skynews.com/feeds/rss/business.xml",
|
||||||
|
"website": "skynews.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "this_is_money",
|
||||||
|
"label": "This Is Money",
|
||||||
|
"feedUrl": "[FAILED] https://www.thisismoney.co.uk/money/news/index.rss",
|
||||||
|
"website": "thisismoney.co.uk",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "city_a_m",
|
||||||
|
"label": "City A.M.",
|
||||||
|
"feedUrl": "https://www.cityam.com/feed/",
|
||||||
|
"website": "cityam.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "spiegel_wirtschaft",
|
||||||
|
"label": "Spiegel Wirtschaft",
|
||||||
|
"feedUrl": "https://www.spiegel.de/wirtschaft/index.rss",
|
||||||
|
"website": "spiegel.de",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "handelsblatt",
|
||||||
|
"label": "Handelsblatt",
|
||||||
|
"feedUrl": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen",
|
||||||
|
"website": "handelsblatt.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "faz_wirtschaft",
|
||||||
|
"label": "FAZ Wirtschaft",
|
||||||
|
"feedUrl": "https://www.faz.net/rss/aktuell/wirtschaft/",
|
||||||
|
"website": "faz.net",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "die_welt_wirtschaft",
|
||||||
|
"label": "Die Welt Wirtschaft",
|
||||||
|
"feedUrl": "https://www.welt.de/feeds/section/wirtschaft.rss",
|
||||||
|
"website": "welt.de",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "les_echos",
|
||||||
|
"label": "Les Echos",
|
||||||
|
"feedUrl": "[FAILED] https://feeds.lesechos.fr/rss/rss_la_une.xml",
|
||||||
|
"website": "lesechos.fr",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "le_monde_economie",
|
||||||
|
"label": "Le Monde Economie",
|
||||||
|
"feedUrl": "https://www.lemonde.fr/economie/rss_full.xml",
|
||||||
|
"website": "lemonde.fr",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "bfm_business",
|
||||||
|
"label": "BFM Business",
|
||||||
|
"feedUrl": "[FAILED] https://bfmbusiness.bfmtv.com/rss/news-flux-rss/",
|
||||||
|
"website": "bfmbusiness.bfmtv.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "el_economista_es",
|
||||||
|
"label": "El Economista ES",
|
||||||
|
"feedUrl": "[FAILED] https://www.eleconomista.es/rss/rss-de-portada.php",
|
||||||
|
"website": "eleconomista.es",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "expansion_es",
|
||||||
|
"label": "Expansion ES",
|
||||||
|
"feedUrl": "https://e00-expansion.uecdn.es/rss/portada.xml",
|
||||||
|
"website": "expansion.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "cinco_dias",
|
||||||
|
"label": "Cinco Dias",
|
||||||
|
"feedUrl": "[FAILED] https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml",
|
||||||
|
"website": "cincodias.elpais.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "il_sole_24_ore",
|
||||||
|
"label": "Il Sole 24 Ore",
|
||||||
|
"feedUrl": "[FAILED] https://www.ilsole24ore.com/rss/economia--finanza.xml",
|
||||||
|
"website": "ilsole24ore.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "fd_nl",
|
||||||
|
"label": "FD.nl",
|
||||||
|
"feedUrl": "[FAILED] https://fd.nl/rss",
|
||||||
|
"website": "fd.nl",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "nzz_wirtschaft",
|
||||||
|
"label": "NZZ Wirtschaft",
|
||||||
|
"feedUrl": "https://www.nzz.ch/wirtschaft.rss",
|
||||||
|
"website": "nzz.ch",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "moscow_times",
|
||||||
|
"label": "Moscow Times",
|
||||||
|
"feedUrl": "https://www.themoscowtimes.com/rss/news",
|
||||||
|
"website": "themoscowtimes.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "rbc_russia",
|
||||||
|
"label": "RBC Russia",
|
||||||
|
"feedUrl": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss",
|
||||||
|
"website": "rbc.ru",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "economic_times_india",
|
||||||
|
"label": "Economic Times India",
|
||||||
|
"feedUrl": "https://economictimes.indiatimes.com/rssfeedstopstories.cms",
|
||||||
|
"website": "economictimes.indiatimes.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "business_standard_in",
|
||||||
|
"label": "Business Standard IN",
|
||||||
|
"feedUrl": "https://www.business-standard.com/rss/home_page_top_stories.rss",
|
||||||
|
"website": "business-standard.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "live_mint",
|
||||||
|
"label": "Live Mint",
|
||||||
|
"feedUrl": "[FAILED] https://www.livemint.com/rss/headlines",
|
||||||
|
"website": "livemint.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "moneycontrol",
|
||||||
|
"label": "Moneycontrol",
|
||||||
|
"feedUrl": "https://www.moneycontrol.com/rss/MCtopnews.xml",
|
||||||
|
"website": "moneycontrol.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "hindu_business_line",
|
||||||
|
"label": "Hindu Business Line",
|
||||||
|
"feedUrl": "https://www.thehindubusinessline.com/feeder/default.rss",
|
||||||
|
"website": "thehindubusinessline.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "caixin_global",
|
||||||
|
"label": "Caixin Global",
|
||||||
|
"feedUrl": "[FAILED] https://www.caixinglobal.com/rss/newsfeeds/",
|
||||||
|
"website": "caixinglobal.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "china_daily_business",
|
||||||
|
"label": "China Daily Business",
|
||||||
|
"feedUrl": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml",
|
||||||
|
"website": "chinadaily.com.cn",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "xinhua_business",
|
||||||
|
"label": "Xinhua Business",
|
||||||
|
"feedUrl": "[FAILED] https://english.news.cn/rss/business.xml",
|
||||||
|
"website": "news.cn",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "south_china_morning_post",
|
||||||
|
"label": "South China Morning Post",
|
||||||
|
"feedUrl": "https://www.scmp.com/rss/91/feed",
|
||||||
|
"website": "scmp.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "nikkei_asia",
|
||||||
|
"label": "Nikkei Asia",
|
||||||
|
"feedUrl": "https://asia.nikkei.com/rss/feed/nar",
|
||||||
|
"website": "asia.nikkei.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "japan_times_business",
|
||||||
|
"label": "Japan Times Business",
|
||||||
|
"feedUrl": "[FAILED] https://www.japantimes.co.jp/feed/business/",
|
||||||
|
"website": "japantimes.co.jp",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "korea_herald",
|
||||||
|
"label": "Korea Herald",
|
||||||
|
"feedUrl": "https://www.koreaherald.com/rss/010000000000.xml",
|
||||||
|
"website": "koreaherald.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "korea_joongang_daily",
|
||||||
|
"label": "Korea JoongAng Daily",
|
||||||
|
"feedUrl": "[FAILED] https://koreajoongangdaily.joins.com/rss/",
|
||||||
|
"website": "koreajoongangdaily.joins.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "business_times_sg",
|
||||||
|
"label": "Business Times SG",
|
||||||
|
"feedUrl": "https://www.businesstimes.com.sg/rss.xml",
|
||||||
|
"website": "businesstimes.com.sg",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "straits_times_business",
|
||||||
|
"label": "Straits Times Business",
|
||||||
|
"feedUrl": "https://www.straitstimes.com/news/business/rss.xml",
|
||||||
|
"website": "straitstimes.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "channel_newsasia",
|
||||||
|
"label": "Channel NewsAsia",
|
||||||
|
"feedUrl": "https://www.channelnewsasia.com/rssfeeds/8395986",
|
||||||
|
"website": "channelnewsasia.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "bangkok_post_business",
|
||||||
|
"label": "Bangkok Post Business",
|
||||||
|
"feedUrl": "https://www.bangkokpost.com/rss/data/business.xml",
|
||||||
|
"website": "bangkokpost.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "the_star_malaysia",
|
||||||
|
"label": "The Star Malaysia",
|
||||||
|
"feedUrl": "[FAILED] https://www.thestar.com.my/rss/Business/Business-News",
|
||||||
|
"website": "thestar.com.my",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "australian_fin_review",
|
||||||
|
"label": "Australian Fin Review",
|
||||||
|
"feedUrl": "[FAILED] https://www.afr.com/rss",
|
||||||
|
"website": "afr.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "abc_business_au",
|
||||||
|
"label": "ABC Business AU",
|
||||||
|
"feedUrl": "[FAILED] https://www.abc.net.au/news/feed/52278/rss.xml",
|
||||||
|
"website": "abc.net.au",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "nz_herald_business",
|
||||||
|
"label": "NZ Herald Business",
|
||||||
|
"feedUrl": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/",
|
||||||
|
"website": "nzherald.co.nz",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "arabian_business",
|
||||||
|
"label": "Arabian Business",
|
||||||
|
"feedUrl": "[FAILED] https://www.arabianbusiness.com/rss.xml",
|
||||||
|
"website": "arabianbusiness.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "gulf_news_business",
|
||||||
|
"label": "Gulf News Business",
|
||||||
|
"feedUrl": "[FAILED] https://gulfnews.com/rss/business",
|
||||||
|
"website": "gulfnews.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "arab_news",
|
||||||
|
"label": "Arab News",
|
||||||
|
"feedUrl": "[FAILED] https://www.arabnews.com/rss/front_page.xml",
|
||||||
|
"website": "arabnews.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "the_national_uae",
|
||||||
|
"label": "The National UAE",
|
||||||
|
"feedUrl": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml",
|
||||||
|
"website": "thenationalnews.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "businessday_nigeria",
|
||||||
|
"label": "BusinessDay Nigeria",
|
||||||
|
"feedUrl": "https://businessday.ng/feed/",
|
||||||
|
"website": "businessday.ng",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "moneyweb_sa",
|
||||||
|
"label": "Moneyweb SA",
|
||||||
|
"feedUrl": "https://www.moneyweb.co.za/feed/",
|
||||||
|
"website": "moneyweb.co.za",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "businesslive_sa",
|
||||||
|
"label": "BusinessLive SA",
|
||||||
|
"feedUrl": "[FAILED] https://www.businesslive.co.za/rss/bd/",
|
||||||
|
"website": "businesslive.co.za",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "business_daily_africa",
|
||||||
|
"label": "Business Daily Africa",
|
||||||
|
"feedUrl": "[FAILED] https://www.businessdailyafrica.com/rss/",
|
||||||
|
"website": "businessdailyafrica.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "vanguard_business_ng",
|
||||||
|
"label": "Vanguard Business NG",
|
||||||
|
"feedUrl": "https://www.vanguardngr.com/category/business/feed/",
|
||||||
|
"website": "vanguardngr.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "folha_mercado_br",
|
||||||
|
"label": "Folha Mercado BR",
|
||||||
|
"feedUrl": "https://feeds.folha.uol.com.br/mercado/rss091.xml",
|
||||||
|
"website": "folha.uol.com.br",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "g1_economia_br",
|
||||||
|
"label": "G1 Economia BR",
|
||||||
|
"feedUrl": "https://g1.globo.com/dynamo/economia/rss2.xml",
|
||||||
|
"website": "g1.globo.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "exame_br",
|
||||||
|
"label": "Exame BR",
|
||||||
|
"feedUrl": "https://exame.com/feed/",
|
||||||
|
"website": "exame.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "el_economista_mx",
|
||||||
|
"label": "El Economista MX",
|
||||||
|
"feedUrl": "[FAILED] https://www.eleconomista.com.mx/rss/rss.html",
|
||||||
|
"website": "eleconomista.com.mx",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "expansion_mx",
|
||||||
|
"label": "Expansion MX",
|
||||||
|
"feedUrl": "https://expansion.mx/rss",
|
||||||
|
"website": "expansion.mx",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "la_nacion_ar",
|
||||||
|
"label": "La Nacion AR",
|
||||||
|
"feedUrl": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/",
|
||||||
|
"website": "lanacion.com.ar",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "infobae_economia_ar",
|
||||||
|
"label": "Infobae Economia AR",
|
||||||
|
"feedUrl": "[FAILED] https://www.infobae.com/feeds/rss/economia/",
|
||||||
|
"website": "infobae.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "portafolio_colombia",
|
||||||
|
"label": "Portafolio Colombia",
|
||||||
|
"feedUrl": "[FAILED] https://www.portafolio.co/rss/portafolio.xml",
|
||||||
|
"website": "portafolio.co",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "el_comercio_peru",
|
||||||
|
"label": "El Comercio Peru",
|
||||||
|
"feedUrl": "[FAILED] https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/",
|
||||||
|
"website": "elcomercio.pe",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "jamaica_gleaner",
|
||||||
|
"label": "Jamaica Gleaner",
|
||||||
|
"feedUrl": "https://jamaica-gleaner.com/feed/business.xml",
|
||||||
|
"website": [
|
||||||
|
"jamaica-gleaner.com",
|
||||||
|
"jamaicagleaner.com"
|
||||||
|
],
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "jamaica_observer",
|
||||||
|
"label": "Jamaica Observer",
|
||||||
|
"feedUrl": "https://www.jamaicaobserver.com/app/business/",
|
||||||
|
"website": "jamaicaobserver.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "stabroek_news",
|
||||||
|
"label": "Stabroek News",
|
||||||
|
"feedUrl": "[FAILED] https://www.stabroeknews.com/feed/",
|
||||||
|
"website": "stabroeknews.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "nation_news_barbados",
|
||||||
|
"label": "Nation News Barbados",
|
||||||
|
"feedUrl": "[FAILED] https://nationnews.com/rss-feed/",
|
||||||
|
"website": "nationnews.com",
|
||||||
|
"backfill": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "google_news",
|
||||||
|
"label": "Google News",
|
||||||
|
"feedUrl": "https://news.google.com/rss?hl=en-GB&gl=GB&ceid=GB:en",
|
||||||
|
"website": "news.google.com",
|
||||||
|
"backfill": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
@ -1,8 +1,9 @@
|
||||||
const { extract } = require('@extractus/article-extractor');
|
const { extractFromHtml } = require('@extractus/article-extractor');
|
||||||
const sharp = require('sharp');
|
const sharp = require('sharp');
|
||||||
const db = require('./db');
|
const db = require('./db');
|
||||||
const { generateAndStoreEmbedding } = require('./embeddings');
|
const { generateAndStoreEmbedding } = require('./embeddings');
|
||||||
const { fetchWithPolicy } = require('./http');
|
const { fetchWithPolicy } = require('./http');
|
||||||
|
const { getSharedBrowserSession } = require('./sources/browserCrawler');
|
||||||
|
|
||||||
const updateArticleAssets = db.prepare(`
|
const updateArticleAssets = db.prepare(`
|
||||||
UPDATE articles
|
UPDATE articles
|
||||||
|
|
@ -40,32 +41,7 @@ const selectArticlesMissingContent = db.prepare(`
|
||||||
LIMIT ?
|
LIMIT ?
|
||||||
`);
|
`);
|
||||||
|
|
||||||
const blockedContentDomains = [
|
|
||||||
'axios.com',
|
|
||||||
'bizjournals.com',
|
|
||||||
'fastcompany.com',
|
|
||||||
'gurufocus.com',
|
|
||||||
'investing.com',
|
|
||||||
'rbc.ru',
|
|
||||||
'stocktitan.net',
|
|
||||||
];
|
|
||||||
const loggedBlockedDomains = new Set();
|
const loggedBlockedDomains = new Set();
|
||||||
const articleFetchHeaders = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
|
||||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
|
||||||
'Cache-Control': 'no-cache',
|
|
||||||
Pragma: 'no-cache',
|
|
||||||
'Upgrade-Insecure-Requests': '1',
|
|
||||||
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
|
|
||||||
'sec-ch-ua-mobile': '?0',
|
|
||||||
'sec-ch-ua-platform': '"macOS"',
|
|
||||||
'Sec-Fetch-Dest': 'document',
|
|
||||||
'Sec-Fetch-Mode': 'navigate',
|
|
||||||
'Sec-Fetch-Site': 'none',
|
|
||||||
'Sec-Fetch-User': '?1',
|
|
||||||
};
|
|
||||||
|
|
||||||
let contentBackfillRunning = false;
|
let contentBackfillRunning = false;
|
||||||
|
|
||||||
function getHostname(url) {
|
function getHostname(url) {
|
||||||
|
|
@ -76,10 +52,6 @@ function getHostname(url) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function isBlockedContentUrl(url) {
|
|
||||||
const hostname = getHostname(url);
|
|
||||||
return blockedContentDomains.some((domain) => hostname === domain || hostname.endsWith(`.${domain}`));
|
|
||||||
}
|
|
||||||
|
|
||||||
function getErrorStatus(error) {
|
function getErrorStatus(error) {
|
||||||
if (error && Number.isInteger(error.status)) {
|
if (error && Number.isInteger(error.status)) {
|
||||||
|
|
@ -147,20 +119,9 @@ async function fetchCompressedImage(url) {
|
||||||
|
|
||||||
async function fetchAndStoreContent(id, url) {
|
async function fetchAndStoreContent(id, url) {
|
||||||
try {
|
try {
|
||||||
if (isBlockedContentUrl(url)) {
|
const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages: 2 });
|
||||||
const hostname = getHostname(url);
|
const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 });
|
||||||
if (hostname && !loggedBlockedDomains.has(hostname)) {
|
const article = await extractFromHtml(html, url);
|
||||||
loggedBlockedDomains.add(hostname);
|
|
||||||
console.warn(`content extraction skipped for blocked domain ${hostname}`);
|
|
||||||
}
|
|
||||||
markArticleStatus(markContentSkipped, id, `blocked domain: ${hostname || 'unknown'}`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const article = await extract(url, {}, {
|
|
||||||
headers: articleFetchHeaders,
|
|
||||||
signal: AbortSignal.timeout(20000),
|
|
||||||
});
|
|
||||||
if (!article) {
|
if (!article) {
|
||||||
markArticleStatus(markContentSkipped, id, 'extractor returned no article');
|
markArticleStatus(markContentSkipped, id, 'extractor returned no article');
|
||||||
return;
|
return;
|
||||||
|
|
|
||||||
43
src/db.js
43
src/db.js
|
|
@ -127,6 +127,49 @@ db.exec(`
|
||||||
);
|
);
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
db.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
|
||||||
|
source_id TEXT NOT NULL,
|
||||||
|
window_start TEXT NOT NULL,
|
||||||
|
window_end TEXT NOT NULL,
|
||||||
|
completed_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY (source_id, window_start, window_end)
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
|
||||||
|
db.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS crawler_page_classifications (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
site_name TEXT NOT NULL,
|
||||||
|
classification TEXT NOT NULL,
|
||||||
|
pattern TEXT,
|
||||||
|
classified_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
|
||||||
|
db.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS crawler_url_patterns (
|
||||||
|
site_name TEXT NOT NULL,
|
||||||
|
pattern TEXT NOT NULL,
|
||||||
|
classification TEXT NOT NULL,
|
||||||
|
hit_count INTEGER NOT NULL DEFAULT 1,
|
||||||
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY (site_name, pattern)
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
|
||||||
|
db.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS crawler_site_rules (
|
||||||
|
site_name TEXT NOT NULL,
|
||||||
|
rule_type TEXT NOT NULL,
|
||||||
|
rule_value TEXT NOT NULL,
|
||||||
|
classification TEXT NOT NULL,
|
||||||
|
hit_count INTEGER NOT NULL DEFAULT 1,
|
||||||
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY (site_name, rule_type, rule_value)
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
|
||||||
for (const statement of [
|
for (const statement of [
|
||||||
'ALTER TABLE articles ADD COLUMN image TEXT',
|
'ALTER TABLE articles ADD COLUMN image TEXT',
|
||||||
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
||||||
|
|
|
||||||
|
|
@ -2,14 +2,18 @@ const cron = require('node-cron');
|
||||||
const config = require('./config');
|
const config = require('./config');
|
||||||
const { ingestBatch } = require('./ingest');
|
const { ingestBatch } = require('./ingest');
|
||||||
const { fetchRssArticles } = require('./sources/rss');
|
const { fetchRssArticles } = require('./sources/rss');
|
||||||
const { fetchGdeltArticles } = require('./sources/gdelt');
|
const { fetchGdeltArticles, hasPendingWindows } = require('./sources/gdelt');
|
||||||
const { fetchEdgarArticles } = require('./sources/edgar');
|
const { fetchEdgarArticles } = require('./sources/edgar');
|
||||||
const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
|
const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
|
||||||
const { fetchFinnhubArticles } = require('./sources/finnhub');
|
const { fetchFinnhubArticles } = require('./sources/finnhub');
|
||||||
const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler');
|
const { fetchGoogleNewsArticles } = require('./sources/googleNews');
|
||||||
const { backfillMissingContent } = require('./content');
|
const { backfillMissingContent } = require('./content');
|
||||||
const { backfillMissingEmbeddings } = require('./embeddings');
|
const { backfillMissingEmbeddings } = require('./embeddings');
|
||||||
|
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
async function runSource(source, fetcher) {
|
async function runSource(source, fetcher) {
|
||||||
try {
|
try {
|
||||||
const articles = await fetcher();
|
const articles = await fetcher();
|
||||||
|
|
@ -20,16 +24,6 @@ async function runSource(source, fetcher) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function runCrawlerSources() {
|
|
||||||
const results = [];
|
|
||||||
|
|
||||||
for (const site of getConfiguredCrawlerSites()) {
|
|
||||||
results.push(await runSource(site.name, () => crawlSite(site)));
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function runAllIngestions() {
|
async function runAllIngestions() {
|
||||||
const results = [];
|
const results = [];
|
||||||
|
|
||||||
|
|
@ -38,7 +32,7 @@ async function runAllIngestions() {
|
||||||
results.push(await runSource('edgar', fetchEdgarArticles));
|
results.push(await runSource('edgar', fetchEdgarArticles));
|
||||||
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
|
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
|
||||||
results.push(await runSource('finnhub', fetchFinnhubArticles));
|
results.push(await runSource('finnhub', fetchFinnhubArticles));
|
||||||
results.push(...await runCrawlerSources());
|
results.push(await runSource('googlenews', fetchGoogleNewsArticles));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await backfillMissingContent();
|
await backfillMissingContent();
|
||||||
|
|
@ -60,8 +54,23 @@ function startScheduler() {
|
||||||
await runSource('rss', fetchRssArticles);
|
await runSource('rss', fetchRssArticles);
|
||||||
};
|
};
|
||||||
|
|
||||||
const runGdelt = async () => {
|
const runGdeltLoop = async () => {
|
||||||
|
while (true) {
|
||||||
|
if (!hasPendingWindows()) {
|
||||||
|
await sleep(60 * 1000);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const isBigQuery = String(config.gdelt?.source || 'api').toLowerCase() === 'bigquery';
|
||||||
|
|
||||||
|
if (isBigQuery) {
|
||||||
|
await fetchGdeltArticles(async (articles) => {
|
||||||
|
await ingestBatch('gdelt', articles);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
await runSource('gdelt', fetchGdeltArticles);
|
await runSource('gdelt', fetchGdeltArticles);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const runEdgar = async () => {
|
const runEdgar = async () => {
|
||||||
|
|
@ -76,6 +85,10 @@ function startScheduler() {
|
||||||
await runSource('finnhub', fetchFinnhubArticles);
|
await runSource('finnhub', fetchFinnhubArticles);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const runGoogleNews = async () => {
|
||||||
|
await runSource('googlenews', fetchGoogleNewsArticles);
|
||||||
|
};
|
||||||
|
|
||||||
const runContentMaintenance = async () => {
|
const runContentMaintenance = async () => {
|
||||||
try {
|
try {
|
||||||
await backfillMissingContent();
|
await backfillMissingContent();
|
||||||
|
|
@ -91,23 +104,18 @@ function startScheduler() {
|
||||||
};
|
};
|
||||||
|
|
||||||
runRss();
|
runRss();
|
||||||
runGdelt();
|
runGdeltLoop();
|
||||||
runEdgar();
|
runEdgar();
|
||||||
runAlphaVantage();
|
runAlphaVantage();
|
||||||
runFinnhub();
|
runFinnhub();
|
||||||
|
// runGoogleNews();
|
||||||
runContentMaintenance();
|
runContentMaintenance();
|
||||||
|
|
||||||
cron.schedule(config.scheduler.rss, runRss);
|
cron.schedule(config.scheduler.rss, runRss);
|
||||||
cron.schedule(config.scheduler.gdelt, runGdelt);
|
|
||||||
cron.schedule(config.scheduler.edgar, runEdgar);
|
cron.schedule(config.scheduler.edgar, runEdgar);
|
||||||
cron.schedule(config.scheduler.alphaVantage, runAlphaVantage);
|
cron.schedule(config.scheduler.alphaVantage, runAlphaVantage);
|
||||||
cron.schedule(config.scheduler.finnhub, runFinnhub);
|
cron.schedule(config.scheduler.finnhub, runFinnhub);
|
||||||
|
|
||||||
if (config.scheduler.newsCrawler) {
|
|
||||||
runCrawlerSources();
|
|
||||||
cron.schedule(config.scheduler.newsCrawler, runCrawlerSources);
|
|
||||||
}
|
|
||||||
|
|
||||||
cron.schedule(config.contentBackfill.cron, runContentMaintenance);
|
cron.schedule(config.contentBackfill.cron, runContentMaintenance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,43 +1,131 @@
|
||||||
const { chromium } = require('playwright');
|
const { chromium } = require('playwright');
|
||||||
|
|
||||||
const BROWSER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36';
|
const BROWSER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36';
|
||||||
let browserPromise = null;
|
const MAX_RENDERED_HTML_LENGTH = 1_500_000;
|
||||||
|
const DEFAULT_REQUEST_TIMEOUT = 20000;
|
||||||
|
const CONSENT_BUTTON_SELECTORS = [
|
||||||
|
'button[name="agree"]',
|
||||||
|
'input[name="agree"]',
|
||||||
|
'button:has-text("Accept all")',
|
||||||
|
'button:has-text("Accept All")',
|
||||||
|
'button:has-text("Accept")',
|
||||||
|
'button:has-text("Accept cookies")',
|
||||||
|
'button:has-text("I agree")',
|
||||||
|
'button:has-text("Agree")',
|
||||||
|
'button:has-text("Consent")',
|
||||||
|
'[data-action="agree"]',
|
||||||
|
'[data-testid="consent-accept"]',
|
||||||
|
'[data-testid="accept-button"]',
|
||||||
|
];
|
||||||
|
|
||||||
async function getBrowser() {
|
let sharedBrowserSessionPromise = null;
|
||||||
if (!browserPromise) {
|
let sharedBrowserShutdownInstalled = false;
|
||||||
browserPromise = chromium.launch({
|
|
||||||
headless: false,
|
function normalizeTimeout(value) {
|
||||||
});
|
return Math.max(1000, Math.min(Number(value) || DEFAULT_REQUEST_TIMEOUT, 30000));
|
||||||
}
|
}
|
||||||
|
|
||||||
return browserPromise;
|
async function waitForUsefulDom(page, requestTimeout) {
|
||||||
}
|
const timeout = Math.min(normalizeTimeout(requestTimeout), 5000);
|
||||||
|
|
||||||
async function waitForUsefulDom(page, site) {
|
|
||||||
try {
|
try {
|
||||||
await page.waitForLoadState('networkidle', { timeout: Math.min(site.requestTimeout, 5000) });
|
await page.waitForLoadState('networkidle', { timeout });
|
||||||
} catch {
|
} catch {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await page.waitForFunction(() => document.querySelectorAll('a[href]').length > 20, {
|
await page.waitForFunction(() => document.querySelectorAll('a[href]').length > 20, { timeout });
|
||||||
timeout: Math.min(site.requestTimeout, 5000),
|
|
||||||
});
|
|
||||||
} catch {
|
} catch {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function createBrowserSession(site) {
|
async function acceptConsentIfPresent(page) {
|
||||||
const browser = await getBrowser();
|
let url;
|
||||||
|
try {
|
||||||
|
url = new URL(page.url());
|
||||||
|
} catch {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const hostname = url.hostname.toLowerCase();
|
||||||
|
const pathname = url.pathname.toLowerCase();
|
||||||
|
const shouldCheckConsent = hostname.includes('consent.')
|
||||||
|
|| hostname.includes('yahoo.com')
|
||||||
|
|| /cookie|consent|privacy/.test(hostname)
|
||||||
|
|| /cookie|consent|privacy/.test(pathname);
|
||||||
|
|
||||||
|
if (!shouldCheckConsent) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const selector of CONSENT_BUTTON_SELECTORS) {
|
||||||
|
const locator = page.locator(selector).first();
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (await locator.isVisible({ timeout: 750 })) {
|
||||||
|
await locator.click({ timeout: 2000 });
|
||||||
|
try {
|
||||||
|
await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
|
||||||
|
} catch {
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function buildBrowserSession(options = {}) {
|
||||||
|
const requestTimeout = normalizeTimeout(options.requestTimeout);
|
||||||
|
const maxConcurrentPages = Math.max(1, Math.min(Number(options.maxConcurrentPages) || 2, 8));
|
||||||
|
const browser = await chromium.launch({
|
||||||
|
headless: true,
|
||||||
|
args: [
|
||||||
|
'--disable-blink-features=AutomationControlled',
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
],
|
||||||
|
});
|
||||||
const context = await browser.newContext({
|
const context = await browser.newContext({
|
||||||
userAgent: BROWSER_USER_AGENT,
|
userAgent: BROWSER_USER_AGENT,
|
||||||
viewport: { width: 1440, height: 1200 },
|
viewport: { width: 1440, height: 1200 },
|
||||||
javaScriptEnabled: true,
|
javaScriptEnabled: true,
|
||||||
|
extraHTTPHeaders: {
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
await context.addInitScript(() => {
|
||||||
|
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||||
|
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
||||||
|
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
|
||||||
|
});
|
||||||
|
const waiters = [];
|
||||||
|
let activePages = 0;
|
||||||
|
let closed = false;
|
||||||
|
|
||||||
|
async function acquirePageSlot() {
|
||||||
|
if (activePages < maxConcurrentPages) {
|
||||||
|
activePages += 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((resolve) => {
|
||||||
|
waiters.push(resolve);
|
||||||
|
});
|
||||||
|
activePages += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
function releasePageSlot() {
|
||||||
|
activePages = Math.max(0, activePages - 1);
|
||||||
|
const next = waiters.shift();
|
||||||
|
if (next) {
|
||||||
|
next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
await context.route('**/*', async (route) => {
|
await context.route('**/*', async (route) => {
|
||||||
const request = route.request();
|
const resourceType = route.request().resourceType();
|
||||||
const resourceType = request.resourceType();
|
|
||||||
|
|
||||||
if (['image', 'media', 'font'].includes(resourceType)) {
|
if (['image', 'media', 'font'].includes(resourceType)) {
|
||||||
await route.abort();
|
await route.abort();
|
||||||
|
|
@ -49,16 +137,23 @@ async function createBrowserSession(site) {
|
||||||
|
|
||||||
return {
|
return {
|
||||||
async fetchRenderedHtml(url, options = {}) {
|
async fetchRenderedHtml(url, options = {}) {
|
||||||
|
if (closed) {
|
||||||
|
throw new Error('browser session is closed');
|
||||||
|
}
|
||||||
|
|
||||||
|
await acquirePageSlot();
|
||||||
const page = await context.newPage();
|
const page = await context.newPage();
|
||||||
|
const timeout = normalizeTimeout(options.timeout || requestTimeout);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await page.goto(url, {
|
await page.goto(url, {
|
||||||
waitUntil: 'domcontentloaded',
|
waitUntil: 'domcontentloaded',
|
||||||
timeout: site.requestTimeout,
|
timeout,
|
||||||
});
|
});
|
||||||
|
|
||||||
await waitForUsefulDom(page, site);
|
await acceptConsentIfPresent(page);
|
||||||
const html = await page.content();
|
await waitForUsefulDom(page, timeout);
|
||||||
|
const html = (await page.content()).slice(0, MAX_RENDERED_HTML_LENGTH);
|
||||||
|
|
||||||
if (options.includeDebug) {
|
if (options.includeDebug) {
|
||||||
return {
|
return {
|
||||||
|
|
@ -71,20 +166,73 @@ async function createBrowserSession(site) {
|
||||||
|
|
||||||
return html;
|
return html;
|
||||||
} finally {
|
} finally {
|
||||||
|
try {
|
||||||
await page.close();
|
await page.close();
|
||||||
|
} finally {
|
||||||
|
releasePageSlot();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
async close() {
|
async close() {
|
||||||
|
if (closed) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
closed = true;
|
||||||
await context.close();
|
await context.close();
|
||||||
|
await browser.close();
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function shouldUseBrowser(site) {
|
function installSharedBrowserShutdown() {
|
||||||
return site.renderMode === 'browser';
|
if (sharedBrowserShutdownInstalled) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
sharedBrowserShutdownInstalled = true;
|
||||||
|
const shutdown = () => {
|
||||||
|
if (!sharedBrowserSessionPromise) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sessionPromise = sharedBrowserSessionPromise;
|
||||||
|
sharedBrowserSessionPromise = null;
|
||||||
|
sessionPromise
|
||||||
|
.then((session) => session.close())
|
||||||
|
.catch((error) => console.error('shared browser shutdown failed:', error));
|
||||||
|
};
|
||||||
|
|
||||||
|
process.once('beforeExit', shutdown);
|
||||||
|
process.once('SIGINT', shutdown);
|
||||||
|
process.once('SIGTERM', shutdown);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createBrowserSession(site = {}) {
|
||||||
|
return buildBrowserSession({
|
||||||
|
requestTimeout: site.requestTimeout,
|
||||||
|
maxConcurrentPages: site.pageConcurrency,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getSharedBrowserSession(options = {}) {
|
||||||
|
if (!sharedBrowserSessionPromise) {
|
||||||
|
installSharedBrowserShutdown();
|
||||||
|
sharedBrowserSessionPromise = buildBrowserSession({
|
||||||
|
requestTimeout: options.requestTimeout,
|
||||||
|
maxConcurrentPages: options.maxConcurrentPages || 2,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return sharedBrowserSessionPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
function shouldUseBrowser() {
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
createBrowserSession,
|
createBrowserSession,
|
||||||
|
getSharedBrowserSession,
|
||||||
shouldUseBrowser,
|
shouldUseBrowser,
|
||||||
};
|
};
|
||||||
|
|
|
||||||
804
src/sources/crawlerClassifier.js
Normal file
804
src/sources/crawlerClassifier.js
Normal file
|
|
@ -0,0 +1,804 @@
|
||||||
|
const db = require('../db');
|
||||||
|
const config = require('../config');
|
||||||
|
|
||||||
|
const POSITIVE_RULE_TYPES = new Set([
|
||||||
|
'meta_og_type',
|
||||||
|
'meta_has_publish_time',
|
||||||
|
'jsonld_type',
|
||||||
|
'has_tag',
|
||||||
|
'url_pattern',
|
||||||
|
'path_segment',
|
||||||
|
'meta_presence',
|
||||||
|
'meta_value_pattern',
|
||||||
|
'selector_present',
|
||||||
|
'class_token_present',
|
||||||
|
'attr_presence',
|
||||||
|
'link_density_bucket',
|
||||||
|
'paragraph_count_bucket',
|
||||||
|
'headline_container_pattern',
|
||||||
|
'byline_signal',
|
||||||
|
'time_signal',
|
||||||
|
'body_container_signal',
|
||||||
|
'listing_container_signal',
|
||||||
|
'pagination_signal',
|
||||||
|
'url_prefix_pattern',
|
||||||
|
'canonical_pattern',
|
||||||
|
'shallow_text_signal',
|
||||||
|
'repeated_card_signal',
|
||||||
|
'nav_density_bucket',
|
||||||
|
'utility_path_signal',
|
||||||
|
'commercial_signal',
|
||||||
|
'media_signal',
|
||||||
|
]);
|
||||||
|
|
||||||
|
const selectCachedClassification = db.prepare(`
|
||||||
|
SELECT classification
|
||||||
|
FROM crawler_page_classifications
|
||||||
|
WHERE url = ?
|
||||||
|
`);
|
||||||
|
const upsertCachedClassification = db.prepare(`
|
||||||
|
INSERT INTO crawler_page_classifications (url, site_name, classification, pattern)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
|
site_name = excluded.site_name,
|
||||||
|
classification = excluded.classification,
|
||||||
|
pattern = excluded.pattern,
|
||||||
|
classified_at = datetime('now')
|
||||||
|
`);
|
||||||
|
const selectPatternsForSite = db.prepare(`
|
||||||
|
SELECT pattern, classification, hit_count
|
||||||
|
FROM crawler_url_patterns
|
||||||
|
WHERE site_name = ?
|
||||||
|
AND hit_count >= ?
|
||||||
|
ORDER BY hit_count DESC, pattern ASC
|
||||||
|
`);
|
||||||
|
const upsertPattern = db.prepare(`
|
||||||
|
INSERT INTO crawler_url_patterns (site_name, pattern, classification, hit_count)
|
||||||
|
VALUES (?, ?, ?, 1)
|
||||||
|
ON CONFLICT(site_name, pattern) DO UPDATE SET
|
||||||
|
classification = excluded.classification,
|
||||||
|
hit_count = CASE
|
||||||
|
WHEN crawler_url_patterns.classification = excluded.classification THEN crawler_url_patterns.hit_count + 1
|
||||||
|
ELSE 1
|
||||||
|
END,
|
||||||
|
updated_at = datetime('now')
|
||||||
|
`);
|
||||||
|
const selectRulesForSite = db.prepare(`
|
||||||
|
SELECT rule_type, rule_value, classification, hit_count
|
||||||
|
FROM crawler_site_rules
|
||||||
|
WHERE site_name = ?
|
||||||
|
AND hit_count >= ?
|
||||||
|
ORDER BY hit_count DESC, rule_type ASC, rule_value ASC
|
||||||
|
`);
|
||||||
|
const upsertRule = db.prepare(`
|
||||||
|
INSERT INTO crawler_site_rules (site_name, rule_type, rule_value, classification, hit_count)
|
||||||
|
VALUES (?, ?, ?, ?, 1)
|
||||||
|
ON CONFLICT(site_name, rule_type, rule_value) DO UPDATE SET
|
||||||
|
classification = excluded.classification,
|
||||||
|
hit_count = CASE
|
||||||
|
WHEN crawler_site_rules.classification = excluded.classification THEN crawler_site_rules.hit_count + 1
|
||||||
|
ELSE 1
|
||||||
|
END,
|
||||||
|
updated_at = datetime('now')
|
||||||
|
`);
|
||||||
|
|
||||||
|
function normalizePathSegment(segment) {
|
||||||
|
if (/^\d{4}$/.test(segment)) {
|
||||||
|
return '{year}';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/^\d{2}$/.test(segment)) {
|
||||||
|
return '{num2}';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/^\d+$/.test(segment)) {
|
||||||
|
return '{id}';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/^[a-f0-9]{8,}$/i.test(segment)) {
|
||||||
|
return '{hex}';
|
||||||
|
}
|
||||||
|
|
||||||
|
return String(segment || '').toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildUrlPattern(url) {
|
||||||
|
try {
|
||||||
|
const parsed = new URL(url);
|
||||||
|
const normalizedSegments = parsed.pathname
|
||||||
|
.split('/')
|
||||||
|
.filter(Boolean)
|
||||||
|
.map(normalizePathSegment);
|
||||||
|
|
||||||
|
return `/${normalizedSegments.join('/')}` || '/';
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function patternToRegex(pattern) {
|
||||||
|
const escaped = String(pattern || '')
|
||||||
|
.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
||||||
|
.replace(/\{year\}/g, '\\d{4}')
|
||||||
|
.replace(/\{num2\}/g, '\\d{2}')
|
||||||
|
.replace(/\{id\}/g, '\\d+')
|
||||||
|
.replace(/\{hex\}/g, '[a-f0-9]+');
|
||||||
|
|
||||||
|
return new RegExp(`^${escaped}$`, 'i');
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeText(value, maxLength = 200) {
|
||||||
|
return String(value || '')
|
||||||
|
.replace(/<[^>]*>/g, ' ')
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.trim()
|
||||||
|
.slice(0, maxLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeRuleValue(value) {
|
||||||
|
return sanitizeText(String(value || '').toLowerCase(), 160);
|
||||||
|
}
|
||||||
|
|
||||||
|
function pushSignal(signals, ruleType, ruleValue) {
|
||||||
|
const normalizedValue = normalizeRuleValue(ruleValue);
|
||||||
|
if (!ruleType || !normalizedValue) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
signals.push({ ruleType, ruleValue: normalizedValue });
|
||||||
|
}
|
||||||
|
|
||||||
|
function uniqueSignals(signals) {
|
||||||
|
const seen = new Set();
|
||||||
|
const unique = [];
|
||||||
|
|
||||||
|
for (const signal of signals) {
|
||||||
|
const key = `${signal.ruleType}:${signal.ruleValue}`;
|
||||||
|
if (seen.has(key)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
seen.add(key);
|
||||||
|
unique.push(signal);
|
||||||
|
}
|
||||||
|
|
||||||
|
return unique;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractJsonObjectString(value) {
|
||||||
|
const text = String(value || '').trim();
|
||||||
|
const start = text.indexOf('{');
|
||||||
|
if (start === -1) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
let depth = 0;
|
||||||
|
let inString = false;
|
||||||
|
let escape = false;
|
||||||
|
|
||||||
|
for (let index = start; index < text.length; index += 1) {
|
||||||
|
const char = text[index];
|
||||||
|
|
||||||
|
if (escape) {
|
||||||
|
escape = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (char === '\\') {
|
||||||
|
escape = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (char === '"') {
|
||||||
|
inString = !inString;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inString) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (char === '{') {
|
||||||
|
depth += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (char === '}') {
|
||||||
|
depth -= 1;
|
||||||
|
if (depth === 0) {
|
||||||
|
return text.slice(start, index + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return text.slice(start);
|
||||||
|
}
|
||||||
|
|
||||||
|
function repairJsonString(value) {
|
||||||
|
let repaired = String(value || '').trim();
|
||||||
|
if (!repaired) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
repaired = repaired
|
||||||
|
.replace(/^```(?:json)?\s*/i, '')
|
||||||
|
.replace(/\s*```$/i, '')
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
repaired = extractJsonObjectString(repaired);
|
||||||
|
if (!repaired) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
repaired = repaired
|
||||||
|
.replace(/[\u0000-\u001f]+/g, ' ')
|
||||||
|
.replace(/,\s*([}\]])/g, '$1')
|
||||||
|
.replace(/:\s*undefined\b/g, ': null')
|
||||||
|
.replace(/:\s*NaN\b/g, ': null')
|
||||||
|
.replace(/:\s*Infinity\b/g, ': null');
|
||||||
|
|
||||||
|
const openCurly = (repaired.match(/\{/g) || []).length;
|
||||||
|
const closeCurly = (repaired.match(/\}/g) || []).length;
|
||||||
|
const openSquare = (repaired.match(/\[/g) || []).length;
|
||||||
|
const closeSquare = (repaired.match(/\]/g) || []).length;
|
||||||
|
|
||||||
|
if (closeSquare < openSquare) {
|
||||||
|
repaired += ']'.repeat(openSquare - closeSquare);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (closeCurly < openCurly) {
|
||||||
|
repaired += '}'.repeat(openCurly - closeCurly);
|
||||||
|
}
|
||||||
|
|
||||||
|
return repaired;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseJsonLoose(value) {
|
||||||
|
const direct = String(value || '').trim();
|
||||||
|
if (!direct) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return JSON.parse(direct);
|
||||||
|
} catch {
|
||||||
|
}
|
||||||
|
|
||||||
|
const repaired = repairJsonString(direct);
|
||||||
|
if (!repaired) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return JSON.parse(repaired);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('failed to parse crawler classification payload:', error, direct);
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractClassTokens(html) {
|
||||||
|
const attrs = html.match(/\bclass\s*=\s*(["'])(.*?)\1/gi) || [];
|
||||||
|
const tokens = [];
|
||||||
|
|
||||||
|
for (const attr of attrs) {
|
||||||
|
const match = attr.match(/\bclass\s*=\s*(["'])(.*?)\1/i);
|
||||||
|
const raw = match ? match[2] : '';
|
||||||
|
for (const token of raw.split(/\s+/)) {
|
||||||
|
const normalized = String(token || '').trim().toLowerCase();
|
||||||
|
if (!normalized || normalized.length < 3 || normalized.length > 40) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!/[a-z]/.test(normalized)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (/^(jsx-\d+|sc-[a-z0-9]+|css-[a-z0-9]+|_[a-z0-9]+|[a-f0-9]{10,})$/i.test(normalized)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
tokens.push(normalized);
|
||||||
|
for (const part of normalized.split(/[_-]+/)) {
|
||||||
|
if (part.length >= 4 && part.length <= 24 && /[a-z]/.test(part) && !/^\d+$/.test(part)) {
|
||||||
|
tokens.push(part);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return uniqueSignals(tokens.map((token) => ({ ruleType: 'class_token_present', ruleValue: token }))).map((entry) => entry.ruleValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTagSummary(html) {
|
||||||
|
const tags = new Set();
|
||||||
|
const regex = /<([a-z0-9:-]+)\b/gi;
|
||||||
|
let match;
|
||||||
|
|
||||||
|
while ((match = regex.exec(html)) !== null && tags.size < 50) {
|
||||||
|
tags.add(String(match[1] || '').toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
return [...tags];
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractAttributeValues(html, attrName) {
|
||||||
|
const regex = new RegExp(`\\b${attrName}\\s*=\\s*(["'])(.*?)\\1`, 'gi');
|
||||||
|
const values = [];
|
||||||
|
let match;
|
||||||
|
|
||||||
|
while ((match = regex.exec(html)) !== null) {
|
||||||
|
values.push(String(match[2] || '').trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectLinkDensityBucket(links, paragraphTextLength) {
|
||||||
|
if (!paragraphTextLength) {
|
||||||
|
return links.length >= 15 ? 'high' : 'medium';
|
||||||
|
}
|
||||||
|
|
||||||
|
const ratio = (links.length * 1000) / Math.max(paragraphTextLength, 1);
|
||||||
|
if (ratio >= 18 || links.length >= 60) {
|
||||||
|
return 'high';
|
||||||
|
}
|
||||||
|
if (ratio >= 8 || links.length >= 25) {
|
||||||
|
return 'medium';
|
||||||
|
}
|
||||||
|
return 'low';
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectParagraphBucket(paragraphCount) {
|
||||||
|
if (paragraphCount === 0) {
|
||||||
|
return '0';
|
||||||
|
}
|
||||||
|
if (paragraphCount <= 2) {
|
||||||
|
return '1-2';
|
||||||
|
}
|
||||||
|
if (paragraphCount <= 7) {
|
||||||
|
return '3-7';
|
||||||
|
}
|
||||||
|
return '8+';
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectHeadlineContainerPattern(html, headlineLinks) {
|
||||||
|
const h1Count = (html.match(/<h1\b/gi) || []).length;
|
||||||
|
const h2Count = (html.match(/<h2\b/gi) || []).length;
|
||||||
|
const h3Count = (html.match(/<h3\b/gi) || []).length;
|
||||||
|
|
||||||
|
if (h1Count === 1 && headlineLinks <= 6 && h2Count <= 4) {
|
||||||
|
return 'single_h1';
|
||||||
|
}
|
||||||
|
if (h2Count >= 6 || h3Count >= 8) {
|
||||||
|
return 'repeated_h2_cards';
|
||||||
|
}
|
||||||
|
if (headlineLinks >= 10) {
|
||||||
|
return 'multiple_headline_links';
|
||||||
|
}
|
||||||
|
return 'mixed';
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectCanonicalPattern(url) {
|
||||||
|
try {
|
||||||
|
const pathname = new URL(url).pathname || '/';
|
||||||
|
const segments = pathname.split('/').filter(Boolean);
|
||||||
|
|
||||||
|
if (!segments.length) {
|
||||||
|
return 'root';
|
||||||
|
}
|
||||||
|
if (/^\d{4}$/.test(segments[0])) {
|
||||||
|
return 'dated_article';
|
||||||
|
}
|
||||||
|
if (segments.length === 1) {
|
||||||
|
return 'section_root';
|
||||||
|
}
|
||||||
|
if (segments.length === 2) {
|
||||||
|
return 'short_section_slug';
|
||||||
|
}
|
||||||
|
return 'multi_segment_slug';
|
||||||
|
} catch {
|
||||||
|
return 'unknown';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildRuleSignals(url, meta, html, jsonLdArticle, links, heuristic) {
|
||||||
|
const signals = [];
|
||||||
|
const classTokens = extractClassTokens(html);
|
||||||
|
const tagSummary = extractTagSummary(html);
|
||||||
|
const pathname = (() => {
|
||||||
|
try {
|
||||||
|
return new URL(url).pathname || '/';
|
||||||
|
} catch {
|
||||||
|
return '/';
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
const segments = pathname.split('/').filter(Boolean);
|
||||||
|
const selectedMetaKeys = [
|
||||||
|
'og:type',
|
||||||
|
'og:title',
|
||||||
|
'article:published_time',
|
||||||
|
'og:article:published_time',
|
||||||
|
'author',
|
||||||
|
'article:author',
|
||||||
|
'twitter:title',
|
||||||
|
'description',
|
||||||
|
'article:section',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const key of selectedMetaKeys) {
|
||||||
|
const value = meta.get(key);
|
||||||
|
if (!value) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
pushSignal(signals, 'meta_presence', key);
|
||||||
|
pushSignal(signals, 'meta_value_pattern', `${key}:${value}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const ogType = String(meta.get('og:type') || '').trim().toLowerCase();
|
||||||
|
const publishTime = String(meta.get('article:published_time') || meta.get('og:article:published_time') || '').trim();
|
||||||
|
const jsonLdType = String(jsonLdArticle && jsonLdArticle['@type'] || '').trim().toLowerCase();
|
||||||
|
|
||||||
|
if (ogType) {
|
||||||
|
pushSignal(signals, 'meta_og_type', ogType);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (publishTime) {
|
||||||
|
pushSignal(signals, 'meta_has_publish_time', 'true');
|
||||||
|
pushSignal(signals, 'time_signal', meta.get('article:published_time') ? 'meta_article_published_time' : 'meta_og_article_published_time');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jsonLdType) {
|
||||||
|
pushSignal(signals, 'jsonld_type', jsonLdType);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const tag of ['article', 'main', 'nav', 'aside', 'section', 'time', 'h1', 'h2']) {
|
||||||
|
if (tagSummary.includes(tag)) {
|
||||||
|
pushSignal(signals, 'has_tag', tag);
|
||||||
|
pushSignal(signals, 'selector_present', tag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/<main\b[\s\S]{0,800}<article\b/i.test(html) || /<article\b[\s\S]{0,800}<main\b/i.test(html)) {
|
||||||
|
pushSignal(signals, 'selector_present', 'main article');
|
||||||
|
pushSignal(signals, 'body_container_signal', 'main_article');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/itemprop\s*=\s*(["'])articlebody\1/i.test(html)) {
|
||||||
|
pushSignal(signals, 'selector_present', '[itemprop="articlebody"]');
|
||||||
|
pushSignal(signals, 'attr_presence', 'itemprop:articlebody');
|
||||||
|
pushSignal(signals, 'body_container_signal', 'itemprop_articlebody');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/itemprop\s*=\s*(["'])headline\1/i.test(html)) {
|
||||||
|
pushSignal(signals, 'attr_presence', 'itemprop:headline');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/rel\s*=\s*(["'])author\1/i.test(html)) {
|
||||||
|
pushSignal(signals, 'selector_present', '[rel="author"]');
|
||||||
|
pushSignal(signals, 'attr_presence', 'rel:author');
|
||||||
|
pushSignal(signals, 'byline_signal', 'rel_author');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/role\s*=\s*(["'])main\1/i.test(html)) {
|
||||||
|
pushSignal(signals, 'attr_presence', 'role:main');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/role\s*=\s*(["'])navigation\1/i.test(html)) {
|
||||||
|
pushSignal(signals, 'attr_presence', 'role:navigation');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/<time\b[^>]*datetime\s*=/i.test(html)) {
|
||||||
|
pushSignal(signals, 'selector_present', 'time[datetime]');
|
||||||
|
pushSignal(signals, 'time_signal', 'time_datetime');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jsonLdArticle && jsonLdArticle.datePublished) {
|
||||||
|
pushSignal(signals, 'time_signal', 'jsonld_datepublished');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jsonLdArticle && jsonLdArticle.dateModified) {
|
||||||
|
pushSignal(signals, 'time_signal', 'jsonld_datemodified');
|
||||||
|
}
|
||||||
|
|
||||||
|
const paragraphCount = Number(heuristic.paragraphCount || 0);
|
||||||
|
pushSignal(signals, 'paragraph_count_bucket', detectParagraphBucket(paragraphCount));
|
||||||
|
pushSignal(signals, 'link_density_bucket', detectLinkDensityBucket(links, heuristic.paragraphTextLength));
|
||||||
|
pushSignal(signals, 'headline_container_pattern', detectHeadlineContainerPattern(html, heuristic.headlineLinks));
|
||||||
|
pushSignal(signals, 'canonical_pattern', detectCanonicalPattern(url));
|
||||||
|
|
||||||
|
if (heuristic.paragraphTextLength < 200) {
|
||||||
|
pushSignal(signals, 'shallow_text_signal', 'very_low');
|
||||||
|
} else if (heuristic.paragraphTextLength < 600) {
|
||||||
|
pushSignal(signals, 'shallow_text_signal', 'low');
|
||||||
|
} else {
|
||||||
|
pushSignal(signals, 'shallow_text_signal', 'substantial');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (heuristic.headlineLinks >= 8 || (html.match(/<h2\b/gi) || []).length >= 6) {
|
||||||
|
pushSignal(signals, 'repeated_card_signal', 'present');
|
||||||
|
}
|
||||||
|
|
||||||
|
const linkDensityBucket = detectLinkDensityBucket(links, heuristic.paragraphTextLength);
|
||||||
|
if (tagSummary.includes('nav') && linkDensityBucket === 'high') {
|
||||||
|
pushSignal(signals, 'nav_density_bucket', 'high');
|
||||||
|
} else if (tagSummary.includes('nav')) {
|
||||||
|
pushSignal(signals, 'nav_density_bucket', 'present');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (meta.get('author') || meta.get('article:author')) {
|
||||||
|
pushSignal(signals, 'byline_signal', 'meta_author');
|
||||||
|
}
|
||||||
|
if (classTokens.some((token) => token.includes('byline'))) {
|
||||||
|
pushSignal(signals, 'byline_signal', 'class_token_byline');
|
||||||
|
}
|
||||||
|
if (classTokens.some((token) => token === 'author' || token.endsWith('author'))) {
|
||||||
|
pushSignal(signals, 'byline_signal', 'class_token_author');
|
||||||
|
}
|
||||||
|
if (/\bby\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2}\b/.test(html)) {
|
||||||
|
pushSignal(signals, 'byline_signal', 'text_prefix_by');
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const token of classTokens.slice(0, 60)) {
|
||||||
|
pushSignal(signals, 'class_token_present', token);
|
||||||
|
|
||||||
|
if (['article', 'story', 'headline', 'content', 'body', 'byline', 'author'].includes(token)) {
|
||||||
|
pushSignal(signals, 'body_container_signal', token);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (['latest', 'archive', 'category', 'topic', 'topics', 'feed', 'river', 'grid', 'cards', 'listing', 'section', 'stream'].includes(token)) {
|
||||||
|
pushSignal(signals, 'listing_container_signal', token);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (['subscribe', 'subscription', 'newsletter', 'advertise', 'sponsored'].includes(token)) {
|
||||||
|
pushSignal(signals, 'commercial_signal', token);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (['video', 'podcast', 'live', 'watch', 'media'].includes(token)) {
|
||||||
|
pushSignal(signals, 'media_signal', token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const relValues = extractAttributeValues(html, 'rel').map((value) => value.toLowerCase());
|
||||||
|
if (relValues.some((value) => value.split(/\s+/).includes('next'))) {
|
||||||
|
pushSignal(signals, 'pagination_signal', 'rel_next');
|
||||||
|
}
|
||||||
|
if (relValues.some((value) => value.split(/\s+/).includes('prev'))) {
|
||||||
|
pushSignal(signals, 'pagination_signal', 'rel_prev');
|
||||||
|
}
|
||||||
|
if (/[?&]page=\d+/i.test(url) || /\/page\/\d+(?:\/|$)/i.test(pathname)) {
|
||||||
|
pushSignal(signals, 'pagination_signal', 'page_param');
|
||||||
|
}
|
||||||
|
if (/load more/i.test(html)) {
|
||||||
|
pushSignal(signals, 'pagination_signal', 'load_more');
|
||||||
|
}
|
||||||
|
if (/(next page|older posts|older stories)/i.test(html)) {
|
||||||
|
pushSignal(signals, 'pagination_signal', 'next_page_text');
|
||||||
|
}
|
||||||
|
|
||||||
|
const utilitySegments = segments.filter((segment) => /^(login|signin|search|account|about|contact|privacy|terms)$/i.test(segment));
|
||||||
|
for (const segment of utilitySegments) {
|
||||||
|
pushSignal(signals, 'utility_path_signal', normalizePathSegment(segment));
|
||||||
|
}
|
||||||
|
|
||||||
|
const commercialSegments = segments.filter((segment) => /^(subscribe|subscription|newsletter|advertise|sponsored)$/i.test(segment));
|
||||||
|
for (const segment of commercialSegments) {
|
||||||
|
pushSignal(signals, 'commercial_signal', normalizePathSegment(segment));
|
||||||
|
}
|
||||||
|
|
||||||
|
const mediaSegments = segments.filter((segment) => /^(video|videos|podcast|podcasts|live|watch)$/i.test(segment));
|
||||||
|
for (const segment of mediaSegments) {
|
||||||
|
pushSignal(signals, 'media_signal', normalizePathSegment(segment));
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlPattern = buildUrlPattern(url);
|
||||||
|
if (urlPattern) {
|
||||||
|
pushSignal(signals, 'url_pattern', urlPattern);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (segments.length >= 1) {
|
||||||
|
pushSignal(signals, 'url_prefix_pattern', `/${normalizePathSegment(segments[0])}`);
|
||||||
|
}
|
||||||
|
if (segments.length >= 2) {
|
||||||
|
pushSignal(signals, 'url_prefix_pattern', `/${normalizePathSegment(segments[0])}/${normalizePathSegment(segments[1])}`);
|
||||||
|
}
|
||||||
|
if (segments.length >= 3) {
|
||||||
|
pushSignal(signals, 'url_prefix_pattern', `/${normalizePathSegment(segments[0])}/${normalizePathSegment(segments[1])}/${normalizePathSegment(segments[2])}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const segment of segments.slice(0, 5)) {
|
||||||
|
pushSignal(signals, 'path_segment', normalizePathSegment(segment));
|
||||||
|
}
|
||||||
|
|
||||||
|
return uniqueSignals(signals);
|
||||||
|
}
|
||||||
|
|
||||||
|
function matchRule(rule, signals) {
|
||||||
|
return signals.some((signal) => signal.ruleType === rule.rule_type && signal.ruleValue === rule.rule_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatSignalsForPrompt(signals) {
|
||||||
|
return signals
|
||||||
|
.slice(0, 120)
|
||||||
|
.map((signal) => `${signal.ruleType}:${signal.ruleValue}`)
|
||||||
|
.join('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeForLlm(url, html, meta, jsonLdArticle, links, heuristic, signals) {
|
||||||
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
||||||
|
const h1Match = html.match(/<h1\b[^>]*>([\s\S]*?)<\/h1>/i);
|
||||||
|
const timeMatch = html.match(/<time\b[^>]*datetime\s*=\s*(["'])(.*?)\1/i);
|
||||||
|
const paragraphMatches = html.match(/<p\b[^>]*>[\s\S]*?<\/p>/gi) || [];
|
||||||
|
const paragraphs = paragraphMatches
|
||||||
|
.map((paragraph) => sanitizeText(paragraph, 180))
|
||||||
|
.filter(Boolean)
|
||||||
|
.slice(0, 5);
|
||||||
|
const sampleLinks = links
|
||||||
|
.slice(0, 12)
|
||||||
|
.map((link) => `${link.url} | ${sanitizeText(link.text, 120)}`)
|
||||||
|
.join('\n');
|
||||||
|
|
||||||
|
const parts = [
|
||||||
|
`URL: ${url}`,
|
||||||
|
`TITLE: ${titleMatch ? sanitizeText(titleMatch[1]) : ''}`,
|
||||||
|
`H1: ${h1Match ? sanitizeText(h1Match[1]) : ''}`,
|
||||||
|
`OG_TYPE: ${String(meta.get('og:type') || '').slice(0, 80)}`,
|
||||||
|
`OG_TITLE: ${sanitizeText(meta.get('og:title') || '')}`,
|
||||||
|
`PUBLISHED: ${String(meta.get('article:published_time') || meta.get('og:article:published_time') || (timeMatch ? timeMatch[2] : '') || '').slice(0, 80)}`,
|
||||||
|
`JSONLD_TYPE: ${jsonLdArticle ? String(jsonLdArticle['@type'] || '').slice(0, 80) : ''}`,
|
||||||
|
`JSONLD_HEADLINE: ${jsonLdArticle ? sanitizeText(jsonLdArticle.headline || '') : ''}`,
|
||||||
|
`LINK_COUNT: ${links.length}`,
|
||||||
|
`PARAGRAPH_COUNT: ${heuristic.paragraphCount}`,
|
||||||
|
`PARAGRAPH_TEXT_LENGTH: ${heuristic.paragraphTextLength}`,
|
||||||
|
`HEADLINE_LINKS: ${heuristic.headlineLinks}`,
|
||||||
|
sampleLinks ? `LINKS:\n${sampleLinks}` : '',
|
||||||
|
...paragraphs.map((paragraph, index) => `P${index + 1}: ${paragraph}`),
|
||||||
|
`AVAILABLE_SIGNALS:\n${formatSignalsForPrompt(signals.slice(0, 40))}`,
|
||||||
|
];
|
||||||
|
|
||||||
|
return parts.filter(Boolean).join('\n').slice(0, 4200);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function requestLlmClassification(url, sanitizedHtml, heuristic) {
|
||||||
|
const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${String(config.openRouter.apiKey || '').trim()}`,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: 'openai/gpt-4.1-mini',
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: 'system',
|
||||||
|
content: 'Classify pages for a news crawler. Return strict JSON only with keys classification, confidence, learnedSignals, and negativeSignals. classification must be ARTICLE, LISTING, or OTHER. learnedSignals and negativeSignals must be arrays of objects with keys type and value. Only use reusable, site-level structural signals. Allowed types: meta_og_type, meta_has_publish_time, jsonld_type, has_tag, url_pattern, path_segment, meta_presence, meta_value_pattern, selector_present, class_token_present, attr_presence, link_density_bucket, paragraph_count_bucket, headline_container_pattern, byline_signal, time_signal, body_container_signal, listing_container_signal, pagination_signal, url_prefix_pattern, canonical_pattern, shallow_text_signal, repeated_card_signal, nav_density_bucket, utility_path_signal, commercial_signal, media_signal.',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
'ARTICLE = a single news story page.',
|
||||||
|
'LISTING = homepage, topic page, category page, archive, feature hub, or page containing many article links.',
|
||||||
|
'OTHER = anything else.',
|
||||||
|
'learnedSignals should explain why this page belongs to its classification.',
|
||||||
|
'negativeSignals should capture strong anti-article clues when this page is LISTING or OTHER.',
|
||||||
|
'Return compact JSON only. Keep learnedSignals and negativeSignals short. Max 3 entries in each array.',
|
||||||
|
'Never include exact titles, exact names, full article text, random hashes, or one-off values.',
|
||||||
|
`HEURISTIC_ARTICLE_SCORE: ${heuristic.articleScore}`,
|
||||||
|
`HEURISTIC_LISTING_SCORE: ${heuristic.listingScore}`,
|
||||||
|
`HEURISTIC_SHOULD_ASK: ${heuristic.shouldAskLlm ? 'yes' : 'no'}`,
|
||||||
|
sanitizedHtml,
|
||||||
|
].join('\n\n'),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
max_tokens: 220,
|
||||||
|
response_format: { type: 'json_object' },
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
let message = `crawler classification failed with ${response.status}`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const payload = await response.json();
|
||||||
|
const errorMessage = payload && payload.error && payload.error.message;
|
||||||
|
if (errorMessage) {
|
||||||
|
message = errorMessage;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('failed to parse crawler classification error response:', error);
|
||||||
|
}
|
||||||
|
|
||||||
|
const requestError = new Error(message);
|
||||||
|
requestError.status = response.status;
|
||||||
|
throw requestError;
|
||||||
|
}
|
||||||
|
|
||||||
|
const payload = await response.json();
|
||||||
|
const content = String(payload?.choices?.[0]?.message?.content || '').trim();
|
||||||
|
|
||||||
|
const parsed = parseJsonLoose(content);
|
||||||
|
|
||||||
|
const classificationRaw = String(parsed.classification || '').trim().toUpperCase();
|
||||||
|
const classification = classificationRaw === 'ARTICLE'
|
||||||
|
? 'article'
|
||||||
|
: classificationRaw === 'LISTING'
|
||||||
|
? 'listing'
|
||||||
|
: 'other';
|
||||||
|
|
||||||
|
function parseSignals(entries) {
|
||||||
|
return Array.isArray(entries)
|
||||||
|
? entries
|
||||||
|
.filter((entry) => entry && typeof entry === 'object')
|
||||||
|
.map((entry) => ({
|
||||||
|
ruleType: String(entry.type || '').trim(),
|
||||||
|
ruleValue: normalizeRuleValue(entry.value || ''),
|
||||||
|
}))
|
||||||
|
.filter((entry) => POSITIVE_RULE_TYPES.has(entry.ruleType) && entry.ruleValue)
|
||||||
|
.slice(0, 8)
|
||||||
|
: [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
classification,
|
||||||
|
learnedSignals: parseSignals(parsed.learnedSignals),
|
||||||
|
negativeSignals: parseSignals(parsed.negativeSignals),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function classifyPageWithLlm({ siteName, url, html, meta, jsonLdArticle, heuristic, links, minPatternHits }) {
|
||||||
|
const cached = selectCachedClassification.get(url);
|
||||||
|
if (cached) {
|
||||||
|
return { classification: cached.classification, source: 'cache', learnedSignals: [], negativeSignals: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
const pattern = buildUrlPattern(url);
|
||||||
|
if (pattern) {
|
||||||
|
const pathname = new URL(url).pathname || '/';
|
||||||
|
const matchedPattern = selectPatternsForSite.all(siteName, minPatternHits)
|
||||||
|
.find((entry) => patternToRegex(entry.pattern).test(pathname));
|
||||||
|
|
||||||
|
if (matchedPattern) {
|
||||||
|
upsertCachedClassification.run(url, siteName, matchedPattern.classification, matchedPattern.pattern);
|
||||||
|
return { classification: matchedPattern.classification, source: 'pattern', learnedSignals: [], negativeSignals: [] };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const ruleSignals = buildRuleSignals(url, meta, html, jsonLdArticle, links, heuristic);
|
||||||
|
const matchedRule = selectRulesForSite.all(siteName, minPatternHits).find((rule) => matchRule(rule, ruleSignals));
|
||||||
|
if (matchedRule) {
|
||||||
|
upsertCachedClassification.run(url, siteName, matchedRule.classification, pattern);
|
||||||
|
return { classification: matchedRule.classification, source: 'rule', learnedSignals: [], negativeSignals: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!String(config.openRouter?.apiKey || '').trim()) {
|
||||||
|
return { classification: null, source: 'disabled', learnedSignals: [], negativeSignals: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await requestLlmClassification(
|
||||||
|
url,
|
||||||
|
sanitizeForLlm(url, html, meta, jsonLdArticle, links, heuristic, ruleSignals),
|
||||||
|
heuristic,
|
||||||
|
);
|
||||||
|
|
||||||
|
upsertCachedClassification.run(url, siteName, result.classification, pattern);
|
||||||
|
|
||||||
|
if (pattern) {
|
||||||
|
upsertPattern.run(siteName, pattern, result.classification);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const signal of [...result.learnedSignals, ...result.negativeSignals]) {
|
||||||
|
upsertRule.run(siteName, signal.ruleType, signal.ruleValue, result.classification);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[crawler-llm] ${siteName} ${result.classification.toUpperCase()} ${url}`);
|
||||||
|
return {
|
||||||
|
classification: result.classification,
|
||||||
|
source: 'llm',
|
||||||
|
learnedSignals: result.learnedSignals,
|
||||||
|
negativeSignals: result.negativeSignals,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
classifyPageWithLlm,
|
||||||
|
buildUrlPattern,
|
||||||
|
};
|
||||||
|
|
@ -1,48 +1,258 @@
|
||||||
|
const path = require('path');
|
||||||
|
const db = require('../db');
|
||||||
const config = require('../config');
|
const config = require('../config');
|
||||||
const { fetchJson } = require('../http');
|
const { fetchJson } = require('../http');
|
||||||
|
const { getBackfillSources } = require('./sourceCatalog');
|
||||||
|
|
||||||
async function fetchGdeltArticles() {
|
const insertCompletedWindow = db.prepare(`
|
||||||
const articles = [];
|
INSERT OR IGNORE INTO gdelt_backfill_windows (
|
||||||
|
source_id,
|
||||||
|
window_start,
|
||||||
|
window_end,
|
||||||
|
completed_at
|
||||||
|
) VALUES (?, ?, ?, ?)
|
||||||
|
`);
|
||||||
|
const findCompletedWindow = db.prepare(`
|
||||||
|
SELECT 1
|
||||||
|
FROM gdelt_backfill_windows
|
||||||
|
WHERE source_id = ?
|
||||||
|
AND window_start = ?
|
||||||
|
AND window_end = ?
|
||||||
|
`);
|
||||||
|
|
||||||
for (const query of config.gdelt?.queries || []) {
|
function sleep(ms) {
|
||||||
try {
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatWindowDate(value) {
|
||||||
|
return value.toISOString().slice(0, 19).replace(/[-:]/g, '').replace('T', '');
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildWeeklyWindows() {
|
||||||
|
const windowDays = Math.max(1, Number(config.gdelt?.windowDays) || 7);
|
||||||
|
const lookbackWeeks = Math.max(1, Number(config.gdelt?.lookbackWeeks) || 52);
|
||||||
|
const windowMs = windowDays * 24 * 60 * 60 * 1000;
|
||||||
|
|
||||||
|
// anchor to epoch-aligned boundaries so keys are stable across runs
|
||||||
|
const now = Date.now();
|
||||||
|
const epochBoundary = Math.floor(now / windowMs) * windowMs;
|
||||||
|
|
||||||
|
const windows = [];
|
||||||
|
for (let index = 0; index < lookbackWeeks; index += 1) {
|
||||||
|
const endMs = epochBoundary - index * windowMs;
|
||||||
|
const startMs = endMs - windowMs;
|
||||||
|
const windowStart = new Date(startMs);
|
||||||
|
const windowEnd = new Date(endMs);
|
||||||
|
|
||||||
|
windows.push({
|
||||||
|
start: windowStart,
|
||||||
|
end: windowEnd,
|
||||||
|
startKey: formatWindowDate(windowStart),
|
||||||
|
endKey: formatWindowDate(windowEnd),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return windows;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildSourceQuery(source) {
|
||||||
|
const clauses = source.website.map((website) => `domain:${website}`);
|
||||||
|
if (clauses.length === 1) {
|
||||||
|
return clauses[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
return `(${clauses.join(' OR ')})`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isWindowCompleted(sourceId, window) {
|
||||||
|
return Boolean(findCompletedWindow.get(sourceId, window.startKey, window.endKey));
|
||||||
|
}
|
||||||
|
|
||||||
|
function markWindowCompleted(sourceId, window) {
|
||||||
|
insertCompletedWindow.run(sourceId, window.startKey, window.endKey, new Date().toISOString());
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchWindow(source, window) {
|
||||||
const params = new URLSearchParams({
|
const params = new URLSearchParams({
|
||||||
query,
|
query: buildSourceQuery(source),
|
||||||
mode: config.gdelt.mode || 'ArtList',
|
mode: config.gdelt?.mode || 'ArtList',
|
||||||
maxrecords: String(Math.min(config.gdelt.maxRecords || 10, 10)),
|
maxrecords: String(Math.max(1, Math.min(Number(config.gdelt?.maxRecords) || 100, 250))),
|
||||||
format: config.gdelt.format || 'json',
|
format: config.gdelt?.format || 'json',
|
||||||
|
startdatetime: window.startKey,
|
||||||
|
enddatetime: window.endKey,
|
||||||
});
|
});
|
||||||
|
|
||||||
const data = await fetchJson(`https://api.gdeltproject.org/api/v2/doc/doc?${params.toString()}`);
|
const data = await fetchJson(`https://api.gdeltproject.org/api/v2/doc/doc?${params.toString()}`);
|
||||||
for (const item of data.articles || []) {
|
return (data.articles || []).map((item) => {
|
||||||
const title = String(item.title || '').trim();
|
const title = String(item.title || '').trim();
|
||||||
const url = String(item.url || '').trim();
|
const url = String(item.url || '').trim();
|
||||||
|
|
||||||
if (!title || !url) {
|
if (!title || !url) {
|
||||||
continue;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
articles.push({
|
return {
|
||||||
title,
|
title,
|
||||||
description: item.domain || null,
|
description: item.domain || source.label,
|
||||||
url,
|
url,
|
||||||
source: 'gdelt',
|
source: 'gdelt',
|
||||||
pubDate: item.seendate || null,
|
pubDate: item.seendate || null,
|
||||||
});
|
};
|
||||||
|
}).filter(Boolean);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
|
||||||
if (error && error.status === 429) {
|
function getBigQueryClient() {
|
||||||
console.warn(`GDELT query skipped for rate limit: ${query}`);
|
const { BigQuery } = require('@google-cloud/bigquery');
|
||||||
|
const keyFile = config.gdelt?.bigQueryKeyFile
|
||||||
|
? path.resolve(config.gdelt.bigQueryKeyFile)
|
||||||
|
: path.resolve(__dirname, '..', '..', 'gdelt-credentials.json');
|
||||||
|
|
||||||
|
return new BigQuery({ projectId: config.gdelt?.bigQueryProject || 'duriin', keyFilename: keyFile });
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchWindowBigQuery(source, window, bigquery) {
|
||||||
|
const maxRecords = Math.max(1, Math.min(Number(config.gdelt?.maxRecords) || 100, 1000));
|
||||||
|
const domainClauses = source.website.map((d) => `LOWER(DocumentIdentifier) LIKE '%${d}%'`).join(' OR ');
|
||||||
|
|
||||||
|
const query = `
|
||||||
|
SELECT
|
||||||
|
DocumentIdentifier AS url,
|
||||||
|
SourceCommonName AS domain,
|
||||||
|
CAST(DATE AS STRING) AS seendate
|
||||||
|
FROM \`gdelt-bq.gdeltv2.gkg\`
|
||||||
|
WHERE DATE >= ${window.startKey}
|
||||||
|
AND DATE < ${window.endKey}
|
||||||
|
AND (${domainClauses})
|
||||||
|
LIMIT ${maxRecords}
|
||||||
|
`;
|
||||||
|
|
||||||
|
console.log(`GDELT BigQuery: querying ${source.id} ${window.startKey}-${window.endKey}`);
|
||||||
|
const [rows] = await bigquery.query({ query, location: 'US' });
|
||||||
|
console.log(`GDELT BigQuery: ${rows.length} rows for ${source.id} ${window.startKey}-${window.endKey}`);
|
||||||
|
|
||||||
|
return rows.map((row) => {
|
||||||
|
const url = String(row.url || '').trim();
|
||||||
|
if (!url) return null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: url,
|
||||||
|
description: row.domain || source.label,
|
||||||
|
url,
|
||||||
|
source: 'gdelt',
|
||||||
|
pubDate: row.seendate || null,
|
||||||
|
};
|
||||||
|
}).filter(Boolean);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchGdeltArticlesBigQuery(onWindow) {
|
||||||
|
const windows = buildWeeklyWindows();
|
||||||
|
const maxWindowsPerRun = Number(config.gdelt?.maxWindowsPerRun) || 0;
|
||||||
|
const requestDelayMs = Math.max(0, Number(config.gdelt?.requestDelayMs) || 0);
|
||||||
|
const bigquery = getBigQueryClient();
|
||||||
|
const allArticles = [];
|
||||||
|
|
||||||
|
for (const source of getBackfillSources()) {
|
||||||
|
let windowsFetched = 0;
|
||||||
|
|
||||||
|
for (const window of windows) {
|
||||||
|
if (maxWindowsPerRun > 0 && windowsFetched >= maxWindowsPerRun) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isWindowCompleted(source.id, window)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
console.error(`Failed to fetch GDELT query: ${query}`, error);
|
try {
|
||||||
|
const windowArticles = await fetchWindowBigQuery(source, window, bigquery);
|
||||||
|
markWindowCompleted(source.id, window);
|
||||||
|
windowsFetched += 1;
|
||||||
|
|
||||||
|
if (onWindow && windowArticles.length > 0) {
|
||||||
|
await onWindow(windowArticles);
|
||||||
|
} else {
|
||||||
|
allArticles.push(...windowArticles);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to fetch GDELT BigQuery window for ${source.id} ${window.startKey}-${window.endKey}`, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (requestDelayMs > 0) {
|
||||||
|
await sleep(requestDelayMs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return allArticles;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchGdeltArticles() {
|
||||||
|
const articles = [];
|
||||||
|
const windows = buildWeeklyWindows();
|
||||||
|
const requestDelayMs = Math.max(0, Number(config.gdelt?.requestDelayMs) || 5500);
|
||||||
|
const maxWindowsPerRun = Number(config.gdelt?.maxWindowsPerRun) || 0;
|
||||||
|
|
||||||
|
for (const source of getBackfillSources()) {
|
||||||
|
let windowsFetched = 0;
|
||||||
|
|
||||||
|
for (const window of windows) {
|
||||||
|
if (maxWindowsPerRun > 0 && windowsFetched >= maxWindowsPerRun) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isWindowCompleted(source.id, window)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const windowArticles = await fetchWindow(source, window);
|
||||||
|
articles.push(...windowArticles);
|
||||||
|
markWindowCompleted(source.id, window);
|
||||||
|
windowsFetched += 1;
|
||||||
|
} catch (error) {
|
||||||
|
if (error && error.status === 429) {
|
||||||
|
console.warn(`GDELT window rate-limited for ${source.id} ${window.startKey}-${window.endKey}`);
|
||||||
|
|
||||||
|
if (requestDelayMs > 0) {
|
||||||
|
await sleep(requestDelayMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(`Failed to fetch GDELT window for ${source.id} ${window.startKey}-${window.endKey}`, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (requestDelayMs > 0) {
|
||||||
|
await sleep(requestDelayMs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return articles;
|
return articles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function hasPendingWindows() {
|
||||||
|
const windows = buildWeeklyWindows();
|
||||||
|
for (const source of getBackfillSources()) {
|
||||||
|
for (const window of windows) {
|
||||||
|
if (!isWindowCompleted(source.id, window)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function fetchGdeltArticlesRouted(onWindow) {
|
||||||
|
const source = String(config.gdelt?.source || 'api').toLowerCase();
|
||||||
|
if (source === 'bigquery') {
|
||||||
|
return fetchGdeltArticlesBigQuery(onWindow);
|
||||||
|
}
|
||||||
|
return fetchGdeltArticles();
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
fetchGdeltArticles,
|
fetchGdeltArticles: fetchGdeltArticlesRouted,
|
||||||
|
hasPendingWindows,
|
||||||
};
|
};
|
||||||
|
|
|
||||||
236
src/sources/googleNews.js
Normal file
236
src/sources/googleNews.js
Normal file
|
|
@ -0,0 +1,236 @@
|
||||||
|
const Parser = require('rss-parser');
|
||||||
|
const config = require('../config');
|
||||||
|
const { fetchWithPolicy } = require('../http');
|
||||||
|
|
||||||
|
const parser = new Parser({
|
||||||
|
timeout: 10000,
|
||||||
|
headers: {
|
||||||
|
'User-Agent': 'Mozilla/5.0',
|
||||||
|
Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const TRACKING_PARAM_PATTERNS = [
|
||||||
|
/^utm_/i,
|
||||||
|
/^fbclid$/i,
|
||||||
|
/^gclid$/i,
|
||||||
|
/^mkt_tok$/i,
|
||||||
|
/^mc_cid$/i,
|
||||||
|
/^mc_eid$/i,
|
||||||
|
/^ref$/i,
|
||||||
|
/^ref_src$/i,
|
||||||
|
/^s$/i,
|
||||||
|
/^cmpid$/i,
|
||||||
|
];
|
||||||
|
const RESOLVE_CONCURRENCY = 6;
|
||||||
|
|
||||||
|
function shouldDropParam(key) {
|
||||||
|
return TRACKING_PARAM_PATTERNS.some((pattern) => pattern.test(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeArticleUrl(rawUrl) {
|
||||||
|
try {
|
||||||
|
const url = new URL(rawUrl);
|
||||||
|
|
||||||
|
if (!['http:', 'https:'].includes(url.protocol)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
url.hash = '';
|
||||||
|
url.username = '';
|
||||||
|
url.password = '';
|
||||||
|
|
||||||
|
const params = [...url.searchParams.entries()]
|
||||||
|
.filter(([key]) => !shouldDropParam(key))
|
||||||
|
.sort(([left], [right]) => left.localeCompare(right));
|
||||||
|
|
||||||
|
url.search = '';
|
||||||
|
for (const [key, value] of params) {
|
||||||
|
url.searchParams.append(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url.pathname !== '/') {
|
||||||
|
url.pathname = url.pathname.replace(/\/+$/, '') || '/';
|
||||||
|
}
|
||||||
|
|
||||||
|
return url.toString();
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function getGoogleNewsConfig() {
|
||||||
|
const settings = config.googleNews || {};
|
||||||
|
|
||||||
|
return {
|
||||||
|
language: String(settings.language || 'en').trim().toLowerCase() || 'en',
|
||||||
|
country: String(settings.country || 'US').trim().toUpperCase() || 'US',
|
||||||
|
topics: [...new Set((settings.topics || []).map((topic) => String(topic || '').trim().toUpperCase()).filter(Boolean))],
|
||||||
|
queries: [...new Set((settings.queries || []).map((query) => String(query || '').trim()).filter(Boolean))],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildFeedDefinitions() {
|
||||||
|
const settings = getGoogleNewsConfig();
|
||||||
|
const ceid = `${settings.country}:${settings.language}`;
|
||||||
|
|
||||||
|
return [
|
||||||
|
...settings.topics.map((topic) => ({
|
||||||
|
label: `topic:${topic}`,
|
||||||
|
url: `https://news.google.com/rss/headlines/section/topic/${encodeURIComponent(topic)}?hl=${encodeURIComponent(settings.language)}&gl=${encodeURIComponent(settings.country)}&ceid=${encodeURIComponent(ceid)}`,
|
||||||
|
})),
|
||||||
|
...settings.queries.map((query) => ({
|
||||||
|
label: `query:${query}`,
|
||||||
|
url: `https://news.google.com/rss/search?q=${encodeURIComponent(query)}&hl=${encodeURIComponent(settings.language)}&gl=${encodeURIComponent(settings.country)}&ceid=${encodeURIComponent(ceid)}`,
|
||||||
|
})),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
async function parseFeed(feedUrl) {
|
||||||
|
const response = await fetchWithPolicy(feedUrl, {
|
||||||
|
timeout: 10000,
|
||||||
|
retries: 1,
|
||||||
|
headers: {
|
||||||
|
Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = new Error(`Status code ${response.status}`);
|
||||||
|
error.status = response.status;
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
const xml = await response.text();
|
||||||
|
return parser.parseString(xml);
|
||||||
|
}
|
||||||
|
|
||||||
|
function decodeHtmlEntities(value) {
|
||||||
|
return String(value || '')
|
||||||
|
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
|
||||||
|
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/ /g, ' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractDataP(html) {
|
||||||
|
const match = html.match(/<c-wiz\b[^>]*\bdata-p\s*=\s*(?:"([^"]*)"|'([^']*)')/i);
|
||||||
|
return decodeHtmlEntities(match ? match[1] || match[2] : '');
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildBatchExecutePayload(dataP) {
|
||||||
|
const parsed = JSON.parse(dataP.replace('%.@.', '["garturlreq",'));
|
||||||
|
return new URLSearchParams({
|
||||||
|
'f.req': JSON.stringify([[['Fbv4je', JSON.stringify([...parsed.slice(0, -6), ...parsed.slice(-2)]), 'null', 'generic']]]),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractResolvedUrl(responseText) {
|
||||||
|
const trimmed = String(responseText || '').trim();
|
||||||
|
const normalized = trimmed.startsWith(")]}'") ? trimmed.slice(4).trim() : trimmed;
|
||||||
|
const outer = JSON.parse(normalized);
|
||||||
|
const inner = JSON.parse(outer[0][2]);
|
||||||
|
return normalizeArticleUrl(inner[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function resolveArticleUrl(rawUrl) {
|
||||||
|
const normalizedInput = normalizeArticleUrl(rawUrl);
|
||||||
|
if (!normalizedInput) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const articleResponse = await fetchWithPolicy(normalizedInput, {
|
||||||
|
timeout: 10000,
|
||||||
|
retries: 1,
|
||||||
|
});
|
||||||
|
const articleHtml = await articleResponse.text();
|
||||||
|
const dataP = extractDataP(articleHtml);
|
||||||
|
|
||||||
|
if (!dataP) {
|
||||||
|
return normalizedInput;
|
||||||
|
}
|
||||||
|
|
||||||
|
const resolutionResponse = await fetchWithPolicy('https://news.google.com/_/DotsSplashUi/data/batchexecute', {
|
||||||
|
method: 'POST',
|
||||||
|
body: buildBatchExecutePayload(dataP),
|
||||||
|
timeout: 10000,
|
||||||
|
retries: 1,
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
|
||||||
|
Origin: 'https://news.google.com',
|
||||||
|
Referer: normalizedInput,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
const resolvedUrl = extractResolvedUrl(await resolutionResponse.text());
|
||||||
|
|
||||||
|
return resolvedUrl || normalizedInput;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to resolve Google News URL: ${normalizedInput}`, error);
|
||||||
|
return normalizedInput;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function mapWithConcurrency(items, concurrency, mapper) {
|
||||||
|
const results = [];
|
||||||
|
|
||||||
|
for (let offset = 0; offset < items.length; offset += concurrency) {
|
||||||
|
const batch = items.slice(offset, offset + concurrency);
|
||||||
|
results.push(...await Promise.all(batch.map(mapper)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchGoogleNewsArticles() {
|
||||||
|
const articles = [];
|
||||||
|
const seenUrls = new Set();
|
||||||
|
const feeds = buildFeedDefinitions();
|
||||||
|
|
||||||
|
for (const feed of feeds) {
|
||||||
|
try {
|
||||||
|
const parsed = await parseFeed(feed.url);
|
||||||
|
const resolvedArticles = await mapWithConcurrency(parsed.items || [], RESOLVE_CONCURRENCY, async (item) => {
|
||||||
|
const title = String(item.title || '').trim();
|
||||||
|
const url = String(item.link || item.guid || '').trim();
|
||||||
|
|
||||||
|
if (!title || !url) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const resolvedUrl = await resolveArticleUrl(url);
|
||||||
|
if (!resolvedUrl || seenUrls.has(resolvedUrl)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
seenUrls.add(resolvedUrl);
|
||||||
|
return {
|
||||||
|
title,
|
||||||
|
description: item.contentSnippet || item.content || item.summary || null,
|
||||||
|
url: resolvedUrl,
|
||||||
|
source: 'googlenews',
|
||||||
|
pubDate: item.isoDate || item.pubDate || null,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const article of resolvedArticles) {
|
||||||
|
if (article) {
|
||||||
|
articles.push(article);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to fetch Google News feed: ${feed.label}`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return articles;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
fetchGoogleNewsArticles,
|
||||||
|
};
|
||||||
|
|
@ -1,6 +1,12 @@
|
||||||
const config = require('../config');
|
const config = require('../config');
|
||||||
const { fetchWithPolicy } = require('../http');
|
const { fetchWithPolicy } = require('../http');
|
||||||
const { createBrowserSession, shouldUseBrowser } = require('./browserCrawler');
|
const { createBrowserSession, shouldUseBrowser } = require('./browserCrawler');
|
||||||
|
const { classifyPageWithLlm } = require('./crawlerClassifier');
|
||||||
|
const { getRssSources } = require('./sourceCatalog');
|
||||||
|
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
const TRACKING_PARAM_PATTERNS = [
|
const TRACKING_PARAM_PATTERNS = [
|
||||||
/^utm_/i,
|
/^utm_/i,
|
||||||
|
|
@ -22,7 +28,7 @@ const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/
|
||||||
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
|
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
|
||||||
const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//;
|
const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//;
|
||||||
const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i;
|
const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i;
|
||||||
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
|
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$)|\/subscribe(?:\/|$)|\/subscription(?:\/|$)|\/newsletters?(?:\/|$)|\/privacy(?:\/|$)|\/terms(?:\/|$)|\/about(?:\/|$)|\/contact(?:\/|$))/i;
|
||||||
const EXPLORATION_PATH_HINT = /(\/page\/\d+(?:\/|$)|[?&]page=\d+|\/archive(?:s)?(?:\/|$)|\/latest(?:\/|$)|\/news(?:\/|$)|\/world(?:\/|$)|\/business(?:\/|$)|\/politics(?:\/|$)|\/technology(?:\/|$)|\/tech(?:\/|$)|\/markets(?:\/|$)|\/economy(?:\/|$)|\/topic(?:s)?(?:\/|$)|\/section(?:s)?(?:\/|$)|\/category(?:ies)?(?:\/|$)|\/tag(?:s)?(?:\/|$))/i;
|
const EXPLORATION_PATH_HINT = /(\/page\/\d+(?:\/|$)|[?&]page=\d+|\/archive(?:s)?(?:\/|$)|\/latest(?:\/|$)|\/news(?:\/|$)|\/world(?:\/|$)|\/business(?:\/|$)|\/politics(?:\/|$)|\/technology(?:\/|$)|\/tech(?:\/|$)|\/markets(?:\/|$)|\/economy(?:\/|$)|\/topic(?:s)?(?:\/|$)|\/section(?:s)?(?:\/|$)|\/category(?:ies)?(?:\/|$)|\/tag(?:s)?(?:\/|$))/i;
|
||||||
|
|
||||||
function decodeHtmlEntities(value) {
|
function decodeHtmlEntities(value) {
|
||||||
|
|
@ -149,9 +155,14 @@ function extractTimeDatetime(html) {
|
||||||
return match ? decodeHtmlEntities(match[2]).trim() : null;
|
return match ? decodeHtmlEntities(match[2]).trim() : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function extractParagraphTextLength(html) {
|
function extractParagraphStats(html) {
|
||||||
const paragraphs = html.match(/<p\b[^>]*>[\s\S]*?<\/p>/gi) || [];
|
const paragraphs = html.match(/<p\b[^>]*>[\s\S]*?<\/p>/gi) || [];
|
||||||
return paragraphs.slice(0, 10).reduce((total, paragraph) => total + normalizeText(paragraph).length, 0);
|
const normalizedParagraphs = paragraphs.slice(0, 12).map((paragraph) => normalizeText(paragraph)).filter(Boolean);
|
||||||
|
|
||||||
|
return {
|
||||||
|
textLength: normalizedParagraphs.reduce((total, paragraph) => total + paragraph.length, 0),
|
||||||
|
substantialCount: normalizedParagraphs.filter((paragraph) => paragraph.length >= 80).length,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function extractJsonLdBlocks(html) {
|
function extractJsonLdBlocks(html) {
|
||||||
|
|
@ -266,31 +277,43 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
||||||
const hasArticlePathHint = ARTICLE_PATH_HINT.test(pageUrl);
|
const hasArticlePathHint = ARTICLE_PATH_HINT.test(pageUrl);
|
||||||
const hasStrongArticlePath = ARTICLE_PATH_STRONG_HINT.test(pathname);
|
const hasStrongArticlePath = ARTICLE_PATH_STRONG_HINT.test(pathname);
|
||||||
const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname);
|
const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname);
|
||||||
const paragraphTextLength = extractParagraphTextLength(html);
|
const { textLength: paragraphTextLength, substantialCount: substantialParagraphCount } = extractParagraphStats(html);
|
||||||
const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
|
const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
|
||||||
const h1 = extractH1(html);
|
const h1 = extractH1(html);
|
||||||
|
const titleTag = extractTitleTag(html);
|
||||||
const ogTitle = normalizeText(meta.get('og:title') || '');
|
const ogTitle = normalizeText(meta.get('og:title') || '');
|
||||||
|
const twitterTitle = normalizeText(meta.get('twitter:title') || '');
|
||||||
|
const titleSignalsLower = [h1, titleTag, ogTitle, twitterTitle].filter(Boolean).map((value) => value.toLowerCase());
|
||||||
|
const looksLikeSectionTitle = titleSignalsLower.some((value) => /^(category:|section:)/.test(value)
|
||||||
|
|| /(market[s]?|business|technology|tech|science|health|gaming|culture|news|world|economy)/.test(value) && value.length <= 80);
|
||||||
|
const looksLikeCommercialPage = titleSignalsLower.some((value) => /(subscribe|subscription|sign in|log in|newsletter|advertis)/.test(value));
|
||||||
const jsonLdHeadline = normalizeText(jsonLdArticle && jsonLdArticle.headline);
|
const jsonLdHeadline = normalizeText(jsonLdArticle && jsonLdArticle.headline);
|
||||||
const jsonLdMatchesPage = jsonLdHeadline
|
const titleSignals = [h1, ogTitle, twitterTitle, titleTag].filter(Boolean);
|
||||||
&& ((h1 && (h1.includes(jsonLdHeadline) || jsonLdHeadline.includes(h1)))
|
const matchingTitleSignals = titleSignals.filter((value) => jsonLdHeadline && (value.includes(jsonLdHeadline) || jsonLdHeadline.includes(value)));
|
||||||
|| (ogTitle && (ogTitle.includes(jsonLdHeadline) || jsonLdHeadline.includes(ogTitle))));
|
const hasJsonLdArticle = Boolean(jsonLdArticle && matchingTitleSignals.length > 0);
|
||||||
const hasJsonLdArticle = Boolean(jsonLdArticle && jsonLdMatchesPage);
|
|
||||||
const hasPublishTime = Boolean(meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html));
|
const hasPublishTime = Boolean(meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html));
|
||||||
const hasOgArticle = String(meta.get('og:type') || '').toLowerCase() === 'article';
|
const hasOgArticle = String(meta.get('og:type') || '').toLowerCase() === 'article';
|
||||||
|
const hasArticleTag = /<article\b/i.test(html);
|
||||||
|
const hasLongBody = paragraphTextLength >= 600 && substantialParagraphCount >= 3;
|
||||||
|
const hasArticleStructure = Boolean(h1) && (hasArticleTag || hasLongBody);
|
||||||
|
const hasMetadataArticleSignal = hasOgArticle || hasPublishTime || hasJsonLdArticle;
|
||||||
|
const hasUrlArticleSignal = hasArticleDatePath || hasStrongArticlePath || hasArticlePathHint;
|
||||||
|
const hasPrimaryArticleSignal = hasMetadataArticleSignal || hasUrlArticleSignal;
|
||||||
|
const hasSecondaryArticleSignal = hasArticlePathHint || hasArticleStructure;
|
||||||
|
|
||||||
if (hasJsonLdArticle) {
|
if (hasJsonLdArticle) {
|
||||||
articleScore += 4;
|
articleScore += 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasOgArticle && !hasListingFalsePositivePath) {
|
if (hasOgArticle && !hasListingFalsePositivePath) {
|
||||||
articleScore += 1;
|
articleScore += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasPublishTime && !hasListingFalsePositivePath) {
|
if (hasPublishTime && !hasListingFalsePositivePath) {
|
||||||
articleScore += 1;
|
articleScore += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (/<article\b/i.test(html)) {
|
if (hasArticleTag) {
|
||||||
articleScore += 1;
|
articleScore += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -298,7 +321,7 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
||||||
articleScore += 2;
|
articleScore += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (h1 && paragraphTextLength >= 500) {
|
if (hasLongBody) {
|
||||||
articleScore += 2;
|
articleScore += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -318,19 +341,41 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
||||||
listingScore += 3;
|
listingScore += 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (articleScore > 0) {
|
if (looksLikeSectionTitle) {
|
||||||
listingScore -= 1;
|
listingScore += 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
const hasArticleSignalsBeyondJsonLd = hasOgArticle || hasPublishTime || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500;
|
if (looksLikeCommercialPage) {
|
||||||
const looksLikeListingPage = headlineLinks >= 15;
|
listingScore += 4;
|
||||||
const isArticleCandidate = !looksLikeListingPage
|
}
|
||||||
&& articleScore >= 5
|
|
||||||
&& articleScore > listingScore
|
|
||||||
&& hasArticleSignalsBeyondJsonLd
|
|
||||||
&& (!jsonLdArticle || hasJsonLdArticle || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);
|
|
||||||
|
|
||||||
return { articleScore, listingScore, isArticleCandidate };
|
if ((pathname === '/' || LISTING_PATH_HINT.test(pathname)) && headlineLinks >= 8 && links.length >= 20) {
|
||||||
|
listingScore += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (headlineLinks >= 15 && !hasPrimaryArticleSignal && !hasLongBody) {
|
||||||
|
listingScore += 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!hasPrimaryArticleSignal && headlineLinks >= 12 && links.length >= 25) {
|
||||||
|
listingScore += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
const shouldAskLlm = !looksLikeCommercialPage && (
|
||||||
|
articleScore >= 2
|
||||||
|
|| hasPrimaryArticleSignal
|
||||||
|
|| hasSecondaryArticleSignal
|
||||||
|
|| (headlineLinks <= 10 && links.length <= 40)
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
articleScore,
|
||||||
|
listingScore,
|
||||||
|
shouldAskLlm,
|
||||||
|
headlineLinks,
|
||||||
|
paragraphCount: substantialParagraphCount,
|
||||||
|
paragraphTextLength,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function shouldQueueLink(url) {
|
function shouldQueueLink(url) {
|
||||||
|
|
@ -458,16 +503,23 @@ function normalizeSite(site) {
|
||||||
const seeds = unique((site.seeds || [])
|
const seeds = unique((site.seeds || [])
|
||||||
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
|
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
|
||||||
.filter(Boolean));
|
.filter(Boolean));
|
||||||
|
const renderMode = String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 'browser' : 'http';
|
||||||
|
const maxPages = normalizeLimit(site.maxPages, 15, 1, 500);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
name: String(site.name || '').trim(),
|
name: String(site.name || '').trim(),
|
||||||
label: String(site.label || '').trim(),
|
label: String(site.label || '').trim(),
|
||||||
allowedHosts,
|
allowedHosts,
|
||||||
seeds,
|
seeds,
|
||||||
renderMode: String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 'browser' : 'http',
|
renderMode,
|
||||||
maxPages: normalizeLimit(site.maxPages, 15, 1, 500),
|
maxPages,
|
||||||
maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5),
|
maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5),
|
||||||
pageConcurrency: normalizeLimit(site.pageConcurrency, String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 3 : 4, 1, 12),
|
pageConcurrency: normalizeLimit(site.pageConcurrency, renderMode === 'browser' ? 2 : 4, 1, renderMode === 'browser' ? 4 : 12),
|
||||||
|
maxQueuedPages: normalizeLimit(site.maxQueuedPages, Math.min(maxPages * 3, 1000), maxPages, 2000),
|
||||||
|
memorySoftLimitMb: normalizeLimit(site.memorySoftLimitMb, 800, 128, 8192),
|
||||||
|
memoryHardLimitMb: normalizeLimit(site.memoryHardLimitMb, 1400, 256, 16384),
|
||||||
|
memoryThrottleDelayMs: normalizeLimit(site.memoryThrottleDelayMs, 1500, 100, 10000),
|
||||||
|
llmPatternMinHits: normalizeLimit(site.llmPatternMinHits, 3, 1, 20),
|
||||||
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
@ -483,7 +535,7 @@ function getConfiguredCrawlerSites() {
|
||||||
const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
|
const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
|
||||||
const derivedSites = [];
|
const derivedSites = [];
|
||||||
|
|
||||||
for (const feed of config.rssFeeds || []) {
|
for (const feed of getRssSources()) {
|
||||||
const label = String(feed.label || '').trim();
|
const label = String(feed.label || '').trim();
|
||||||
if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
|
if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -491,7 +543,7 @@ function getConfiguredCrawlerSites() {
|
||||||
|
|
||||||
let hostname = '';
|
let hostname = '';
|
||||||
try {
|
try {
|
||||||
hostname = new URL(feed.url).hostname;
|
hostname = new URL(feed.feedUrl).hostname;
|
||||||
} catch {
|
} catch {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -501,11 +553,16 @@ function getConfiguredCrawlerSites() {
|
||||||
label,
|
label,
|
||||||
name: override.name || `crawler_${slugifyLabel(label)}`,
|
name: override.name || `crawler_${slugifyLabel(label)}`,
|
||||||
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
|
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
|
||||||
seeds: override.seeds || buildDefaultSeeds(feed.url),
|
seeds: override.seeds || buildDefaultSeeds(feed.feedUrl),
|
||||||
renderMode: override.renderMode || defaults.renderMode,
|
renderMode: override.renderMode || defaults.renderMode,
|
||||||
maxPages: override.maxPages || defaults.maxPages,
|
maxPages: override.maxPages || defaults.maxPages,
|
||||||
maxDepth: override.maxDepth || defaults.maxDepth,
|
maxDepth: override.maxDepth || defaults.maxDepth,
|
||||||
pageConcurrency: override.pageConcurrency,
|
pageConcurrency: override.pageConcurrency,
|
||||||
|
maxQueuedPages: override.maxQueuedPages,
|
||||||
|
memorySoftLimitMb: override.memorySoftLimitMb || defaults.memorySoftLimitMb,
|
||||||
|
memoryHardLimitMb: override.memoryHardLimitMb || defaults.memoryHardLimitMb,
|
||||||
|
memoryThrottleDelayMs: override.memoryThrottleDelayMs || defaults.memoryThrottleDelayMs,
|
||||||
|
llmPatternMinHits: override.llmPatternMinHits || defaults.llmPatternMinHits,
|
||||||
requestTimeout: override.requestTimeout || defaults.requestTimeout,
|
requestTimeout: override.requestTimeout || defaults.requestTimeout,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
@ -555,6 +612,28 @@ async function crawlSite(site) {
|
||||||
const discoveredArticleUrls = new Set();
|
const discoveredArticleUrls = new Set();
|
||||||
const articles = [];
|
const articles = [];
|
||||||
|
|
||||||
|
function getResidentSetMb() {
|
||||||
|
return Math.round(process.memoryUsage().rss / (1024 * 1024));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function throttleForMemory() {
|
||||||
|
let residentSetMb = getResidentSetMb();
|
||||||
|
|
||||||
|
while (residentSetMb >= normalizedSite.memorySoftLimitMb) {
|
||||||
|
if (residentSetMb >= normalizedSite.memoryHardLimitMb) {
|
||||||
|
console.error(`Crawler memory hard limit reached for ${normalizedSite.name}: ${residentSetMb}MB >= ${normalizedSite.memoryHardLimitMb}MB`);
|
||||||
|
queue.length = 0;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.warn(`Crawler memory throttle for ${normalizedSite.name}: ${residentSetMb}MB >= ${normalizedSite.memorySoftLimitMb}MB`);
|
||||||
|
await sleep(normalizedSite.memoryThrottleDelayMs);
|
||||||
|
residentSetMb = getResidentSetMb();
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
async function processPage(current) {
|
async function processPage(current) {
|
||||||
let html;
|
let html;
|
||||||
try {
|
try {
|
||||||
|
|
@ -575,7 +654,35 @@ async function crawlSite(site) {
|
||||||
? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
|
? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
|
||||||
: current.url;
|
: current.url;
|
||||||
const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
|
const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
|
||||||
const { listingScore, isArticleCandidate } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
|
const heuristic = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
|
||||||
|
let isArticleCandidate = false;
|
||||||
|
let effectiveListingScore = heuristic.listingScore;
|
||||||
|
|
||||||
|
if (heuristic.shouldAskLlm) {
|
||||||
|
try {
|
||||||
|
const llmDecision = await classifyPageWithLlm({
|
||||||
|
siteName: normalizedSite.name,
|
||||||
|
url: canonicalUrl,
|
||||||
|
html,
|
||||||
|
meta,
|
||||||
|
jsonLdArticle,
|
||||||
|
heuristic,
|
||||||
|
links,
|
||||||
|
minPatternHits: normalizedSite.llmPatternMinHits,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (llmDecision.classification === 'article') {
|
||||||
|
isArticleCandidate = true;
|
||||||
|
effectiveListingScore = Math.max(0, heuristic.listingScore - 2);
|
||||||
|
} else if (llmDecision.classification === 'listing') {
|
||||||
|
effectiveListingScore = Math.max(heuristic.listingScore, 3);
|
||||||
|
} else if (llmDecision.classification === 'other') {
|
||||||
|
effectiveListingScore = Math.max(0, heuristic.listingScore - 1);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Crawler LLM classification failed for ${normalizedSite.name}: ${canonicalUrl}`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (isArticleCandidate && !discoveredArticleUrls.has(canonicalUrl)) {
|
if (isArticleCandidate && !discoveredArticleUrls.has(canonicalUrl)) {
|
||||||
const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
|
const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
|
||||||
|
|
@ -587,16 +694,20 @@ async function crawlSite(site) {
|
||||||
url: canonicalUrl,
|
url: canonicalUrl,
|
||||||
source: normalizedSite.name,
|
source: normalizedSite.name,
|
||||||
pubDate: selectPubDate(meta, jsonLdArticle, html),
|
pubDate: selectPubDate(meta, jsonLdArticle, html),
|
||||||
isIndexPage: !isArticleCandidate,
|
isIndexPage: false,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.depth >= normalizedSite.maxDepth || !shouldContinueExploring(current, listingScore, links)) {
|
if (current.depth >= normalizedSite.maxDepth || !shouldContinueExploring(current, effectiveListingScore, links)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const link of links) {
|
for (const link of links) {
|
||||||
|
if (queuedUrls.size >= normalizedSite.maxQueuedPages) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (!shouldQueueLink(link.url) || visitedUrls.has(link.url) || queuedUrls.has(link.url)) {
|
if (!shouldQueueLink(link.url) || visitedUrls.has(link.url) || queuedUrls.has(link.url)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -608,6 +719,10 @@ async function crawlSite(site) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
|
while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
|
||||||
|
if (!await throttleForMemory()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
const batch = [];
|
const batch = [];
|
||||||
|
|
||||||
while (queue.length && batch.length < normalizedSite.pageConcurrency && visitedUrls.size + batch.length < normalizedSite.maxPages) {
|
while (queue.length && batch.length < normalizedSite.pageConcurrency && visitedUrls.size + batch.length < normalizedSite.maxPages) {
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
const Parser = require('rss-parser');
|
const Parser = require('rss-parser');
|
||||||
const config = require('../config');
|
|
||||||
const { fetchWithPolicy } = require('../http');
|
const { fetchWithPolicy } = require('../http');
|
||||||
|
const { getRssSources, markFeedFailed } = require('./sourceCatalog');
|
||||||
|
|
||||||
const parser = new Parser({
|
const parser = new Parser({
|
||||||
timeout: 10000,
|
timeout: 10000,
|
||||||
|
|
@ -10,62 +10,11 @@ const parser = new Parser({
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
const blockedFeedDomains = [
|
|
||||||
'arabnews.com',
|
|
||||||
'arabianbusiness.com',
|
|
||||||
'business-standard.com',
|
|
||||||
'cityam.com',
|
|
||||||
'eleconomista.com.mx',
|
|
||||||
'eleconomista.es',
|
|
||||||
'moneycontrol.com',
|
|
||||||
'thisismoney.co.uk',
|
|
||||||
];
|
|
||||||
const invalidFeedLabels = new Set([
|
|
||||||
'ABC Business AU',
|
|
||||||
'Australian Fin Review',
|
|
||||||
'Business Daily Africa',
|
|
||||||
'BusinessLive SA',
|
|
||||||
'Caixin Global',
|
|
||||||
'Cinco Dias',
|
|
||||||
'El Comercio Peru',
|
|
||||||
'FD.nl',
|
|
||||||
'Gulf News Business',
|
|
||||||
'Il Sole 24 Ore',
|
|
||||||
'Infobae Economia AR',
|
|
||||||
'Japan Times Business',
|
|
||||||
'Korea JoongAng Daily',
|
|
||||||
'Les Echos',
|
|
||||||
'Live Mint',
|
|
||||||
'NZ Herald Business',
|
|
||||||
'Portafolio Colombia',
|
|
||||||
'The Star Malaysia',
|
|
||||||
'Xinhua Business',
|
|
||||||
]);
|
|
||||||
const malformedFeedLabels = new Set([
|
|
||||||
'BFM Business',
|
|
||||||
'Business Daily Africa',
|
|
||||||
'Nation News Barbados',
|
|
||||||
]);
|
|
||||||
const loggedBlockedFeeds = new Set();
|
|
||||||
const loggedInvalidFeeds = new Set();
|
|
||||||
const loggedUpstreamFeedSkips = new Set();
|
const loggedUpstreamFeedSkips = new Set();
|
||||||
|
|
||||||
function getHostname(url) {
|
function isMalformedXml(error) {
|
||||||
try {
|
const msg = String(error && error.message || '');
|
||||||
return new URL(url).hostname.toLowerCase();
|
return msg.includes('Invalid character in entity name') || msg.includes('Attribute without value') || msg.includes('Unquoted attribute value');
|
||||||
} catch {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function isBlockedFeed(feed) {
|
|
||||||
const hostname = getHostname(feed.url);
|
|
||||||
return blockedFeedDomains.some((domain) => hostname === domain || hostname.endsWith(`.${domain}`));
|
|
||||||
}
|
|
||||||
|
|
||||||
function isMalformedFeedError(error) {
|
|
||||||
const message = String(error && error.message || '');
|
|
||||||
return message.includes('Invalid character in entity name') || message.includes('Attribute without value');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function getErrorStatus(error) {
|
function getErrorStatus(error) {
|
||||||
|
|
@ -99,28 +48,16 @@ async function parseFeed(feedUrl) {
|
||||||
async function fetchRssArticles() {
|
async function fetchRssArticles() {
|
||||||
const articles = [];
|
const articles = [];
|
||||||
|
|
||||||
for (const feed of config.rssFeeds || []) {
|
for (const feed of getRssSources()) {
|
||||||
const label = feed.label || feed.url;
|
const label = feed.label || feed.id;
|
||||||
|
|
||||||
if (invalidFeedLabels.has(label)) {
|
for (const feedUrl of feed.feedUrls) {
|
||||||
if (!loggedInvalidFeeds.has(label)) {
|
if (feedUrl.startsWith('[FAILED] ')) {
|
||||||
loggedInvalidFeeds.add(label);
|
|
||||||
console.warn(`RSS feed skipped for invalid endpoint ${label}`);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isBlockedFeed(feed)) {
|
|
||||||
const hostname = getHostname(feed.url);
|
|
||||||
if (!loggedBlockedFeeds.has(hostname)) {
|
|
||||||
loggedBlockedFeeds.add(hostname);
|
|
||||||
console.warn(`RSS feed skipped for blocked domain ${hostname}`);
|
|
||||||
}
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const parsed = await parseFeed(feed.url);
|
const parsed = await parseFeed(feedUrl);
|
||||||
for (const item of parsed.items || []) {
|
for (const item of parsed.items || []) {
|
||||||
const title = String(item.title || '').trim();
|
const title = String(item.title || '').trim();
|
||||||
const url = String(item.link || item.guid || '').trim();
|
const url = String(item.link || item.guid || '').trim();
|
||||||
|
|
@ -138,25 +75,27 @@ async function fetchRssArticles() {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (malformedFeedLabels.has(label) && isMalformedFeedError(error)) {
|
const status = getErrorStatus(error);
|
||||||
if (!loggedInvalidFeeds.has(label)) {
|
const isDnsFailure = error && error.cause && error.cause.code === 'ENOTFOUND';
|
||||||
loggedInvalidFeeds.add(label);
|
const permanent = status === 401 || status === 403 || status === 404 || isMalformedXml(error) || isDnsFailure;
|
||||||
console.warn(`RSS feed skipped for malformed XML ${label}`);
|
|
||||||
}
|
if (permanent) {
|
||||||
|
console.warn(`RSS feed permanently failed for ${label} (${feedUrl}) — marking in sources.json`);
|
||||||
|
markFeedFailed(feedUrl);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const status = getErrorStatus(error);
|
if (status === 429) {
|
||||||
if (status === 401 || status === 403 || status === 404 || status === 429) {
|
const key = `${label}:${feedUrl}:429`;
|
||||||
const key = `${label}:${status}`;
|
|
||||||
if (!loggedUpstreamFeedSkips.has(key)) {
|
if (!loggedUpstreamFeedSkips.has(key)) {
|
||||||
loggedUpstreamFeedSkips.add(key);
|
loggedUpstreamFeedSkips.add(key);
|
||||||
console.warn(`RSS feed skipped for ${label}: upstream returned ${status}`);
|
console.warn(`RSS feed skipped for ${label}: rate limited`);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
console.error(`Failed to fetch RSS feed: ${label}`, error);
|
console.error(`Failed to fetch RSS feed: ${label} (${feedUrl})`, error);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
133
src/sources/sourceCatalog.js
Normal file
133
src/sources/sourceCatalog.js
Normal file
|
|
@ -0,0 +1,133 @@
|
||||||
|
const fs = require('fs');
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
const catalogPath = path.join(__dirname, '..', '..', 'sources.json');
|
||||||
|
|
||||||
|
function normalizeHostname(value) {
|
||||||
|
const input = String(value || '').trim().toLowerCase();
|
||||||
|
if (!input) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const url = new URL(input.includes('://') ? input : `https://${input}`);
|
||||||
|
return url.hostname.toLowerCase();
|
||||||
|
} catch {
|
||||||
|
return input.replace(/^[a-z]+:\/\//i, '').split('/')[0].trim().toLowerCase() || null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeWebsites(value) {
|
||||||
|
const rawValues = Array.isArray(value) ? value : value == null ? [] : [value];
|
||||||
|
return [...new Set(rawValues.map((entry) => normalizeHostname(entry)).filter(Boolean))];
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeSource(source, index) {
|
||||||
|
if (!source || typeof source !== 'object' || Array.isArray(source)) {
|
||||||
|
throw new Error(`sources.json entry ${index} must be an object`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const id = String(source.id || '').trim();
|
||||||
|
const label = String(source.label || '').trim();
|
||||||
|
const rawFeedUrls = Array.isArray(source.feedUrl)
|
||||||
|
? source.feedUrl
|
||||||
|
: source.feedUrl == null ? [] : [source.feedUrl];
|
||||||
|
const feedUrls = rawFeedUrls.map((u) => String(u).trim()).filter(Boolean);
|
||||||
|
const website = normalizeWebsites(source.website);
|
||||||
|
const backfill = Boolean(source.backfill);
|
||||||
|
|
||||||
|
if (!id) {
|
||||||
|
throw new Error(`sources.json entry ${index} is missing id`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!label) {
|
||||||
|
throw new Error(`sources.json entry ${index} is missing label`);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const u of feedUrls) {
|
||||||
|
if (u.startsWith('[FAILED] ')) continue;
|
||||||
|
try {
|
||||||
|
new URL(u);
|
||||||
|
} catch {
|
||||||
|
throw new Error(`sources.json entry ${index} has invalid feedUrl "${u}" for ${id}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (backfill && website.length === 0) {
|
||||||
|
throw new Error(`sources.json entry ${index} has backfill enabled but no website for ${id}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
label,
|
||||||
|
feedUrls,
|
||||||
|
website,
|
||||||
|
backfill,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadCatalog() {
|
||||||
|
const parsed = JSON.parse(fs.readFileSync(catalogPath, 'utf8'));
|
||||||
|
if (!Array.isArray(parsed)) {
|
||||||
|
throw new Error('sources.json must contain an array');
|
||||||
|
}
|
||||||
|
|
||||||
|
const sources = parsed.map((source, index) => normalizeSource(source, index));
|
||||||
|
const seenIds = new Set();
|
||||||
|
|
||||||
|
for (const source of sources) {
|
||||||
|
if (seenIds.has(source.id)) {
|
||||||
|
throw new Error(`sources.json contains duplicate id ${source.id}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
seenIds.add(source.id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sources;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sourceCatalog = loadCatalog();
|
||||||
|
|
||||||
|
function getSourceCatalog() {
|
||||||
|
return sourceCatalog;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getRssSources() {
|
||||||
|
return sourceCatalog.filter((source) => source.feedUrls.length > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
function getBackfillSources() {
|
||||||
|
return sourceCatalog.filter((source) => source.backfill && source.website.length > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
function markFeedFailed(feedUrl) {
|
||||||
|
try {
|
||||||
|
const raw = JSON.parse(fs.readFileSync(catalogPath, 'utf8'));
|
||||||
|
let changed = false;
|
||||||
|
|
||||||
|
for (const entry of raw) {
|
||||||
|
const urls = Array.isArray(entry.feedUrl) ? entry.feedUrl : entry.feedUrl ? [entry.feedUrl] : [];
|
||||||
|
const idx = urls.findIndex((u) => u === feedUrl && !u.startsWith('[FAILED] '));
|
||||||
|
|
||||||
|
if (idx !== -1) {
|
||||||
|
urls[idx] = `[FAILED] ${urls[idx]}`;
|
||||||
|
entry.feedUrl = urls.length === 1 ? urls[0] : urls;
|
||||||
|
changed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (changed) {
|
||||||
|
fs.writeFileSync(catalogPath, JSON.stringify(raw, null, 2) + '\n', 'utf8');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to mark feed as failed in sources.json:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
getSourceCatalog,
|
||||||
|
getRssSources,
|
||||||
|
getBackfillSources,
|
||||||
|
markFeedFailed,
|
||||||
|
};
|
||||||
Loading…
Add table
Reference in a new issue