diff --git a/README.md b/README.md index b5ec5c5..9417240 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # duriin_api -Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, GDELT, and configured publisher crawlers into a local SQLite archive. +Node.js Fastify server that ingests news articles from RSS, Google News RSS, SEC EDGAR 8-K filings, Alpha Vantage News Sentiment, Finnhub company news, and GDELT into a local SQLite archive. ## Setup @@ -8,7 +8,7 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing ```bash npm install ``` -2. Edit `config.json` with your API keys, tickers, RSS feeds, crawler settings, and schedules. +2. Edit `config.json` with your API keys, tickers, RSS feeds, Google News settings, and schedules. 3. Start the server: ```bash npm start @@ -303,7 +303,7 @@ Returns ingestion and archive summary information. - `image` stores the extracted main image as ultra-compressed base64 WebP. - `normalized_title` is stored for matching and indexing. -- `source` may be a shared source like `rss`, `gdelt`, `edgar`, `alphavantage`, or `finnhub`, or a crawler-derived source name for a configured publisher. +- `source` may be a shared source like `rss`, `googlenews`, `gdelt`, `edgar`, `alphavantage`, or `finnhub`. - `pub_date` is normalized to ISO-8601 when it can be parsed. - `ingested_at` is the insert timestamp set by the server. @@ -311,7 +311,7 @@ Returns ingestion and archive summary information. - SQLite archive file defaults to `./archive.sqlite`. - Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique. -- `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts. +- `googleNews` accepts `queries`, `topics`, `language`, and `country`, and resolves Google redirect URLs to publisher URLs before ingestion. - Article body extraction runs asynchronously after insertion, with scheduled retries for rows still missing content. - Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search. - Topic search caches normalized query embeddings in SQLite and falls back to OpenRouter on cache miss. diff --git a/config.json b/config.json index 0fa2203..00b4553 100644 --- a/config.json +++ b/config.json @@ -21,664 +21,39 @@ "openRouter": { "apiKey": "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" }, - "rssFeeds": [ - { - "url": "https://www.aljazeera.com/xml/rss/all.xml", - "label": "Al Jazeera" - }, - { - "url": "https://feeds.bbci.co.uk/news/business/rss.xml", - "label": "BBC Business" - }, - { - "url": "https://feeds.businessinsider.com/custom/all", - "label": "Business Insider" - }, - { - "url": "https://feeds.bloomberg.com/markets/news.rss", - "label": "Bloomberg Markets" - }, - { - "url": "https://www.cnbc.com/id/100003114/device/rss/rss.html", - "label": "CNBC" - }, - { - "url": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml", - "label": "Wall Street Journal" - }, - { - "url": "https://feeds.marketwatch.com/marketwatch/topstories/", - "label": "MarketWatch" - }, - { - "url": "https://finance.yahoo.com/news/rssindex", - "label": "Yahoo Finance" - }, - { - "url": "https://seekingalpha.com/feed.xml", - "label": "Seeking Alpha" - }, - { - "url": "https://www.ft.com/?format=rss", - "label": "Financial Times" - }, - { - "url": "https://www.economist.com/finance-and-economics/rss.xml", - "label": "The Economist" - }, - { - "url": "https://fortune.com/feed", - "label": "Fortune" - }, - { - "url": "https://www.forbes.com/business/feed/", - "label": "Forbes Business" - }, - { - "url": "https://www.inc.com/rss", - "label": "Inc Magazine" - }, - { - "url": "https://www.fastcompany.com/latest/rss", - "label": "Fast Company" - }, - { - "url": "https://www.entrepreneur.com/latest.rss", - "label": "Entrepreneur" - }, - { - "url": "https://api.axios.com/feed/", - "label": "Axios" - }, - { - "url": "https://www.wired.com/feed/category/business/latest/rss", - "label": "Wired Business" - }, - { - "url": "https://feeds.npr.org/1006/rss.xml", - "label": "NPR Business" - }, - { - "url": "https://www.federalreserve.gov/feeds/press_all.xml", - "label": "Federal Reserve" - }, - { - "url": "https://techcrunch.com/feed/", - "label": "TechCrunch" - }, - { - "url": "https://www.theverge.com/rss/index.xml", - "label": "The Verge" - }, - { - "url": "https://feeds.arstechnica.com/arstechnica/index", - "label": "Ars Technica" - }, - { - "url": "https://www.retaildive.com/feeds/news/", - "label": "Retail Dive" - }, - { - "url": "https://www.manufacturingdive.com/feeds/news/", - "label": "Manufacturing Dive" - }, - { - "url": "https://www.bankingdive.com/feeds/news/", - "label": "Banking Dive" - }, - { - "url": "https://financialpost.com/feed", - "label": "Financial Post CA" - }, - { - "url": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/", - "label": "Globe and Mail" - }, - { - "url": "https://www.theguardian.com/uk/business/rss", - "label": "Guardian Business" - }, - { - "url": "https://feeds.skynews.com/feeds/rss/business.xml", - "label": "Sky News Business" - }, - { - "url": "https://www.thisismoney.co.uk/money/news/index.rss", - "label": "This Is Money" - }, - { - "url": "https://www.cityam.com/feed/", - "label": "City A.M." - }, - { - "url": "https://www.spiegel.de/wirtschaft/index.rss", - "label": "Spiegel Wirtschaft" - }, - { - "url": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen", - "label": "Handelsblatt" - }, - { - "url": "https://www.faz.net/rss/aktuell/wirtschaft/", - "label": "FAZ Wirtschaft" - }, - { - "url": "https://www.welt.de/feeds/section/wirtschaft.rss", - "label": "Die Welt Wirtschaft" - }, - { - "url": "https://feeds.lesechos.fr/rss/rss_la_une.xml", - "label": "Les Echos" - }, - { - "url": "https://www.lemonde.fr/economie/rss_full.xml", - "label": "Le Monde Economie" - }, - { - "url": "https://bfmbusiness.bfmtv.com/rss/news-flux-rss/", - "label": "BFM Business" - }, - { - "url": "https://www.eleconomista.es/rss/rss-de-portada.php", - "label": "El Economista ES" - }, - { - "url": "https://e00-expansion.uecdn.es/rss/portada.xml", - "label": "Expansion ES" - }, - { - "url": "https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml", - "label": "Cinco Dias" - }, - { - "url": "https://www.ilsole24ore.com/rss/economia--finanza.xml", - "label": "Il Sole 24 Ore" - }, - { - "url": "https://fd.nl/rss", - "label": "FD.nl" - }, - { - "url": "https://www.nzz.ch/wirtschaft.rss", - "label": "NZZ Wirtschaft" - }, - { - "url": "https://www.themoscowtimes.com/rss/news", - "label": "Moscow Times" - }, - { - "url": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss", - "label": "RBC Russia" - }, - { - "url": "https://economictimes.indiatimes.com/rssfeedstopstories.cms", - "label": "Economic Times India" - }, - { - "url": "https://www.business-standard.com/rss/home_page_top_stories.rss", - "label": "Business Standard IN" - }, - { - "url": "https://www.livemint.com/rss/headlines", - "label": "Live Mint" - }, - { - "url": "https://www.moneycontrol.com/rss/MCtopnews.xml", - "label": "Moneycontrol" - }, - { - "url": "https://www.thehindubusinessline.com/feeder/default.rss", - "label": "Hindu Business Line" - }, - { - "url": "https://www.caixinglobal.com/rss/newsfeeds/", - "label": "Caixin Global" - }, - { - "url": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml", - "label": "China Daily Business" - }, - { - "url": "https://english.news.cn/rss/business.xml", - "label": "Xinhua Business" - }, - { - "url": "https://www.scmp.com/rss/91/feed", - "label": "South China Morning Post" - }, - { - "url": "https://asia.nikkei.com/rss/feed/nar", - "label": "Nikkei Asia" - }, - { - "url": "https://www.japantimes.co.jp/feed/business/", - "label": "Japan Times Business" - }, - { - "url": "https://www.koreaherald.com/rss/010000000000.xml", - "label": "Korea Herald" - }, - { - "url": "https://koreajoongangdaily.joins.com/rss/", - "label": "Korea JoongAng Daily" - }, - { - "url": "https://www.businesstimes.com.sg/rss.xml", - "label": "Business Times SG" - }, - { - "url": "https://www.straitstimes.com/news/business/rss.xml", - "label": "Straits Times Business" - }, - { - "url": "https://www.channelnewsasia.com/rssfeeds/8395986", - "label": "Channel NewsAsia" - }, - { - "url": "https://www.bangkokpost.com/rss/data/business.xml", - "label": "Bangkok Post Business" - }, - { - "url": "https://www.thestar.com.my/rss/Business/Business-News", - "label": "The Star Malaysia" - }, - { - "url": "https://www.afr.com/rss", - "label": "Australian Fin Review" - }, - { - "url": "https://www.abc.net.au/news/feed/52278/rss.xml", - "label": "ABC Business AU" - }, - { - "url": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/", - "label": "NZ Herald Business" - }, - { - "url": "https://www.arabianbusiness.com/rss.xml", - "label": "Arabian Business" - }, - { - "url": "https://gulfnews.com/rss/business", - "label": "Gulf News Business" - }, - { - "url": "https://www.arabnews.com/rss/front_page.xml", - "label": "Arab News" - }, - { - "url": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml", - "label": "The National UAE" - }, - { - "url": "https://businessday.ng/feed/", - "label": "BusinessDay Nigeria" - }, - { - "url": "https://www.moneyweb.co.za/feed/", - "label": "Moneyweb SA" - }, - { - "url": "https://www.businesslive.co.za/rss/bd/", - "label": "BusinessLive SA" - }, - { - "url": "https://www.businessdailyafrica.com/rss/", - "label": "Business Daily Africa" - }, - { - "url": "https://www.vanguardngr.com/category/business/feed/", - "label": "Vanguard Business NG" - }, - { - "url": "https://feeds.folha.uol.com.br/mercado/rss091.xml", - "label": "Folha Mercado BR" - }, - { - "url": "https://g1.globo.com/dynamo/economia/rss2.xml", - "label": "G1 Economia BR" - }, - { - "url": "https://exame.com/feed/", - "label": "Exame BR" - }, - { - "url": "https://www.eleconomista.com.mx/rss/rss.html", - "label": "El Economista MX" - }, - { - "url": "https://expansion.mx/rss", - "label": "Expansion MX" - }, - { - "url": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/", - "label": "La Nacion AR" - }, - { - "url": "https://www.infobae.com/feeds/rss/economia/", - "label": "Infobae Economia AR" - }, - { - "url": "https://www.portafolio.co/rss/portafolio.xml", - "label": "Portafolio Colombia" - }, - { - "url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/", - "label": "El Comercio Peru" - }, - { - "url": "https://jamaica-gleaner.com/feed/business.xml", - "label": "Jamaica Gleaner" - }, - { - "url": "https://www.jamaicaobserver.com/app/business/", - "label": "Jamaica Observer" - }, - { - "url": "https://www.stabroeknews.com/feed/", - "label": "Stabroek News" - }, - { - "url": "https://nationnews.com/rss-feed/", - "label": "Nation News Barbados" - } - ], "gdelt": { - "queries": [ - "technology" - ], + "source": "bigquery", "mode": "ArtList", - "maxRecords": 50, - "format": "json" - }, - "newsCrawler": { - "maxPages": -1, - "maxDepth": 10, - "pageConcurrency": 4, - "requestTimeout": 15000, - "disabledLabels": [ - "Arab News", - "Arabian Business", - "Australian Fin Review", - "BFM Business", - "Business Daily Africa", - "Business Standard IN", - "BusinessLive SA", - "Caixin Global", - "Cinco Dias", - "City A.M.", - "El Comercio Peru", - "El Economista ES", - "El Economista MX", - "FD.nl", - "Gulf News Business", - "Il Sole 24 Ore", - "Infobae Economia AR", - "Japan Times Business", - "Korea JoongAng Daily", - "Les Echos", - "Live Mint", - "Moneycontrol", - "NZ Herald Business", - "Portafolio Colombia", - "Reuters", - "The Star Malaysia", - "This Is Money", - "Xinhua Business" - ], - "overrides": { - "Al Jazeera": { - "allowedHosts": [ - "www.aljazeera.com", - "aljazeera.com" - ], - "seeds": [ - "https://www.aljazeera.com/", - "https://www.aljazeera.com/economy/", - "https://www.aljazeera.com/tag/technology/" - ] - }, - "Ars Technica": { - "allowedHosts": [ - "arstechnica.com", - "www.arstechnica.com" - ], - "seeds": [ - "https://arstechnica.com/", - "https://arstechnica.com/tech-policy/", - "https://arstechnica.com/information-technology/" - ] - }, - "BBC Business": { - "allowedHosts": [ - "www.bbc.com", - "bbc.com" - ], - "seeds": [ - "https://www.bbc.com/news/business", - "https://www.bbc.com/news/technology" - ] - }, - "CNBC": { - "allowedHosts": [ - "www.cnbc.com", - "cnbc.com" - ], - "renderMode": "browser", - "seeds": [ - "https://www.cnbc.com/world/", - "https://www.cnbc.com/business/", - "https://www.cnbc.com/technology/" - ] - }, - "Guardian Business": { - "allowedHosts": [ - "www.theguardian.com", - "theguardian.com" - ], - "seeds": [ - "https://www.theguardian.com/", - "https://www.theguardian.com/business", - "https://www.theguardian.com/technology" - ] - }, - "Jamaica Gleaner": { - "allowedHosts": [ - "jamaica-gleaner.com", - "www.jamaica-gleaner.com" - ], - "seeds": [ - "https://jamaica-gleaner.com/", - "https://jamaica-gleaner.com/news", - "https://jamaica-gleaner.com/business" - ], - "requestTimeout": 25000 - }, - "Jamaica Observer": { - "allowedHosts": [ - "www.jamaicaobserver.com", - "jamaicaobserver.com" - ], - "seeds": [ - "https://www.jamaicaobserver.com/", - "https://www.jamaicaobserver.com/news/", - "https://www.jamaicaobserver.com/business/" - ] - }, - "Nation News Barbados": { - "allowedHosts": [ - "nationnews.com", - "www.nationnews.com" - ], - "seeds": [ - "https://nationnews.com/", - "https://nationnews.com/category/business/", - "https://nationnews.com/category/news/" - ] - }, - "NPR Business": { - "allowedHosts": [ - "www.npr.org", - "npr.org" - ], - "seeds": [ - "https://www.npr.org/sections/business/", - "https://www.npr.org/sections/technology/" - ] - }, - "The Verge": { - "allowedHosts": [ - "www.theverge.com", - "theverge.com" - ], - "seeds": [ - "https://www.theverge.com/tech", - "https://www.theverge.com/business", - "https://www.theverge.com/archives" - ] - }, - "TechCrunch": { - "allowedHosts": [ - "techcrunch.com", - "www.techcrunch.com" - ], - "seeds": [ - "https://techcrunch.com/", - "https://techcrunch.com/category/startups/", - "https://techcrunch.com/category/venture/" - ] - }, - "The Economist": { - "allowedHosts": [ - "www.economist.com", - "economist.com" - ], - "seeds": [ - "https://www.economist.com/finance-and-economics", - "https://www.economist.com/business", - "https://www.economist.com/science-and-technology" - ] - }, - "Federal Reserve": { - "allowedHosts": [ - "www.federalreserve.gov", - "federalreserve.gov" - ], - "seeds": [ - "https://www.federalreserve.gov/newsevents.htm", - "https://www.federalreserve.gov/monetarypolicy.htm" - ] - }, - "Fortune": { - "allowedHosts": [ - "fortune.com", - "www.fortune.com" - ], - "renderMode": "browser", - "seeds": [ - "https://fortune.com/", - "https://fortune.com/section/tech/", - "https://fortune.com/section/finance/" - ] - }, - "Forbes Business": { - "allowedHosts": [ - "www.forbes.com", - "forbes.com" - ], - "renderMode": "browser", - "seeds": [ - "https://www.forbes.com/business/", - "https://www.forbes.com/innovation/" - ] - }, - "Financial Times": { - "allowedHosts": [ - "www.ft.com", - "ft.com" - ], - "renderMode": "browser", - "seeds": [ - "https://www.ft.com/world/us", - "https://www.ft.com/technology" - ] - }, - "Nikkei Asia": { - "allowedHosts": [ - "asia.nikkei.com" - ], - "seeds": [ - "https://asia.nikkei.com/", - "https://asia.nikkei.com/Business", - "https://asia.nikkei.com/Technology" - ] - }, - "South China Morning Post": { - "allowedHosts": [ - "www.scmp.com", - "scmp.com" - ], - "seeds": [ - "https://www.scmp.com/", - "https://www.scmp.com/business", - "https://www.scmp.com/tech" - ] - }, - "Stabroek News": { - "allowedHosts": [ - "www.stabroeknews.com", - "stabroeknews.com" - ], - "seeds": [ - "https://www.stabroeknews.com/", - "https://www.stabroeknews.com/category/business/", - "https://www.stabroeknews.com/category/news/" - ] - }, - "Wall Street Journal": { - "allowedHosts": [ - "www.wsj.com", - "wsj.com" - ], - "seeds": [ - "https://www.wsj.com/news/business", - "https://www.wsj.com/tech" - ] - }, - "Wired Business": { - "allowedHosts": [ - "www.wired.com", - "wired.com" - ], - "renderMode": "browser", - "seeds": [ - "https://www.wired.com/category/business/", - "https://www.wired.com/category/security/" - ] - }, - "Yahoo Finance": { - "allowedHosts": [ - "finance.yahoo.com" - ], - "renderMode": "browser", - "seeds": [ - "https://finance.yahoo.com/", - "https://finance.yahoo.com/news/", - "https://finance.yahoo.com/topic/tech/" - ] - } - } + "maxRecords": 100, + "format": "json", + "windowDays": 7, + "lookbackWeeks": 312, + "requestDelayMs": 6500, + "maxWindowsPerRun": 4, + "bigQueryProject": "duriin", + "bigQueryKeyFile": "./gdelt-credentials.json" }, "scheduler": { - "newsCrawler": "0 * * * *", "rss": "5 * * * *", "gdelt": "10 * * * *", "edgar": "15 * * * *", "alphaVantage": "20 * * * *", - "finnhub": "25 * * * *" + "finnhub": "25 * * * *", + "googleNews": "0 * * * *" }, "contentBackfill": { "cron": "0 * * * *", "batchSize": -1 + }, + "googleNews": { + "queries": [ + "technology" + ], + "topics": [ + "BUSINESS", + "TECHNOLOGY" + ], + "language": "en", + "country": "US" } } diff --git a/gdelt-credentials.json b/gdelt-credentials.json new file mode 100644 index 0000000..79e0304 --- /dev/null +++ b/gdelt-credentials.json @@ -0,0 +1,13 @@ +{ + "type": "service_account", + "project_id": "duriin", + "private_key_id": "28c6050948e703e3443203ac1a8c10e2e3009793", + "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDQlDtHQASuKDOn\nUWejQA6lbbPGn5AV8Xc+mQ7G+JgmxqeVxmqmQU6Eh+Q/8aUHL2oMwbaVi1Z8Dfbl\nhjjKaTYuRB1N3MAr94hahxQBk7EvSLUtzoMv36aMtZNcvx1W57hKQtW6Qou6N6Y4\ncVlNOS+lj9Jl5WWr88zkDhS2FH5G0WvBp7fWtZFhSi2hganaZtXdYnvJ3LZ3x/FP\nbBdrthF5mRVkNm89ylHqCDa04ZZjqJ8RTr5XmNTlS4T6vCLBYNwHdO0xYTh2PxVo\nlWUUrIyoV6b1xn/H/ko04E/hcfG/jehhvxBUqUPJHb7/bt7DAQqNIqQRLPcjXWwH\n2leSlUTnAgMBAAECggEASgizWcDxaFfArd0JKjrsd++OZivw2rkQmFl/k0RdzTWp\n+lGpPUXk9sm9TK0a5IgB3nFLu5zvn6zdO+7+bWoW3ykyNZbrZy+/aFKV2VFxDNWD\n6bRpgC6kUUGKAtubMGOjWEiM0EYajoh+KX6iMfTgYqXACob4JaatzSzqUQ7JG51J\nzZnzsNXBTr21kKkxtfTAIrCXjmy5ogAJhYCNgeoqVd8ILhrYHluK8F8WCqk+BPnA\nfZ4vPTeTEvxsh0uYFmRY8wA3TGwy9Q40Lsg+oHEcs/XmUDJHXFryNryknUHYwsA7\nWqbUi26/SHPKRs0w6y17f+LxCn1vg6MxOG7M3LcTcQKBgQDrq8grI8EbBLo6gdY9\n8mjBklTnkvEpxmYDmjzVWX1XCdZIj4xJYyt1Y3PDbBEeEbjwqrlrTQgk+R0tC3ed\nW3jLEbioUfulnri8dWfeuAr9xhCJc8qSxDeLNfnJQc7rpUzyhyD5KhNI8GTJ1zwW\n2JzJGWaPAu5KoNAY2SA//sfRiwKBgQDikjBx1oeIVqFg3Lp47G734l1ikgn9LcP3\njSW3cjYg2XBIlM1LrRt39ljdRByvA0vo3dlN5cFZMknzFlWV8ymvjNghbeVfD2r0\nBcWOMJ0ZeFB3cK127GBN+iMJ9Y8xR6ZWg0d9SmBVUSwVrndS1u4kS6vfYRBSCVWb\nmZujR/TNlQKBgQDR/t69Of3O8nZyvdDGoCMiIR8QvgmwfL3YBe6g+T3LedN8EpUh\nq4FE95pmjvvtvEL8CFRyPVC9iVCrG6W5DJHk+OR+75Z5bKYWH9OvTHVWzc9ce1YN\nU1Re8niiEcasiT24ehoyi4BlpPdaNzSu8tM6Ci0tz6G/0+25xneLLp6kowKBgCuc\npjSTbd1Bh6jEdCRopmeSrBUYNVIFqC4TfkoUcvTZxfJCqk3B0YLC6ZIV1Uue39LA\nOV70NcZ8lp1zFCBcAQ8olkXBCKDGr/iuz7syAltvvFVxXAKDN3prBqmZGeoLd6o5\ndN5aHbbufATkY1WPx6E266uA3Ipd/5uG8t14MVgNAoGBAJsE7YhgTtMfn1eYJiDG\nRhM8YbJ5njFezrug1Fzhq3BeXLTclpQXUlQC/hNDfOsDBQS7bQlmXNkKaN5Pc0G6\ngRKzuZucKJGEMpce0ZaM5mN1j+wRnZUH096O286X/M35WONI3iYD3atqyiR6meUp\nTHSWlR6A3P5xGWVKrNOs04ck\n-----END PRIVATE KEY-----\n", + "client_email": "duriin-gdelt@duriin.iam.gserviceaccount.com", + "client_id": "101084071372544178772", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/duriin-gdelt%40duriin.iam.gserviceaccount.com", + "universe_domain": "googleapis.com" +} diff --git a/package-lock.json b/package-lock.json index 808f9eb..d7872bc 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,6 +11,7 @@ "dependencies": { "@extractus/article-extractor": "^8.0.18", "@fastify/cors": "^11.2.0", + "@google-cloud/bigquery": "^8.1.1", "better-sqlite3": "^12.4.1", "fastify": "^5.6.1", "node-cron": "^4.2.1", @@ -177,6 +178,104 @@ "ipaddr.js": "^2.1.0" } }, + "node_modules/@google-cloud/bigquery": { + "version": "8.1.1", + "resolved": "https://registry.npmjs.org/@google-cloud/bigquery/-/bigquery-8.1.1.tgz", + "integrity": "sha512-2GHlohfA/VJffTvibMazMsZi6jPRx8MmaMberyDTL8rnhVs/frKSXVVRtLU83uSAy2j/5SD4mOs4jMQgJPON2g==", + "license": "Apache-2.0", + "dependencies": { + "@google-cloud/common": "^6.0.0", + "@google-cloud/paginator": "^6.0.0", + "@google-cloud/precise-date": "^5.0.0", + "@google-cloud/promisify": "^5.0.0", + "arrify": "^3.0.0", + "big.js": "^6.2.2", + "duplexify": "^4.1.3", + "extend": "^3.0.2", + "stream-events": "^1.0.5", + "teeny-request": "^10.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@google-cloud/common": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/@google-cloud/common/-/common-6.0.0.tgz", + "integrity": "sha512-IXh04DlkLMxWgYLIUYuHHKXKOUwPDzDgke1ykkkJPe48cGIS9kkL2U/o0pm4ankHLlvzLF/ma1eO86n/bkumIA==", + "license": "Apache-2.0", + "dependencies": { + "@google-cloud/projectify": "^4.0.0", + "@google-cloud/promisify": "^4.0.0", + "arrify": "^2.0.0", + "duplexify": "^4.1.3", + "extend": "^3.0.2", + "google-auth-library": "^10.0.0-rc.1", + "html-entities": "^2.5.2", + "retry-request": "^8.0.0", + "teeny-request": "^10.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@google-cloud/common/node_modules/@google-cloud/promisify": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/@google-cloud/promisify/-/promisify-4.1.0.tgz", + "integrity": "sha512-G/FQx5cE/+DqBbOpA5jKsegGwdPniU6PuIEMt+qxWgFxvxuFOzVmp6zYchtYuwAWV5/8Dgs0yAmjvNZv3uXLQg==", + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, + "node_modules/@google-cloud/common/node_modules/arrify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/arrify/-/arrify-2.0.1.tgz", + "integrity": "sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/@google-cloud/paginator": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/@google-cloud/paginator/-/paginator-6.0.0.tgz", + "integrity": "sha512-g5nmMnzC+94kBxOKkLGpK1ikvolTFCC3s2qtE4F+1EuArcJ7HHC23RDQVt3Ra3CqpUYZ+oXNKZ8n5Cn5yug8DA==", + "license": "Apache-2.0", + "dependencies": { + "extend": "^3.0.2" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@google-cloud/precise-date": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/@google-cloud/precise-date/-/precise-date-5.0.0.tgz", + "integrity": "sha512-9h0Gvw92EvPdE8AK8AgZPbMnH5ftDyPtKm7/KUfcJVaPEPjwGDsJd1QV0H8esBDV4II41R/2lDWH1epBqIoKUw==", + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, + "node_modules/@google-cloud/projectify": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@google-cloud/projectify/-/projectify-4.0.0.tgz", + "integrity": "sha512-MmaX6HeSvyPbWGwFq7mXdo0uQZLGBYCwziiLIGq5JVX+/bdI3SAq6bP98trV5eTWfLuvsMcIC1YJOF2vfteLFA==", + "license": "Apache-2.0", + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/@google-cloud/promisify": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/@google-cloud/promisify/-/promisify-5.0.0.tgz", + "integrity": "sha512-N8qS6dlORGHwk7WjGXKOSsLjIjNINCPicsOX6gyyLiYk7mq3MtII96NZ9N2ahwA2vnkLmZODOIH9rlNniYWvCQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, "node_modules/@img/colour": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.1.0.tgz", @@ -669,6 +768,15 @@ "integrity": "sha512-2BjRTZxTPvheOvGbBslFSYOUkr+SjPtOnrLP33f+VIWLzezQpZcqVg7ja3L4dBXmzzgwT+a029jRx5PCi3JuiA==", "license": "MIT" }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, "node_modules/ajv": { "version": "8.18.0", "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz", @@ -702,6 +810,18 @@ } } }, + "node_modules/arrify": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/arrify/-/arrify-3.0.0.tgz", + "integrity": "sha512-tLkvA81vQG/XqE2mjDkGQHoOINtMHtysSnemrmoGe6PydDPMRbVugqyk4A6V/WDWEfm3l+0d8anA9r8cv/5Jaw==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/atomic-sleep": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/atomic-sleep/-/atomic-sleep-1.0.0.tgz", @@ -765,6 +885,28 @@ "node": "20.x || 22.x || 23.x || 24.x || 25.x" } }, + "node_modules/big.js": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/big.js/-/big.js-6.2.2.tgz", + "integrity": "sha512-y/ie+Faknx7sZA5MfGA2xKlu0GDv8RWrXGsmlteyJQ2lvoKv9GBK/fpRMc2qlSoBAgNxrixICFCBefIq8WCQpQ==", + "license": "MIT", + "engines": { + "node": "*" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/bigjs" + } + }, + "node_modules/bignumber.js": { + "version": "9.3.1", + "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz", + "integrity": "sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==", + "license": "MIT", + "engines": { + "node": "*" + } + }, "node_modules/bindings": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz", @@ -815,6 +957,12 @@ "ieee754": "^1.1.13" } }, + "node_modules/buffer-equal-constant-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", + "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", + "license": "BSD-3-Clause" + }, "node_modules/chownr": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", @@ -877,6 +1025,32 @@ "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==", "license": "MIT" }, + "node_modules/data-uri-to-buffer": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz", + "integrity": "sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==", + "license": "MIT", + "engines": { + "node": ">= 12" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, "node_modules/decompress-response": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz", @@ -983,6 +1157,27 @@ "url": "https://github.com/fb55/domutils?sponsor=1" } }, + "node_modules/duplexify": { + "version": "4.1.3", + "resolved": "https://registry.npmjs.org/duplexify/-/duplexify-4.1.3.tgz", + "integrity": "sha512-M3BmBhwJRZsSx38lZyhE53Csddgzl5R7xGJNk7CVddZD6CcmwMCH8J+7AprIrQKH7TonKxaCjcv27Qmf+sQ+oA==", + "license": "MIT", + "dependencies": { + "end-of-stream": "^1.4.1", + "inherits": "^2.0.3", + "readable-stream": "^3.1.1", + "stream-shift": "^1.0.2" + } + }, + "node_modules/ecdsa-sig-formatter": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", + "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + } + }, "node_modules/end-of-stream": { "version": "1.4.5", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", @@ -1025,6 +1220,12 @@ "node": ">=6" } }, + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", + "license": "MIT" + }, "node_modules/fast-decode-uri-component": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/fast-decode-uri-component/-/fast-decode-uri-component-1.0.1.tgz", @@ -1144,6 +1345,29 @@ "reusify": "^1.0.4" } }, + "node_modules/fetch-blob": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.2.0.tgz", + "integrity": "sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "paypal", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "dependencies": { + "node-domexception": "^1.0.0", + "web-streams-polyfill": "^3.0.3" + }, + "engines": { + "node": "^12.20 || >= 14.13" + } + }, "node_modules/file-uri-to-path": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz", @@ -1164,6 +1388,18 @@ "node": ">=20" } }, + "node_modules/formdata-polyfill": { + "version": "4.0.10", + "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", + "integrity": "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==", + "license": "MIT", + "dependencies": { + "fetch-blob": "^3.1.2" + }, + "engines": { + "node": ">=12.20.0" + } + }, "node_modules/fs-constants": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", @@ -1184,12 +1420,100 @@ "node": "^8.16.0 || ^10.6.0 || >=11.0.0" } }, + "node_modules/gaxios": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-7.1.4.tgz", + "integrity": "sha512-bTIgTsM2bWn3XklZISBTQX7ZSddGW+IO3bMdGaemHZ3tbqExMENHLx6kKZ/KlejgrMtj8q7wBItt51yegqalrA==", + "license": "Apache-2.0", + "dependencies": { + "extend": "^3.0.2", + "https-proxy-agent": "^7.0.1", + "node-fetch": "^3.3.2" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/gaxios/node_modules/node-fetch": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz", + "integrity": "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==", + "license": "MIT", + "dependencies": { + "data-uri-to-buffer": "^4.0.0", + "fetch-blob": "^3.1.4", + "formdata-polyfill": "^4.0.10" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/node-fetch" + } + }, + "node_modules/gcp-metadata": { + "version": "8.1.2", + "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-8.1.2.tgz", + "integrity": "sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg==", + "license": "Apache-2.0", + "dependencies": { + "gaxios": "^7.0.0", + "google-logging-utils": "^1.0.0", + "json-bigint": "^1.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/github-from-package": { "version": "0.0.0", "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==", "license": "MIT" }, + "node_modules/google-auth-library": { + "version": "10.6.2", + "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-10.6.2.tgz", + "integrity": "sha512-e27Z6EThmVNNvtYASwQxose/G57rkRuaRbQyxM2bvYLLX/GqWZ5chWq2EBoUchJbCc57eC9ArzO5wMsEmWftCw==", + "license": "Apache-2.0", + "dependencies": { + "base64-js": "^1.3.0", + "ecdsa-sig-formatter": "^1.0.11", + "gaxios": "^7.1.4", + "gcp-metadata": "8.1.2", + "google-logging-utils": "1.1.3", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/google-logging-utils": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/google-logging-utils/-/google-logging-utils-1.1.3.tgz", + "integrity": "sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==", + "license": "Apache-2.0", + "engines": { + "node": ">=14" + } + }, + "node_modules/html-entities": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.6.0.tgz", + "integrity": "sha512-kig+rMn/QOVRvr7c86gQ8lWXq+Hkv6CbAH1hLu+RG338StTpE8Z0b44SDVaqVu7HGKf27frdmUYEs9hTUX/cLQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/mdevils" + }, + { + "type": "patreon", + "url": "https://patreon.com/mdevils" + } + ], + "license": "MIT" + }, "node_modules/html-escaper": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz", @@ -1227,6 +1551,32 @@ "url": "https://github.com/fb55/entities?sponsor=1" } }, + "node_modules/http-proxy-agent": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", + "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.0", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, "node_modules/ieee754": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", @@ -1277,6 +1627,15 @@ "node": ">=0.10.0" } }, + "node_modules/json-bigint": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", + "integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==", + "license": "MIT", + "dependencies": { + "bignumber.js": "^9.0.0" + } + }, "node_modules/json-schema-ref-resolver": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/json-schema-ref-resolver/-/json-schema-ref-resolver-3.0.0.tgz", @@ -1302,6 +1661,27 @@ "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", "license": "MIT" }, + "node_modules/jwa": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", + "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", + "license": "MIT", + "dependencies": { + "buffer-equal-constant-time": "^1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/jws": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", + "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", + "license": "MIT", + "dependencies": { + "jwa": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, "node_modules/light-my-request": { "version": "6.6.0", "resolved": "https://registry.npmjs.org/light-my-request/-/light-my-request-6.6.0.tgz", @@ -1390,6 +1770,12 @@ "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==", "license": "MIT" }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, "node_modules/nanoid": { "version": "3.3.11", "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", @@ -1435,6 +1821,26 @@ "node": ">=6.0.0" } }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "deprecated": "Use your platform's native DOMException instead", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, "node_modules/node-fetch": { "version": "2.7.0", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", @@ -1707,6 +2113,19 @@ "node": ">=10" } }, + "node_modules/retry-request": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/retry-request/-/retry-request-8.0.2.tgz", + "integrity": "sha512-JzFPAfklk1kjR1w76f0QOIhoDkNkSqW8wYKT08n9yysTmZfB+RQ2QoXoTAeOi1HD9ZipTyTAZg3c4pM/jeqgSw==", + "license": "MIT", + "dependencies": { + "extend": "^3.0.2", + "teeny-request": "^10.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/reusify": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz", @@ -2063,6 +2482,21 @@ "win32" ] }, + "node_modules/stream-events": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/stream-events/-/stream-events-1.0.5.tgz", + "integrity": "sha512-E1GUzBSgvct8Jsb3v2X15pjzN1tYebtbLaMg+eBOUOAxgbLoSbT2NS91ckc5lJD1KfLjId+jXJRgo0qnV5Nerg==", + "license": "MIT", + "dependencies": { + "stubs": "^3.0.0" + } + }, + "node_modules/stream-shift": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.3.tgz", + "integrity": "sha512-76ORR0DO1o1hlKwTbi/DM3EXWGf3ZJYO8cXX5RJwnul2DEg2oyoZyjLNoQM8WsvZiFKCRfC1O0J7iCvie3RZmQ==", + "license": "MIT" + }, "node_modules/string_decoder": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", @@ -2081,6 +2515,12 @@ "node": ">=0.10.0" } }, + "node_modules/stubs": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/stubs/-/stubs-3.0.0.tgz", + "integrity": "sha512-PdHt7hHUJKxvTCgbKX9C1V/ftOcjJQgz8BZwNfV5c4B6dcGqlpelTbJ999jBGZ2jYiPAwcX5dP6oBwVlBlUbxw==", + "license": "MIT" + }, "node_modules/tar-fs": { "version": "2.1.4", "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz", @@ -2109,6 +2549,39 @@ "node": ">=6" } }, + "node_modules/teeny-request": { + "version": "10.1.2", + "resolved": "https://registry.npmjs.org/teeny-request/-/teeny-request-10.1.2.tgz", + "integrity": "sha512-Xj0ZAQ0CeuQn6UxCDPLbFRlgcSTUEyO3+wiepr2grjIjyL/lMMs1Z4OwXn8kLvn/V1OuaEP0UY7Na6UDNNsYrQ==", + "license": "Apache-2.0", + "dependencies": { + "http-proxy-agent": "^7.0.0", + "https-proxy-agent": "^7.0.1", + "node-fetch": "^3.3.2", + "stream-events": "^1.0.5" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/teeny-request/node_modules/node-fetch": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz", + "integrity": "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==", + "license": "MIT", + "dependencies": { + "data-uri-to-buffer": "^4.0.0", + "fetch-blob": "^3.1.4", + "formdata-polyfill": "^4.0.10" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/node-fetch" + } + }, "node_modules/thread-stream": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/thread-stream/-/thread-stream-4.0.0.tgz", @@ -2167,6 +2640,15 @@ "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", "license": "MIT" }, + "node_modules/web-streams-polyfill": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz", + "integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==", + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, "node_modules/webidl-conversions": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", diff --git a/package.json b/package.json index 5ffc6e9..b71edbc 100644 --- a/package.json +++ b/package.json @@ -13,6 +13,7 @@ "dependencies": { "@extractus/article-extractor": "^8.0.18", "@fastify/cors": "^11.2.0", + "@google-cloud/bigquery": "^8.1.1", "better-sqlite3": "^12.4.1", "fastify": "^5.6.1", "node-cron": "^4.2.1", diff --git a/sources.json b/sources.json new file mode 100644 index 0000000..c9908f7 --- /dev/null +++ b/sources.json @@ -0,0 +1,648 @@ +[ + { + "id": "al_jazeera", + "label": "Al Jazeera", + "feedUrl": "https://www.aljazeera.com/xml/rss/all.xml", + "website": "aljazeera.com", + "backfill": true + }, + { + "id": "bbc", + "label": "BBC", + "feedUrl": "https://feeds.bbci.co.uk/news/business/rss.xml", + "website": [ + "bbc.com", + "bbc.co.uk" + ], + "backfill": true + }, + { + "id": "business_insider", + "label": "Business Insider", + "feedUrl": "https://feeds.businessinsider.com/custom/all", + "website": "businessinsider.com", + "backfill": true + }, + { + "id": "bloomberg_markets", + "label": "Bloomberg Markets", + "feedUrl": "https://feeds.bloomberg.com/markets/news.rss", + "website": "bloomberg.com", + "backfill": true + }, + { + "id": "cnbc", + "label": "CNBC", + "feedUrl": "https://www.cnbc.com/id/100003114/device/rss/rss.html", + "website": "cnbc.com", + "backfill": true + }, + { + "id": "wall_street_journal", + "label": "Wall Street Journal", + "feedUrl": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml", + "website": "wsj.com", + "backfill": true + }, + { + "id": "marketwatch", + "label": "MarketWatch", + "feedUrl": "https://feeds.marketwatch.com/marketwatch/topstories/", + "website": "marketwatch.com", + "backfill": true + }, + { + "id": "yahoo_finance", + "label": "Yahoo Finance", + "feedUrl": "https://finance.yahoo.com/news/rssindex", + "website": [ + "finance.yahoo.com", + "yahoo.com" + ], + "backfill": true + }, + { + "id": "seeking_alpha", + "label": "Seeking Alpha", + "feedUrl": "https://seekingalpha.com/feed.xml", + "website": "seekingalpha.com", + "backfill": true + }, + { + "id": "financial_times", + "label": "Financial Times", + "feedUrl": "https://www.ft.com/?format=rss", + "website": "ft.com", + "backfill": true + }, + { + "id": "the_economist", + "label": "The Economist", + "feedUrl": "https://www.economist.com/finance-and-economics/rss.xml", + "website": "economist.com", + "backfill": true + }, + { + "id": "fortune", + "label": "Fortune", + "feedUrl": "https://fortune.com/feed", + "website": "fortune.com", + "backfill": true + }, + { + "id": "forbes_business", + "label": "Forbes Business", + "feedUrl": "https://www.forbes.com/business/feed/", + "website": "forbes.com", + "backfill": true + }, + { + "id": "inc_magazine", + "label": "Inc Magazine", + "feedUrl": "https://www.inc.com/rss", + "website": "inc.com", + "backfill": true + }, + { + "id": "fast_company", + "label": "Fast Company", + "feedUrl": "https://www.fastcompany.com/latest/rss", + "website": "fastcompany.com", + "backfill": true + }, + { + "id": "entrepreneur", + "label": "Entrepreneur", + "feedUrl": "https://www.entrepreneur.com/latest.rss", + "website": "entrepreneur.com", + "backfill": true + }, + { + "id": "axios", + "label": "Axios", + "feedUrl": "https://api.axios.com/feed/", + "website": "axios.com", + "backfill": true + }, + { + "id": "wired_business", + "label": "Wired Business", + "feedUrl": "https://www.wired.com/feed/category/business/latest/rss", + "website": "wired.com", + "backfill": true + }, + { + "id": "npr_business", + "label": "NPR Business", + "feedUrl": "https://feeds.npr.org/1006/rss.xml", + "website": "npr.org", + "backfill": true + }, + { + "id": "federal_reserve", + "label": "Federal Reserve", + "feedUrl": "https://www.federalreserve.gov/feeds/press_all.xml", + "website": "federalreserve.gov", + "backfill": true + }, + { + "id": "techcrunch", + "label": "TechCrunch", + "feedUrl": "https://techcrunch.com/feed/", + "website": "techcrunch.com", + "backfill": true + }, + { + "id": "the_verge", + "label": "The Verge", + "feedUrl": "https://www.theverge.com/rss/index.xml", + "website": "theverge.com", + "backfill": true + }, + { + "id": "ars_technica", + "label": "Ars Technica", + "feedUrl": "https://feeds.arstechnica.com/arstechnica/index", + "website": "arstechnica.com", + "backfill": true + }, + { + "id": "retail_dive", + "label": "Retail Dive", + "feedUrl": "https://www.retaildive.com/feeds/news/", + "website": "retaildive.com", + "backfill": true + }, + { + "id": "manufacturing_dive", + "label": "Manufacturing Dive", + "feedUrl": "https://www.manufacturingdive.com/feeds/news/", + "website": "manufacturingdive.com", + "backfill": true + }, + { + "id": "banking_dive", + "label": "Banking Dive", + "feedUrl": "https://www.bankingdive.com/feeds/news/", + "website": "bankingdive.com", + "backfill": true + }, + { + "id": "financial_post_ca", + "label": "Financial Post CA", + "feedUrl": "https://financialpost.com/feed", + "website": "financialpost.com", + "backfill": true + }, + { + "id": "globe_and_mail", + "label": "Globe and Mail", + "feedUrl": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/", + "website": "theglobeandmail.com", + "backfill": true + }, + { + "id": "guardian_business", + "label": "Guardian Business", + "feedUrl": "https://www.theguardian.com/uk/business/rss", + "website": "theguardian.com", + "backfill": true + }, + { + "id": "sky_news_business", + "label": "Sky News Business", + "feedUrl": "https://feeds.skynews.com/feeds/rss/business.xml", + "website": "skynews.com", + "backfill": true + }, + { + "id": "this_is_money", + "label": "This Is Money", + "feedUrl": "[FAILED] https://www.thisismoney.co.uk/money/news/index.rss", + "website": "thisismoney.co.uk", + "backfill": true + }, + { + "id": "city_a_m", + "label": "City A.M.", + "feedUrl": "https://www.cityam.com/feed/", + "website": "cityam.com", + "backfill": true + }, + { + "id": "spiegel_wirtschaft", + "label": "Spiegel Wirtschaft", + "feedUrl": "https://www.spiegel.de/wirtschaft/index.rss", + "website": "spiegel.de", + "backfill": true + }, + { + "id": "handelsblatt", + "label": "Handelsblatt", + "feedUrl": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen", + "website": "handelsblatt.com", + "backfill": true + }, + { + "id": "faz_wirtschaft", + "label": "FAZ Wirtschaft", + "feedUrl": "https://www.faz.net/rss/aktuell/wirtschaft/", + "website": "faz.net", + "backfill": true + }, + { + "id": "die_welt_wirtschaft", + "label": "Die Welt Wirtschaft", + "feedUrl": "https://www.welt.de/feeds/section/wirtschaft.rss", + "website": "welt.de", + "backfill": true + }, + { + "id": "les_echos", + "label": "Les Echos", + "feedUrl": "[FAILED] https://feeds.lesechos.fr/rss/rss_la_une.xml", + "website": "lesechos.fr", + "backfill": true + }, + { + "id": "le_monde_economie", + "label": "Le Monde Economie", + "feedUrl": "https://www.lemonde.fr/economie/rss_full.xml", + "website": "lemonde.fr", + "backfill": true + }, + { + "id": "bfm_business", + "label": "BFM Business", + "feedUrl": "[FAILED] https://bfmbusiness.bfmtv.com/rss/news-flux-rss/", + "website": "bfmbusiness.bfmtv.com", + "backfill": true + }, + { + "id": "el_economista_es", + "label": "El Economista ES", + "feedUrl": "[FAILED] https://www.eleconomista.es/rss/rss-de-portada.php", + "website": "eleconomista.es", + "backfill": true + }, + { + "id": "expansion_es", + "label": "Expansion ES", + "feedUrl": "https://e00-expansion.uecdn.es/rss/portada.xml", + "website": "expansion.com", + "backfill": true + }, + { + "id": "cinco_dias", + "label": "Cinco Dias", + "feedUrl": "[FAILED] https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml", + "website": "cincodias.elpais.com", + "backfill": true + }, + { + "id": "il_sole_24_ore", + "label": "Il Sole 24 Ore", + "feedUrl": "[FAILED] https://www.ilsole24ore.com/rss/economia--finanza.xml", + "website": "ilsole24ore.com", + "backfill": true + }, + { + "id": "fd_nl", + "label": "FD.nl", + "feedUrl": "[FAILED] https://fd.nl/rss", + "website": "fd.nl", + "backfill": true + }, + { + "id": "nzz_wirtschaft", + "label": "NZZ Wirtschaft", + "feedUrl": "https://www.nzz.ch/wirtschaft.rss", + "website": "nzz.ch", + "backfill": true + }, + { + "id": "moscow_times", + "label": "Moscow Times", + "feedUrl": "https://www.themoscowtimes.com/rss/news", + "website": "themoscowtimes.com", + "backfill": true + }, + { + "id": "rbc_russia", + "label": "RBC Russia", + "feedUrl": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss", + "website": "rbc.ru", + "backfill": true + }, + { + "id": "economic_times_india", + "label": "Economic Times India", + "feedUrl": "https://economictimes.indiatimes.com/rssfeedstopstories.cms", + "website": "economictimes.indiatimes.com", + "backfill": true + }, + { + "id": "business_standard_in", + "label": "Business Standard IN", + "feedUrl": "https://www.business-standard.com/rss/home_page_top_stories.rss", + "website": "business-standard.com", + "backfill": true + }, + { + "id": "live_mint", + "label": "Live Mint", + "feedUrl": "[FAILED] https://www.livemint.com/rss/headlines", + "website": "livemint.com", + "backfill": true + }, + { + "id": "moneycontrol", + "label": "Moneycontrol", + "feedUrl": "https://www.moneycontrol.com/rss/MCtopnews.xml", + "website": "moneycontrol.com", + "backfill": true + }, + { + "id": "hindu_business_line", + "label": "Hindu Business Line", + "feedUrl": "https://www.thehindubusinessline.com/feeder/default.rss", + "website": "thehindubusinessline.com", + "backfill": true + }, + { + "id": "caixin_global", + "label": "Caixin Global", + "feedUrl": "[FAILED] https://www.caixinglobal.com/rss/newsfeeds/", + "website": "caixinglobal.com", + "backfill": true + }, + { + "id": "china_daily_business", + "label": "China Daily Business", + "feedUrl": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml", + "website": "chinadaily.com.cn", + "backfill": true + }, + { + "id": "xinhua_business", + "label": "Xinhua Business", + "feedUrl": "[FAILED] https://english.news.cn/rss/business.xml", + "website": "news.cn", + "backfill": true + }, + { + "id": "south_china_morning_post", + "label": "South China Morning Post", + "feedUrl": "https://www.scmp.com/rss/91/feed", + "website": "scmp.com", + "backfill": true + }, + { + "id": "nikkei_asia", + "label": "Nikkei Asia", + "feedUrl": "https://asia.nikkei.com/rss/feed/nar", + "website": "asia.nikkei.com", + "backfill": true + }, + { + "id": "japan_times_business", + "label": "Japan Times Business", + "feedUrl": "[FAILED] https://www.japantimes.co.jp/feed/business/", + "website": "japantimes.co.jp", + "backfill": true + }, + { + "id": "korea_herald", + "label": "Korea Herald", + "feedUrl": "https://www.koreaherald.com/rss/010000000000.xml", + "website": "koreaherald.com", + "backfill": true + }, + { + "id": "korea_joongang_daily", + "label": "Korea JoongAng Daily", + "feedUrl": "[FAILED] https://koreajoongangdaily.joins.com/rss/", + "website": "koreajoongangdaily.joins.com", + "backfill": true + }, + { + "id": "business_times_sg", + "label": "Business Times SG", + "feedUrl": "https://www.businesstimes.com.sg/rss.xml", + "website": "businesstimes.com.sg", + "backfill": true + }, + { + "id": "straits_times_business", + "label": "Straits Times Business", + "feedUrl": "https://www.straitstimes.com/news/business/rss.xml", + "website": "straitstimes.com", + "backfill": true + }, + { + "id": "channel_newsasia", + "label": "Channel NewsAsia", + "feedUrl": "https://www.channelnewsasia.com/rssfeeds/8395986", + "website": "channelnewsasia.com", + "backfill": true + }, + { + "id": "bangkok_post_business", + "label": "Bangkok Post Business", + "feedUrl": "https://www.bangkokpost.com/rss/data/business.xml", + "website": "bangkokpost.com", + "backfill": true + }, + { + "id": "the_star_malaysia", + "label": "The Star Malaysia", + "feedUrl": "[FAILED] https://www.thestar.com.my/rss/Business/Business-News", + "website": "thestar.com.my", + "backfill": true + }, + { + "id": "australian_fin_review", + "label": "Australian Fin Review", + "feedUrl": "[FAILED] https://www.afr.com/rss", + "website": "afr.com", + "backfill": true + }, + { + "id": "abc_business_au", + "label": "ABC Business AU", + "feedUrl": "[FAILED] https://www.abc.net.au/news/feed/52278/rss.xml", + "website": "abc.net.au", + "backfill": true + }, + { + "id": "nz_herald_business", + "label": "NZ Herald Business", + "feedUrl": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/", + "website": "nzherald.co.nz", + "backfill": true + }, + { + "id": "arabian_business", + "label": "Arabian Business", + "feedUrl": "[FAILED] https://www.arabianbusiness.com/rss.xml", + "website": "arabianbusiness.com", + "backfill": true + }, + { + "id": "gulf_news_business", + "label": "Gulf News Business", + "feedUrl": "[FAILED] https://gulfnews.com/rss/business", + "website": "gulfnews.com", + "backfill": true + }, + { + "id": "arab_news", + "label": "Arab News", + "feedUrl": "[FAILED] https://www.arabnews.com/rss/front_page.xml", + "website": "arabnews.com", + "backfill": true + }, + { + "id": "the_national_uae", + "label": "The National UAE", + "feedUrl": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml", + "website": "thenationalnews.com", + "backfill": true + }, + { + "id": "businessday_nigeria", + "label": "BusinessDay Nigeria", + "feedUrl": "https://businessday.ng/feed/", + "website": "businessday.ng", + "backfill": true + }, + { + "id": "moneyweb_sa", + "label": "Moneyweb SA", + "feedUrl": "https://www.moneyweb.co.za/feed/", + "website": "moneyweb.co.za", + "backfill": true + }, + { + "id": "businesslive_sa", + "label": "BusinessLive SA", + "feedUrl": "[FAILED] https://www.businesslive.co.za/rss/bd/", + "website": "businesslive.co.za", + "backfill": true + }, + { + "id": "business_daily_africa", + "label": "Business Daily Africa", + "feedUrl": "[FAILED] https://www.businessdailyafrica.com/rss/", + "website": "businessdailyafrica.com", + "backfill": true + }, + { + "id": "vanguard_business_ng", + "label": "Vanguard Business NG", + "feedUrl": "https://www.vanguardngr.com/category/business/feed/", + "website": "vanguardngr.com", + "backfill": true + }, + { + "id": "folha_mercado_br", + "label": "Folha Mercado BR", + "feedUrl": "https://feeds.folha.uol.com.br/mercado/rss091.xml", + "website": "folha.uol.com.br", + "backfill": true + }, + { + "id": "g1_economia_br", + "label": "G1 Economia BR", + "feedUrl": "https://g1.globo.com/dynamo/economia/rss2.xml", + "website": "g1.globo.com", + "backfill": true + }, + { + "id": "exame_br", + "label": "Exame BR", + "feedUrl": "https://exame.com/feed/", + "website": "exame.com", + "backfill": true + }, + { + "id": "el_economista_mx", + "label": "El Economista MX", + "feedUrl": "[FAILED] https://www.eleconomista.com.mx/rss/rss.html", + "website": "eleconomista.com.mx", + "backfill": true + }, + { + "id": "expansion_mx", + "label": "Expansion MX", + "feedUrl": "https://expansion.mx/rss", + "website": "expansion.mx", + "backfill": true + }, + { + "id": "la_nacion_ar", + "label": "La Nacion AR", + "feedUrl": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/", + "website": "lanacion.com.ar", + "backfill": true + }, + { + "id": "infobae_economia_ar", + "label": "Infobae Economia AR", + "feedUrl": "[FAILED] https://www.infobae.com/feeds/rss/economia/", + "website": "infobae.com", + "backfill": true + }, + { + "id": "portafolio_colombia", + "label": "Portafolio Colombia", + "feedUrl": "[FAILED] https://www.portafolio.co/rss/portafolio.xml", + "website": "portafolio.co", + "backfill": true + }, + { + "id": "el_comercio_peru", + "label": "El Comercio Peru", + "feedUrl": "[FAILED] https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/", + "website": "elcomercio.pe", + "backfill": true + }, + { + "id": "jamaica_gleaner", + "label": "Jamaica Gleaner", + "feedUrl": "https://jamaica-gleaner.com/feed/business.xml", + "website": [ + "jamaica-gleaner.com", + "jamaicagleaner.com" + ], + "backfill": true + }, + { + "id": "jamaica_observer", + "label": "Jamaica Observer", + "feedUrl": "https://www.jamaicaobserver.com/app/business/", + "website": "jamaicaobserver.com", + "backfill": true + }, + { + "id": "stabroek_news", + "label": "Stabroek News", + "feedUrl": "[FAILED] https://www.stabroeknews.com/feed/", + "website": "stabroeknews.com", + "backfill": true + }, + { + "id": "nation_news_barbados", + "label": "Nation News Barbados", + "feedUrl": "[FAILED] https://nationnews.com/rss-feed/", + "website": "nationnews.com", + "backfill": true + }, + { + "id": "google_news", + "label": "Google News", + "feedUrl": "https://news.google.com/rss?hl=en-GB&gl=GB&ceid=GB:en", + "website": "news.google.com", + "backfill": false + } +] diff --git a/src/content.js b/src/content.js index ccb87de..1c47720 100644 --- a/src/content.js +++ b/src/content.js @@ -1,8 +1,9 @@ -const { extract } = require('@extractus/article-extractor'); +const { extractFromHtml } = require('@extractus/article-extractor'); const sharp = require('sharp'); const db = require('./db'); const { generateAndStoreEmbedding } = require('./embeddings'); const { fetchWithPolicy } = require('./http'); +const { getSharedBrowserSession } = require('./sources/browserCrawler'); const updateArticleAssets = db.prepare(` UPDATE articles @@ -40,32 +41,7 @@ const selectArticlesMissingContent = db.prepare(` LIMIT ? `); -const blockedContentDomains = [ - 'axios.com', - 'bizjournals.com', - 'fastcompany.com', - 'gurufocus.com', - 'investing.com', - 'rbc.ru', - 'stocktitan.net', -]; const loggedBlockedDomains = new Set(); -const articleFetchHeaders = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', - Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.9', - 'Cache-Control': 'no-cache', - Pragma: 'no-cache', - 'Upgrade-Insecure-Requests': '1', - 'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"macOS"', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', -}; - let contentBackfillRunning = false; function getHostname(url) { @@ -76,10 +52,6 @@ function getHostname(url) { } } -function isBlockedContentUrl(url) { - const hostname = getHostname(url); - return blockedContentDomains.some((domain) => hostname === domain || hostname.endsWith(`.${domain}`)); -} function getErrorStatus(error) { if (error && Number.isInteger(error.status)) { @@ -147,20 +119,9 @@ async function fetchCompressedImage(url) { async function fetchAndStoreContent(id, url) { try { - if (isBlockedContentUrl(url)) { - const hostname = getHostname(url); - if (hostname && !loggedBlockedDomains.has(hostname)) { - loggedBlockedDomains.add(hostname); - console.warn(`content extraction skipped for blocked domain ${hostname}`); - } - markArticleStatus(markContentSkipped, id, `blocked domain: ${hostname || 'unknown'}`); - return; - } - - const article = await extract(url, {}, { - headers: articleFetchHeaders, - signal: AbortSignal.timeout(20000), - }); + const browserSession = await getSharedBrowserSession({ requestTimeout: 20000, maxConcurrentPages: 2 }); + const html = await browserSession.fetchRenderedHtml(url, { timeout: 20000 }); + const article = await extractFromHtml(html, url); if (!article) { markArticleStatus(markContentSkipped, id, 'extractor returned no article'); return; diff --git a/src/db.js b/src/db.js index ebc8ae7..7f65b14 100644 --- a/src/db.js +++ b/src/db.js @@ -127,6 +127,49 @@ db.exec(` ); `); +db.exec(` + CREATE TABLE IF NOT EXISTS gdelt_backfill_windows ( + source_id TEXT NOT NULL, + window_start TEXT NOT NULL, + window_end TEXT NOT NULL, + completed_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (source_id, window_start, window_end) + ); +`); + +db.exec(` + CREATE TABLE IF NOT EXISTS crawler_page_classifications ( + url TEXT PRIMARY KEY, + site_name TEXT NOT NULL, + classification TEXT NOT NULL, + pattern TEXT, + classified_at TEXT NOT NULL DEFAULT (datetime('now')) + ); +`); + +db.exec(` + CREATE TABLE IF NOT EXISTS crawler_url_patterns ( + site_name TEXT NOT NULL, + pattern TEXT NOT NULL, + classification TEXT NOT NULL, + hit_count INTEGER NOT NULL DEFAULT 1, + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (site_name, pattern) + ); +`); + +db.exec(` + CREATE TABLE IF NOT EXISTS crawler_site_rules ( + site_name TEXT NOT NULL, + rule_type TEXT NOT NULL, + rule_value TEXT NOT NULL, + classification TEXT NOT NULL, + hit_count INTEGER NOT NULL DEFAULT 1, + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (site_name, rule_type, rule_value) + ); +`); + for (const statement of [ 'ALTER TABLE articles ADD COLUMN image TEXT', 'ALTER TABLE articles ADD COLUMN content_status TEXT', diff --git a/src/scheduler.js b/src/scheduler.js index 2c39c9a..73d5f30 100644 --- a/src/scheduler.js +++ b/src/scheduler.js @@ -2,14 +2,18 @@ const cron = require('node-cron'); const config = require('./config'); const { ingestBatch } = require('./ingest'); const { fetchRssArticles } = require('./sources/rss'); -const { fetchGdeltArticles } = require('./sources/gdelt'); +const { fetchGdeltArticles, hasPendingWindows } = require('./sources/gdelt'); const { fetchEdgarArticles } = require('./sources/edgar'); const { fetchAlphaVantageArticles } = require('./sources/alphavantage'); const { fetchFinnhubArticles } = require('./sources/finnhub'); -const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler'); +const { fetchGoogleNewsArticles } = require('./sources/googleNews'); const { backfillMissingContent } = require('./content'); const { backfillMissingEmbeddings } = require('./embeddings'); +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + async function runSource(source, fetcher) { try { const articles = await fetcher(); @@ -20,16 +24,6 @@ async function runSource(source, fetcher) { } } -async function runCrawlerSources() { - const results = []; - - for (const site of getConfiguredCrawlerSites()) { - results.push(await runSource(site.name, () => crawlSite(site))); - } - - return results; -} - async function runAllIngestions() { const results = []; @@ -38,7 +32,7 @@ async function runAllIngestions() { results.push(await runSource('edgar', fetchEdgarArticles)); results.push(await runSource('alphavantage', fetchAlphaVantageArticles)); results.push(await runSource('finnhub', fetchFinnhubArticles)); - results.push(...await runCrawlerSources()); + results.push(await runSource('googlenews', fetchGoogleNewsArticles)); try { await backfillMissingContent(); @@ -60,8 +54,23 @@ function startScheduler() { await runSource('rss', fetchRssArticles); }; - const runGdelt = async () => { - await runSource('gdelt', fetchGdeltArticles); + const runGdeltLoop = async () => { + while (true) { + if (!hasPendingWindows()) { + await sleep(60 * 1000); + continue; + } + + const isBigQuery = String(config.gdelt?.source || 'api').toLowerCase() === 'bigquery'; + + if (isBigQuery) { + await fetchGdeltArticles(async (articles) => { + await ingestBatch('gdelt', articles); + }); + } else { + await runSource('gdelt', fetchGdeltArticles); + } + } }; const runEdgar = async () => { @@ -76,6 +85,10 @@ function startScheduler() { await runSource('finnhub', fetchFinnhubArticles); }; + const runGoogleNews = async () => { + await runSource('googlenews', fetchGoogleNewsArticles); + }; + const runContentMaintenance = async () => { try { await backfillMissingContent(); @@ -91,24 +104,19 @@ function startScheduler() { }; runRss(); - runGdelt(); + runGdeltLoop(); runEdgar(); runAlphaVantage(); runFinnhub(); + // runGoogleNews(); runContentMaintenance(); cron.schedule(config.scheduler.rss, runRss); - cron.schedule(config.scheduler.gdelt, runGdelt); cron.schedule(config.scheduler.edgar, runEdgar); cron.schedule(config.scheduler.alphaVantage, runAlphaVantage); cron.schedule(config.scheduler.finnhub, runFinnhub); - if (config.scheduler.newsCrawler) { - runCrawlerSources(); - cron.schedule(config.scheduler.newsCrawler, runCrawlerSources); - } - - cron.schedule(config.contentBackfill.cron, runContentMaintenance); +cron.schedule(config.contentBackfill.cron, runContentMaintenance); } module.exports = { diff --git a/src/sources/browserCrawler.js b/src/sources/browserCrawler.js index 6eab8e6..5fb058e 100644 --- a/src/sources/browserCrawler.js +++ b/src/sources/browserCrawler.js @@ -1,43 +1,131 @@ const { chromium } = require('playwright'); const BROWSER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'; -let browserPromise = null; +const MAX_RENDERED_HTML_LENGTH = 1_500_000; +const DEFAULT_REQUEST_TIMEOUT = 20000; +const CONSENT_BUTTON_SELECTORS = [ + 'button[name="agree"]', + 'input[name="agree"]', + 'button:has-text("Accept all")', + 'button:has-text("Accept All")', + 'button:has-text("Accept")', + 'button:has-text("Accept cookies")', + 'button:has-text("I agree")', + 'button:has-text("Agree")', + 'button:has-text("Consent")', + '[data-action="agree"]', + '[data-testid="consent-accept"]', + '[data-testid="accept-button"]', +]; -async function getBrowser() { - if (!browserPromise) { - browserPromise = chromium.launch({ - headless: false, - }); - } +let sharedBrowserSessionPromise = null; +let sharedBrowserShutdownInstalled = false; - return browserPromise; +function normalizeTimeout(value) { + return Math.max(1000, Math.min(Number(value) || DEFAULT_REQUEST_TIMEOUT, 30000)); } -async function waitForUsefulDom(page, site) { +async function waitForUsefulDom(page, requestTimeout) { + const timeout = Math.min(normalizeTimeout(requestTimeout), 5000); + try { - await page.waitForLoadState('networkidle', { timeout: Math.min(site.requestTimeout, 5000) }); + await page.waitForLoadState('networkidle', { timeout }); } catch { } try { - await page.waitForFunction(() => document.querySelectorAll('a[href]').length > 20, { - timeout: Math.min(site.requestTimeout, 5000), - }); + await page.waitForFunction(() => document.querySelectorAll('a[href]').length > 20, { timeout }); } catch { } } -async function createBrowserSession(site) { - const browser = await getBrowser(); +async function acceptConsentIfPresent(page) { + let url; + try { + url = new URL(page.url()); + } catch { + return; + } + + const hostname = url.hostname.toLowerCase(); + const pathname = url.pathname.toLowerCase(); + const shouldCheckConsent = hostname.includes('consent.') + || hostname.includes('yahoo.com') + || /cookie|consent|privacy/.test(hostname) + || /cookie|consent|privacy/.test(pathname); + + if (!shouldCheckConsent) { + return; + } + + for (const selector of CONSENT_BUTTON_SELECTORS) { + const locator = page.locator(selector).first(); + + try { + if (await locator.isVisible({ timeout: 750 })) { + await locator.click({ timeout: 2000 }); + try { + await page.waitForLoadState('domcontentloaded', { timeout: 5000 }); + } catch { + } + return; + } + } catch { + } + } +} + +async function buildBrowserSession(options = {}) { + const requestTimeout = normalizeTimeout(options.requestTimeout); + const maxConcurrentPages = Math.max(1, Math.min(Number(options.maxConcurrentPages) || 2, 8)); + const browser = await chromium.launch({ + headless: true, + args: [ + '--disable-blink-features=AutomationControlled', + '--no-sandbox', + '--disable-dev-shm-usage', + ], + }); const context = await browser.newContext({ userAgent: BROWSER_USER_AGENT, viewport: { width: 1440, height: 1200 }, javaScriptEnabled: true, + extraHTTPHeaders: { + 'Accept-Language': 'en-US,en;q=0.9', + }, }); + await context.addInitScript(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); + }); + const waiters = []; + let activePages = 0; + let closed = false; + + async function acquirePageSlot() { + if (activePages < maxConcurrentPages) { + activePages += 1; + return; + } + + await new Promise((resolve) => { + waiters.push(resolve); + }); + activePages += 1; + } + + function releasePageSlot() { + activePages = Math.max(0, activePages - 1); + const next = waiters.shift(); + if (next) { + next(); + } + } + await context.route('**/*', async (route) => { - const request = route.request(); - const resourceType = request.resourceType(); + const resourceType = route.request().resourceType(); if (['image', 'media', 'font'].includes(resourceType)) { await route.abort(); @@ -49,16 +137,23 @@ async function createBrowserSession(site) { return { async fetchRenderedHtml(url, options = {}) { + if (closed) { + throw new Error('browser session is closed'); + } + + await acquirePageSlot(); const page = await context.newPage(); + const timeout = normalizeTimeout(options.timeout || requestTimeout); try { await page.goto(url, { waitUntil: 'domcontentloaded', - timeout: site.requestTimeout, + timeout, }); - await waitForUsefulDom(page, site); - const html = await page.content(); + await acceptConsentIfPresent(page); + await waitForUsefulDom(page, timeout); + const html = (await page.content()).slice(0, MAX_RENDERED_HTML_LENGTH); if (options.includeDebug) { return { @@ -71,20 +166,73 @@ async function createBrowserSession(site) { return html; } finally { - await page.close(); + try { + await page.close(); + } finally { + releasePageSlot(); + } } }, async close() { + if (closed) { + return; + } + + closed = true; await context.close(); + await browser.close(); }, }; } -function shouldUseBrowser(site) { - return site.renderMode === 'browser'; +function installSharedBrowserShutdown() { + if (sharedBrowserShutdownInstalled) { + return; + } + + sharedBrowserShutdownInstalled = true; + const shutdown = () => { + if (!sharedBrowserSessionPromise) { + return; + } + + const sessionPromise = sharedBrowserSessionPromise; + sharedBrowserSessionPromise = null; + sessionPromise + .then((session) => session.close()) + .catch((error) => console.error('shared browser shutdown failed:', error)); + }; + + process.once('beforeExit', shutdown); + process.once('SIGINT', shutdown); + process.once('SIGTERM', shutdown); +} + +async function createBrowserSession(site = {}) { + return buildBrowserSession({ + requestTimeout: site.requestTimeout, + maxConcurrentPages: site.pageConcurrency, + }); +} + +async function getSharedBrowserSession(options = {}) { + if (!sharedBrowserSessionPromise) { + installSharedBrowserShutdown(); + sharedBrowserSessionPromise = buildBrowserSession({ + requestTimeout: options.requestTimeout, + maxConcurrentPages: options.maxConcurrentPages || 2, + }); + } + + return sharedBrowserSessionPromise; +} + +function shouldUseBrowser() { + return true; } module.exports = { createBrowserSession, + getSharedBrowserSession, shouldUseBrowser, }; diff --git a/src/sources/crawlerClassifier.js b/src/sources/crawlerClassifier.js new file mode 100644 index 0000000..25e15e8 --- /dev/null +++ b/src/sources/crawlerClassifier.js @@ -0,0 +1,804 @@ +const db = require('../db'); +const config = require('../config'); + +const POSITIVE_RULE_TYPES = new Set([ + 'meta_og_type', + 'meta_has_publish_time', + 'jsonld_type', + 'has_tag', + 'url_pattern', + 'path_segment', + 'meta_presence', + 'meta_value_pattern', + 'selector_present', + 'class_token_present', + 'attr_presence', + 'link_density_bucket', + 'paragraph_count_bucket', + 'headline_container_pattern', + 'byline_signal', + 'time_signal', + 'body_container_signal', + 'listing_container_signal', + 'pagination_signal', + 'url_prefix_pattern', + 'canonical_pattern', + 'shallow_text_signal', + 'repeated_card_signal', + 'nav_density_bucket', + 'utility_path_signal', + 'commercial_signal', + 'media_signal', +]); + +const selectCachedClassification = db.prepare(` + SELECT classification + FROM crawler_page_classifications + WHERE url = ? +`); +const upsertCachedClassification = db.prepare(` + INSERT INTO crawler_page_classifications (url, site_name, classification, pattern) + VALUES (?, ?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + site_name = excluded.site_name, + classification = excluded.classification, + pattern = excluded.pattern, + classified_at = datetime('now') +`); +const selectPatternsForSite = db.prepare(` + SELECT pattern, classification, hit_count + FROM crawler_url_patterns + WHERE site_name = ? + AND hit_count >= ? + ORDER BY hit_count DESC, pattern ASC +`); +const upsertPattern = db.prepare(` + INSERT INTO crawler_url_patterns (site_name, pattern, classification, hit_count) + VALUES (?, ?, ?, 1) + ON CONFLICT(site_name, pattern) DO UPDATE SET + classification = excluded.classification, + hit_count = CASE + WHEN crawler_url_patterns.classification = excluded.classification THEN crawler_url_patterns.hit_count + 1 + ELSE 1 + END, + updated_at = datetime('now') +`); +const selectRulesForSite = db.prepare(` + SELECT rule_type, rule_value, classification, hit_count + FROM crawler_site_rules + WHERE site_name = ? + AND hit_count >= ? + ORDER BY hit_count DESC, rule_type ASC, rule_value ASC +`); +const upsertRule = db.prepare(` + INSERT INTO crawler_site_rules (site_name, rule_type, rule_value, classification, hit_count) + VALUES (?, ?, ?, ?, 1) + ON CONFLICT(site_name, rule_type, rule_value) DO UPDATE SET + classification = excluded.classification, + hit_count = CASE + WHEN crawler_site_rules.classification = excluded.classification THEN crawler_site_rules.hit_count + 1 + ELSE 1 + END, + updated_at = datetime('now') +`); + +function normalizePathSegment(segment) { + if (/^\d{4}$/.test(segment)) { + return '{year}'; + } + + if (/^\d{2}$/.test(segment)) { + return '{num2}'; + } + + if (/^\d+$/.test(segment)) { + return '{id}'; + } + + if (/^[a-f0-9]{8,}$/i.test(segment)) { + return '{hex}'; + } + + return String(segment || '').toLowerCase(); +} + +function buildUrlPattern(url) { + try { + const parsed = new URL(url); + const normalizedSegments = parsed.pathname + .split('/') + .filter(Boolean) + .map(normalizePathSegment); + + return `/${normalizedSegments.join('/')}` || '/'; + } catch { + return null; + } +} + +function patternToRegex(pattern) { + const escaped = String(pattern || '') + .replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + .replace(/\{year\}/g, '\\d{4}') + .replace(/\{num2\}/g, '\\d{2}') + .replace(/\{id\}/g, '\\d+') + .replace(/\{hex\}/g, '[a-f0-9]+'); + + return new RegExp(`^${escaped}$`, 'i'); +} + +function sanitizeText(value, maxLength = 200) { + return String(value || '') + .replace(/<[^>]*>/g, ' ') + .replace(/\s+/g, ' ') + .trim() + .slice(0, maxLength); +} + +function normalizeRuleValue(value) { + return sanitizeText(String(value || '').toLowerCase(), 160); +} + +function pushSignal(signals, ruleType, ruleValue) { + const normalizedValue = normalizeRuleValue(ruleValue); + if (!ruleType || !normalizedValue) { + return; + } + + signals.push({ ruleType, ruleValue: normalizedValue }); +} + +function uniqueSignals(signals) { + const seen = new Set(); + const unique = []; + + for (const signal of signals) { + const key = `${signal.ruleType}:${signal.ruleValue}`; + if (seen.has(key)) { + continue; + } + + seen.add(key); + unique.push(signal); + } + + return unique; +} + +function extractJsonObjectString(value) { + const text = String(value || '').trim(); + const start = text.indexOf('{'); + if (start === -1) { + return ''; + } + + let depth = 0; + let inString = false; + let escape = false; + + for (let index = start; index < text.length; index += 1) { + const char = text[index]; + + if (escape) { + escape = false; + continue; + } + + if (char === '\\') { + escape = true; + continue; + } + + if (char === '"') { + inString = !inString; + continue; + } + + if (inString) { + continue; + } + + if (char === '{') { + depth += 1; + continue; + } + + if (char === '}') { + depth -= 1; + if (depth === 0) { + return text.slice(start, index + 1); + } + } + } + + return text.slice(start); +} + +function repairJsonString(value) { + let repaired = String(value || '').trim(); + if (!repaired) { + return ''; + } + + repaired = repaired + .replace(/^```(?:json)?\s*/i, '') + .replace(/\s*```$/i, '') + .trim(); + + repaired = extractJsonObjectString(repaired); + if (!repaired) { + return ''; + } + + repaired = repaired + .replace(/[\u0000-\u001f]+/g, ' ') + .replace(/,\s*([}\]])/g, '$1') + .replace(/:\s*undefined\b/g, ': null') + .replace(/:\s*NaN\b/g, ': null') + .replace(/:\s*Infinity\b/g, ': null'); + + const openCurly = (repaired.match(/\{/g) || []).length; + const closeCurly = (repaired.match(/\}/g) || []).length; + const openSquare = (repaired.match(/\[/g) || []).length; + const closeSquare = (repaired.match(/\]/g) || []).length; + + if (closeSquare < openSquare) { + repaired += ']'.repeat(openSquare - closeSquare); + } + + if (closeCurly < openCurly) { + repaired += '}'.repeat(openCurly - closeCurly); + } + + return repaired; +} + +function parseJsonLoose(value) { + const direct = String(value || '').trim(); + if (!direct) { + return {}; + } + + try { + return JSON.parse(direct); + } catch { + } + + const repaired = repairJsonString(direct); + if (!repaired) { + return {}; + } + + try { + return JSON.parse(repaired); + } catch (error) { + console.error('failed to parse crawler classification payload:', error, direct); + return {}; + } +} + +function extractClassTokens(html) { + const attrs = html.match(/\bclass\s*=\s*(["'])(.*?)\1/gi) || []; + const tokens = []; + + for (const attr of attrs) { + const match = attr.match(/\bclass\s*=\s*(["'])(.*?)\1/i); + const raw = match ? match[2] : ''; + for (const token of raw.split(/\s+/)) { + const normalized = String(token || '').trim().toLowerCase(); + if (!normalized || normalized.length < 3 || normalized.length > 40) { + continue; + } + if (!/[a-z]/.test(normalized)) { + continue; + } + if (/^(jsx-\d+|sc-[a-z0-9]+|css-[a-z0-9]+|_[a-z0-9]+|[a-f0-9]{10,})$/i.test(normalized)) { + continue; + } + tokens.push(normalized); + for (const part of normalized.split(/[_-]+/)) { + if (part.length >= 4 && part.length <= 24 && /[a-z]/.test(part) && !/^\d+$/.test(part)) { + tokens.push(part); + } + } + } + } + + return uniqueSignals(tokens.map((token) => ({ ruleType: 'class_token_present', ruleValue: token }))).map((entry) => entry.ruleValue); +} + +function extractTagSummary(html) { + const tags = new Set(); + const regex = /<([a-z0-9:-]+)\b/gi; + let match; + + while ((match = regex.exec(html)) !== null && tags.size < 50) { + tags.add(String(match[1] || '').toLowerCase()); + } + + return [...tags]; +} + +function extractAttributeValues(html, attrName) { + const regex = new RegExp(`\\b${attrName}\\s*=\\s*(["'])(.*?)\\1`, 'gi'); + const values = []; + let match; + + while ((match = regex.exec(html)) !== null) { + values.push(String(match[2] || '').trim()); + } + + return values; +} + +function detectLinkDensityBucket(links, paragraphTextLength) { + if (!paragraphTextLength) { + return links.length >= 15 ? 'high' : 'medium'; + } + + const ratio = (links.length * 1000) / Math.max(paragraphTextLength, 1); + if (ratio >= 18 || links.length >= 60) { + return 'high'; + } + if (ratio >= 8 || links.length >= 25) { + return 'medium'; + } + return 'low'; +} + +function detectParagraphBucket(paragraphCount) { + if (paragraphCount === 0) { + return '0'; + } + if (paragraphCount <= 2) { + return '1-2'; + } + if (paragraphCount <= 7) { + return '3-7'; + } + return '8+'; +} + +function detectHeadlineContainerPattern(html, headlineLinks) { + const h1Count = (html.match(/