add Docker configuration and news crawler implementation
This commit is contained in:
parent
7724fafbdc
commit
c91e4ddb60
8 changed files with 915 additions and 2 deletions
10
.dockerignore
Normal file
10
.dockerignore
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
node_modules
|
||||||
|
npm-debug.log
|
||||||
|
Dockerfile*
|
||||||
|
docker-compose*.yml
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
archive.sqlite
|
||||||
|
archive.sqlite-shm
|
||||||
|
archive.sqlite-wal
|
||||||
|
data
|
||||||
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -2,7 +2,7 @@ node_modules/
|
||||||
.env
|
.env
|
||||||
.env.*
|
.env.*
|
||||||
|
|
||||||
config.json
|
#config.json
|
||||||
|
|
||||||
*.sqlite
|
*.sqlite
|
||||||
*.sqlite-shm
|
*.sqlite-shm
|
||||||
|
|
|
||||||
17
Dockerfile
Normal file
17
Dockerfile
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
FROM node:22-bookworm-slim
|
||||||
|
|
||||||
|
ENV NODE_ENV=production
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY package.json package-lock.json ./
|
||||||
|
RUN npm ci --omit=dev \
|
||||||
|
&& npm cache clean --force \
|
||||||
|
&& mkdir -p /data \
|
||||||
|
&& ln -s /data/archive.sqlite /app/archive.sqlite
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
EXPOSE 3001
|
||||||
|
|
||||||
|
CMD ["npm", "start"]
|
||||||
|
|
@ -25,7 +25,8 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing
|
||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- SQLite archive file defaults to `./archive.sqlite`.
|
- SQLite archive file defaults to `./archive.sqlite`.
|
||||||
- Deduplication is enforced on `url` and normalized title.
|
- Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique.
|
||||||
|
- `newsCrawler.sites` can be configured with same-site seed pages for bounded HTML crawling and historical article discovery.
|
||||||
- Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content.
|
- Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content.
|
||||||
- Main article images are stored as ultra-compressed base64 WebP.
|
- Main article images are stored as ultra-compressed base64 WebP.
|
||||||
- Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.
|
- Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.
|
||||||
|
|
|
||||||
406
config.json
Normal file
406
config.json
Normal file
|
|
@ -0,0 +1,406 @@
|
||||||
|
{
|
||||||
|
"server": {
|
||||||
|
"port": 3001,
|
||||||
|
"host": "0.0.0.0"
|
||||||
|
},
|
||||||
|
"database": {
|
||||||
|
"path": "./archive.sqlite"
|
||||||
|
},
|
||||||
|
"sec": {
|
||||||
|
"userAgent": "Augor benjamin.watt@imbenji.net",
|
||||||
|
"tickers": []
|
||||||
|
},
|
||||||
|
"alphaVantage": {
|
||||||
|
"apiKey": "KJ68ZQEW0PF524UA",
|
||||||
|
"tickers": []
|
||||||
|
},
|
||||||
|
"finnhub": {
|
||||||
|
"apiKey": "d7gg0h1r01qmqj4573sgd7gg0h1r01qmqj4573t0",
|
||||||
|
"tickers": []
|
||||||
|
},
|
||||||
|
"openRouter": {
|
||||||
|
"apiKey": "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"
|
||||||
|
},
|
||||||
|
"rssFeeds": [
|
||||||
|
{
|
||||||
|
"url": "https://www.aljazeera.com/xml/rss/all.xml",
|
||||||
|
"label": "Al Jazeera"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.bbci.co.uk/news/business/rss.xml",
|
||||||
|
"label": "BBC Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.businessinsider.com/custom/all",
|
||||||
|
"label": "Business Insider"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.bloomberg.com/markets/news.rss",
|
||||||
|
"label": "Bloomberg Markets"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.cnbc.com/id/100003114/device/rss/rss.html",
|
||||||
|
"label": "CNBC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml",
|
||||||
|
"label": "Wall Street Journal"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.marketwatch.com/marketwatch/topstories/",
|
||||||
|
"label": "MarketWatch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://finance.yahoo.com/news/rssindex",
|
||||||
|
"label": "Yahoo Finance"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://seekingalpha.com/feed.xml",
|
||||||
|
"label": "Seeking Alpha"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.ft.com/?format=rss",
|
||||||
|
"label": "Financial Times"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.economist.com/finance-and-economics/rss.xml",
|
||||||
|
"label": "The Economist"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://fortune.com/feed",
|
||||||
|
"label": "Fortune"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.forbes.com/business/feed/",
|
||||||
|
"label": "Forbes Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.inc.com/rss",
|
||||||
|
"label": "Inc Magazine"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.fastcompany.com/latest/rss",
|
||||||
|
"label": "Fast Company"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.entrepreneur.com/latest.rss",
|
||||||
|
"label": "Entrepreneur"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://api.axios.com/feed/",
|
||||||
|
"label": "Axios"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.wired.com/feed/category/business/latest/rss",
|
||||||
|
"label": "Wired Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.npr.org/1006/rss.xml",
|
||||||
|
"label": "NPR Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.federalreserve.gov/feeds/press_all.xml",
|
||||||
|
"label": "Federal Reserve"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://techcrunch.com/feed/",
|
||||||
|
"label": "TechCrunch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.theverge.com/rss/index.xml",
|
||||||
|
"label": "The Verge"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.arstechnica.com/arstechnica/index",
|
||||||
|
"label": "Ars Technica"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.retaildive.com/feeds/news/",
|
||||||
|
"label": "Retail Dive"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.manufacturingdive.com/feeds/news/",
|
||||||
|
"label": "Manufacturing Dive"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.bankingdive.com/feeds/news/",
|
||||||
|
"label": "Banking Dive"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://financialpost.com/feed",
|
||||||
|
"label": "Financial Post CA"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/",
|
||||||
|
"label": "Globe and Mail"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.theguardian.com/uk/business/rss",
|
||||||
|
"label": "Guardian Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.skynews.com/feeds/rss/business.xml",
|
||||||
|
"label": "Sky News Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.thisismoney.co.uk/money/news/index.rss",
|
||||||
|
"label": "This Is Money"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.cityam.com/feed/",
|
||||||
|
"label": "City A.M."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.spiegel.de/wirtschaft/index.rss",
|
||||||
|
"label": "Spiegel Wirtschaft"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen",
|
||||||
|
"label": "Handelsblatt"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.faz.net/rss/aktuell/wirtschaft/",
|
||||||
|
"label": "FAZ Wirtschaft"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.welt.de/feeds/section/wirtschaft.rss",
|
||||||
|
"label": "Die Welt Wirtschaft"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.lesechos.fr/rss/rss_la_une.xml",
|
||||||
|
"label": "Les Echos"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.lemonde.fr/economie/rss_full.xml",
|
||||||
|
"label": "Le Monde Economie"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://bfmbusiness.bfmtv.com/rss/news-flux-rss/",
|
||||||
|
"label": "BFM Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.eleconomista.es/rss/rss-de-portada.php",
|
||||||
|
"label": "El Economista ES"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://e00-expansion.uecdn.es/rss/portada.xml",
|
||||||
|
"label": "Expansion ES"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml",
|
||||||
|
"label": "Cinco Dias"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.ilsole24ore.com/rss/economia--finanza.xml",
|
||||||
|
"label": "Il Sole 24 Ore"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://fd.nl/rss",
|
||||||
|
"label": "FD.nl"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.nzz.ch/wirtschaft.rss",
|
||||||
|
"label": "NZZ Wirtschaft"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.themoscowtimes.com/rss/news",
|
||||||
|
"label": "Moscow Times"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss",
|
||||||
|
"label": "RBC Russia"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://economictimes.indiatimes.com/rssfeedstopstories.cms",
|
||||||
|
"label": "Economic Times India"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.business-standard.com/rss/home_page_top_stories.rss",
|
||||||
|
"label": "Business Standard IN"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.livemint.com/rss/headlines",
|
||||||
|
"label": "Live Mint"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.moneycontrol.com/rss/MCtopnews.xml",
|
||||||
|
"label": "Moneycontrol"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.thehindubusinessline.com/feeder/default.rss",
|
||||||
|
"label": "Hindu Business Line"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.caixinglobal.com/rss/newsfeeds/",
|
||||||
|
"label": "Caixin Global"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml",
|
||||||
|
"label": "China Daily Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://english.news.cn/rss/business.xml",
|
||||||
|
"label": "Xinhua Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.scmp.com/rss/91/feed",
|
||||||
|
"label": "South China Morning Post"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://asia.nikkei.com/rss/feed/nar",
|
||||||
|
"label": "Nikkei Asia"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.japantimes.co.jp/feed/business/",
|
||||||
|
"label": "Japan Times Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.koreaherald.com/rss/010000000000.xml",
|
||||||
|
"label": "Korea Herald"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://koreajoongangdaily.joins.com/rss/",
|
||||||
|
"label": "Korea JoongAng Daily"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.businesstimes.com.sg/rss.xml",
|
||||||
|
"label": "Business Times SG"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.straitstimes.com/news/business/rss.xml",
|
||||||
|
"label": "Straits Times Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.channelnewsasia.com/rssfeeds/8395986",
|
||||||
|
"label": "Channel NewsAsia"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.bangkokpost.com/rss/data/business.xml",
|
||||||
|
"label": "Bangkok Post Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.thestar.com.my/rss/Business/Business-News",
|
||||||
|
"label": "The Star Malaysia"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.afr.com/rss",
|
||||||
|
"label": "Australian Fin Review"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.abc.net.au/news/feed/52278/rss.xml",
|
||||||
|
"label": "ABC Business AU"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/",
|
||||||
|
"label": "NZ Herald Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.arabianbusiness.com/rss.xml",
|
||||||
|
"label": "Arabian Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://gulfnews.com/rss/business",
|
||||||
|
"label": "Gulf News Business"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.arabnews.com/rss/front_page.xml",
|
||||||
|
"label": "Arab News"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml",
|
||||||
|
"label": "The National UAE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://businessday.ng/feed/",
|
||||||
|
"label": "BusinessDay Nigeria"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.moneyweb.co.za/feed/",
|
||||||
|
"label": "Moneyweb SA"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.businesslive.co.za/rss/bd/",
|
||||||
|
"label": "BusinessLive SA"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.businessdailyafrica.com/rss/",
|
||||||
|
"label": "Business Daily Africa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.vanguardngr.com/category/business/feed/",
|
||||||
|
"label": "Vanguard Business NG"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://feeds.folha.uol.com.br/mercado/rss091.xml",
|
||||||
|
"label": "Folha Mercado BR"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://g1.globo.com/dynamo/economia/rss2.xml",
|
||||||
|
"label": "G1 Economia BR"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://exame.com/feed/",
|
||||||
|
"label": "Exame BR"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.eleconomista.com.mx/rss/rss.html",
|
||||||
|
"label": "El Economista MX"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://expansion.mx/rss",
|
||||||
|
"label": "Expansion MX"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/",
|
||||||
|
"label": "La Nacion AR"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.infobae.com/feeds/rss/economia/",
|
||||||
|
"label": "Infobae Economia AR"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://www.portafolio.co/rss/portafolio.xml",
|
||||||
|
"label": "Portafolio Colombia"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/",
|
||||||
|
"label": "El Comercio Peru"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gdelt": {
|
||||||
|
"queries": [
|
||||||
|
"technology"
|
||||||
|
],
|
||||||
|
"mode": "ArtList",
|
||||||
|
"maxRecords": 50,
|
||||||
|
"format": "json"
|
||||||
|
},
|
||||||
|
"newsCrawler": {
|
||||||
|
"sites": [
|
||||||
|
{
|
||||||
|
"name": "crawler_reuters",
|
||||||
|
"allowedHosts": [
|
||||||
|
"www.reuters.com",
|
||||||
|
"reuters.com"
|
||||||
|
],
|
||||||
|
"seeds": [
|
||||||
|
"https://www.reuters.com/world/",
|
||||||
|
"https://www.reuters.com/business/",
|
||||||
|
"https://www.reuters.com/markets/",
|
||||||
|
"https://www.reuters.com/technology/"
|
||||||
|
],
|
||||||
|
"maxPages": 100,
|
||||||
|
"maxDepth": 2,
|
||||||
|
"requestTimeout": 15000
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"scheduler": {
|
||||||
|
"rss": "0 */6 * * *",
|
||||||
|
"gdelt": "0 */6 * * *",
|
||||||
|
"edgar": "15 0 * * *",
|
||||||
|
"alphaVantage": "30 0 * * *",
|
||||||
|
"finnhub": "45 0 * * *",
|
||||||
|
"newsCrawler": "15 */12 * * *"
|
||||||
|
}
|
||||||
|
}
|
||||||
16
docker-compose.yml
Normal file
16
docker-compose.yml
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
services:
|
||||||
|
api:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
volumes:
|
||||||
|
- ./config.json:/app/config.json:ro
|
||||||
|
- ./data:/data
|
||||||
|
environment:
|
||||||
|
NODE_ENV: production
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- nginx_proxy_manager_default
|
||||||
|
|
||||||
|
networks:
|
||||||
|
nginx_proxy_manager_default:
|
||||||
|
external: true
|
||||||
|
|
@ -6,6 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt');
|
||||||
const { fetchEdgarArticles } = require('./sources/edgar');
|
const { fetchEdgarArticles } = require('./sources/edgar');
|
||||||
const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
|
const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
|
||||||
const { fetchFinnhubArticles } = require('./sources/finnhub');
|
const { fetchFinnhubArticles } = require('./sources/finnhub');
|
||||||
|
const { fetchCrawlerArticles } = require('./sources/newsCrawler');
|
||||||
const { backfillMissingContent } = require('./content');
|
const { backfillMissingContent } = require('./content');
|
||||||
const { backfillMissingEmbeddings } = require('./embeddings');
|
const { backfillMissingEmbeddings } = require('./embeddings');
|
||||||
|
|
||||||
|
|
@ -27,6 +28,7 @@ async function runAllIngestions() {
|
||||||
results.push(await runSource('edgar', fetchEdgarArticles));
|
results.push(await runSource('edgar', fetchEdgarArticles));
|
||||||
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
|
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
|
||||||
results.push(await runSource('finnhub', fetchFinnhubArticles));
|
results.push(await runSource('finnhub', fetchFinnhubArticles));
|
||||||
|
results.push(await runSource('news_crawler', fetchCrawlerArticles));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await backfillMissingContent();
|
await backfillMissingContent();
|
||||||
|
|
@ -64,6 +66,12 @@ function startScheduler() {
|
||||||
await runSource('finnhub', fetchFinnhubArticles);
|
await runSource('finnhub', fetchFinnhubArticles);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (config.scheduler.newsCrawler) {
|
||||||
|
cron.schedule(config.scheduler.newsCrawler, async () => {
|
||||||
|
await runSource('news_crawler', fetchCrawlerArticles);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
cron.schedule('0 * * * *', async () => {
|
cron.schedule('0 * * * *', async () => {
|
||||||
try {
|
try {
|
||||||
await backfillMissingContent();
|
await backfillMissingContent();
|
||||||
|
|
|
||||||
455
src/sources/newsCrawler.js
Normal file
455
src/sources/newsCrawler.js
Normal file
|
|
@ -0,0 +1,455 @@
|
||||||
|
const config = require('../config');
|
||||||
|
const { fetchWithPolicy } = require('../http');
|
||||||
|
|
||||||
|
const TRACKING_PARAM_PATTERNS = [
|
||||||
|
/^utm_/i,
|
||||||
|
/^fbclid$/i,
|
||||||
|
/^gclid$/i,
|
||||||
|
/^mkt_tok$/i,
|
||||||
|
/^mc_cid$/i,
|
||||||
|
/^mc_eid$/i,
|
||||||
|
/^ref$/i,
|
||||||
|
/^ref_src$/i,
|
||||||
|
/^s$/i,
|
||||||
|
/^cmpid$/i,
|
||||||
|
/^guccounter$/i,
|
||||||
|
/^guce_referrer$/i,
|
||||||
|
/^guce_referrer_sig$/i,
|
||||||
|
];
|
||||||
|
const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i;
|
||||||
|
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
|
||||||
|
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
|
||||||
|
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
|
||||||
|
const USER_AGENT = 'duriin_api crawler/1.0';
|
||||||
|
|
||||||
|
function decodeHtmlEntities(value) {
|
||||||
|
return String(value || '')
|
||||||
|
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
|
||||||
|
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/ /g, ' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
function stripTags(value) {
|
||||||
|
return decodeHtmlEntities(String(value || '').replace(/<[^>]*>/g, ' ')).replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeText(value) {
|
||||||
|
return stripTags(value).replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function isAllowedHost(hostname, allowedHosts) {
|
||||||
|
const normalized = String(hostname || '').toLowerCase();
|
||||||
|
return allowedHosts.some((allowedHost) => {
|
||||||
|
const candidate = String(allowedHost || '').toLowerCase();
|
||||||
|
return normalized === candidate || normalized.endsWith(`.${candidate}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function shouldDropParam(key) {
|
||||||
|
return TRACKING_PARAM_PATTERNS.some((pattern) => pattern.test(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
|
||||||
|
try {
|
||||||
|
const url = new URL(rawUrl, baseUrl);
|
||||||
|
|
||||||
|
if (!['http:', 'https:'].includes(url.protocol)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isAllowedHost(url.hostname, allowedHosts)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
url.hash = '';
|
||||||
|
url.username = '';
|
||||||
|
url.password = '';
|
||||||
|
|
||||||
|
const params = [...url.searchParams.entries()]
|
||||||
|
.filter(([key]) => !shouldDropParam(key))
|
||||||
|
.sort(([left], [right]) => left.localeCompare(right));
|
||||||
|
|
||||||
|
url.search = '';
|
||||||
|
for (const [key, value] of params) {
|
||||||
|
url.searchParams.append(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url.pathname !== '/') {
|
||||||
|
url.pathname = url.pathname.replace(/\/+$/, '') || '/';
|
||||||
|
}
|
||||||
|
|
||||||
|
return url.toString();
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractAttribute(tag, name) {
|
||||||
|
const match = tag.match(new RegExp(`${name}\\s*=\\s*(["'])(.*?)\\1`, 'i'));
|
||||||
|
return match ? decodeHtmlEntities(match[2]).trim() : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractMetaMap(html) {
|
||||||
|
const metas = new Map();
|
||||||
|
const metaTags = html.match(/<meta\b[^>]*>/gi) || [];
|
||||||
|
|
||||||
|
for (const tag of metaTags) {
|
||||||
|
const key = extractAttribute(tag, 'property') || extractAttribute(tag, 'name');
|
||||||
|
const content = extractAttribute(tag, 'content');
|
||||||
|
|
||||||
|
if (!key || !content) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
metas.set(key.toLowerCase(), content);
|
||||||
|
}
|
||||||
|
|
||||||
|
return metas;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractCanonicalHref(html) {
|
||||||
|
const links = html.match(/<link\b[^>]*>/gi) || [];
|
||||||
|
|
||||||
|
for (const tag of links) {
|
||||||
|
const rel = extractAttribute(tag, 'rel').toLowerCase();
|
||||||
|
if (!rel || !rel.split(/\s+/).includes('canonical')) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const href = extractAttribute(tag, 'href');
|
||||||
|
if (href) {
|
||||||
|
return href;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTitleTag(html) {
|
||||||
|
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
||||||
|
return match ? normalizeText(match[1]) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractH1(html) {
|
||||||
|
const match = html.match(/<h1\b[^>]*>([\s\S]*?)<\/h1>/i);
|
||||||
|
return match ? normalizeText(match[1]) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTimeDatetime(html) {
|
||||||
|
const match = html.match(/<time\b[^>]*datetime\s*=\s*(["'])(.*?)\1/i);
|
||||||
|
return match ? decodeHtmlEntities(match[2]).trim() : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractParagraphTextLength(html) {
|
||||||
|
const paragraphs = html.match(/<p\b[^>]*>[\s\S]*?<\/p>/gi) || [];
|
||||||
|
return paragraphs.slice(0, 10).reduce((total, paragraph) => total + normalizeText(paragraph).length, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractJsonLdBlocks(html) {
|
||||||
|
const blocks = [];
|
||||||
|
const regex = /<script\b[^>]*type\s*=\s*(["'])application\/ld\+json\1[^>]*>([\s\S]*?)<\/script>/gi;
|
||||||
|
let match;
|
||||||
|
|
||||||
|
while ((match = regex.exec(html)) !== null) {
|
||||||
|
const raw = String(match[2] || '').trim();
|
||||||
|
if (!raw) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
blocks.push(JSON.parse(raw));
|
||||||
|
} catch {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
function walkJson(value, visit) {
|
||||||
|
if (Array.isArray(value)) {
|
||||||
|
for (const item of value) {
|
||||||
|
walkJson(item, visit);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!value || typeof value !== 'object') {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
visit(value);
|
||||||
|
|
||||||
|
for (const child of Object.values(value)) {
|
||||||
|
walkJson(child, visit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function isArticleType(type) {
|
||||||
|
if (Array.isArray(type)) {
|
||||||
|
return type.some((entry) => isArticleType(entry));
|
||||||
|
}
|
||||||
|
|
||||||
|
return ['article', 'newsarticle'].includes(String(type || '').toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractArticleJsonLd(html) {
|
||||||
|
const blocks = extractJsonLdBlocks(html);
|
||||||
|
let article = null;
|
||||||
|
|
||||||
|
for (const block of blocks) {
|
||||||
|
walkJson(block, (value) => {
|
||||||
|
if (!article && isArticleType(value['@type'])) {
|
||||||
|
article = value;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (article) {
|
||||||
|
return article;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractLinks(html, pageUrl, allowedHosts) {
|
||||||
|
const links = [];
|
||||||
|
const seen = new Set();
|
||||||
|
const regex = /<a\b[^>]*href\s*=\s*(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi;
|
||||||
|
let match;
|
||||||
|
|
||||||
|
while ((match = regex.exec(html)) !== null) {
|
||||||
|
const url = canonicalizeUrl(match[2], pageUrl, allowedHosts);
|
||||||
|
if (!url || seen.has(url)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = normalizeText(match[3]);
|
||||||
|
seen.add(url);
|
||||||
|
links.push({ url, text });
|
||||||
|
}
|
||||||
|
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
function selectTitle(meta, jsonLdArticle, html) {
|
||||||
|
return [
|
||||||
|
meta.get('og:title'),
|
||||||
|
meta.get('twitter:title'),
|
||||||
|
jsonLdArticle && jsonLdArticle.headline,
|
||||||
|
extractH1(html),
|
||||||
|
extractTitleTag(html),
|
||||||
|
].find((value) => String(value || '').trim()) || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function selectDescription(meta, jsonLdArticle) {
|
||||||
|
return [
|
||||||
|
meta.get('og:description'),
|
||||||
|
meta.get('description'),
|
||||||
|
jsonLdArticle && jsonLdArticle.description,
|
||||||
|
].find((value) => String(value || '').trim()) || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function selectPubDate(meta, jsonLdArticle, html) {
|
||||||
|
return [
|
||||||
|
jsonLdArticle && jsonLdArticle.datePublished,
|
||||||
|
meta.get('article:published_time'),
|
||||||
|
meta.get('og:article:published_time'),
|
||||||
|
extractTimeDatetime(html),
|
||||||
|
].find((value) => String(value || '').trim()) || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
||||||
|
let articleScore = 0;
|
||||||
|
let listingScore = 0;
|
||||||
|
const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
|
||||||
|
|
||||||
|
if (jsonLdArticle) {
|
||||||
|
articleScore += 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (String(meta.get('og:type') || '').toLowerCase() === 'article') {
|
||||||
|
articleScore += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) {
|
||||||
|
articleScore += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/<article\b/i.test(html)) {
|
||||||
|
articleScore += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ARTICLE_DATE_PATH.test(pageUrl) || ARTICLE_PATH_HINT.test(pageUrl)) {
|
||||||
|
articleScore += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extractH1(html) && extractParagraphTextLength(html) >= 500) {
|
||||||
|
articleScore += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (links.length >= 20) {
|
||||||
|
listingScore += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (headlineLinks >= 8) {
|
||||||
|
listingScore += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LISTING_PATH_HINT.test(new URL(pageUrl).pathname)) {
|
||||||
|
listingScore += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (articleScore > 0) {
|
||||||
|
listingScore -= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { articleScore, listingScore };
|
||||||
|
}
|
||||||
|
|
||||||
|
function shouldQueueLink(url) {
|
||||||
|
const pathname = new URL(url).pathname.toLowerCase();
|
||||||
|
|
||||||
|
if (BLOCKED_PATH_HINT.test(pathname)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeSite(site) {
|
||||||
|
const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))];
|
||||||
|
const seeds = [...new Set((site.seeds || [])
|
||||||
|
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
|
||||||
|
.filter(Boolean))];
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: String(site.name || '').trim(),
|
||||||
|
allowedHosts,
|
||||||
|
seeds,
|
||||||
|
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)),
|
||||||
|
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)),
|
||||||
|
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchHtml(url, timeout) {
|
||||||
|
const response = await fetchWithPolicy(url, {
|
||||||
|
timeout,
|
||||||
|
retries: 1,
|
||||||
|
headers: {
|
||||||
|
Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
|
||||||
|
'User-Agent': USER_AGENT,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const contentType = String(response.headers.get('content-type') || '').toLowerCase();
|
||||||
|
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.text();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function crawlSite(site) {
|
||||||
|
const normalizedSite = normalizeSite(site);
|
||||||
|
|
||||||
|
if (!normalizedSite.name || !normalizedSite.allowedHosts.length || !normalizedSite.seeds.length) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 }));
|
||||||
|
const queuedUrls = new Set(normalizedSite.seeds);
|
||||||
|
const visitedUrls = new Set();
|
||||||
|
const discoveredArticleUrls = new Set();
|
||||||
|
const articles = [];
|
||||||
|
|
||||||
|
while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
|
||||||
|
const current = queue.shift();
|
||||||
|
|
||||||
|
if (!current || visitedUrls.has(current.url)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
visitedUrls.add(current.url);
|
||||||
|
|
||||||
|
let html;
|
||||||
|
try {
|
||||||
|
html = await fetchHtml(current.url, normalizedSite.requestTimeout);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!html) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const meta = extractMetaMap(html);
|
||||||
|
const jsonLdArticle = extractArticleJsonLd(html);
|
||||||
|
const canonicalHref = extractCanonicalHref(html);
|
||||||
|
const canonicalUrl = canonicalHref
|
||||||
|
? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
|
||||||
|
: current.url;
|
||||||
|
const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
|
||||||
|
const { articleScore, listingScore } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
|
||||||
|
|
||||||
|
if (articleScore >= 3 && !discoveredArticleUrls.has(canonicalUrl)) {
|
||||||
|
const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
|
||||||
|
if (title) {
|
||||||
|
discoveredArticleUrls.add(canonicalUrl);
|
||||||
|
articles.push({
|
||||||
|
title,
|
||||||
|
description: normalizeText(selectDescription(meta, jsonLdArticle)) || null,
|
||||||
|
url: canonicalUrl,
|
||||||
|
source: normalizedSite.name,
|
||||||
|
pubDate: selectPubDate(meta, jsonLdArticle, html),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.depth >= normalizedSite.maxDepth || listingScore < 2) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const link of links) {
|
||||||
|
if (!shouldQueueLink(link.url) || visitedUrls.has(link.url) || queuedUrls.has(link.url)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
queuedUrls.add(link.url);
|
||||||
|
queue.push({ url: link.url, depth: current.depth + 1 });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return articles;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchCrawlerArticles() {
|
||||||
|
const articles = [];
|
||||||
|
|
||||||
|
for (const site of config.newsCrawler?.sites || []) {
|
||||||
|
try {
|
||||||
|
articles.push(...await crawlSite(site));
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Crawler failed for ${site && site.name ? site.name : 'unknown_site'}`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return articles;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
fetchCrawlerArticles,
|
||||||
|
canonicalizeUrl,
|
||||||
|
};
|
||||||
Loading…
Add table
Reference in a new issue