146 lines
No EOL
4.1 KiB
JavaScript
146 lines
No EOL
4.1 KiB
JavaScript
const path = require('path');
|
|
const Database = require('better-sqlite3');
|
|
const sqliteVec = require('sqlite-vec');
|
|
const config = require('./config');
|
|
|
|
const dbPath = path.resolve(__dirname, '..', config.database.path || './archive.sqlite');
|
|
const db = new Database(dbPath);
|
|
sqliteVec.load(db);
|
|
|
|
db.pragma('journal_mode = WAL');
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS articles (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
content TEXT,
|
|
image TEXT,
|
|
content_status TEXT,
|
|
content_error TEXT,
|
|
content_attempted_at TEXT,
|
|
content_attempt_count INTEGER NOT NULL DEFAULT 0,
|
|
content_retry_after TEXT,
|
|
is_index_page INTEGER NOT NULL DEFAULT 0,
|
|
has_embedding INTEGER NOT NULL DEFAULT 0,
|
|
url TEXT NOT NULL UNIQUE,
|
|
normalized_title TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
pub_date TEXT,
|
|
pub_date_effective TEXT,
|
|
language TEXT,
|
|
event_id INTEGER REFERENCES events(id),
|
|
ingested_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE INDEX IF NOT EXISTS idx_articles_source ON articles(source);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_pub_date ON articles(pub_date);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_ingested_at ON articles(ingested_at);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_normalized_title ON articles(normalized_title);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_event_id ON articles(event_id);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS article_embedding_store (
|
|
article_id INTEGER NOT NULL,
|
|
model TEXT NOT NULL,
|
|
embedding BLOB NOT NULL,
|
|
embedded_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (article_id, model)
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS article_embedding_meta (
|
|
article_id INTEGER PRIMARY KEY,
|
|
model TEXT NOT NULL,
|
|
embedded_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS article_embeddings USING vec0(
|
|
article_id INTEGER PRIMARY KEY,
|
|
embedding FLOAT[8192]
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS query_embeddings (
|
|
query TEXT NOT NULL,
|
|
model TEXT NOT NULL,
|
|
embedding BLOB NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (query, model)
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS events (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
title TEXT NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
|
|
source_id TEXT NOT NULL,
|
|
window_start TEXT NOT NULL,
|
|
window_end TEXT NOT NULL,
|
|
completed_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (source_id, window_start, window_end)
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS crawler_page_classifications (
|
|
url TEXT PRIMARY KEY,
|
|
site_name TEXT NOT NULL,
|
|
classification TEXT NOT NULL,
|
|
pattern TEXT,
|
|
classified_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS crawler_url_patterns (
|
|
site_name TEXT NOT NULL,
|
|
pattern TEXT NOT NULL,
|
|
classification TEXT NOT NULL,
|
|
hit_count INTEGER NOT NULL DEFAULT 1,
|
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (site_name, pattern)
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS crawler_site_rules (
|
|
site_name TEXT NOT NULL,
|
|
rule_type TEXT NOT NULL,
|
|
rule_value TEXT NOT NULL,
|
|
classification TEXT NOT NULL,
|
|
hit_count INTEGER NOT NULL DEFAULT 1,
|
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (site_name, rule_type, rule_value)
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS domain_fetch_policy (
|
|
domain TEXT PRIMARY KEY,
|
|
policy TEXT NOT NULL DEFAULT 'auto',
|
|
consecutive_plain_failures INTEGER NOT NULL DEFAULT 0,
|
|
consecutive_browser_failures INTEGER NOT NULL DEFAULT 0,
|
|
plain_success_count INTEGER NOT NULL DEFAULT 0,
|
|
browser_success_count INTEGER NOT NULL DEFAULT 0,
|
|
expires_at TEXT,
|
|
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
module.exports = db; |