Duriin-API/src/db.js

221 lines
5.6 KiB
JavaScript

const path = require('path');
const Database = require('better-sqlite3');
const sqliteVec = require('sqlite-vec');
const config = require('./config');
const dbPath = path.resolve(__dirname, '..', config.database.path || './archive.sqlite');
const db = new Database(dbPath);
sqliteVec.load(db);
db.pragma('journal_mode = WAL');
db.exec(`
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
description TEXT,
content TEXT,
image TEXT,
content_status TEXT,
content_error TEXT,
content_attempted_at TEXT,
is_index_page INTEGER NOT NULL DEFAULT 0,
url TEXT NOT NULL UNIQUE,
normalized_title TEXT NOT NULL,
source TEXT NOT NULL,
pub_date TEXT,
ingested_at TEXT NOT NULL DEFAULT (datetime('now'))
);
`);
function rebuildArticlesTableIfNeeded() {
const indexes = db.prepare(`PRAGMA index_list('articles')`).all();
const hasUniqueNormalizedTitleIndex = indexes.some((index) => {
if (index.origin !== 'u' || !index.name) {
return false;
}
const columns = db.prepare(`PRAGMA index_info('${index.name.replace(/'/g, "''")}')`).all();
return columns.length === 1 && columns[0].name === 'normalized_title';
});
if (!hasUniqueNormalizedTitleIndex) {
return;
}
db.exec(`
BEGIN;
CREATE TABLE articles_rebuild (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
description TEXT,
content TEXT,
image TEXT,
content_status TEXT,
content_error TEXT,
content_attempted_at TEXT,
is_index_page INTEGER NOT NULL DEFAULT 0,
url TEXT NOT NULL UNIQUE,
normalized_title TEXT NOT NULL,
source TEXT NOT NULL,
pub_date TEXT,
ingested_at TEXT NOT NULL DEFAULT (datetime('now'))
);
INSERT INTO articles_rebuild (
id,
title,
description,
content,
image,
content_status,
content_error,
content_attempted_at,
is_index_page,
url,
normalized_title,
source,
pub_date,
ingested_at
)
SELECT
id,
title,
description,
content,
image,
content_status,
content_error,
content_attempted_at,
0,
url,
normalized_title,
source,
pub_date,
ingested_at
FROM articles;
DROP TABLE articles;
ALTER TABLE articles_rebuild RENAME TO articles;
COMMIT;
`);
}
rebuildArticlesTableIfNeeded();
db.exec(`
CREATE INDEX IF NOT EXISTS idx_articles_source ON articles(source);
CREATE INDEX IF NOT EXISTS idx_articles_pub_date ON articles(pub_date);
CREATE INDEX IF NOT EXISTS idx_articles_ingested_at ON articles(ingested_at);
CREATE INDEX IF NOT EXISTS idx_articles_normalized_title ON articles(normalized_title);
`);
db.exec(`
CREATE VIRTUAL TABLE IF NOT EXISTS article_embeddings USING vec0(
article_id INTEGER PRIMARY KEY,
embedding FLOAT[1024]
);
`);
db.exec(`
CREATE TABLE IF NOT EXISTS query_embeddings (
query TEXT PRIMARY KEY,
embedding BLOB NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
);
`);
db.exec(`
CREATE TABLE IF NOT EXISTS article_embedding_meta (
article_id INTEGER PRIMARY KEY,
model TEXT NOT NULL,
embedded_at TEXT NOT NULL DEFAULT (datetime('now'))
);
`);
db.exec(`
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
source_id TEXT NOT NULL,
window_start TEXT NOT NULL,
window_end TEXT NOT NULL,
completed_at TEXT NOT NULL DEFAULT (datetime('now')),
PRIMARY KEY (source_id, window_start, window_end)
);
`);
db.exec(`
CREATE TABLE IF NOT EXISTS crawler_page_classifications (
url TEXT PRIMARY KEY,
site_name TEXT NOT NULL,
classification TEXT NOT NULL,
pattern TEXT,
classified_at TEXT NOT NULL DEFAULT (datetime('now'))
);
`);
db.exec(`
CREATE TABLE IF NOT EXISTS crawler_url_patterns (
site_name TEXT NOT NULL,
pattern TEXT NOT NULL,
classification TEXT NOT NULL,
hit_count INTEGER NOT NULL DEFAULT 1,
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
PRIMARY KEY (site_name, pattern)
);
`);
db.exec(`
CREATE TABLE IF NOT EXISTS crawler_site_rules (
site_name TEXT NOT NULL,
rule_type TEXT NOT NULL,
rule_value TEXT NOT NULL,
classification TEXT NOT NULL,
hit_count INTEGER NOT NULL DEFAULT 1,
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
PRIMARY KEY (site_name, rule_type, rule_value)
);
`);
for (const statement of [
'ALTER TABLE articles ADD COLUMN image TEXT',
'ALTER TABLE articles ADD COLUMN content_status TEXT',
'ALTER TABLE articles ADD COLUMN content_error TEXT',
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
]) {
try {
db.exec(statement);
} catch (error) {
if (!String(error.message).includes('duplicate column name')) {
throw error;
}
}
}
db.exec(`
UPDATE articles
SET is_index_page = 1
WHERE is_index_page = 0
AND (
LOWER(url) LIKE '%/category/%'
OR LOWER(url) LIKE '%/categories/%'
OR LOWER(url) LIKE '%/tag/%'
OR LOWER(url) LIKE '%/tags/%'
OR LOWER(url) LIKE '%/topic/%'
OR LOWER(url) LIKE '%/topics/%'
OR LOWER(url) LIKE '%/section/%'
OR LOWER(url) LIKE '%/sections/%'
OR LOWER(url) LIKE '%/archive%'
OR LOWER(url) LIKE '%/archives/%'
OR LOWER(url) LIKE '%/authors/%'
OR LOWER(url) LIKE '%/search%'
OR LOWER(title) LIKE '%category%'
OR LOWER(title) LIKE '%archives%'
OR LOWER(title) LIKE '%archive%'
OR LOWER(title) LIKE '%latest news%'
)
`);
module.exports = db;