enhance article processing by adding language support and adjusting embedding parameters
This commit is contained in:
+3
-3
@@ -65,8 +65,8 @@ const markContentPending = db.prepare(`
|
||||
const selectPartitionedArticlesMissingContent = db.prepare(`
|
||||
SELECT id, url, title, description
|
||||
FROM (
|
||||
SELECT id, url, title, description, source,
|
||||
ROW_NUMBER() OVER (PARTITION BY source ORDER BY ingested_at DESC, id DESC) AS rn
|
||||
SELECT id, url, title, description, source, pub_date_effective,
|
||||
ROW_NUMBER() OVER (PARTITION BY source ORDER BY pub_date_effective DESC, id DESC) AS rn
|
||||
FROM articles
|
||||
WHERE (content IS NULL OR TRIM(content) = '')
|
||||
AND (content_status IS NULL OR content_status = 'pending')
|
||||
@@ -74,7 +74,7 @@ const selectPartitionedArticlesMissingContent = db.prepare(`
|
||||
AND (id % ?) = ?
|
||||
)
|
||||
WHERE rn <= ?
|
||||
ORDER BY rn, source
|
||||
ORDER BY pub_date_effective DESC, rn, source
|
||||
`);
|
||||
|
||||
const selectAttemptCount = db.prepare(`
|
||||
|
||||
@@ -288,7 +288,8 @@ for (const statement of [
|
||||
'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0',
|
||||
'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0',
|
||||
'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT'
|
||||
'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT',
|
||||
'ALTER TABLE articles ADD COLUMN language TEXT'
|
||||
]) {
|
||||
try {
|
||||
db.exec(statement);
|
||||
@@ -312,6 +313,20 @@ db.exec(`
|
||||
WHERE pub_date_effective IS NULL
|
||||
`);
|
||||
|
||||
// backfill language from sources.json for existing rows
|
||||
{
|
||||
const sources = require('../sources.json');
|
||||
const updateLang = db.prepare(`UPDATE articles SET language = ? WHERE source = ? AND language IS NULL`);
|
||||
const backfillLang = db.transaction(() => {
|
||||
for (const src of sources) {
|
||||
if (src.language) {
|
||||
updateLang.run(src.language, src.id);
|
||||
}
|
||||
}
|
||||
});
|
||||
backfillLang();
|
||||
}
|
||||
|
||||
db.exec(`
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);
|
||||
|
||||
+1
-1
@@ -73,7 +73,7 @@ const selectArticlesMissingEmbeddings = db.prepare(`
|
||||
SELECT 1 FROM article_embedding_store s
|
||||
WHERE s.article_id = a.id AND s.model = ?
|
||||
)
|
||||
ORDER BY a.ingested_at ASC, a.id ASC
|
||||
ORDER BY a.pub_date_effective DESC, a.id DESC
|
||||
LIMIT ?
|
||||
`);
|
||||
|
||||
|
||||
+10
-3
@@ -2,6 +2,10 @@ const db = require('./db');
|
||||
const { normalizeTitle } = require('./dedup');
|
||||
const { markSourceRun } = require('./state');
|
||||
|
||||
const sourcesById = Object.fromEntries(
|
||||
require('../sources.json').map((s) => [s.id, s])
|
||||
);
|
||||
|
||||
const insertArticle = db.prepare(`
|
||||
INSERT INTO articles (
|
||||
title,
|
||||
@@ -13,8 +17,9 @@ const insertArticle = db.prepare(`
|
||||
source,
|
||||
pub_date,
|
||||
ingested_at,
|
||||
pub_date_effective
|
||||
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?)
|
||||
pub_date_effective,
|
||||
language
|
||||
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`);
|
||||
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
|
||||
const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
|
||||
@@ -81,6 +86,7 @@ function ingestArticle(article) {
|
||||
const isIndexPage = inferIsIndexPage(article, title, url);
|
||||
const pubDate = normalizePubDate(article.pubDate);
|
||||
const ingestedAt = new Date().toISOString();
|
||||
const language = (sourcesById[source] && sourcesById[source].language) || null;
|
||||
|
||||
try {
|
||||
const result = insertArticle.run(
|
||||
@@ -92,7 +98,8 @@ function ingestArticle(article) {
|
||||
source,
|
||||
pubDate,
|
||||
ingestedAt,
|
||||
pubDate || ingestedAt
|
||||
pubDate || ingestedAt,
|
||||
language
|
||||
);
|
||||
|
||||
// dont kick off the content fetch here — it used to be fire-and-forget which
|
||||
|
||||
Reference in New Issue
Block a user