Revisione/server/utils/generateTTS.ts

418 lines
14 KiB
TypeScript

import { mkdir, writeFile, access } from "fs/promises";
import { resolve } from "path";
import { askAI } from "./openrouter";
import { ttsLimiter } from "./limiter";
const NARRATION_SYSTEM_PROMPT = `You are a narration script editor for an AI voice actor reading educational lesson content. Your job is to prepare text so it sounds natural, warm, and engaging when spoken aloud.
## Content rules
- Do NOT change the meaning, facts, or structure. You are not rewriting the lesson.
- Fix anything that sounds robotic or awkward when spoken: strip markdown (asterisks, backticks, hashes, bullet dashes), spell out acronyms where helpful, rephrase code snippets or URLs into speakable language (e.g. "the fetch function" not "\`fetch()\`").
## Voice control tags
You have a rich set of square bracket tags to shape how the voice sounds. Use them tastefully — a well-placed tag is powerful, overuse kills it.
**Pacing**
[pause] — a natural breath beat, use at transitions or after key ideas
[long pause] — a longer held silence, use for emphasis or before something important
[short pause] — a very brief beat
**Non-verbal sounds** (use sparingly, one or two per lesson max)
[breath] — a natural inhale, good at the start of a new thought or after a long sentence
[sighs] — before a tricky concept, or when something is a bit of a pain
[laughs] — when something is genuinely ironic, surprising, or lightly funny
[chuckles] — softer than laughs, more conversational
[exhales] — a quiet breath out, good for winding down a dense section
[clears throat] — before jumping into something more formal or detailed
[gasp] — for something genuinely surprising
**Delivery style** (can be chained, effect lasts until next tag or end of sentence)
[curious] — lean in, raise intrigue
[excited] — energy up, good for "here's the cool part"
[whispers] — draw the listener in for an aside
[nervous] — for content where a student might feel anxious (e.g. exams)
[calm] — reassuring, slows things down
[sarcastic] — very sparingly, only when the tone clearly calls for it
## Placement guidance
- [pause] can go mid-sentence before a key term, or at the end of a sentence before shifting topic
- Emotional tags go BEFORE the text they should affect, and return to neutral naturally after a sentence or two
- Don't open with a tag — let the voice settle first
- Avoid back-to-back tags with no words between them
## Output
Return ONLY the modified narration text. No commentary, no labels, no quotes.`;
async function humaniseTTSText(text: string): Promise<string> {
try {
const result = await askAI(
[
{ role: "system", content: NARRATION_SYSTEM_PROMPT },
{ role: "user", content: text },
],
{ temperature: 0.5, maxTokens: 2048 }
);
return result.text.trim();
} catch (err: any) {
console.error(`[tts] humanise failed, using raw text: ${err?.message ?? err}`);
return text;
}
}
export interface AudioChunk {
text: string;
start: number;
end: number;
}
export interface TTSResult {
audioPath: string;
audioChunks: AudioChunk[];
cost: number;
}
async function callElevenLabs(
text: string,
apiKey: string,
voiceId: string
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
const MAX_RETRIES = 5;
let delay = 2000;
let res!: Response;
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
res = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`,
{
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
},
body: JSON.stringify({
text,
model_id: "eleven_v3",
output_format: "mp3_44100_128",
}),
signal: AbortSignal.timeout(60_000),
}
);
if (res.ok) break;
if (res.status === 429 && attempt < MAX_RETRIES) {
console.warn(`[tts] ElevenLabs 429 — retry ${attempt + 1}/${MAX_RETRIES} in ${delay}ms`);
await new Promise((r) => setTimeout(r, delay));
delay *= 2;
continue;
}
const errText = await res.text().catch(() => "");
console.error(`[tts] ElevenLabs error ${res.status}: ${errText}`);
return null;
}
if (!res.ok) {
const errText = await res.text().catch(() => "");
console.error(`[tts] ElevenLabs failed after ${MAX_RETRIES} retries: ${res.status} ${errText}`);
return null;
}
const charCost = parseInt(res.headers.get("character-cost") ?? "0", 10);
const cost = charCost * 0.0003;
const data = await res.json() as {
audio_base64: string;
alignment: {
characters: string[];
character_start_times_seconds: number[];
character_end_times_seconds: number[];
};
};
const audio = Buffer.from(data.audio_base64, "base64");
const { characters, character_start_times_seconds: starts, character_end_times_seconds: ends } = data.alignment;
const words: { word: string; charStart: number; charEnd: number }[] = [];
let i = 0;
while (i < text.length) {
while (i < text.length && /\s/.test(text[i])) i++;
if (i >= text.length) break;
const wordStart = i;
while (i < text.length && !/\s/.test(text[i])) i++;
words.push({ word: text.slice(wordStart, i), charStart: wordStart, charEnd: i - 1 });
}
function timeForChar(charIdx: number, side: "start" | "end"): number {
let accumulated = 0;
for (let ci = 0; ci < characters.length; ci++) {
const c = characters[ci];
if (accumulated + c.length > charIdx) {
return side === "start" ? starts[ci] : ends[ci];
}
accumulated += c.length;
}
const last = side === "start" ? starts[starts.length - 1] : ends[ends.length - 1];
return last ?? 0;
}
const chunks: AudioChunk[] = [];
for (let wi = 0; wi < words.length; wi += 3) {
const slice = words.slice(wi, wi + 3);
chunks.push({
text: slice.map(w => w.word).join(" "),
start: timeForChar(slice[0].charStart, "start"),
end: timeForChar(slice[slice.length - 1].charEnd, "end"),
});
}
return { audio, chunks, cost };
}
// Fish Audio doesnt return alignment data, so we return empty chunks
async function callFishAudio(
text: string,
apiKey: string,
voiceId: string
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
const res = await fetch("https://api.fish.audio/v1/tts", {
method: "POST",
headers: {
"Authorization": `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
text,
reference_id: voiceId,
format: "mp3",
mp3_bitrate: 128,
streaming: false,
normalize: false,
model: "s2",
}),
signal: AbortSignal.timeout(60_000),
});
if (!res.ok) {
const errText = await res.text().catch(() => "");
console.error(`[tts] Fish Audio error ${res.status}: ${errText}`);
return null;
}
const audio = Buffer.from(await res.arrayBuffer());
return { audio, chunks: [], cost: 0 };
}
function getProvider() {
const config = useRuntimeConfig();
return (config.ttsProvider as string | undefined)?.toLowerCase() ?? "elevenlabs";
}
async function callTTS(
text: string
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
const config = useRuntimeConfig();
const provider = getProvider();
text = await humaniseTTSText(text);
if (provider === "fishaudio") {
const apiKey = config.fishAudioApiKey as string;
const voiceId = (config.public.fishAudioVoiceId || config.fishAudioVoiceId) as string;
if (!apiKey) return null;
console.log(`[tts] queued (fish) — active: ${ttsLimiter.active}, queued: ${ttsLimiter.queued}`);
return ttsLimiter.run(() => callFishAudio(text, apiKey, voiceId));
}
const apiKey = config.elevenlabsApiKey as string;
const voiceId = (config.public.elevenlabsVoiceId || config.elevenlabsVoiceId) as string;
if (!apiKey) return null;
console.log(`[tts] queued (elevenlabs) — active: ${ttsLimiter.active}, queued: ${ttsLimiter.queued}`);
return ttsLimiter.run(() => callElevenLabs(text, apiKey, voiceId));
}
export async function generateStepTTS(
text: string,
lessonId: string,
stepIndex: number
): Promise<TTSResult | null> {
try {
const result = await callTTS(text);
if (!result) return null;
const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`);
await mkdir(dir, { recursive: true });
const filename = `step_${stepIndex}.mp3`;
await writeFile(`${dir}/${filename}`, result.audio);
const audioPath = `/api/audio/lessons/${lessonId}/${filename}`;
console.log(`[tts] step ${stepIndex} for lesson ${lessonId}${result.chunks.length} chunks | $${result.cost.toFixed(4)}`);
return { audioPath, audioChunks: result.chunks, cost: result.cost };
} catch (err: any) {
console.error(`[tts] step ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
return null;
}
}
export async function generateQuestionTTS(
text: string,
lessonId: string,
stepIndex: number
): Promise<TTSResult | null> {
try {
const result = await callTTS(text);
if (!result) return null;
const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`);
await mkdir(dir, { recursive: true });
const filename = `step_${stepIndex}_question.mp3`;
await writeFile(`${dir}/${filename}`, result.audio);
const audioPath = `/api/audio/lessons/${lessonId}/${filename}`;
return { audioPath, audioChunks: result.chunks, cost: result.cost };
} catch (err: any) {
console.error(`[tts] question ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
return null;
}
}
export async function generateOptionTTS(
text: string,
lessonId: string,
stepIndex: number,
optionIndex: number
): Promise<{ audioPath: string; cost: number } | null> {
try {
const result = await callTTS(text);
if (!result) return null;
const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`);
await mkdir(dir, { recursive: true });
const filename = `step_${stepIndex}_option_${optionIndex}.mp3`;
await writeFile(`${dir}/${filename}`, result.audio);
const audioPath = `/api/audio/lessons/${lessonId}/${filename}`;
return { audioPath, cost: result.cost };
} catch (err: any) {
console.error(`[tts] option ${stepIndex}/${optionIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
return null;
}
}
// generate a plain audio clip with no timestamp data (used for label clips)
// opts are ElevenLabs-specific and are ignored when using Fish Audio
export async function generateClip(
text: string,
outPath: string,
apiKey: string,
voiceId: string,
opts?: { model?: string; voice_settings?: Record<string, number> }
): Promise<{ cost: number } | null> {
const provider = getProvider();
try {
let buffer: Buffer;
let cost = 0;
if (provider === "fishaudio") {
const config = useRuntimeConfig();
const fishKey = config.fishAudioApiKey as string;
const fishVoice = (config.public.fishAudioVoiceId || config.fishAudioVoiceId as string) || voiceId;
const res = await fetch("https://api.fish.audio/v1/tts", {
method: "POST",
headers: {
"Authorization": `Bearer ${fishKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
text,
reference_id: fishVoice,
format: "mp3",
mp3_bitrate: 128,
streaming: false,
normalize: false,
model: "s2",
}),
signal: AbortSignal.timeout(60_000),
});
if (!res.ok) {
const errText = await res.text().catch(() => "");
console.error(`[tts] Fish Audio clip error ${res.status}: ${errText}`);
return null;
}
buffer = Buffer.from(await res.arrayBuffer());
} else {
const res = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
{
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
},
body: JSON.stringify({
text,
model_id: opts?.model ?? "eleven_v3",
output_format: "mp3_44100_128",
...(opts?.voice_settings ? { voice_settings: opts.voice_settings } : {}),
}),
signal: AbortSignal.timeout(60_000),
}
);
if (!res.ok) {
const errText = await res.text().catch(() => "");
console.error(`[tts] clip error ${res.status}: ${errText}`);
return null;
}
const charCost = parseInt(res.headers.get("character-cost") ?? "0", 10);
cost = charCost * 0.0003;
buffer = Buffer.from(await res.arrayBuffer());
}
await mkdir(resolve(process.cwd(), "data/audio/labels"), { recursive: true });
await writeFile(outPath, buffer);
return { cost };
} catch (err: any) {
console.error(`[tts] clip failed for ${outPath}: ${err?.message ?? err}`);
return null;
}
}
export async function generateTTSToPath(
text: string,
lessonId: string,
filename: string
): Promise<TTSResult | null> {
try {
const result = await callTTS(text);
if (!result) return null;
const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`);
await mkdir(dir, { recursive: true });
await writeFile(`${dir}/${filename}`, result.audio);
const audioPath = `/api/audio/lessons/${lessonId}/${filename}`;
return { audioPath, audioChunks: result.chunks, cost: result.cost };
} catch (err: any) {
console.error(`[tts] ${filename} for lesson ${lessonId} failed: ${err?.message ?? err}`);
return null;
}
}
export async function fileExists(path: string): Promise<boolean> {
try {
await access(path);
return true;
} catch {
return false;
}
}