import { mkdir, writeFile, access } from "fs/promises"; import { resolve } from "path"; import { askAI } from "./openrouter"; import { ttsLimiter } from "./limiter"; const NARRATION_SYSTEM_PROMPT = `You are a narration script editor for an AI voice actor reading educational lesson content. Your job is to prepare text so it sounds natural, warm, and engaging when spoken aloud. ## Content rules - Do NOT change the meaning, facts, or structure. You are not rewriting the lesson. - Fix anything that sounds robotic or awkward when spoken: strip markdown (asterisks, backticks, hashes, bullet dashes), spell out acronyms where helpful, rephrase code snippets or URLs into speakable language (e.g. "the fetch function" not "\`fetch()\`"). ## Voice control tags You have a rich set of square bracket tags to shape how the voice sounds. Use them tastefully — a well-placed tag is powerful, overuse kills it. **Pacing** [pause] — a natural breath beat, use at transitions or after key ideas [long pause] — a longer held silence, use for emphasis or before something important [short pause] — a very brief beat **Non-verbal sounds** (use sparingly, one or two per lesson max) [breath] — a natural inhale, good at the start of a new thought or after a long sentence [sighs] — before a tricky concept, or when something is a bit of a pain [laughs] — when something is genuinely ironic, surprising, or lightly funny [chuckles] — softer than laughs, more conversational [exhales] — a quiet breath out, good for winding down a dense section [clears throat] — before jumping into something more formal or detailed [gasp] — for something genuinely surprising **Delivery style** (can be chained, effect lasts until next tag or end of sentence) [curious] — lean in, raise intrigue [excited] — energy up, good for "here's the cool part" [whispers] — draw the listener in for an aside [nervous] — for content where a student might feel anxious (e.g. exams) [calm] — reassuring, slows things down [sarcastic] — very sparingly, only when the tone clearly calls for it ## Placement guidance - [pause] can go mid-sentence before a key term, or at the end of a sentence before shifting topic - Emotional tags go BEFORE the text they should affect, and return to neutral naturally after a sentence or two - Don't open with a tag — let the voice settle first - Avoid back-to-back tags with no words between them ## Output Return ONLY the modified narration text. No commentary, no labels, no quotes.`; async function humaniseTTSText(text: string): Promise { try { const result = await askAI( [ { role: "system", content: NARRATION_SYSTEM_PROMPT }, { role: "user", content: text }, ], { temperature: 0.5, maxTokens: 2048 } ); return result.text.trim(); } catch (err: any) { console.error(`[tts] humanise failed, using raw text: ${err?.message ?? err}`); return text; } } export interface AudioChunk { text: string; start: number; end: number; } export interface TTSResult { audioPath: string; audioChunks: AudioChunk[]; cost: number; } async function callElevenLabs( text: string, apiKey: string, voiceId: string ): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> { const MAX_RETRIES = 5; let delay = 2000; let res!: Response; for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { res = await fetch( `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`, { method: "POST", headers: { "xi-api-key": apiKey, "Content-Type": "application/json", }, body: JSON.stringify({ text, model_id: "eleven_v3", output_format: "mp3_44100_128", }), signal: AbortSignal.timeout(60_000), } ); if (res.ok) break; if (res.status === 429 && attempt < MAX_RETRIES) { console.warn(`[tts] ElevenLabs 429 — retry ${attempt + 1}/${MAX_RETRIES} in ${delay}ms`); await new Promise((r) => setTimeout(r, delay)); delay *= 2; continue; } const errText = await res.text().catch(() => ""); console.error(`[tts] ElevenLabs error ${res.status}: ${errText}`); return null; } if (!res.ok) { const errText = await res.text().catch(() => ""); console.error(`[tts] ElevenLabs failed after ${MAX_RETRIES} retries: ${res.status} ${errText}`); return null; } const charCost = parseInt(res.headers.get("character-cost") ?? "0", 10); const cost = charCost * 0.0003; const data = await res.json() as { audio_base64: string; alignment: { characters: string[]; character_start_times_seconds: number[]; character_end_times_seconds: number[]; }; }; const audio = Buffer.from(data.audio_base64, "base64"); const { characters, character_start_times_seconds: starts, character_end_times_seconds: ends } = data.alignment; const words: { word: string; charStart: number; charEnd: number }[] = []; let i = 0; while (i < text.length) { while (i < text.length && /\s/.test(text[i])) i++; if (i >= text.length) break; const wordStart = i; while (i < text.length && !/\s/.test(text[i])) i++; words.push({ word: text.slice(wordStart, i), charStart: wordStart, charEnd: i - 1 }); } function timeForChar(charIdx: number, side: "start" | "end"): number { let accumulated = 0; for (let ci = 0; ci < characters.length; ci++) { const c = characters[ci]; if (accumulated + c.length > charIdx) { return side === "start" ? starts[ci] : ends[ci]; } accumulated += c.length; } const last = side === "start" ? starts[starts.length - 1] : ends[ends.length - 1]; return last ?? 0; } const chunks: AudioChunk[] = []; for (let wi = 0; wi < words.length; wi += 3) { const slice = words.slice(wi, wi + 3); chunks.push({ text: slice.map(w => w.word).join(" "), start: timeForChar(slice[0].charStart, "start"), end: timeForChar(slice[slice.length - 1].charEnd, "end"), }); } return { audio, chunks, cost }; } // Fish Audio doesnt return alignment data, so we return empty chunks async function callFishAudio( text: string, apiKey: string, voiceId: string ): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> { const res = await fetch("https://api.fish.audio/v1/tts", { method: "POST", headers: { "Authorization": `Bearer ${apiKey}`, "Content-Type": "application/json", }, body: JSON.stringify({ text, reference_id: voiceId, format: "mp3", mp3_bitrate: 128, streaming: false, normalize: false, model: "s2", }), signal: AbortSignal.timeout(60_000), }); if (!res.ok) { const errText = await res.text().catch(() => ""); console.error(`[tts] Fish Audio error ${res.status}: ${errText}`); return null; } const audio = Buffer.from(await res.arrayBuffer()); return { audio, chunks: [], cost: 0 }; } function getProvider() { const config = useRuntimeConfig(); return (config.ttsProvider as string | undefined)?.toLowerCase() ?? "elevenlabs"; } async function callTTS( text: string ): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> { const config = useRuntimeConfig(); const provider = getProvider(); text = await humaniseTTSText(text); if (provider === "fishaudio") { const apiKey = config.fishAudioApiKey as string; const voiceId = (config.public.fishAudioVoiceId || config.fishAudioVoiceId) as string; if (!apiKey) return null; console.log(`[tts] queued (fish) — active: ${ttsLimiter.active}, queued: ${ttsLimiter.queued}`); return ttsLimiter.run(() => callFishAudio(text, apiKey, voiceId)); } const apiKey = config.elevenlabsApiKey as string; const voiceId = (config.public.elevenlabsVoiceId || config.elevenlabsVoiceId) as string; if (!apiKey) return null; console.log(`[tts] queued (elevenlabs) — active: ${ttsLimiter.active}, queued: ${ttsLimiter.queued}`); return ttsLimiter.run(() => callElevenLabs(text, apiKey, voiceId)); } export async function generateStepTTS( text: string, lessonId: string, stepIndex: number ): Promise { try { const result = await callTTS(text); if (!result) return null; const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`); await mkdir(dir, { recursive: true }); const filename = `step_${stepIndex}.mp3`; await writeFile(`${dir}/${filename}`, result.audio); const audioPath = `/api/audio/lessons/${lessonId}/${filename}`; console.log(`[tts] step ${stepIndex} for lesson ${lessonId} — ${result.chunks.length} chunks | $${result.cost.toFixed(4)}`); return { audioPath, audioChunks: result.chunks, cost: result.cost }; } catch (err: any) { console.error(`[tts] step ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`); return null; } } export async function generateQuestionTTS( text: string, lessonId: string, stepIndex: number ): Promise { try { const result = await callTTS(text); if (!result) return null; const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`); await mkdir(dir, { recursive: true }); const filename = `step_${stepIndex}_question.mp3`; await writeFile(`${dir}/${filename}`, result.audio); const audioPath = `/api/audio/lessons/${lessonId}/${filename}`; return { audioPath, audioChunks: result.chunks, cost: result.cost }; } catch (err: any) { console.error(`[tts] question ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`); return null; } } export async function generateOptionTTS( text: string, lessonId: string, stepIndex: number, optionIndex: number ): Promise<{ audioPath: string; cost: number } | null> { try { const result = await callTTS(text); if (!result) return null; const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`); await mkdir(dir, { recursive: true }); const filename = `step_${stepIndex}_option_${optionIndex}.mp3`; await writeFile(`${dir}/${filename}`, result.audio); const audioPath = `/api/audio/lessons/${lessonId}/${filename}`; return { audioPath, cost: result.cost }; } catch (err: any) { console.error(`[tts] option ${stepIndex}/${optionIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`); return null; } } // generate a plain audio clip with no timestamp data (used for label clips) // opts are ElevenLabs-specific and are ignored when using Fish Audio export async function generateClip( text: string, outPath: string, apiKey: string, voiceId: string, opts?: { model?: string; voice_settings?: Record } ): Promise<{ cost: number } | null> { const provider = getProvider(); try { let buffer: Buffer; let cost = 0; if (provider === "fishaudio") { const config = useRuntimeConfig(); const fishKey = config.fishAudioApiKey as string; const fishVoice = (config.public.fishAudioVoiceId || config.fishAudioVoiceId as string) || voiceId; const res = await fetch("https://api.fish.audio/v1/tts", { method: "POST", headers: { "Authorization": `Bearer ${fishKey}`, "Content-Type": "application/json", }, body: JSON.stringify({ text, reference_id: fishVoice, format: "mp3", mp3_bitrate: 128, streaming: false, normalize: false, model: "s2", }), signal: AbortSignal.timeout(60_000), }); if (!res.ok) { const errText = await res.text().catch(() => ""); console.error(`[tts] Fish Audio clip error ${res.status}: ${errText}`); return null; } buffer = Buffer.from(await res.arrayBuffer()); } else { const res = await fetch( `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, { method: "POST", headers: { "xi-api-key": apiKey, "Content-Type": "application/json", }, body: JSON.stringify({ text, model_id: opts?.model ?? "eleven_v3", output_format: "mp3_44100_128", ...(opts?.voice_settings ? { voice_settings: opts.voice_settings } : {}), }), signal: AbortSignal.timeout(60_000), } ); if (!res.ok) { const errText = await res.text().catch(() => ""); console.error(`[tts] clip error ${res.status}: ${errText}`); return null; } const charCost = parseInt(res.headers.get("character-cost") ?? "0", 10); cost = charCost * 0.0003; buffer = Buffer.from(await res.arrayBuffer()); } await mkdir(resolve(process.cwd(), "data/audio/labels"), { recursive: true }); await writeFile(outPath, buffer); return { cost }; } catch (err: any) { console.error(`[tts] clip failed for ${outPath}: ${err?.message ?? err}`); return null; } } export async function generateTTSToPath( text: string, lessonId: string, filename: string ): Promise { try { const result = await callTTS(text); if (!result) return null; const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`); await mkdir(dir, { recursive: true }); await writeFile(`${dir}/${filename}`, result.audio); const audioPath = `/api/audio/lessons/${lessonId}/${filename}`; return { audioPath, audioChunks: result.chunks, cost: result.cost }; } catch (err: any) { console.error(`[tts] ${filename} for lesson ${lessonId} failed: ${err?.message ?? err}`); return null; } } export async function fileExists(path: string): Promise { try { await access(path); return true; } catch { return false; } }