|
|
|
@@ -1,17 +1,47 @@
|
|
|
|
|
import { mkdir, writeFile, access } from "fs/promises";
|
|
|
|
|
import { resolve } from "path";
|
|
|
|
|
import { askAI } from "./openrouter";
|
|
|
|
|
import { ttsLimiter } from "./limiter";
|
|
|
|
|
|
|
|
|
|
const NARRATION_SYSTEM_PROMPT = `You are a narration script editor for an AI voice actor. Your job is to take educational lesson text and prepare it to be read aloud naturally and engagingly.
|
|
|
|
|
const NARRATION_SYSTEM_PROMPT = `You are a narration script editor for an AI voice actor reading educational lesson content. Your job is to prepare text so it sounds natural, warm, and engaging when spoken aloud.
|
|
|
|
|
|
|
|
|
|
Rules:
|
|
|
|
|
- Do NOT change the meaning, facts, or structure of the content. You are not rewriting it.
|
|
|
|
|
- Fix anything that would sound awkward or robotic when spoken: remove markdown formatting (asterisks, backticks, hashes), spell out acronyms where helpful, rephrase code snippets or technical shorthand into speakable language.
|
|
|
|
|
- Add square bracket cues to give the voice character and pacing. These are the only ones you may use: [pause], [long pause], [sighs], [laughs], [clears throat], [hesitates].
|
|
|
|
|
- Use [pause] at natural breath points — after key ideas, before a new concept, or mid-sentence where a human would pause for effect. Don't overdo it; one every few sentences at most.
|
|
|
|
|
- Use [sighs] or [laughs] very sparingly — only where a human narrator genuinely would. A [sighs] before a tricky concept, a [laughs] when something is ironic or light. Maybe once or twice per lesson, if at all.
|
|
|
|
|
- Keep the tone warm, clear, and conversational — like a knowledgeable friend explaining something, not a textbook being read aloud.
|
|
|
|
|
- Return ONLY the modified narration text. No commentary, no explanation, no quotes around the output.`;
|
|
|
|
|
## Content rules
|
|
|
|
|
- Do NOT change the meaning, facts, or structure. You are not rewriting the lesson.
|
|
|
|
|
- Fix anything that sounds robotic or awkward when spoken: strip markdown (asterisks, backticks, hashes, bullet dashes), spell out acronyms where helpful, rephrase code snippets or URLs into speakable language (e.g. "the fetch function" not "\`fetch()\`").
|
|
|
|
|
|
|
|
|
|
## Voice control tags
|
|
|
|
|
You have a rich set of square bracket tags to shape how the voice sounds. Use them tastefully — a well-placed tag is powerful, overuse kills it.
|
|
|
|
|
|
|
|
|
|
**Pacing**
|
|
|
|
|
[pause] — a natural breath beat, use at transitions or after key ideas
|
|
|
|
|
[long pause] — a longer held silence, use for emphasis or before something important
|
|
|
|
|
[short pause] — a very brief beat
|
|
|
|
|
|
|
|
|
|
**Non-verbal sounds** (use sparingly, one or two per lesson max)
|
|
|
|
|
[breath] — a natural inhale, good at the start of a new thought or after a long sentence
|
|
|
|
|
[sighs] — before a tricky concept, or when something is a bit of a pain
|
|
|
|
|
[laughs] — when something is genuinely ironic, surprising, or lightly funny
|
|
|
|
|
[chuckles] — softer than laughs, more conversational
|
|
|
|
|
[exhales] — a quiet breath out, good for winding down a dense section
|
|
|
|
|
[clears throat] — before jumping into something more formal or detailed
|
|
|
|
|
[gasp] — for something genuinely surprising
|
|
|
|
|
|
|
|
|
|
**Delivery style** (can be chained, effect lasts until next tag or end of sentence)
|
|
|
|
|
[curious] — lean in, raise intrigue
|
|
|
|
|
[excited] — energy up, good for "here's the cool part"
|
|
|
|
|
[whispers] — draw the listener in for an aside
|
|
|
|
|
[nervous] — for content where a student might feel anxious (e.g. exams)
|
|
|
|
|
[calm] — reassuring, slows things down
|
|
|
|
|
[sarcastic] — very sparingly, only when the tone clearly calls for it
|
|
|
|
|
|
|
|
|
|
## Placement guidance
|
|
|
|
|
- [pause] can go mid-sentence before a key term, or at the end of a sentence before shifting topic
|
|
|
|
|
- Emotional tags go BEFORE the text they should affect, and return to neutral naturally after a sentence or two
|
|
|
|
|
- Don't open with a tag — let the voice settle first
|
|
|
|
|
- Avoid back-to-back tags with no words between them
|
|
|
|
|
|
|
|
|
|
## Output
|
|
|
|
|
Return ONLY the modified narration text. No commentary, no labels, no quotes.`;
|
|
|
|
|
|
|
|
|
|
async function humaniseTTSText(text: string): Promise<string> {
|
|
|
|
|
try {
|
|
|
|
@@ -47,26 +77,45 @@ async function callElevenLabs(
|
|
|
|
|
apiKey: string,
|
|
|
|
|
voiceId: string
|
|
|
|
|
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
|
|
|
|
|
const res = await fetch(
|
|
|
|
|
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`,
|
|
|
|
|
{
|
|
|
|
|
method: "POST",
|
|
|
|
|
headers: {
|
|
|
|
|
"xi-api-key": apiKey,
|
|
|
|
|
"Content-Type": "application/json",
|
|
|
|
|
},
|
|
|
|
|
body: JSON.stringify({
|
|
|
|
|
text,
|
|
|
|
|
model_id: "eleven_turbo_v2_5",
|
|
|
|
|
output_format: "mp3_44100_128",
|
|
|
|
|
}),
|
|
|
|
|
signal: AbortSignal.timeout(60_000),
|
|
|
|
|
const MAX_RETRIES = 5;
|
|
|
|
|
let delay = 2000;
|
|
|
|
|
|
|
|
|
|
let res!: Response;
|
|
|
|
|
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
|
|
|
|
|
res = await fetch(
|
|
|
|
|
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`,
|
|
|
|
|
{
|
|
|
|
|
method: "POST",
|
|
|
|
|
headers: {
|
|
|
|
|
"xi-api-key": apiKey,
|
|
|
|
|
"Content-Type": "application/json",
|
|
|
|
|
},
|
|
|
|
|
body: JSON.stringify({
|
|
|
|
|
text,
|
|
|
|
|
model_id: "eleven_v3",
|
|
|
|
|
output_format: "mp3_44100_128",
|
|
|
|
|
}),
|
|
|
|
|
signal: AbortSignal.timeout(60_000),
|
|
|
|
|
}
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (res.ok) break;
|
|
|
|
|
|
|
|
|
|
if (res.status === 429 && attempt < MAX_RETRIES) {
|
|
|
|
|
console.warn(`[tts] ElevenLabs 429 — retry ${attempt + 1}/${MAX_RETRIES} in ${delay}ms`);
|
|
|
|
|
await new Promise((r) => setTimeout(r, delay));
|
|
|
|
|
delay *= 2;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
const errText = await res.text().catch(() => "");
|
|
|
|
|
console.error(`[tts] ElevenLabs error ${res.status}: ${errText}`);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!res.ok) {
|
|
|
|
|
const errText = await res.text().catch(() => "");
|
|
|
|
|
console.error(`[tts] ElevenLabs error ${res.status}: ${errText}`);
|
|
|
|
|
console.error(`[tts] ElevenLabs failed after ${MAX_RETRIES} retries: ${res.status} ${errText}`);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@@ -140,6 +189,8 @@ async function callFishAudio(
|
|
|
|
|
format: "mp3",
|
|
|
|
|
mp3_bitrate: 128,
|
|
|
|
|
streaming: false,
|
|
|
|
|
normalize: false,
|
|
|
|
|
model: "s2",
|
|
|
|
|
}),
|
|
|
|
|
signal: AbortSignal.timeout(60_000),
|
|
|
|
|
});
|
|
|
|
@@ -172,13 +223,15 @@ async function callTTS(
|
|
|
|
|
const apiKey = config.fishAudioApiKey as string;
|
|
|
|
|
const voiceId = (config.public.fishAudioVoiceId || config.fishAudioVoiceId) as string;
|
|
|
|
|
if (!apiKey) return null;
|
|
|
|
|
return callFishAudio(text, apiKey, voiceId);
|
|
|
|
|
console.log(`[tts] queued (fish) — active: ${ttsLimiter.active}, queued: ${ttsLimiter.queued}`);
|
|
|
|
|
return ttsLimiter.run(() => callFishAudio(text, apiKey, voiceId));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const apiKey = config.elevenlabsApiKey as string;
|
|
|
|
|
const voiceId = (config.public.elevenlabsVoiceId || config.elevenlabsVoiceId) as string;
|
|
|
|
|
if (!apiKey) return null;
|
|
|
|
|
return callElevenLabs(text, apiKey, voiceId);
|
|
|
|
|
console.log(`[tts] queued (elevenlabs) — active: ${ttsLimiter.active}, queued: ${ttsLimiter.queued}`);
|
|
|
|
|
return ttsLimiter.run(() => callElevenLabs(text, apiKey, voiceId));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -191,12 +244,12 @@ export async function generateStepTTS(
|
|
|
|
|
const result = await callTTS(text);
|
|
|
|
|
if (!result) return null;
|
|
|
|
|
|
|
|
|
|
const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
|
|
|
|
|
const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`);
|
|
|
|
|
await mkdir(dir, { recursive: true });
|
|
|
|
|
const filename = `step_${stepIndex}.mp3`;
|
|
|
|
|
await writeFile(`${dir}/${filename}`, result.audio);
|
|
|
|
|
|
|
|
|
|
const audioPath = `/audio/lessons/${lessonId}/${filename}`;
|
|
|
|
|
const audioPath = `/api/audio/lessons/${lessonId}/${filename}`;
|
|
|
|
|
console.log(`[tts] step ${stepIndex} for lesson ${lessonId} — ${result.chunks.length} chunks | $${result.cost.toFixed(4)}`);
|
|
|
|
|
return { audioPath, audioChunks: result.chunks, cost: result.cost };
|
|
|
|
|
} catch (err: any) {
|
|
|
|
@@ -214,12 +267,12 @@ export async function generateQuestionTTS(
|
|
|
|
|
const result = await callTTS(text);
|
|
|
|
|
if (!result) return null;
|
|
|
|
|
|
|
|
|
|
const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
|
|
|
|
|
const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`);
|
|
|
|
|
await mkdir(dir, { recursive: true });
|
|
|
|
|
const filename = `step_${stepIndex}_question.mp3`;
|
|
|
|
|
await writeFile(`${dir}/${filename}`, result.audio);
|
|
|
|
|
|
|
|
|
|
const audioPath = `/audio/lessons/${lessonId}/${filename}`;
|
|
|
|
|
const audioPath = `/api/audio/lessons/${lessonId}/${filename}`;
|
|
|
|
|
return { audioPath, audioChunks: result.chunks, cost: result.cost };
|
|
|
|
|
} catch (err: any) {
|
|
|
|
|
console.error(`[tts] question ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
|
|
|
|
@@ -237,12 +290,12 @@ export async function generateOptionTTS(
|
|
|
|
|
const result = await callTTS(text);
|
|
|
|
|
if (!result) return null;
|
|
|
|
|
|
|
|
|
|
const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
|
|
|
|
|
const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`);
|
|
|
|
|
await mkdir(dir, { recursive: true });
|
|
|
|
|
const filename = `step_${stepIndex}_option_${optionIndex}.mp3`;
|
|
|
|
|
await writeFile(`${dir}/${filename}`, result.audio);
|
|
|
|
|
|
|
|
|
|
const audioPath = `/audio/lessons/${lessonId}/${filename}`;
|
|
|
|
|
const audioPath = `/api/audio/lessons/${lessonId}/${filename}`;
|
|
|
|
|
return { audioPath, cost: result.cost };
|
|
|
|
|
} catch (err: any) {
|
|
|
|
|
console.error(`[tts] option ${stepIndex}/${optionIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
|
|
|
|
@@ -282,6 +335,8 @@ export async function generateClip(
|
|
|
|
|
format: "mp3",
|
|
|
|
|
mp3_bitrate: 128,
|
|
|
|
|
streaming: false,
|
|
|
|
|
normalize: false,
|
|
|
|
|
model: "s2",
|
|
|
|
|
}),
|
|
|
|
|
signal: AbortSignal.timeout(60_000),
|
|
|
|
|
});
|
|
|
|
@@ -304,7 +359,7 @@ export async function generateClip(
|
|
|
|
|
},
|
|
|
|
|
body: JSON.stringify({
|
|
|
|
|
text,
|
|
|
|
|
model_id: opts?.model ?? "eleven_turbo_v2_5",
|
|
|
|
|
model_id: opts?.model ?? "eleven_v3",
|
|
|
|
|
output_format: "mp3_44100_128",
|
|
|
|
|
...(opts?.voice_settings ? { voice_settings: opts.voice_settings } : {}),
|
|
|
|
|
}),
|
|
|
|
@@ -323,7 +378,7 @@ export async function generateClip(
|
|
|
|
|
buffer = Buffer.from(await res.arrayBuffer());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await mkdir(resolve(process.cwd(), "public/audio/labels"), { recursive: true });
|
|
|
|
|
await mkdir(resolve(process.cwd(), "data/audio/labels"), { recursive: true });
|
|
|
|
|
await writeFile(outPath, buffer);
|
|
|
|
|
return { cost };
|
|
|
|
|
} catch (err: any) {
|
|
|
|
@@ -341,11 +396,11 @@ export async function generateTTSToPath(
|
|
|
|
|
const result = await callTTS(text);
|
|
|
|
|
if (!result) return null;
|
|
|
|
|
|
|
|
|
|
const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
|
|
|
|
|
const dir = resolve(process.cwd(), `data/audio/lessons/${lessonId}`);
|
|
|
|
|
await mkdir(dir, { recursive: true });
|
|
|
|
|
await writeFile(`${dir}/${filename}`, result.audio);
|
|
|
|
|
|
|
|
|
|
const audioPath = `/audio/lessons/${lessonId}/${filename}`;
|
|
|
|
|
const audioPath = `/api/audio/lessons/${lessonId}/${filename}`;
|
|
|
|
|
return { audioPath, audioChunks: result.chunks, cost: result.cost };
|
|
|
|
|
} catch (err: any) {
|
|
|
|
|
console.error(`[tts] ${filename} for lesson ${lessonId} failed: ${err?.message ?? err}`);
|
|
|
|
|