329 lines
9.6 KiB
TypeScript
329 lines
9.6 KiB
TypeScript
import { mkdir, writeFile, access } from "fs/promises";
|
|
import { resolve } from "path";
|
|
|
|
export interface AudioChunk {
|
|
text: string;
|
|
start: number;
|
|
end: number;
|
|
}
|
|
|
|
export interface TTSResult {
|
|
audioPath: string;
|
|
audioChunks: AudioChunk[];
|
|
cost: number;
|
|
}
|
|
|
|
|
|
async function callElevenLabs(
|
|
text: string,
|
|
apiKey: string,
|
|
voiceId: string
|
|
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
|
|
const res = await fetch(
|
|
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`,
|
|
{
|
|
method: "POST",
|
|
headers: {
|
|
"xi-api-key": apiKey,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({
|
|
text,
|
|
model_id: "eleven_turbo_v2_5",
|
|
output_format: "mp3_44100_128",
|
|
}),
|
|
}
|
|
);
|
|
|
|
if (!res.ok) {
|
|
const errText = await res.text().catch(() => "");
|
|
console.error(`[tts] ElevenLabs error ${res.status}: ${errText}`);
|
|
return null;
|
|
}
|
|
|
|
const charCost = parseInt(res.headers.get("character-cost") ?? "0", 10);
|
|
const cost = charCost * 0.0003;
|
|
|
|
const data = await res.json() as {
|
|
audio_base64: string;
|
|
alignment: {
|
|
characters: string[];
|
|
character_start_times_seconds: number[];
|
|
character_end_times_seconds: number[];
|
|
};
|
|
};
|
|
|
|
const audio = Buffer.from(data.audio_base64, "base64");
|
|
const { characters, character_start_times_seconds: starts, character_end_times_seconds: ends } = data.alignment;
|
|
|
|
const words: { word: string; charStart: number; charEnd: number }[] = [];
|
|
let i = 0;
|
|
while (i < text.length) {
|
|
while (i < text.length && /\s/.test(text[i])) i++;
|
|
if (i >= text.length) break;
|
|
const wordStart = i;
|
|
while (i < text.length && !/\s/.test(text[i])) i++;
|
|
words.push({ word: text.slice(wordStart, i), charStart: wordStart, charEnd: i - 1 });
|
|
}
|
|
|
|
function timeForChar(charIdx: number, side: "start" | "end"): number {
|
|
let accumulated = 0;
|
|
for (let ci = 0; ci < characters.length; ci++) {
|
|
const c = characters[ci];
|
|
if (accumulated + c.length > charIdx) {
|
|
return side === "start" ? starts[ci] : ends[ci];
|
|
}
|
|
accumulated += c.length;
|
|
}
|
|
const last = side === "start" ? starts[starts.length - 1] : ends[ends.length - 1];
|
|
return last ?? 0;
|
|
}
|
|
|
|
const chunks: AudioChunk[] = [];
|
|
for (let wi = 0; wi < words.length; wi += 3) {
|
|
const slice = words.slice(wi, wi + 3);
|
|
chunks.push({
|
|
text: slice.map(w => w.word).join(" "),
|
|
start: timeForChar(slice[0].charStart, "start"),
|
|
end: timeForChar(slice[slice.length - 1].charEnd, "end"),
|
|
});
|
|
}
|
|
|
|
return { audio, chunks, cost };
|
|
}
|
|
|
|
|
|
// Fish Audio doesnt return alignment data, so we return empty chunks
|
|
async function callFishAudio(
|
|
text: string,
|
|
apiKey: string,
|
|
voiceId: string
|
|
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
|
|
const res = await fetch("https://api.fish.audio/v1/tts", {
|
|
method: "POST",
|
|
headers: {
|
|
"Authorization": `Bearer ${apiKey}`,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({
|
|
text,
|
|
reference_id: voiceId,
|
|
format: "mp3",
|
|
mp3_bitrate: 128,
|
|
streaming: false,
|
|
}),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
const errText = await res.text().catch(() => "");
|
|
console.error(`[tts] Fish Audio error ${res.status}: ${errText}`);
|
|
return null;
|
|
}
|
|
|
|
const audio = Buffer.from(await res.arrayBuffer());
|
|
return { audio, chunks: [], cost: 0 };
|
|
}
|
|
|
|
|
|
function getProvider() {
|
|
const config = useRuntimeConfig();
|
|
return (config.ttsProvider as string | undefined)?.toLowerCase() ?? "elevenlabs";
|
|
}
|
|
|
|
async function callTTS(
|
|
text: string
|
|
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
|
|
const config = useRuntimeConfig();
|
|
const provider = getProvider();
|
|
|
|
if (provider === "fishaudio") {
|
|
const apiKey = config.fishAudioApiKey as string;
|
|
const voiceId = config.fishAudioVoiceId as string;
|
|
if (!apiKey) return null;
|
|
return callFishAudio(text, apiKey, voiceId);
|
|
}
|
|
|
|
const apiKey = config.elevenlabsApiKey as string;
|
|
const voiceId = config.elevenlabsVoiceId as string;
|
|
if (!apiKey) return null;
|
|
return callElevenLabs(text, apiKey, voiceId);
|
|
}
|
|
|
|
|
|
export async function generateStepTTS(
|
|
text: string,
|
|
lessonId: string,
|
|
stepIndex: number
|
|
): Promise<TTSResult | null> {
|
|
try {
|
|
const result = await callTTS(text);
|
|
if (!result) return null;
|
|
|
|
const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
|
|
await mkdir(dir, { recursive: true });
|
|
const filename = `step_${stepIndex}.mp3`;
|
|
await writeFile(`${dir}/${filename}`, result.audio);
|
|
|
|
const audioPath = `/audio/lessons/${lessonId}/${filename}`;
|
|
console.log(`[tts] step ${stepIndex} for lesson ${lessonId} — ${result.chunks.length} chunks | $${result.cost.toFixed(4)}`);
|
|
return { audioPath, audioChunks: result.chunks, cost: result.cost };
|
|
} catch (err: any) {
|
|
console.error(`[tts] step ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export async function generateQuestionTTS(
|
|
text: string,
|
|
lessonId: string,
|
|
stepIndex: number
|
|
): Promise<TTSResult | null> {
|
|
try {
|
|
const result = await callTTS(text);
|
|
if (!result) return null;
|
|
|
|
const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
|
|
await mkdir(dir, { recursive: true });
|
|
const filename = `step_${stepIndex}_question.mp3`;
|
|
await writeFile(`${dir}/${filename}`, result.audio);
|
|
|
|
const audioPath = `/audio/lessons/${lessonId}/${filename}`;
|
|
return { audioPath, audioChunks: result.chunks, cost: result.cost };
|
|
} catch (err: any) {
|
|
console.error(`[tts] question ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export async function generateOptionTTS(
|
|
text: string,
|
|
lessonId: string,
|
|
stepIndex: number,
|
|
optionIndex: number
|
|
): Promise<{ audioPath: string; cost: number } | null> {
|
|
try {
|
|
const result = await callTTS(text);
|
|
if (!result) return null;
|
|
|
|
const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
|
|
await mkdir(dir, { recursive: true });
|
|
const filename = `step_${stepIndex}_option_${optionIndex}.mp3`;
|
|
await writeFile(`${dir}/${filename}`, result.audio);
|
|
|
|
const audioPath = `/audio/lessons/${lessonId}/${filename}`;
|
|
return { audioPath, cost: result.cost };
|
|
} catch (err: any) {
|
|
console.error(`[tts] option ${stepIndex}/${optionIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// generate a plain audio clip with no timestamp data (used for label clips)
|
|
// opts are ElevenLabs-specific and are ignored when using Fish Audio
|
|
export async function generateClip(
|
|
text: string,
|
|
outPath: string,
|
|
apiKey: string,
|
|
voiceId: string,
|
|
opts?: { model?: string; voice_settings?: Record<string, number> }
|
|
): Promise<{ cost: number } | null> {
|
|
const provider = getProvider();
|
|
|
|
try {
|
|
let buffer: Buffer;
|
|
let cost = 0;
|
|
|
|
if (provider === "fishaudio") {
|
|
const config = useRuntimeConfig();
|
|
const fishKey = config.fishAudioApiKey as string;
|
|
const fishVoice = (config.fishAudioVoiceId as string) || voiceId;
|
|
|
|
const res = await fetch("https://api.fish.audio/v1/tts", {
|
|
method: "POST",
|
|
headers: {
|
|
"Authorization": `Bearer ${fishKey}`,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({
|
|
text,
|
|
reference_id: fishVoice,
|
|
format: "mp3",
|
|
mp3_bitrate: 128,
|
|
streaming: false,
|
|
}),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
const errText = await res.text().catch(() => "");
|
|
console.error(`[tts] Fish Audio clip error ${res.status}: ${errText}`);
|
|
return null;
|
|
}
|
|
|
|
buffer = Buffer.from(await res.arrayBuffer());
|
|
} else {
|
|
const res = await fetch(
|
|
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
|
|
{
|
|
method: "POST",
|
|
headers: {
|
|
"xi-api-key": apiKey,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({
|
|
text,
|
|
model_id: opts?.model ?? "eleven_turbo_v2_5",
|
|
output_format: "mp3_44100_128",
|
|
...(opts?.voice_settings ? { voice_settings: opts.voice_settings } : {}),
|
|
}),
|
|
}
|
|
);
|
|
|
|
if (!res.ok) {
|
|
const errText = await res.text().catch(() => "");
|
|
console.error(`[tts] clip error ${res.status}: ${errText}`);
|
|
return null;
|
|
}
|
|
|
|
const charCost = parseInt(res.headers.get("character-cost") ?? "0", 10);
|
|
cost = charCost * 0.0003;
|
|
buffer = Buffer.from(await res.arrayBuffer());
|
|
}
|
|
|
|
await mkdir(resolve(process.cwd(), "public/audio/labels"), { recursive: true });
|
|
await writeFile(outPath, buffer);
|
|
return { cost };
|
|
} catch (err: any) {
|
|
console.error(`[tts] clip failed for ${outPath}: ${err?.message ?? err}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export async function generateTTSToPath(
|
|
text: string,
|
|
lessonId: string,
|
|
filename: string
|
|
): Promise<TTSResult | null> {
|
|
try {
|
|
const result = await callTTS(text);
|
|
if (!result) return null;
|
|
|
|
const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
|
|
await mkdir(dir, { recursive: true });
|
|
await writeFile(`${dir}/${filename}`, result.audio);
|
|
|
|
const audioPath = `/audio/lessons/${lessonId}/${filename}`;
|
|
return { audioPath, audioChunks: result.chunks, cost: result.cost };
|
|
} catch (err: any) {
|
|
console.error(`[tts] ${filename} for lesson ${lessonId} failed: ${err?.message ?? err}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export async function fileExists(path: string): Promise<boolean> {
|
|
try {
|
|
await access(path);
|
|
return true;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|