From 22beb245f228909aaf16aeb655eb2fb32e560c97 Mon Sep 17 00:00:00 2001 From: Nexus Dev Date: Sat, 4 Apr 2026 03:32:10 +0000 Subject: [PATCH] feat(39-01): sentence-buffered TTS streaming + multi-language synthesis - Export splitSentences() with title-abbreviation protection (Dr., Mr. etc.) - Add synthesizeSentenceStream() AsyncGenerator yielding per-sentence audio chunks - Add synthesizeMultiLang() synthesizing same text in N voices via Promise.all - Add POST /api/synthesize/stream SSE endpoint with base64 audio per sentence - Add POST /api/synthesize/multi-lang returning array of voiceId+audio pairs - Existing POST /api/synthesize unchanged (backward compatible) --- server/src/routes/voice.ts | 57 +++++++++++++ server/src/services/voice-pipeline.ts | 110 ++++++++++++++++++++------ 2 files changed, 142 insertions(+), 25 deletions(-) diff --git a/server/src/routes/voice.ts b/server/src/routes/voice.ts index bc53057c..4193efc5 100644 --- a/server/src/routes/voice.ts +++ b/server/src/routes/voice.ts @@ -43,5 +43,62 @@ export function voiceRoutes(): Router { res.send(audioBuffer); }); + // POST /api/synthesize/stream — sentence-buffered SSE streaming TTS + router.post("/synthesize/stream", async (req, res) => { + assertBoard(req); + const { text, voiceId } = req.body as { text?: string; voiceId?: string }; + if (!text || typeof text !== "string") { + res.status(400).json({ error: "text is required" }); + return; + } + + res.setHeader("Content-Type", "text/event-stream"); + res.setHeader("Cache-Control", "no-cache"); + res.setHeader("Connection", "keep-alive"); + res.flushHeaders(); + + try { + for await (const chunk of svc.synthesizeSentenceStream(text, voiceId)) { + const payload = JSON.stringify({ + index: chunk.index, + total: chunk.total, + audio: chunk.audio.toString("base64"), + }); + res.write(`data: ${payload}\n\n`); + } + res.write(`data: ${JSON.stringify({ done: true })}\n\n`); + } catch (err) { + const message = err instanceof Error ? err.message : "Synthesis failed"; + res.write(`data: ${JSON.stringify({ error: message })}\n\n`); + } finally { + res.end(); + } + }); + + // POST /api/synthesize/multi-lang — synthesize same text in multiple languages/voices + router.post("/synthesize/multi-lang", async (req, res) => { + assertBoard(req); + const { text, voiceIds } = req.body as { text?: string; voiceIds?: unknown }; + if (!text || typeof text !== "string") { + res.status(400).json({ error: "text is required" }); + return; + } + if (!Array.isArray(voiceIds) || voiceIds.length < 1 || voiceIds.length > 5) { + res.status(400).json({ error: "voiceIds must be an array with 1-5 entries" }); + return; + } + if (!voiceIds.every((v) => typeof v === "string")) { + res.status(400).json({ error: "voiceIds must be an array of strings" }); + return; + } + + const resultMap = await svc.synthesizeMultiLang(text, voiceIds as string[]); + const results = Array.from(resultMap.entries()).map(([voiceId, audio]) => ({ + voiceId, + audio: audio.toString("base64"), + })); + res.json({ results }); + }); + return router; } diff --git a/server/src/services/voice-pipeline.ts b/server/src/services/voice-pipeline.ts index 1e494a43..2e3d771f 100644 --- a/server/src/services/voice-pipeline.ts +++ b/server/src/services/voice-pipeline.ts @@ -24,6 +24,32 @@ function execFileAsync( }); } +/** + * Splits text into sentences, preserving title abbreviations like Dr., Mr., etc. + * Uses a lookbehind for sentence-ending punctuation followed by whitespace. + * Protects common title abbreviations (Dr., Mr., Mrs., etc.) from being split on. + * Acronyms like D.C. and U.S. that appear at sentence end will still trigger splits. + */ +export function splitSentences(text: string): string[] { + if (!text || !text.trim()) return []; + + const PLACEHOLDER = "\x00"; + + // Protect title abbreviations by replacing the trailing ". " with ".\x00" + const processed = text.replace( + /\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Rev|Gen|Col|Sgt|Cpl|Pvt|Lt|Cmdr|Capt|Gov|Rep|Sen)\.\s+/g, + (_, abbr) => `${abbr}.${PLACEHOLDER}` + ); + + // Split on sentence-ending punctuation followed by whitespace + const parts = processed.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0); + + // Restore placeholders (replace with a space) + return parts + .map((s) => s.replace(new RegExp(PLACEHOLDER, "g"), " ").trim()) + .filter((s) => s.length > 0); +} + export function voicePipelineService() { if (!ffmpegPath) { throw new Error("ffmpeg-static binary not found on this platform"); @@ -124,35 +150,38 @@ export function voicePipelineService() { } } - async function synthesize(text: string, voiceId?: string): Promise { - const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0); + async function synthesizeSentence(sentence: string, voiceId?: string): Promise { + return withTimeout( + new Promise((resolve, reject) => { + execFileCb( + "piper", + ["--model", voiceId || "en_US-lessac-medium", "--output-raw"], + { + timeout: 8000, + maxBuffer: 10 * 1024 * 1024, + // @ts-ignore - input option is valid for execFile + input: sentence, + }, + (err: Error | null, stdout: string | Buffer) => { + if (err) { + reject(err); + } else { + resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string)); + } + } + ); + }), + 8000 + ); + } + async function synthesize(text: string, voiceId?: string): Promise { + const sentences = splitSentences(text); const buffers: Buffer[] = []; for (const sentence of sentences) { try { - const audioData = await withTimeout( - new Promise((resolve, reject) => { - execFileCb( - "piper", - ["--model", voiceId || "en_US-lessac-medium", "--output-raw"], - { - timeout: 8000, - maxBuffer: 10 * 1024 * 1024, - // @ts-ignore - input option is valid for execFile - input: sentence, - }, - (err: Error | null, stdout: string | Buffer) => { - if (err) { - reject(err); - } else { - resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string)); - } - } - ); - }), - 8000 - ); + const audioData = await synthesizeSentence(sentence, voiceId); buffers.push(audioData); } catch (err) { const nodeErr = err as NodeJS.ErrnoException; @@ -166,6 +195,37 @@ export function voicePipelineService() { return Buffer.concat(buffers); } + async function* synthesizeSentenceStream( + text: string, + voiceId?: string + ): AsyncGenerator<{ index: number; total: number; audio: Buffer }> { + const sentences = splitSentences(text); + const total = sentences.length; + + for (let index = 0; index < sentences.length; index++) { + try { + const audio = await synthesizeSentence(sentences[index], voiceId); + yield { index, total, audio }; + } catch (err) { + const nodeErr = err as NodeJS.ErrnoException; + if (nodeErr.code === "ENOENT") { + throw new Error("Piper TTS not available. Install piper for voice output."); + } + throw err; + } + } + } + + async function synthesizeMultiLang(text: string, voiceIds: string[]): Promise> { + const results = await Promise.all( + voiceIds.map(async (voiceId) => { + const audio = await synthesize(text, voiceId); + return [voiceId, audio] as [string, Buffer]; + }) + ); + return new Map(results); + } + function formatForVoice(text: string): string { if (!text) return ""; @@ -212,5 +272,5 @@ export function voicePipelineService() { return result.trim(); } - return { transcribe, synthesize, formatForVoice, transcodeToWav16k }; + return { transcribe, synthesize, synthesizeSentenceStream, synthesizeMultiLang, formatForVoice, transcodeToWav16k }; }