feat(39-01): sentence-buffered TTS streaming + multi-language synthesis

- Export splitSentences() with title-abbreviation protection (Dr., Mr. etc.)
- Add synthesizeSentenceStream() AsyncGenerator yielding per-sentence audio chunks
- Add synthesizeMultiLang() synthesizing same text in N voices via Promise.all
- Add POST /api/synthesize/stream SSE endpoint with base64 audio per sentence
- Add POST /api/synthesize/multi-lang returning array of voiceId+audio pairs
- Existing POST /api/synthesize unchanged (backward compatible)
This commit is contained in:
Nexus Dev 2026-04-04 03:32:10 +00:00
parent e61f471a62
commit b95634c61a
2 changed files with 142 additions and 25 deletions

View file

@ -43,5 +43,62 @@ export function voiceRoutes(): Router {
res.send(audioBuffer);
});
// POST /api/synthesize/stream — sentence-buffered SSE streaming TTS
router.post("/synthesize/stream", async (req, res) => {
assertBoard(req);
const { text, voiceId } = req.body as { text?: string; voiceId?: string };
if (!text || typeof text !== "string") {
res.status(400).json({ error: "text is required" });
return;
}
res.setHeader("Content-Type", "text/event-stream");
res.setHeader("Cache-Control", "no-cache");
res.setHeader("Connection", "keep-alive");
res.flushHeaders();
try {
for await (const chunk of svc.synthesizeSentenceStream(text, voiceId)) {
const payload = JSON.stringify({
index: chunk.index,
total: chunk.total,
audio: chunk.audio.toString("base64"),
});
res.write(`data: ${payload}\n\n`);
}
res.write(`data: ${JSON.stringify({ done: true })}\n\n`);
} catch (err) {
const message = err instanceof Error ? err.message : "Synthesis failed";
res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
} finally {
res.end();
}
});
// POST /api/synthesize/multi-lang — synthesize same text in multiple languages/voices
router.post("/synthesize/multi-lang", async (req, res) => {
assertBoard(req);
const { text, voiceIds } = req.body as { text?: string; voiceIds?: unknown };
if (!text || typeof text !== "string") {
res.status(400).json({ error: "text is required" });
return;
}
if (!Array.isArray(voiceIds) || voiceIds.length < 1 || voiceIds.length > 5) {
res.status(400).json({ error: "voiceIds must be an array with 1-5 entries" });
return;
}
if (!voiceIds.every((v) => typeof v === "string")) {
res.status(400).json({ error: "voiceIds must be an array of strings" });
return;
}
const resultMap = await svc.synthesizeMultiLang(text, voiceIds as string[]);
const results = Array.from(resultMap.entries()).map(([voiceId, audio]) => ({
voiceId,
audio: audio.toString("base64"),
}));
res.json({ results });
});
return router;
}

View file

@ -24,6 +24,32 @@ function execFileAsync(
});
}
/**
* Splits text into sentences, preserving title abbreviations like Dr., Mr., etc.
* Uses a lookbehind for sentence-ending punctuation followed by whitespace.
* Protects common title abbreviations (Dr., Mr., Mrs., etc.) from being split on.
* Acronyms like D.C. and U.S. that appear at sentence end will still trigger splits.
*/
export function splitSentences(text: string): string[] {
if (!text || !text.trim()) return [];
const PLACEHOLDER = "\x00";
// Protect title abbreviations by replacing the trailing ". " with ".\x00"
const processed = text.replace(
/\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Rev|Gen|Col|Sgt|Cpl|Pvt|Lt|Cmdr|Capt|Gov|Rep|Sen)\.\s+/g,
(_, abbr) => `${abbr}.${PLACEHOLDER}`
);
// Split on sentence-ending punctuation followed by whitespace
const parts = processed.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
// Restore placeholders (replace with a space)
return parts
.map((s) => s.replace(new RegExp(PLACEHOLDER, "g"), " ").trim())
.filter((s) => s.length > 0);
}
export function voicePipelineService() {
if (!ffmpegPath) {
throw new Error("ffmpeg-static binary not found on this platform");
@ -124,35 +150,38 @@ export function voicePipelineService() {
}
}
async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
async function synthesizeSentence(sentence: string, voiceId?: string): Promise<Buffer> {
return withTimeout(
new Promise<Buffer>((resolve, reject) => {
execFileCb(
"piper",
["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
{
timeout: 8000,
maxBuffer: 10 * 1024 * 1024,
// @ts-ignore - input option is valid for execFile
input: sentence,
},
(err: Error | null, stdout: string | Buffer) => {
if (err) {
reject(err);
} else {
resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
}
}
);
}),
8000
);
}
async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
const sentences = splitSentences(text);
const buffers: Buffer[] = [];
for (const sentence of sentences) {
try {
const audioData = await withTimeout(
new Promise<Buffer>((resolve, reject) => {
execFileCb(
"piper",
["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
{
timeout: 8000,
maxBuffer: 10 * 1024 * 1024,
// @ts-ignore - input option is valid for execFile
input: sentence,
},
(err: Error | null, stdout: string | Buffer) => {
if (err) {
reject(err);
} else {
resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
}
}
);
}),
8000
);
const audioData = await synthesizeSentence(sentence, voiceId);
buffers.push(audioData);
} catch (err) {
const nodeErr = err as NodeJS.ErrnoException;
@ -166,6 +195,37 @@ export function voicePipelineService() {
return Buffer.concat(buffers);
}
async function* synthesizeSentenceStream(
text: string,
voiceId?: string
): AsyncGenerator<{ index: number; total: number; audio: Buffer }> {
const sentences = splitSentences(text);
const total = sentences.length;
for (let index = 0; index < sentences.length; index++) {
try {
const audio = await synthesizeSentence(sentences[index], voiceId);
yield { index, total, audio };
} catch (err) {
const nodeErr = err as NodeJS.ErrnoException;
if (nodeErr.code === "ENOENT") {
throw new Error("Piper TTS not available. Install piper for voice output.");
}
throw err;
}
}
}
async function synthesizeMultiLang(text: string, voiceIds: string[]): Promise<Map<string, Buffer>> {
const results = await Promise.all(
voiceIds.map(async (voiceId) => {
const audio = await synthesize(text, voiceId);
return [voiceId, audio] as [string, Buffer];
})
);
return new Map(results);
}
function formatForVoice(text: string): string {
if (!text) return "";
@ -212,5 +272,5 @@ export function voicePipelineService() {
return result.trim();
}
return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
return { transcribe, synthesize, synthesizeSentenceStream, synthesizeMultiLang, formatForVoice, transcodeToWav16k };
}