feat(39-01): sentence-buffered TTS streaming + multi-language synthesis
- Export splitSentences() with title-abbreviation protection (Dr., Mr. etc.) - Add synthesizeSentenceStream() AsyncGenerator yielding per-sentence audio chunks - Add synthesizeMultiLang() synthesizing same text in N voices via Promise.all - Add POST /api/synthesize/stream SSE endpoint with base64 audio per sentence - Add POST /api/synthesize/multi-lang returning array of voiceId+audio pairs - Existing POST /api/synthesize unchanged (backward compatible)
This commit is contained in:
parent
6be251a9fb
commit
22beb245f2
2 changed files with 142 additions and 25 deletions
|
|
@ -43,5 +43,62 @@ export function voiceRoutes(): Router {
|
||||||
res.send(audioBuffer);
|
res.send(audioBuffer);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// POST /api/synthesize/stream — sentence-buffered SSE streaming TTS
|
||||||
|
router.post("/synthesize/stream", async (req, res) => {
|
||||||
|
assertBoard(req);
|
||||||
|
const { text, voiceId } = req.body as { text?: string; voiceId?: string };
|
||||||
|
if (!text || typeof text !== "string") {
|
||||||
|
res.status(400).json({ error: "text is required" });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
res.setHeader("Content-Type", "text/event-stream");
|
||||||
|
res.setHeader("Cache-Control", "no-cache");
|
||||||
|
res.setHeader("Connection", "keep-alive");
|
||||||
|
res.flushHeaders();
|
||||||
|
|
||||||
|
try {
|
||||||
|
for await (const chunk of svc.synthesizeSentenceStream(text, voiceId)) {
|
||||||
|
const payload = JSON.stringify({
|
||||||
|
index: chunk.index,
|
||||||
|
total: chunk.total,
|
||||||
|
audio: chunk.audio.toString("base64"),
|
||||||
|
});
|
||||||
|
res.write(`data: ${payload}\n\n`);
|
||||||
|
}
|
||||||
|
res.write(`data: ${JSON.stringify({ done: true })}\n\n`);
|
||||||
|
} catch (err) {
|
||||||
|
const message = err instanceof Error ? err.message : "Synthesis failed";
|
||||||
|
res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
|
||||||
|
} finally {
|
||||||
|
res.end();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// POST /api/synthesize/multi-lang — synthesize same text in multiple languages/voices
|
||||||
|
router.post("/synthesize/multi-lang", async (req, res) => {
|
||||||
|
assertBoard(req);
|
||||||
|
const { text, voiceIds } = req.body as { text?: string; voiceIds?: unknown };
|
||||||
|
if (!text || typeof text !== "string") {
|
||||||
|
res.status(400).json({ error: "text is required" });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!Array.isArray(voiceIds) || voiceIds.length < 1 || voiceIds.length > 5) {
|
||||||
|
res.status(400).json({ error: "voiceIds must be an array with 1-5 entries" });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!voiceIds.every((v) => typeof v === "string")) {
|
||||||
|
res.status(400).json({ error: "voiceIds must be an array of strings" });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const resultMap = await svc.synthesizeMultiLang(text, voiceIds as string[]);
|
||||||
|
const results = Array.from(resultMap.entries()).map(([voiceId, audio]) => ({
|
||||||
|
voiceId,
|
||||||
|
audio: audio.toString("base64"),
|
||||||
|
}));
|
||||||
|
res.json({ results });
|
||||||
|
});
|
||||||
|
|
||||||
return router;
|
return router;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,32 @@ function execFileAsync(
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits text into sentences, preserving title abbreviations like Dr., Mr., etc.
|
||||||
|
* Uses a lookbehind for sentence-ending punctuation followed by whitespace.
|
||||||
|
* Protects common title abbreviations (Dr., Mr., Mrs., etc.) from being split on.
|
||||||
|
* Acronyms like D.C. and U.S. that appear at sentence end will still trigger splits.
|
||||||
|
*/
|
||||||
|
export function splitSentences(text: string): string[] {
|
||||||
|
if (!text || !text.trim()) return [];
|
||||||
|
|
||||||
|
const PLACEHOLDER = "\x00";
|
||||||
|
|
||||||
|
// Protect title abbreviations by replacing the trailing ". " with ".\x00"
|
||||||
|
const processed = text.replace(
|
||||||
|
/\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Rev|Gen|Col|Sgt|Cpl|Pvt|Lt|Cmdr|Capt|Gov|Rep|Sen)\.\s+/g,
|
||||||
|
(_, abbr) => `${abbr}.${PLACEHOLDER}`
|
||||||
|
);
|
||||||
|
|
||||||
|
// Split on sentence-ending punctuation followed by whitespace
|
||||||
|
const parts = processed.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
|
||||||
|
|
||||||
|
// Restore placeholders (replace with a space)
|
||||||
|
return parts
|
||||||
|
.map((s) => s.replace(new RegExp(PLACEHOLDER, "g"), " ").trim())
|
||||||
|
.filter((s) => s.length > 0);
|
||||||
|
}
|
||||||
|
|
||||||
export function voicePipelineService() {
|
export function voicePipelineService() {
|
||||||
if (!ffmpegPath) {
|
if (!ffmpegPath) {
|
||||||
throw new Error("ffmpeg-static binary not found on this platform");
|
throw new Error("ffmpeg-static binary not found on this platform");
|
||||||
|
|
@ -124,35 +150,38 @@ export function voicePipelineService() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
|
async function synthesizeSentence(sentence: string, voiceId?: string): Promise<Buffer> {
|
||||||
const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
|
return withTimeout(
|
||||||
|
new Promise<Buffer>((resolve, reject) => {
|
||||||
|
execFileCb(
|
||||||
|
"piper",
|
||||||
|
["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
|
||||||
|
{
|
||||||
|
timeout: 8000,
|
||||||
|
maxBuffer: 10 * 1024 * 1024,
|
||||||
|
// @ts-ignore - input option is valid for execFile
|
||||||
|
input: sentence,
|
||||||
|
},
|
||||||
|
(err: Error | null, stdout: string | Buffer) => {
|
||||||
|
if (err) {
|
||||||
|
reject(err);
|
||||||
|
} else {
|
||||||
|
resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}),
|
||||||
|
8000
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
|
||||||
|
const sentences = splitSentences(text);
|
||||||
const buffers: Buffer[] = [];
|
const buffers: Buffer[] = [];
|
||||||
|
|
||||||
for (const sentence of sentences) {
|
for (const sentence of sentences) {
|
||||||
try {
|
try {
|
||||||
const audioData = await withTimeout(
|
const audioData = await synthesizeSentence(sentence, voiceId);
|
||||||
new Promise<Buffer>((resolve, reject) => {
|
|
||||||
execFileCb(
|
|
||||||
"piper",
|
|
||||||
["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
|
|
||||||
{
|
|
||||||
timeout: 8000,
|
|
||||||
maxBuffer: 10 * 1024 * 1024,
|
|
||||||
// @ts-ignore - input option is valid for execFile
|
|
||||||
input: sentence,
|
|
||||||
},
|
|
||||||
(err: Error | null, stdout: string | Buffer) => {
|
|
||||||
if (err) {
|
|
||||||
reject(err);
|
|
||||||
} else {
|
|
||||||
resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
);
|
|
||||||
}),
|
|
||||||
8000
|
|
||||||
);
|
|
||||||
buffers.push(audioData);
|
buffers.push(audioData);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
const nodeErr = err as NodeJS.ErrnoException;
|
const nodeErr = err as NodeJS.ErrnoException;
|
||||||
|
|
@ -166,6 +195,37 @@ export function voicePipelineService() {
|
||||||
return Buffer.concat(buffers);
|
return Buffer.concat(buffers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function* synthesizeSentenceStream(
|
||||||
|
text: string,
|
||||||
|
voiceId?: string
|
||||||
|
): AsyncGenerator<{ index: number; total: number; audio: Buffer }> {
|
||||||
|
const sentences = splitSentences(text);
|
||||||
|
const total = sentences.length;
|
||||||
|
|
||||||
|
for (let index = 0; index < sentences.length; index++) {
|
||||||
|
try {
|
||||||
|
const audio = await synthesizeSentence(sentences[index], voiceId);
|
||||||
|
yield { index, total, audio };
|
||||||
|
} catch (err) {
|
||||||
|
const nodeErr = err as NodeJS.ErrnoException;
|
||||||
|
if (nodeErr.code === "ENOENT") {
|
||||||
|
throw new Error("Piper TTS not available. Install piper for voice output.");
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function synthesizeMultiLang(text: string, voiceIds: string[]): Promise<Map<string, Buffer>> {
|
||||||
|
const results = await Promise.all(
|
||||||
|
voiceIds.map(async (voiceId) => {
|
||||||
|
const audio = await synthesize(text, voiceId);
|
||||||
|
return [voiceId, audio] as [string, Buffer];
|
||||||
|
})
|
||||||
|
);
|
||||||
|
return new Map(results);
|
||||||
|
}
|
||||||
|
|
||||||
function formatForVoice(text: string): string {
|
function formatForVoice(text: string): string {
|
||||||
if (!text) return "";
|
if (!text) return "";
|
||||||
|
|
||||||
|
|
@ -212,5 +272,5 @@ export function voicePipelineService() {
|
||||||
return result.trim();
|
return result.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
|
return { transcribe, synthesize, synthesizeSentenceStream, synthesizeMultiLang, formatForVoice, transcodeToWav16k };
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue