feat(39-01): sentence-buffered TTS streaming + multi-language synthesis
- Export splitSentences() with title-abbreviation protection (Dr., Mr. etc.) - Add synthesizeSentenceStream() AsyncGenerator yielding per-sentence audio chunks - Add synthesizeMultiLang() synthesizing same text in N voices via Promise.all - Add POST /api/synthesize/stream SSE endpoint with base64 audio per sentence - Add POST /api/synthesize/multi-lang returning array of voiceId+audio pairs - Existing POST /api/synthesize unchanged (backward compatible)
This commit is contained in:
parent
6be251a9fb
commit
22beb245f2
2 changed files with 142 additions and 25 deletions
|
|
@ -43,5 +43,62 @@ export function voiceRoutes(): Router {
|
|||
res.send(audioBuffer);
|
||||
});
|
||||
|
||||
// POST /api/synthesize/stream — sentence-buffered SSE streaming TTS
|
||||
router.post("/synthesize/stream", async (req, res) => {
|
||||
assertBoard(req);
|
||||
const { text, voiceId } = req.body as { text?: string; voiceId?: string };
|
||||
if (!text || typeof text !== "string") {
|
||||
res.status(400).json({ error: "text is required" });
|
||||
return;
|
||||
}
|
||||
|
||||
res.setHeader("Content-Type", "text/event-stream");
|
||||
res.setHeader("Cache-Control", "no-cache");
|
||||
res.setHeader("Connection", "keep-alive");
|
||||
res.flushHeaders();
|
||||
|
||||
try {
|
||||
for await (const chunk of svc.synthesizeSentenceStream(text, voiceId)) {
|
||||
const payload = JSON.stringify({
|
||||
index: chunk.index,
|
||||
total: chunk.total,
|
||||
audio: chunk.audio.toString("base64"),
|
||||
});
|
||||
res.write(`data: ${payload}\n\n`);
|
||||
}
|
||||
res.write(`data: ${JSON.stringify({ done: true })}\n\n`);
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : "Synthesis failed";
|
||||
res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
|
||||
} finally {
|
||||
res.end();
|
||||
}
|
||||
});
|
||||
|
||||
// POST /api/synthesize/multi-lang — synthesize same text in multiple languages/voices
|
||||
router.post("/synthesize/multi-lang", async (req, res) => {
|
||||
assertBoard(req);
|
||||
const { text, voiceIds } = req.body as { text?: string; voiceIds?: unknown };
|
||||
if (!text || typeof text !== "string") {
|
||||
res.status(400).json({ error: "text is required" });
|
||||
return;
|
||||
}
|
||||
if (!Array.isArray(voiceIds) || voiceIds.length < 1 || voiceIds.length > 5) {
|
||||
res.status(400).json({ error: "voiceIds must be an array with 1-5 entries" });
|
||||
return;
|
||||
}
|
||||
if (!voiceIds.every((v) => typeof v === "string")) {
|
||||
res.status(400).json({ error: "voiceIds must be an array of strings" });
|
||||
return;
|
||||
}
|
||||
|
||||
const resultMap = await svc.synthesizeMultiLang(text, voiceIds as string[]);
|
||||
const results = Array.from(resultMap.entries()).map(([voiceId, audio]) => ({
|
||||
voiceId,
|
||||
audio: audio.toString("base64"),
|
||||
}));
|
||||
res.json({ results });
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,32 @@ function execFileAsync(
|
|||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits text into sentences, preserving title abbreviations like Dr., Mr., etc.
|
||||
* Uses a lookbehind for sentence-ending punctuation followed by whitespace.
|
||||
* Protects common title abbreviations (Dr., Mr., Mrs., etc.) from being split on.
|
||||
* Acronyms like D.C. and U.S. that appear at sentence end will still trigger splits.
|
||||
*/
|
||||
export function splitSentences(text: string): string[] {
|
||||
if (!text || !text.trim()) return [];
|
||||
|
||||
const PLACEHOLDER = "\x00";
|
||||
|
||||
// Protect title abbreviations by replacing the trailing ". " with ".\x00"
|
||||
const processed = text.replace(
|
||||
/\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Rev|Gen|Col|Sgt|Cpl|Pvt|Lt|Cmdr|Capt|Gov|Rep|Sen)\.\s+/g,
|
||||
(_, abbr) => `${abbr}.${PLACEHOLDER}`
|
||||
);
|
||||
|
||||
// Split on sentence-ending punctuation followed by whitespace
|
||||
const parts = processed.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
|
||||
|
||||
// Restore placeholders (replace with a space)
|
||||
return parts
|
||||
.map((s) => s.replace(new RegExp(PLACEHOLDER, "g"), " ").trim())
|
||||
.filter((s) => s.length > 0);
|
||||
}
|
||||
|
||||
export function voicePipelineService() {
|
||||
if (!ffmpegPath) {
|
||||
throw new Error("ffmpeg-static binary not found on this platform");
|
||||
|
|
@ -124,35 +150,38 @@ export function voicePipelineService() {
|
|||
}
|
||||
}
|
||||
|
||||
async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
|
||||
const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
|
||||
async function synthesizeSentence(sentence: string, voiceId?: string): Promise<Buffer> {
|
||||
return withTimeout(
|
||||
new Promise<Buffer>((resolve, reject) => {
|
||||
execFileCb(
|
||||
"piper",
|
||||
["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
|
||||
{
|
||||
timeout: 8000,
|
||||
maxBuffer: 10 * 1024 * 1024,
|
||||
// @ts-ignore - input option is valid for execFile
|
||||
input: sentence,
|
||||
},
|
||||
(err: Error | null, stdout: string | Buffer) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
|
||||
}
|
||||
}
|
||||
);
|
||||
}),
|
||||
8000
|
||||
);
|
||||
}
|
||||
|
||||
async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
|
||||
const sentences = splitSentences(text);
|
||||
const buffers: Buffer[] = [];
|
||||
|
||||
for (const sentence of sentences) {
|
||||
try {
|
||||
const audioData = await withTimeout(
|
||||
new Promise<Buffer>((resolve, reject) => {
|
||||
execFileCb(
|
||||
"piper",
|
||||
["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
|
||||
{
|
||||
timeout: 8000,
|
||||
maxBuffer: 10 * 1024 * 1024,
|
||||
// @ts-ignore - input option is valid for execFile
|
||||
input: sentence,
|
||||
},
|
||||
(err: Error | null, stdout: string | Buffer) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
|
||||
}
|
||||
}
|
||||
);
|
||||
}),
|
||||
8000
|
||||
);
|
||||
const audioData = await synthesizeSentence(sentence, voiceId);
|
||||
buffers.push(audioData);
|
||||
} catch (err) {
|
||||
const nodeErr = err as NodeJS.ErrnoException;
|
||||
|
|
@ -166,6 +195,37 @@ export function voicePipelineService() {
|
|||
return Buffer.concat(buffers);
|
||||
}
|
||||
|
||||
async function* synthesizeSentenceStream(
|
||||
text: string,
|
||||
voiceId?: string
|
||||
): AsyncGenerator<{ index: number; total: number; audio: Buffer }> {
|
||||
const sentences = splitSentences(text);
|
||||
const total = sentences.length;
|
||||
|
||||
for (let index = 0; index < sentences.length; index++) {
|
||||
try {
|
||||
const audio = await synthesizeSentence(sentences[index], voiceId);
|
||||
yield { index, total, audio };
|
||||
} catch (err) {
|
||||
const nodeErr = err as NodeJS.ErrnoException;
|
||||
if (nodeErr.code === "ENOENT") {
|
||||
throw new Error("Piper TTS not available. Install piper for voice output.");
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function synthesizeMultiLang(text: string, voiceIds: string[]): Promise<Map<string, Buffer>> {
|
||||
const results = await Promise.all(
|
||||
voiceIds.map(async (voiceId) => {
|
||||
const audio = await synthesize(text, voiceId);
|
||||
return [voiceId, audio] as [string, Buffer];
|
||||
})
|
||||
);
|
||||
return new Map(results);
|
||||
}
|
||||
|
||||
function formatForVoice(text: string): string {
|
||||
if (!text) return "";
|
||||
|
||||
|
|
@ -212,5 +272,5 @@ export function voicePipelineService() {
|
|||
return result.trim();
|
||||
}
|
||||
|
||||
return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
|
||||
return { transcribe, synthesize, synthesizeSentenceStream, synthesizeMultiLang, formatForVoice, transcodeToWav16k };
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue