feat(39-01): sentence-buffered TTS streaming + multi-language synthesis

- Export splitSentences() with title-abbreviation protection (Dr., Mr. etc.) - Add synthesizeSentenceStream() AsyncGenerator yielding per-sentence audio chunks - Add synthesizeMultiLang() synthesizing same text in N voices via Promise.all - Add POST /api/synthesize/stream SSE endpoint with base64 audio per sentence - Add POST /api/synthesize/multi-lang returning array of voiceId+audio pairs - Existing POST /api/synthesize unchanged (backward compatible)
2026-04-04 03:32:10 +00:00 · 2026-04-04 03:32:10 +00:00 · b95634c61a
commit b95634c61a
parent e61f471a62
2 changed files with 142 additions and 25 deletions
--- a/server/src/routes/voice.ts
+++ b/server/src/routes/voice.ts
@ -43,5 +43,62 @@ export function voiceRoutes(): Router {
    res.send(audioBuffer);
  });

+  // POST /api/synthesize/stream — sentence-buffered SSE streaming TTS
+  router.post("/synthesize/stream", async (req, res) => {
+    assertBoard(req);
+    const { text, voiceId } = req.body as { text?: string; voiceId?: string };
+    if (!text || typeof text !== "string") {
+      res.status(400).json({ error: "text is required" });
+      return;
+    }
+
+    res.setHeader("Content-Type", "text/event-stream");
+    res.setHeader("Cache-Control", "no-cache");
+    res.setHeader("Connection", "keep-alive");
+    res.flushHeaders();
+
+    try {
+      for await (const chunk of svc.synthesizeSentenceStream(text, voiceId)) {
+        const payload = JSON.stringify({
+          index: chunk.index,
+          total: chunk.total,
+          audio: chunk.audio.toString("base64"),
+        });
+        res.write(`data: ${payload}\n\n`);
+      }
+      res.write(`data: ${JSON.stringify({ done: true })}\n\n`);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : "Synthesis failed";
+      res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
+    } finally {
+      res.end();
+    }
+  });
+
+  // POST /api/synthesize/multi-lang — synthesize same text in multiple languages/voices
+  router.post("/synthesize/multi-lang", async (req, res) => {
+    assertBoard(req);
+    const { text, voiceIds } = req.body as { text?: string; voiceIds?: unknown };
+    if (!text || typeof text !== "string") {
+      res.status(400).json({ error: "text is required" });
+      return;
+    }
+    if (!Array.isArray(voiceIds) || voiceIds.length < 1 || voiceIds.length > 5) {
+      res.status(400).json({ error: "voiceIds must be an array with 1-5 entries" });
+      return;
+    }
+    if (!voiceIds.every((v) => typeof v === "string")) {
+      res.status(400).json({ error: "voiceIds must be an array of strings" });
+      return;
+    }
+
+    const resultMap = await svc.synthesizeMultiLang(text, voiceIds as string[]);
+    const results = Array.from(resultMap.entries()).map(([voiceId, audio]) => ({
+      voiceId,
+      audio: audio.toString("base64"),
+    }));
+    res.json({ results });
+  });
+
  return router;
 }
--- a/server/src/services/voice-pipeline.ts
+++ b/server/src/services/voice-pipeline.ts
@ -24,6 +24,32 @@ function execFileAsync(
  });
 }

+/**
+ * Splits text into sentences, preserving title abbreviations like Dr., Mr., etc.
+ * Uses a lookbehind for sentence-ending punctuation followed by whitespace.
+ * Protects common title abbreviations (Dr., Mr., Mrs., etc.) from being split on.
+ * Acronyms like D.C. and U.S. that appear at sentence end will still trigger splits.
+ */
+export function splitSentences(text: string): string[] {
+  if (!text || !text.trim()) return [];
+
+  const PLACEHOLDER = "\x00";
+
+  // Protect title abbreviations by replacing the trailing ". " with ".\x00"
+  const processed = text.replace(
+    /\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Rev|Gen|Col|Sgt|Cpl|Pvt|Lt|Cmdr|Capt|Gov|Rep|Sen)\.\s+/g,
+    (_, abbr) => `${abbr}.${PLACEHOLDER}`
+  );
+
+  // Split on sentence-ending punctuation followed by whitespace
+  const parts = processed.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
+
+  // Restore placeholders (replace with a space)
+  return parts
+    .map((s) => s.replace(new RegExp(PLACEHOLDER, "g"), " ").trim())
+    .filter((s) => s.length > 0);
+}
+
 export function voicePipelineService() {
  if (!ffmpegPath) {
    throw new Error("ffmpeg-static binary not found on this platform");
@ -124,35 +150,38 @@ export function voicePipelineService() {
    }
  }

-  async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
-    const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
+  async function synthesizeSentence(sentence: string, voiceId?: string): Promise<Buffer> {
+    return withTimeout(
+      new Promise<Buffer>((resolve, reject) => {
+        execFileCb(
+          "piper",
+          ["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
+          {
+            timeout: 8000,
+            maxBuffer: 10 * 1024 * 1024,
+            // @ts-ignore - input option is valid for execFile
+            input: sentence,
+          },
+          (err: Error | null, stdout: string | Buffer) => {
+            if (err) {
+              reject(err);
+            } else {
+              resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
+            }
+          }
+        );
+      }),
+      8000
+    );
+  }

+  async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
+    const sentences = splitSentences(text);
    const buffers: Buffer[] = [];

    for (const sentence of sentences) {
      try {
-        const audioData = await withTimeout(
-          new Promise<Buffer>((resolve, reject) => {
-            execFileCb(
-              "piper",
-              ["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
-              {
-                timeout: 8000,
-                maxBuffer: 10 * 1024 * 1024,
-                // @ts-ignore - input option is valid for execFile
-                input: sentence,
-              },
-              (err: Error | null, stdout: string | Buffer) => {
-                if (err) {
-                  reject(err);
-                } else {
-                  resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
-                }
-              }
-            );
-          }),
-          8000
-        );
+        const audioData = await synthesizeSentence(sentence, voiceId);
        buffers.push(audioData);
      } catch (err) {
        const nodeErr = err as NodeJS.ErrnoException;
@ -166,6 +195,37 @@ export function voicePipelineService() {
    return Buffer.concat(buffers);
  }

+  async function* synthesizeSentenceStream(
+    text: string,
+    voiceId?: string
+  ): AsyncGenerator<{ index: number; total: number; audio: Buffer }> {
+    const sentences = splitSentences(text);
+    const total = sentences.length;
+
+    for (let index = 0; index < sentences.length; index++) {
+      try {
+        const audio = await synthesizeSentence(sentences[index], voiceId);
+        yield { index, total, audio };
+      } catch (err) {
+        const nodeErr = err as NodeJS.ErrnoException;
+        if (nodeErr.code === "ENOENT") {
+          throw new Error("Piper TTS not available. Install piper for voice output.");
+        }
+        throw err;
+      }
+    }
+  }
+
+  async function synthesizeMultiLang(text: string, voiceIds: string[]): Promise<Map<string, Buffer>> {
+    const results = await Promise.all(
+      voiceIds.map(async (voiceId) => {
+        const audio = await synthesize(text, voiceId);
+        return [voiceId, audio] as [string, Buffer];
+      })
+    );
+    return new Map(results);
+  }
+
  function formatForVoice(text: string): string {
    if (!text) return "";

@ -212,5 +272,5 @@ export function voicePipelineService() {
    return result.trim();
  }

-  return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
+  return { transcribe, synthesize, synthesizeSentenceStream, synthesizeMultiLang, formatForVoice, transcodeToWav16k };
 }