From 22beb245f228909aaf16aeb655eb2fb32e560c97 Mon Sep 17 00:00:00 2001
From: Nexus Dev <nexus@local>
Date: Sat, 4 Apr 2026 03:32:10 +0000
Subject: [PATCH] feat(39-01): sentence-buffered TTS streaming + multi-language
 synthesis

- Export splitSentences() with title-abbreviation protection (Dr., Mr. etc.)
- Add synthesizeSentenceStream() AsyncGenerator yielding per-sentence audio chunks
- Add synthesizeMultiLang() synthesizing same text in N voices via Promise.all
- Add POST /api/synthesize/stream SSE endpoint with base64 audio per sentence
- Add POST /api/synthesize/multi-lang returning array of voiceId+audio pairs
- Existing POST /api/synthesize unchanged (backward compatible)
---
 server/src/routes/voice.ts            |  57 +++++++++++++
 server/src/services/voice-pipeline.ts | 110 ++++++++++++++++++++------
 2 files changed, 142 insertions(+), 25 deletions(-)

diff --git a/server/src/routes/voice.ts b/server/src/routes/voice.ts
index bc53057c..4193efc5 100644
--- a/server/src/routes/voice.ts
+++ b/server/src/routes/voice.ts
@@ -43,5 +43,62 @@ export function voiceRoutes(): Router {
     res.send(audioBuffer);
   });
 
+  // POST /api/synthesize/stream — sentence-buffered SSE streaming TTS
+  router.post("/synthesize/stream", async (req, res) => {
+    assertBoard(req);
+    const { text, voiceId } = req.body as { text?: string; voiceId?: string };
+    if (!text || typeof text !== "string") {
+      res.status(400).json({ error: "text is required" });
+      return;
+    }
+
+    res.setHeader("Content-Type", "text/event-stream");
+    res.setHeader("Cache-Control", "no-cache");
+    res.setHeader("Connection", "keep-alive");
+    res.flushHeaders();
+
+    try {
+      for await (const chunk of svc.synthesizeSentenceStream(text, voiceId)) {
+        const payload = JSON.stringify({
+          index: chunk.index,
+          total: chunk.total,
+          audio: chunk.audio.toString("base64"),
+        });
+        res.write(`data: ${payload}\n\n`);
+      }
+      res.write(`data: ${JSON.stringify({ done: true })}\n\n`);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : "Synthesis failed";
+      res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
+    } finally {
+      res.end();
+    }
+  });
+
+  // POST /api/synthesize/multi-lang — synthesize same text in multiple languages/voices
+  router.post("/synthesize/multi-lang", async (req, res) => {
+    assertBoard(req);
+    const { text, voiceIds } = req.body as { text?: string; voiceIds?: unknown };
+    if (!text || typeof text !== "string") {
+      res.status(400).json({ error: "text is required" });
+      return;
+    }
+    if (!Array.isArray(voiceIds) || voiceIds.length < 1 || voiceIds.length > 5) {
+      res.status(400).json({ error: "voiceIds must be an array with 1-5 entries" });
+      return;
+    }
+    if (!voiceIds.every((v) => typeof v === "string")) {
+      res.status(400).json({ error: "voiceIds must be an array of strings" });
+      return;
+    }
+
+    const resultMap = await svc.synthesizeMultiLang(text, voiceIds as string[]);
+    const results = Array.from(resultMap.entries()).map(([voiceId, audio]) => ({
+      voiceId,
+      audio: audio.toString("base64"),
+    }));
+    res.json({ results });
+  });
+
   return router;
 }
diff --git a/server/src/services/voice-pipeline.ts b/server/src/services/voice-pipeline.ts
index 1e494a43..2e3d771f 100644
--- a/server/src/services/voice-pipeline.ts
+++ b/server/src/services/voice-pipeline.ts
@@ -24,6 +24,32 @@ function execFileAsync(
   });
 }
 
+/**
+ * Splits text into sentences, preserving title abbreviations like Dr., Mr., etc.
+ * Uses a lookbehind for sentence-ending punctuation followed by whitespace.
+ * Protects common title abbreviations (Dr., Mr., Mrs., etc.) from being split on.
+ * Acronyms like D.C. and U.S. that appear at sentence end will still trigger splits.
+ */
+export function splitSentences(text: string): string[] {
+  if (!text || !text.trim()) return [];
+
+  const PLACEHOLDER = "\x00";
+
+  // Protect title abbreviations by replacing the trailing ". " with ".\x00"
+  const processed = text.replace(
+    /\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Rev|Gen|Col|Sgt|Cpl|Pvt|Lt|Cmdr|Capt|Gov|Rep|Sen)\.\s+/g,
+    (_, abbr) => `${abbr}.${PLACEHOLDER}`
+  );
+
+  // Split on sentence-ending punctuation followed by whitespace
+  const parts = processed.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
+
+  // Restore placeholders (replace with a space)
+  return parts
+    .map((s) => s.replace(new RegExp(PLACEHOLDER, "g"), " ").trim())
+    .filter((s) => s.length > 0);
+}
+
 export function voicePipelineService() {
   if (!ffmpegPath) {
     throw new Error("ffmpeg-static binary not found on this platform");
@@ -124,35 +150,38 @@ export function voicePipelineService() {
     }
   }
 
-  async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
-    const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
+  async function synthesizeSentence(sentence: string, voiceId?: string): Promise<Buffer> {
+    return withTimeout(
+      new Promise<Buffer>((resolve, reject) => {
+        execFileCb(
+          "piper",
+          ["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
+          {
+            timeout: 8000,
+            maxBuffer: 10 * 1024 * 1024,
+            // @ts-ignore - input option is valid for execFile
+            input: sentence,
+          },
+          (err: Error | null, stdout: string | Buffer) => {
+            if (err) {
+              reject(err);
+            } else {
+              resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
+            }
+          }
+        );
+      }),
+      8000
+    );
+  }
 
+  async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
+    const sentences = splitSentences(text);
     const buffers: Buffer[] = [];
 
     for (const sentence of sentences) {
       try {
-        const audioData = await withTimeout(
-          new Promise<Buffer>((resolve, reject) => {
-            execFileCb(
-              "piper",
-              ["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
-              {
-                timeout: 8000,
-                maxBuffer: 10 * 1024 * 1024,
-                // @ts-ignore - input option is valid for execFile
-                input: sentence,
-              },
-              (err: Error | null, stdout: string | Buffer) => {
-                if (err) {
-                  reject(err);
-                } else {
-                  resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
-                }
-              }
-            );
-          }),
-          8000
-        );
+        const audioData = await synthesizeSentence(sentence, voiceId);
         buffers.push(audioData);
       } catch (err) {
         const nodeErr = err as NodeJS.ErrnoException;
@@ -166,6 +195,37 @@ export function voicePipelineService() {
     return Buffer.concat(buffers);
   }
 
+  async function* synthesizeSentenceStream(
+    text: string,
+    voiceId?: string
+  ): AsyncGenerator<{ index: number; total: number; audio: Buffer }> {
+    const sentences = splitSentences(text);
+    const total = sentences.length;
+
+    for (let index = 0; index < sentences.length; index++) {
+      try {
+        const audio = await synthesizeSentence(sentences[index], voiceId);
+        yield { index, total, audio };
+      } catch (err) {
+        const nodeErr = err as NodeJS.ErrnoException;
+        if (nodeErr.code === "ENOENT") {
+          throw new Error("Piper TTS not available. Install piper for voice output.");
+        }
+        throw err;
+      }
+    }
+  }
+
+  async function synthesizeMultiLang(text: string, voiceIds: string[]): Promise<Map<string, Buffer>> {
+    const results = await Promise.all(
+      voiceIds.map(async (voiceId) => {
+        const audio = await synthesize(text, voiceId);
+        return [voiceId, audio] as [string, Buffer];
+      })
+    );
+    return new Map(results);
+  }
+
   function formatForVoice(text: string): string {
     if (!text) return "";
 
@@ -212,5 +272,5 @@ export function voicePipelineService() {
     return result.trim();
   }
 
-  return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
+  return { transcribe, synthesize, synthesizeSentenceStream, synthesizeMultiLang, formatForVoice, transcodeToWav16k };
 }