feat(39-01): sentence-buffered TTS streaming + multi-language synthesis

- Export splitSentences() with title-abbreviation protection (Dr., Mr. etc.) - Add synthesizeSentenceStream() AsyncGenerator yielding per-sentence audio chunks - Add synthesizeMultiLang() synthesizing same text in N voices via Promise.all - Add POST /api/synthesize/stream SSE endpoint with base64 audio per sentence - Add POST /api/synthesize/multi-lang returning array of voiceId+audio pairs - Existing POST /api/synthesize unchanged (backward compatible)
2026-04-04 03:32:10 +00:00 · 2026-04-04 03:32:10 +00:00 · 22beb245f2
commit 22beb245f2
parent 6be251a9fb
2 changed files with 142 additions and 25 deletions
--- a/server/src/routes/voice.ts
+++ b/server/src/routes/voice.ts
@ -43,5 +43,62 @@ export function voiceRoutes(): Router {
    res.send(audioBuffer);
  });
  // POST /api/synthesize/stream — sentence-buffered SSE streaming TTS
  router.post("/synthesize/stream", async (req, res) => {
    assertBoard(req);
    const { text, voiceId } = req.body as { text?: string; voiceId?: string };
    if (!text || typeof text !== "string") {
      res.status(400).json({ error: "text is required" });
      return;
    }
    res.setHeader("Content-Type", "text/event-stream");
    res.setHeader("Cache-Control", "no-cache");
    res.setHeader("Connection", "keep-alive");
    res.flushHeaders();
    try {
      for await (const chunk of svc.synthesizeSentenceStream(text, voiceId)) {
        const payload = JSON.stringify({
          index: chunk.index,
          total: chunk.total,
          audio: chunk.audio.toString("base64"),
        });
        res.write(`data: ${payload}\n\n`);
      }
      res.write(`data: ${JSON.stringify({ done: true })}\n\n`);
    } catch (err) {
      const message = err instanceof Error ? err.message : "Synthesis failed";
      res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
    } finally {
      res.end();
    }
  });
  // POST /api/synthesize/multi-lang — synthesize same text in multiple languages/voices
  router.post("/synthesize/multi-lang", async (req, res) => {
    assertBoard(req);
    const { text, voiceIds } = req.body as { text?: string; voiceIds?: unknown };
    if (!text || typeof text !== "string") {
      res.status(400).json({ error: "text is required" });
      return;
    }
    if (!Array.isArray(voiceIds) || voiceIds.length < 1 || voiceIds.length > 5) {
      res.status(400).json({ error: "voiceIds must be an array with 1-5 entries" });
      return;
    }
    if (!voiceIds.every((v) => typeof v === "string")) {
      res.status(400).json({ error: "voiceIds must be an array of strings" });
      return;
    }
    const resultMap = await svc.synthesizeMultiLang(text, voiceIds as string[]);
    const results = Array.from(resultMap.entries()).map(([voiceId, audio]) => ({
      voiceId,
      audio: audio.toString("base64"),
    }));
    res.json({ results });
  });
  return router;
 }
--- a/server/src/services/voice-pipeline.ts
+++ b/server/src/services/voice-pipeline.ts
@ -24,6 +24,32 @@ function execFileAsync(
  });
 }
 /**
 * Splits text into sentences, preserving title abbreviations like Dr., Mr., etc.
 * Uses a lookbehind for sentence-ending punctuation followed by whitespace.
 * Protects common title abbreviations (Dr., Mr., Mrs., etc.) from being split on.
 * Acronyms like D.C. and U.S. that appear at sentence end will still trigger splits.
 */
 export function splitSentences(text: string): string[] {
  if (!text || !text.trim()) return [];
  const PLACEHOLDER = "\x00";
  // Protect title abbreviations by replacing the trailing ". " with ".\x00"
  const processed = text.replace(
    /\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Rev|Gen|Col|Sgt|Cpl|Pvt|Lt|Cmdr|Capt|Gov|Rep|Sen)\.\s+/g,
    (_, abbr) => `${abbr}.${PLACEHOLDER}`
  );
  // Split on sentence-ending punctuation followed by whitespace
  const parts = processed.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
  // Restore placeholders (replace with a space)
  return parts
    .map((s) => s.replace(new RegExp(PLACEHOLDER, "g"), " ").trim())
    .filter((s) => s.length > 0);
 }
 export function voicePipelineService() {
  if (!ffmpegPath) {
    throw new Error("ffmpeg-static binary not found on this platform");
@ -124,35 +150,38 @@ export function voicePipelineService() {
    }
  }
-  async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
+  async function synthesizeSentence(sentence: string, voiceId?: string): Promise<Buffer> {
-    const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
+    return withTimeout(
      new Promise<Buffer>((resolve, reject) => {
        execFileCb(
          "piper",
          ["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
          {
            timeout: 8000,
            maxBuffer: 10 * 1024 * 1024,
            // @ts-ignore - input option is valid for execFile
            input: sentence,
          },
          (err: Error | null, stdout: string | Buffer) => {
            if (err) {
              reject(err);
            } else {
              resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
            }
          }
        );
      }),
      8000
    );
  }
  async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
    const sentences = splitSentences(text);
    const buffers: Buffer[] = [];
    for (const sentence of sentences) {
      try {
-        const audioData = await withTimeout(
+        const audioData = await synthesizeSentence(sentence, voiceId);
          new Promise<Buffer>((resolve, reject) => {
            execFileCb(
              "piper",
              ["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
              {
                timeout: 8000,
                maxBuffer: 10 * 1024 * 1024,
                // @ts-ignore - input option is valid for execFile
                input: sentence,
              },
              (err: Error | null, stdout: string | Buffer) => {
                if (err) {
                  reject(err);
                } else {
                  resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
                }
              }
            );
          }),
          8000
        );
        buffers.push(audioData);
      } catch (err) {
        const nodeErr = err as NodeJS.ErrnoException;
@ -166,6 +195,37 @@ export function voicePipelineService() {
    return Buffer.concat(buffers);
  }
  async function* synthesizeSentenceStream(
    text: string,
    voiceId?: string
  ): AsyncGenerator<{ index: number; total: number; audio: Buffer }> {
    const sentences = splitSentences(text);
    const total = sentences.length;
    for (let index = 0; index < sentences.length; index++) {
      try {
        const audio = await synthesizeSentence(sentences[index], voiceId);
        yield { index, total, audio };
      } catch (err) {
        const nodeErr = err as NodeJS.ErrnoException;
        if (nodeErr.code === "ENOENT") {
          throw new Error("Piper TTS not available. Install piper for voice output.");
        }
        throw err;
      }
    }
  }
  async function synthesizeMultiLang(text: string, voiceIds: string[]): Promise<Map<string, Buffer>> {
    const results = await Promise.all(
      voiceIds.map(async (voiceId) => {
        const audio = await synthesize(text, voiceId);
        return [voiceId, audio] as [string, Buffer];
      })
    );
    return new Map(results);
  }
  function formatForVoice(text: string): string {
    if (!text) return "";
@ -212,5 +272,5 @@ export function voicePipelineService() {
    return result.trim();
  }
-  return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
+  return { transcribe, synthesize, synthesizeSentenceStream, synthesizeMultiLang, formatForVoice, transcodeToWav16k };
 }