feat(36-01): VoicePipelineService with transcribe, synthesize, formatForVoice, transcodeToWav16k

- Install ffmpeg-static and @types/ffmpeg-static - Create voice-pipeline.ts with voicePipelineService factory function - transcodeToWav16k: pipes audio through ffmpeg at 16kHz mono WAV - transcribe: whisper-cpp cascade with --language auto, falls back to openai-whisper - synthesize: piper TTS with sentence chunking and 8s timeout via Promise.race - formatForVoice: extracts SPOKEN marker or strips markdown as fallback - Unit tests with mocked child_process (12 tests all passing)
2026-04-04 01:28:55 +00:00 · 2026-04-04 01:28:55 +00:00 · 346b42dd73
commit 346b42dd73
parent f7153db301
4 changed files with 1322 additions and 0 deletions
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/server/package.json
+++ b/server/package.json
@ -68,6 +68,7 @@
    "drizzle-orm": "^0.38.4",
    "embedded-postgres": "^18.1.0-beta.16",
    "express": "^5.1.0",
+    "ffmpeg-static": "^5.3.0",
    "hermes-paperclip-adapter": "^0.2.0",
    "jsdom": "^28.1.0",
    "multer": "^2.0.2",
@ -84,6 +85,7 @@
  "devDependencies": {
    "@types/express": "^5.0.0",
    "@types/express-serve-static-core": "^5.0.0",
+    "@types/ffmpeg-static": "^5.1.0",
    "@types/jsdom": "^28.0.0",
    "@types/multer": "^2.0.0",
    "@types/node": "^24.6.0",
--- a/server/src/tests/36-voice-pipeline.test.ts
+++ b/server/src/tests/36-voice-pipeline.test.ts
@ -0,0 +1,259 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { EventEmitter } from "node:events";
+
+// Mock ffmpeg-static BEFORE any imports — returns "/mock/ffmpeg" by default
+vi.mock("ffmpeg-static", () => ({ default: "/mock/ffmpeg" }));
+
+// Mock child_process
+vi.mock("node:child_process", () => ({
+  execFile: vi.fn(),
+  spawn: vi.fn(),
+}));
+
+// Mock fs/promises for temp file operations
+vi.mock("node:fs/promises", () => ({
+  writeFile: vi.fn().mockResolvedValue(undefined),
+  unlink: vi.fn().mockResolvedValue(undefined),
+  readFile: vi.fn().mockResolvedValue(Buffer.from("wav-audio-data")),
+}));
+
+// Import the service and mocks once at the top level (ffmpeg-static mock is in effect)
+import { voicePipelineService } from "../services/voice-pipeline.js";
+import { execFile as execFileCb, spawn } from "node:child_process";
+
+const execFileMock = vi.mocked(execFileCb);
+const spawnMock = vi.mocked(spawn);
+
+describe("voicePipelineService", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  describe("factory", () => {
+    it("throws when ffmpegPath is null", async () => {
+      // This test uses a separate module load with overridden mock
+      vi.doMock("ffmpeg-static", () => ({ default: null }));
+      const { voicePipelineService: svcFactory } = await import(
+        "../services/voice-pipeline.js?null-test"
+      );
+      expect(() => svcFactory()).toThrow("ffmpeg-static binary not found");
+      vi.doUnmock("ffmpeg-static");
+    });
+  });
+
+  describe("transcodeToWav16k", () => {
+    it("spawns ffmpeg with correct args for webm input", async () => {
+      const fakeProcess = new EventEmitter() as any;
+      fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
+      fakeProcess.stdout = new EventEmitter();
+      fakeProcess.stderr = new EventEmitter();
+      spawnMock.mockReturnValue(fakeProcess);
+
+      const svc = voicePipelineService();
+      const inputBuffer = Buffer.from("fake-webm-data");
+      const promise = svc.transcodeToWav16k(inputBuffer, "webm");
+
+      // Emit stdout data then close with code 0
+      fakeProcess.stdout.emit("data", Buffer.from("wav-header"));
+      fakeProcess.emit("close", 0);
+
+      await promise;
+
+      expect(spawnMock).toHaveBeenCalledWith(
+        "/mock/ffmpeg",
+        ["-f", "webm", "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"],
+        { stdio: ["pipe", "pipe", "pipe"] }
+      );
+    });
+
+    it("rejects when ffmpeg exits non-zero", async () => {
+      const fakeProcess = new EventEmitter() as any;
+      fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
+      fakeProcess.stdout = new EventEmitter();
+      fakeProcess.stderr = new EventEmitter();
+      spawnMock.mockReturnValue(fakeProcess);
+
+      const svc = voicePipelineService();
+      const inputBuffer = Buffer.from("bad-audio");
+      const promise = svc.transcodeToWav16k(inputBuffer, "webm");
+
+      fakeProcess.emit("close", 1);
+
+      await expect(promise).rejects.toThrow();
+    });
+  });
+
+  describe("transcribe", () => {
+    function setupSpawnForTranscode() {
+      const fakeProcess = new EventEmitter() as any;
+      fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
+      fakeProcess.stdout = new EventEmitter();
+      fakeProcess.stderr = new EventEmitter();
+      spawnMock.mockReturnValue(fakeProcess);
+      return fakeProcess;
+    }
+
+    it("calls transcodeToWav16k then whisper-cpp with --language auto for webm input", async () => {
+      execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
+        callback(null, "Hello world", "");
+        return {} as any;
+      });
+
+      const svc = voicePipelineService();
+      const inputBuffer = Buffer.from("fake-webm");
+
+      // Set up spawn mock BEFORE calling transcribe
+      const fakeProcess = setupSpawnForTranscode();
+
+      // Start the transcribe (it will spawn ffmpeg internally)
+      const transcribePromise = svc.transcribe(inputBuffer, "webm");
+
+      // Give the spawn setup a tick to register, then emit close
+      await new Promise((r) => setTimeout(r, 0));
+      fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
+      fakeProcess.emit("close", 0);
+
+      const result = await transcribePromise;
+
+      // Verify spawn was called (transcode happened)
+      expect(spawnMock).toHaveBeenCalled();
+
+      // Verify whisper-cpp was called with --language auto
+      expect(execFileMock).toHaveBeenCalledWith(
+        "whisper-cpp",
+        expect.arrayContaining(["--language", "auto"]),
+        expect.anything(),
+        expect.any(Function)
+      );
+
+      expect(result).toHaveProperty("text");
+    });
+
+    it("falls back to openai-whisper when whisper-cpp fails", async () => {
+      let callCount = 0;
+      execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
+        callCount++;
+        if (callCount === 1) {
+          // whisper-cpp fails
+          callback(new Error("whisper-cpp not found"), "", "");
+        } else {
+          // openai-whisper succeeds
+          callback(null, "Fallback transcript", "");
+        }
+        return {} as any;
+      });
+
+      const svc = voicePipelineService();
+      const inputBuffer = Buffer.from("fake-webm");
+      const fakeProcess = setupSpawnForTranscode();
+      const transcribePromise = svc.transcribe(inputBuffer, "webm");
+
+      await new Promise((r) => setTimeout(r, 0));
+      fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
+      fakeProcess.emit("close", 0);
+
+      const result = await transcribePromise;
+
+      expect(execFileMock).toHaveBeenCalledTimes(2);
+      expect(result).toHaveProperty("text");
+    });
+
+    it("throws 503-style error when neither whisper binary is available", async () => {
+      execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
+        callback(new Error("command not found"), "", "");
+        return {} as any;
+      });
+
+      const svc = voicePipelineService();
+      const inputBuffer = Buffer.from("fake-webm");
+      const fakeProcess = setupSpawnForTranscode();
+      const transcribePromise = svc.transcribe(inputBuffer, "webm");
+
+      await new Promise((r) => setTimeout(r, 0));
+      fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
+      fakeProcess.emit("close", 0);
+
+      await expect(transcribePromise).rejects.toThrow("Whisper not available");
+    });
+  });
+
+  describe("synthesize", () => {
+    it("splits text into sentences and calls piper execFile for each", async () => {
+      execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
+        callback(null, Buffer.from("wav-chunk"), "");
+        return {} as any;
+      });
+
+      const svc = voicePipelineService();
+      const result = await svc.synthesize("Hello world. How are you?");
+
+      // Should have called piper twice (2 sentences)
+      expect(execFileMock).toHaveBeenCalledTimes(2);
+      expect(execFileMock).toHaveBeenCalledWith(
+        "piper",
+        expect.anything(),
+        expect.anything(),
+        expect.any(Function)
+      );
+      expect(Buffer.isBuffer(result)).toBe(true);
+    });
+
+    it("throws when piper binary is not found", async () => {
+      const enoentErr = new Error("piper: command not found") as NodeJS.ErrnoException;
+      enoentErr.code = "ENOENT";
+      execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
+        callback(enoentErr, "", "");
+        return {} as any;
+      });
+
+      const svc = voicePipelineService();
+      await expect(svc.synthesize("Hello.")).rejects.toThrow("Piper TTS not available");
+    });
+
+    it("rejects when piper call times out after 8000ms", async () => {
+      vi.useFakeTimers();
+
+      // Never call the callback (simulates hang)
+      execFileMock.mockImplementation(() => ({} as any));
+
+      const svc = voicePipelineService();
+      const promise = svc.synthesize("Hello.");
+
+      // Advance timers to trigger timeout
+      vi.advanceTimersByTime(9000);
+
+      await expect(promise).rejects.toThrow(/[Tt]imed out/);
+
+      vi.useRealTimers();
+    });
+  });
+
+  describe("formatForVoice", () => {
+    it("extracts SPOKEN section when both SPOKEN: and DETAILED: markers are present", () => {
+      const svc = voicePipelineService();
+      const input = "SPOKEN: Hello there\n\nDETAILED: ## Hello\n**world**";
+      const result = svc.formatForVoice(input);
+      expect(result).toBe("Hello there");
+    });
+
+    it("strips markdown when SPOKEN marker is absent", () => {
+      const svc = voicePipelineService();
+      const input = "## Hello\n**world**\n- item\n```code```";
+      const result = svc.formatForVoice(input);
+      // Headings, bold, bullets, and code fences stripped
+      expect(result).not.toContain("##");
+      expect(result).not.toContain("**");
+      expect(result).not.toContain("- item");
+      expect(result).not.toContain("```");
+      expect(result).toContain("Hello");
+      expect(result).toContain("world");
+      expect(result).toContain("item");
+      expect(result).toContain("code");
+    });
+
+    it("returns empty string for empty input", () => {
+      const svc = voicePipelineService();
+      expect(svc.formatForVoice("")).toBe("");
+    });
+  });
+});
--- a/server/src/services/voice-pipeline.ts
+++ b/server/src/services/voice-pipeline.ts
@ -0,0 +1,215 @@
+import ffmpegPath from "ffmpeg-static";
+import { spawn, execFile as execFileCb } from "node:child_process";
+import { tmpdir } from "node:os";
+import path from "node:path";
+import { writeFile, unlink } from "node:fs/promises";
+
+/** Promisifies execFile, always resolving with { stdout, stderr } for consistent mocking. */
+function execFileAsync(
+  cmd: string,
+  args: string[],
+  opts: { timeout?: number; maxBuffer?: number; input?: string }
+): Promise<{ stdout: string; stderr: string }> {
+  return new Promise((resolve, reject) => {
+    execFileCb(cmd, args, opts as any, (err, stdout, stderr) => {
+      if (err) {
+        reject(err);
+      } else {
+        resolve({
+          stdout: Buffer.isBuffer(stdout) ? stdout.toString() : String(stdout ?? ""),
+          stderr: Buffer.isBuffer(stderr) ? stderr.toString() : String(stderr ?? ""),
+        });
+      }
+    });
+  });
+}
+
+export function voicePipelineService() {
+  if (!ffmpegPath) {
+    throw new Error("ffmpeg-static binary not found on this platform");
+  }
+
+
+  function withTimeout<T>(promise: Promise<T>, ms: number): Promise<T> {
+    return Promise.race([
+      promise,
+      new Promise<never>((_, reject) =>
+        setTimeout(() => reject(new Error(`Timed out after ${ms}ms`)), ms)
+      ),
+    ]);
+  }
+
+  async function transcodeToWav16k(inputBuffer: Buffer, inputFormat: string): Promise<Buffer> {
+    return new Promise<Buffer>((resolve, reject) => {
+      const ffmpeg = spawn(ffmpegPath, ["-f", inputFormat, "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"], {
+        stdio: ["pipe", "pipe", "pipe"],
+      });
+
+      const chunks: Buffer[] = [];
+
+      ffmpeg.stdout.on("data", (chunk: Buffer) => {
+        chunks.push(chunk);
+      });
+
+      ffmpeg.stderr.on("data", () => {
+        // Discard stderr to avoid blocking
+      });
+
+      ffmpeg.on("close", (code) => {
+        if (code === 0) {
+          resolve(Buffer.concat(chunks));
+        } else {
+          reject(new Error(`ffmpeg exited with code ${code}`));
+        }
+      });
+
+      ffmpeg.on("error", (err) => {
+        reject(err);
+      });
+
+      ffmpeg.stdin.write(inputBuffer);
+      ffmpeg.stdin.end();
+    });
+  }
+
+  async function transcribe(
+    buffer: Buffer,
+    format: "webm" | "ogg" | "wav"
+  ): Promise<{ text: string; language?: string }> {
+    const wavBuffer = format !== "wav" ? await transcodeToWav16k(buffer, format) : buffer;
+
+    const tmpPath = path.join(tmpdir(), `nexus-audio-${Date.now()}.wav`);
+
+    try {
+      await writeFile(tmpPath, wavBuffer);
+
+      // Try whisper-cpp first
+      try {
+        const { stdout } = await execFileAsync(
+          "whisper-cpp",
+          ["--model", "base.en", "--file", tmpPath, "--no-timestamps", "--output-txt", "--language", "auto"],
+          { timeout: 30000 }
+        );
+
+        // Parse language from output if present (e.g. "auto-detected language: en")
+        let language: string | undefined;
+        const langMatch = stdout.match(/auto-detected language[:\s]+([a-z]{2})/i);
+        if (langMatch) {
+          language = langMatch[1];
+        }
+
+        return { text: stdout.trim(), language };
+      } catch (_whisperCppErr) {
+        // Fall through to openai-whisper
+      }
+
+      // Try openai-whisper Python CLI as fallback
+      try {
+        const { stdout } = await execFileAsync(
+          "whisper",
+          [tmpPath, "--model", "base.en", "--output_format", "txt", "--output_dir", tmpdir()],
+          { timeout: 60000 }
+        );
+        return { text: stdout.trim() };
+      } catch (_whisperErr) {
+        // Both failed
+      }
+
+      throw new Error(
+        "Whisper not available. Install whisper-cpp or openai-whisper for voice input."
+      );
+    } finally {
+      unlink(tmpPath).catch(() => {});
+    }
+  }
+
+  async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
+    const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
+
+    const buffers: Buffer[] = [];
+
+    for (const sentence of sentences) {
+      try {
+        const audioData = await withTimeout(
+          new Promise<Buffer>((resolve, reject) => {
+            execFileCb(
+              "piper",
+              ["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
+              {
+                timeout: 8000,
+                maxBuffer: 10 * 1024 * 1024,
+                // @ts-ignore - input option is valid for execFile
+                input: sentence,
+              },
+              (err, stdout) => {
+                if (err) {
+                  reject(err);
+                } else {
+                  resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
+                }
+              }
+            );
+          }),
+          8000
+        );
+        buffers.push(audioData);
+      } catch (err) {
+        const nodeErr = err as NodeJS.ErrnoException;
+        if (nodeErr.code === "ENOENT") {
+          throw new Error("Piper TTS not available. Install piper for voice output.");
+        }
+        throw err;
+      }
+    }
+
+    return Buffer.concat(buffers);
+  }
+
+  function formatForVoice(text: string): string {
+    if (!text) return "";
+
+    // Check for SPOKEN: marker
+    const spokenMatch = text.match(/SPOKEN:\s*([\s\S]*?)(?=\nDETAILED:|\n\n[A-Z]+:)/);
+    if (spokenMatch) {
+      return spokenMatch[1].trim();
+    }
+
+    // Strip markdown
+    let result = text;
+
+    // Remove triple backtick code fences (with optional language identifier followed by newline)
+    // Pattern: ```lang\n...content...\n``` → content
+    // Pattern: ```content``` (no newline) → content
+    result = result.replace(/```([a-z]*)\n?([\s\S]*?)```/g, (_match, lang, inner) => {
+      // If lang is present and followed by a newline, it's a language identifier; inner is the code
+      // If no newline (lang === content), preserve the lang as text
+      if (lang && !inner.trim()) {
+        // ``` followed by word then immediately ``` — the "word" is actually content
+        return lang;
+      }
+      return inner.trim();
+    });
+
+    // Remove inline backticks
+    result = result.replace(/`([^`]+)`/g, "$1");
+
+    // Remove heading markers (## Heading -> Heading)
+    result = result.replace(/^#{1,6}\s+/gm, "");
+
+    // Remove bold markers (**text** -> text)
+    result = result.replace(/\*\*([^*]+)\*\*/g, "$1");
+
+    // Remove italic markers (*text* -> text)
+    result = result.replace(/\*([^*]+)\*/g, "$1");
+
+    // Remove bullet point prefixes (- item or * item)
+    result = result.replace(/^[-*]\s+/gm, "");
+
+    // Collapse multiple blank lines into one
+    result = result.replace(/\n{3,}/g, "\n\n");
+
+    return result.trim();
+  }
+
+  return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
+}