feat(36-01): VoicePipelineService with transcribe, synthesize, formatForVoice, transcodeToWav16k

- Install ffmpeg-static and @types/ffmpeg-static
- Create voice-pipeline.ts with voicePipelineService factory function
- transcodeToWav16k: pipes audio through ffmpeg at 16kHz mono WAV
- transcribe: whisper-cpp cascade with --language auto, falls back to openai-whisper
- synthesize: piper TTS with sentence chunking and 8s timeout via Promise.race
- formatForVoice: extracts SPOKEN marker or strips markdown as fallback
- Unit tests with mocked child_process (12 tests all passing)
This commit is contained in:
Nexus Dev 2026-04-04 01:28:55 +00:00
parent f7153db301
commit 346b42dd73
4 changed files with 1322 additions and 0 deletions

846
pnpm-lock.yaml generated

File diff suppressed because it is too large Load diff

View file

@ -68,6 +68,7 @@
"drizzle-orm": "^0.38.4",
"embedded-postgres": "^18.1.0-beta.16",
"express": "^5.1.0",
"ffmpeg-static": "^5.3.0",
"hermes-paperclip-adapter": "^0.2.0",
"jsdom": "^28.1.0",
"multer": "^2.0.2",
@ -84,6 +85,7 @@
"devDependencies": {
"@types/express": "^5.0.0",
"@types/express-serve-static-core": "^5.0.0",
"@types/ffmpeg-static": "^5.1.0",
"@types/jsdom": "^28.0.0",
"@types/multer": "^2.0.0",
"@types/node": "^24.6.0",

View file

@ -0,0 +1,259 @@
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import { EventEmitter } from "node:events";
// Mock ffmpeg-static BEFORE any imports — returns "/mock/ffmpeg" by default
vi.mock("ffmpeg-static", () => ({ default: "/mock/ffmpeg" }));
// Mock child_process
vi.mock("node:child_process", () => ({
execFile: vi.fn(),
spawn: vi.fn(),
}));
// Mock fs/promises for temp file operations
vi.mock("node:fs/promises", () => ({
writeFile: vi.fn().mockResolvedValue(undefined),
unlink: vi.fn().mockResolvedValue(undefined),
readFile: vi.fn().mockResolvedValue(Buffer.from("wav-audio-data")),
}));
// Import the service and mocks once at the top level (ffmpeg-static mock is in effect)
import { voicePipelineService } from "../services/voice-pipeline.js";
import { execFile as execFileCb, spawn } from "node:child_process";
const execFileMock = vi.mocked(execFileCb);
const spawnMock = vi.mocked(spawn);
describe("voicePipelineService", () => {
beforeEach(() => {
vi.clearAllMocks();
});
describe("factory", () => {
it("throws when ffmpegPath is null", async () => {
// This test uses a separate module load with overridden mock
vi.doMock("ffmpeg-static", () => ({ default: null }));
const { voicePipelineService: svcFactory } = await import(
"../services/voice-pipeline.js?null-test"
);
expect(() => svcFactory()).toThrow("ffmpeg-static binary not found");
vi.doUnmock("ffmpeg-static");
});
});
describe("transcodeToWav16k", () => {
it("spawns ffmpeg with correct args for webm input", async () => {
const fakeProcess = new EventEmitter() as any;
fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
fakeProcess.stdout = new EventEmitter();
fakeProcess.stderr = new EventEmitter();
spawnMock.mockReturnValue(fakeProcess);
const svc = voicePipelineService();
const inputBuffer = Buffer.from("fake-webm-data");
const promise = svc.transcodeToWav16k(inputBuffer, "webm");
// Emit stdout data then close with code 0
fakeProcess.stdout.emit("data", Buffer.from("wav-header"));
fakeProcess.emit("close", 0);
await promise;
expect(spawnMock).toHaveBeenCalledWith(
"/mock/ffmpeg",
["-f", "webm", "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"],
{ stdio: ["pipe", "pipe", "pipe"] }
);
});
it("rejects when ffmpeg exits non-zero", async () => {
const fakeProcess = new EventEmitter() as any;
fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
fakeProcess.stdout = new EventEmitter();
fakeProcess.stderr = new EventEmitter();
spawnMock.mockReturnValue(fakeProcess);
const svc = voicePipelineService();
const inputBuffer = Buffer.from("bad-audio");
const promise = svc.transcodeToWav16k(inputBuffer, "webm");
fakeProcess.emit("close", 1);
await expect(promise).rejects.toThrow();
});
});
describe("transcribe", () => {
function setupSpawnForTranscode() {
const fakeProcess = new EventEmitter() as any;
fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
fakeProcess.stdout = new EventEmitter();
fakeProcess.stderr = new EventEmitter();
spawnMock.mockReturnValue(fakeProcess);
return fakeProcess;
}
it("calls transcodeToWav16k then whisper-cpp with --language auto for webm input", async () => {
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
callback(null, "Hello world", "");
return {} as any;
});
const svc = voicePipelineService();
const inputBuffer = Buffer.from("fake-webm");
// Set up spawn mock BEFORE calling transcribe
const fakeProcess = setupSpawnForTranscode();
// Start the transcribe (it will spawn ffmpeg internally)
const transcribePromise = svc.transcribe(inputBuffer, "webm");
// Give the spawn setup a tick to register, then emit close
await new Promise((r) => setTimeout(r, 0));
fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
fakeProcess.emit("close", 0);
const result = await transcribePromise;
// Verify spawn was called (transcode happened)
expect(spawnMock).toHaveBeenCalled();
// Verify whisper-cpp was called with --language auto
expect(execFileMock).toHaveBeenCalledWith(
"whisper-cpp",
expect.arrayContaining(["--language", "auto"]),
expect.anything(),
expect.any(Function)
);
expect(result).toHaveProperty("text");
});
it("falls back to openai-whisper when whisper-cpp fails", async () => {
let callCount = 0;
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
callCount++;
if (callCount === 1) {
// whisper-cpp fails
callback(new Error("whisper-cpp not found"), "", "");
} else {
// openai-whisper succeeds
callback(null, "Fallback transcript", "");
}
return {} as any;
});
const svc = voicePipelineService();
const inputBuffer = Buffer.from("fake-webm");
const fakeProcess = setupSpawnForTranscode();
const transcribePromise = svc.transcribe(inputBuffer, "webm");
await new Promise((r) => setTimeout(r, 0));
fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
fakeProcess.emit("close", 0);
const result = await transcribePromise;
expect(execFileMock).toHaveBeenCalledTimes(2);
expect(result).toHaveProperty("text");
});
it("throws 503-style error when neither whisper binary is available", async () => {
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
callback(new Error("command not found"), "", "");
return {} as any;
});
const svc = voicePipelineService();
const inputBuffer = Buffer.from("fake-webm");
const fakeProcess = setupSpawnForTranscode();
const transcribePromise = svc.transcribe(inputBuffer, "webm");
await new Promise((r) => setTimeout(r, 0));
fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
fakeProcess.emit("close", 0);
await expect(transcribePromise).rejects.toThrow("Whisper not available");
});
});
describe("synthesize", () => {
it("splits text into sentences and calls piper execFile for each", async () => {
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
callback(null, Buffer.from("wav-chunk"), "");
return {} as any;
});
const svc = voicePipelineService();
const result = await svc.synthesize("Hello world. How are you?");
// Should have called piper twice (2 sentences)
expect(execFileMock).toHaveBeenCalledTimes(2);
expect(execFileMock).toHaveBeenCalledWith(
"piper",
expect.anything(),
expect.anything(),
expect.any(Function)
);
expect(Buffer.isBuffer(result)).toBe(true);
});
it("throws when piper binary is not found", async () => {
const enoentErr = new Error("piper: command not found") as NodeJS.ErrnoException;
enoentErr.code = "ENOENT";
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
callback(enoentErr, "", "");
return {} as any;
});
const svc = voicePipelineService();
await expect(svc.synthesize("Hello.")).rejects.toThrow("Piper TTS not available");
});
it("rejects when piper call times out after 8000ms", async () => {
vi.useFakeTimers();
// Never call the callback (simulates hang)
execFileMock.mockImplementation(() => ({} as any));
const svc = voicePipelineService();
const promise = svc.synthesize("Hello.");
// Advance timers to trigger timeout
vi.advanceTimersByTime(9000);
await expect(promise).rejects.toThrow(/[Tt]imed out/);
vi.useRealTimers();
});
});
describe("formatForVoice", () => {
it("extracts SPOKEN section when both SPOKEN: and DETAILED: markers are present", () => {
const svc = voicePipelineService();
const input = "SPOKEN: Hello there\n\nDETAILED: ## Hello\n**world**";
const result = svc.formatForVoice(input);
expect(result).toBe("Hello there");
});
it("strips markdown when SPOKEN marker is absent", () => {
const svc = voicePipelineService();
const input = "## Hello\n**world**\n- item\n```code```";
const result = svc.formatForVoice(input);
// Headings, bold, bullets, and code fences stripped
expect(result).not.toContain("##");
expect(result).not.toContain("**");
expect(result).not.toContain("- item");
expect(result).not.toContain("```");
expect(result).toContain("Hello");
expect(result).toContain("world");
expect(result).toContain("item");
expect(result).toContain("code");
});
it("returns empty string for empty input", () => {
const svc = voicePipelineService();
expect(svc.formatForVoice("")).toBe("");
});
});
});

View file

@ -0,0 +1,215 @@
import ffmpegPath from "ffmpeg-static";
import { spawn, execFile as execFileCb } from "node:child_process";
import { tmpdir } from "node:os";
import path from "node:path";
import { writeFile, unlink } from "node:fs/promises";
/** Promisifies execFile, always resolving with { stdout, stderr } for consistent mocking. */
function execFileAsync(
cmd: string,
args: string[],
opts: { timeout?: number; maxBuffer?: number; input?: string }
): Promise<{ stdout: string; stderr: string }> {
return new Promise((resolve, reject) => {
execFileCb(cmd, args, opts as any, (err, stdout, stderr) => {
if (err) {
reject(err);
} else {
resolve({
stdout: Buffer.isBuffer(stdout) ? stdout.toString() : String(stdout ?? ""),
stderr: Buffer.isBuffer(stderr) ? stderr.toString() : String(stderr ?? ""),
});
}
});
});
}
export function voicePipelineService() {
if (!ffmpegPath) {
throw new Error("ffmpeg-static binary not found on this platform");
}
function withTimeout<T>(promise: Promise<T>, ms: number): Promise<T> {
return Promise.race([
promise,
new Promise<never>((_, reject) =>
setTimeout(() => reject(new Error(`Timed out after ${ms}ms`)), ms)
),
]);
}
async function transcodeToWav16k(inputBuffer: Buffer, inputFormat: string): Promise<Buffer> {
return new Promise<Buffer>((resolve, reject) => {
const ffmpeg = spawn(ffmpegPath, ["-f", inputFormat, "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"], {
stdio: ["pipe", "pipe", "pipe"],
});
const chunks: Buffer[] = [];
ffmpeg.stdout.on("data", (chunk: Buffer) => {
chunks.push(chunk);
});
ffmpeg.stderr.on("data", () => {
// Discard stderr to avoid blocking
});
ffmpeg.on("close", (code) => {
if (code === 0) {
resolve(Buffer.concat(chunks));
} else {
reject(new Error(`ffmpeg exited with code ${code}`));
}
});
ffmpeg.on("error", (err) => {
reject(err);
});
ffmpeg.stdin.write(inputBuffer);
ffmpeg.stdin.end();
});
}
async function transcribe(
buffer: Buffer,
format: "webm" | "ogg" | "wav"
): Promise<{ text: string; language?: string }> {
const wavBuffer = format !== "wav" ? await transcodeToWav16k(buffer, format) : buffer;
const tmpPath = path.join(tmpdir(), `nexus-audio-${Date.now()}.wav`);
try {
await writeFile(tmpPath, wavBuffer);
// Try whisper-cpp first
try {
const { stdout } = await execFileAsync(
"whisper-cpp",
["--model", "base.en", "--file", tmpPath, "--no-timestamps", "--output-txt", "--language", "auto"],
{ timeout: 30000 }
);
// Parse language from output if present (e.g. "auto-detected language: en")
let language: string | undefined;
const langMatch = stdout.match(/auto-detected language[:\s]+([a-z]{2})/i);
if (langMatch) {
language = langMatch[1];
}
return { text: stdout.trim(), language };
} catch (_whisperCppErr) {
// Fall through to openai-whisper
}
// Try openai-whisper Python CLI as fallback
try {
const { stdout } = await execFileAsync(
"whisper",
[tmpPath, "--model", "base.en", "--output_format", "txt", "--output_dir", tmpdir()],
{ timeout: 60000 }
);
return { text: stdout.trim() };
} catch (_whisperErr) {
// Both failed
}
throw new Error(
"Whisper not available. Install whisper-cpp or openai-whisper for voice input."
);
} finally {
unlink(tmpPath).catch(() => {});
}
}
async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
const buffers: Buffer[] = [];
for (const sentence of sentences) {
try {
const audioData = await withTimeout(
new Promise<Buffer>((resolve, reject) => {
execFileCb(
"piper",
["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
{
timeout: 8000,
maxBuffer: 10 * 1024 * 1024,
// @ts-ignore - input option is valid for execFile
input: sentence,
},
(err, stdout) => {
if (err) {
reject(err);
} else {
resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
}
}
);
}),
8000
);
buffers.push(audioData);
} catch (err) {
const nodeErr = err as NodeJS.ErrnoException;
if (nodeErr.code === "ENOENT") {
throw new Error("Piper TTS not available. Install piper for voice output.");
}
throw err;
}
}
return Buffer.concat(buffers);
}
function formatForVoice(text: string): string {
if (!text) return "";
// Check for SPOKEN: marker
const spokenMatch = text.match(/SPOKEN:\s*([\s\S]*?)(?=\nDETAILED:|\n\n[A-Z]+:)/);
if (spokenMatch) {
return spokenMatch[1].trim();
}
// Strip markdown
let result = text;
// Remove triple backtick code fences (with optional language identifier followed by newline)
// Pattern: ```lang\n...content...\n``` → content
// Pattern: ```content``` (no newline) → content
result = result.replace(/```([a-z]*)\n?([\s\S]*?)```/g, (_match, lang, inner) => {
// If lang is present and followed by a newline, it's a language identifier; inner is the code
// If no newline (lang === content), preserve the lang as text
if (lang && !inner.trim()) {
// ``` followed by word then immediately ``` — the "word" is actually content
return lang;
}
return inner.trim();
});
// Remove inline backticks
result = result.replace(/`([^`]+)`/g, "$1");
// Remove heading markers (## Heading -> Heading)
result = result.replace(/^#{1,6}\s+/gm, "");
// Remove bold markers (**text** -> text)
result = result.replace(/\*\*([^*]+)\*\*/g, "$1");
// Remove italic markers (*text* -> text)
result = result.replace(/\*([^*]+)\*/g, "$1");
// Remove bullet point prefixes (- item or * item)
result = result.replace(/^[-*]\s+/gm, "");
// Collapse multiple blank lines into one
result = result.replace(/\n{3,}/g, "\n\n");
return result.trim();
}
return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
}