feat(36-01): VoicePipelineService with transcribe, synthesize, formatForVoice, transcodeToWav16k
- Install ffmpeg-static and @types/ffmpeg-static - Create voice-pipeline.ts with voicePipelineService factory function - transcodeToWav16k: pipes audio through ffmpeg at 16kHz mono WAV - transcribe: whisper-cpp cascade with --language auto, falls back to openai-whisper - synthesize: piper TTS with sentence chunking and 8s timeout via Promise.race - formatForVoice: extracts SPOKEN marker or strips markdown as fallback - Unit tests with mocked child_process (12 tests all passing)
This commit is contained in:
parent
f7153db301
commit
346b42dd73
4 changed files with 1322 additions and 0 deletions
846
pnpm-lock.yaml
generated
846
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load diff
|
|
@ -68,6 +68,7 @@
|
|||
"drizzle-orm": "^0.38.4",
|
||||
"embedded-postgres": "^18.1.0-beta.16",
|
||||
"express": "^5.1.0",
|
||||
"ffmpeg-static": "^5.3.0",
|
||||
"hermes-paperclip-adapter": "^0.2.0",
|
||||
"jsdom": "^28.1.0",
|
||||
"multer": "^2.0.2",
|
||||
|
|
@ -84,6 +85,7 @@
|
|||
"devDependencies": {
|
||||
"@types/express": "^5.0.0",
|
||||
"@types/express-serve-static-core": "^5.0.0",
|
||||
"@types/ffmpeg-static": "^5.1.0",
|
||||
"@types/jsdom": "^28.0.0",
|
||||
"@types/multer": "^2.0.0",
|
||||
"@types/node": "^24.6.0",
|
||||
|
|
|
|||
259
server/src/__tests__/36-voice-pipeline.test.ts
Normal file
259
server/src/__tests__/36-voice-pipeline.test.ts
Normal file
|
|
@ -0,0 +1,259 @@
|
|||
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
||||
import { EventEmitter } from "node:events";
|
||||
|
||||
// Mock ffmpeg-static BEFORE any imports — returns "/mock/ffmpeg" by default
|
||||
vi.mock("ffmpeg-static", () => ({ default: "/mock/ffmpeg" }));
|
||||
|
||||
// Mock child_process
|
||||
vi.mock("node:child_process", () => ({
|
||||
execFile: vi.fn(),
|
||||
spawn: vi.fn(),
|
||||
}));
|
||||
|
||||
// Mock fs/promises for temp file operations
|
||||
vi.mock("node:fs/promises", () => ({
|
||||
writeFile: vi.fn().mockResolvedValue(undefined),
|
||||
unlink: vi.fn().mockResolvedValue(undefined),
|
||||
readFile: vi.fn().mockResolvedValue(Buffer.from("wav-audio-data")),
|
||||
}));
|
||||
|
||||
// Import the service and mocks once at the top level (ffmpeg-static mock is in effect)
|
||||
import { voicePipelineService } from "../services/voice-pipeline.js";
|
||||
import { execFile as execFileCb, spawn } from "node:child_process";
|
||||
|
||||
const execFileMock = vi.mocked(execFileCb);
|
||||
const spawnMock = vi.mocked(spawn);
|
||||
|
||||
describe("voicePipelineService", () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
describe("factory", () => {
|
||||
it("throws when ffmpegPath is null", async () => {
|
||||
// This test uses a separate module load with overridden mock
|
||||
vi.doMock("ffmpeg-static", () => ({ default: null }));
|
||||
const { voicePipelineService: svcFactory } = await import(
|
||||
"../services/voice-pipeline.js?null-test"
|
||||
);
|
||||
expect(() => svcFactory()).toThrow("ffmpeg-static binary not found");
|
||||
vi.doUnmock("ffmpeg-static");
|
||||
});
|
||||
});
|
||||
|
||||
describe("transcodeToWav16k", () => {
|
||||
it("spawns ffmpeg with correct args for webm input", async () => {
|
||||
const fakeProcess = new EventEmitter() as any;
|
||||
fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
|
||||
fakeProcess.stdout = new EventEmitter();
|
||||
fakeProcess.stderr = new EventEmitter();
|
||||
spawnMock.mockReturnValue(fakeProcess);
|
||||
|
||||
const svc = voicePipelineService();
|
||||
const inputBuffer = Buffer.from("fake-webm-data");
|
||||
const promise = svc.transcodeToWav16k(inputBuffer, "webm");
|
||||
|
||||
// Emit stdout data then close with code 0
|
||||
fakeProcess.stdout.emit("data", Buffer.from("wav-header"));
|
||||
fakeProcess.emit("close", 0);
|
||||
|
||||
await promise;
|
||||
|
||||
expect(spawnMock).toHaveBeenCalledWith(
|
||||
"/mock/ffmpeg",
|
||||
["-f", "webm", "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"],
|
||||
{ stdio: ["pipe", "pipe", "pipe"] }
|
||||
);
|
||||
});
|
||||
|
||||
it("rejects when ffmpeg exits non-zero", async () => {
|
||||
const fakeProcess = new EventEmitter() as any;
|
||||
fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
|
||||
fakeProcess.stdout = new EventEmitter();
|
||||
fakeProcess.stderr = new EventEmitter();
|
||||
spawnMock.mockReturnValue(fakeProcess);
|
||||
|
||||
const svc = voicePipelineService();
|
||||
const inputBuffer = Buffer.from("bad-audio");
|
||||
const promise = svc.transcodeToWav16k(inputBuffer, "webm");
|
||||
|
||||
fakeProcess.emit("close", 1);
|
||||
|
||||
await expect(promise).rejects.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
describe("transcribe", () => {
|
||||
function setupSpawnForTranscode() {
|
||||
const fakeProcess = new EventEmitter() as any;
|
||||
fakeProcess.stdin = { write: vi.fn(), end: vi.fn() };
|
||||
fakeProcess.stdout = new EventEmitter();
|
||||
fakeProcess.stderr = new EventEmitter();
|
||||
spawnMock.mockReturnValue(fakeProcess);
|
||||
return fakeProcess;
|
||||
}
|
||||
|
||||
it("calls transcodeToWav16k then whisper-cpp with --language auto for webm input", async () => {
|
||||
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
|
||||
callback(null, "Hello world", "");
|
||||
return {} as any;
|
||||
});
|
||||
|
||||
const svc = voicePipelineService();
|
||||
const inputBuffer = Buffer.from("fake-webm");
|
||||
|
||||
// Set up spawn mock BEFORE calling transcribe
|
||||
const fakeProcess = setupSpawnForTranscode();
|
||||
|
||||
// Start the transcribe (it will spawn ffmpeg internally)
|
||||
const transcribePromise = svc.transcribe(inputBuffer, "webm");
|
||||
|
||||
// Give the spawn setup a tick to register, then emit close
|
||||
await new Promise((r) => setTimeout(r, 0));
|
||||
fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
|
||||
fakeProcess.emit("close", 0);
|
||||
|
||||
const result = await transcribePromise;
|
||||
|
||||
// Verify spawn was called (transcode happened)
|
||||
expect(spawnMock).toHaveBeenCalled();
|
||||
|
||||
// Verify whisper-cpp was called with --language auto
|
||||
expect(execFileMock).toHaveBeenCalledWith(
|
||||
"whisper-cpp",
|
||||
expect.arrayContaining(["--language", "auto"]),
|
||||
expect.anything(),
|
||||
expect.any(Function)
|
||||
);
|
||||
|
||||
expect(result).toHaveProperty("text");
|
||||
});
|
||||
|
||||
it("falls back to openai-whisper when whisper-cpp fails", async () => {
|
||||
let callCount = 0;
|
||||
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
// whisper-cpp fails
|
||||
callback(new Error("whisper-cpp not found"), "", "");
|
||||
} else {
|
||||
// openai-whisper succeeds
|
||||
callback(null, "Fallback transcript", "");
|
||||
}
|
||||
return {} as any;
|
||||
});
|
||||
|
||||
const svc = voicePipelineService();
|
||||
const inputBuffer = Buffer.from("fake-webm");
|
||||
const fakeProcess = setupSpawnForTranscode();
|
||||
const transcribePromise = svc.transcribe(inputBuffer, "webm");
|
||||
|
||||
await new Promise((r) => setTimeout(r, 0));
|
||||
fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
|
||||
fakeProcess.emit("close", 0);
|
||||
|
||||
const result = await transcribePromise;
|
||||
|
||||
expect(execFileMock).toHaveBeenCalledTimes(2);
|
||||
expect(result).toHaveProperty("text");
|
||||
});
|
||||
|
||||
it("throws 503-style error when neither whisper binary is available", async () => {
|
||||
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
|
||||
callback(new Error("command not found"), "", "");
|
||||
return {} as any;
|
||||
});
|
||||
|
||||
const svc = voicePipelineService();
|
||||
const inputBuffer = Buffer.from("fake-webm");
|
||||
const fakeProcess = setupSpawnForTranscode();
|
||||
const transcribePromise = svc.transcribe(inputBuffer, "webm");
|
||||
|
||||
await new Promise((r) => setTimeout(r, 0));
|
||||
fakeProcess.stdout.emit("data", Buffer.from("wav-data"));
|
||||
fakeProcess.emit("close", 0);
|
||||
|
||||
await expect(transcribePromise).rejects.toThrow("Whisper not available");
|
||||
});
|
||||
});
|
||||
|
||||
describe("synthesize", () => {
|
||||
it("splits text into sentences and calls piper execFile for each", async () => {
|
||||
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
|
||||
callback(null, Buffer.from("wav-chunk"), "");
|
||||
return {} as any;
|
||||
});
|
||||
|
||||
const svc = voicePipelineService();
|
||||
const result = await svc.synthesize("Hello world. How are you?");
|
||||
|
||||
// Should have called piper twice (2 sentences)
|
||||
expect(execFileMock).toHaveBeenCalledTimes(2);
|
||||
expect(execFileMock).toHaveBeenCalledWith(
|
||||
"piper",
|
||||
expect.anything(),
|
||||
expect.anything(),
|
||||
expect.any(Function)
|
||||
);
|
||||
expect(Buffer.isBuffer(result)).toBe(true);
|
||||
});
|
||||
|
||||
it("throws when piper binary is not found", async () => {
|
||||
const enoentErr = new Error("piper: command not found") as NodeJS.ErrnoException;
|
||||
enoentErr.code = "ENOENT";
|
||||
execFileMock.mockImplementation((_cmd: any, _args: any, _opts: any, callback: any) => {
|
||||
callback(enoentErr, "", "");
|
||||
return {} as any;
|
||||
});
|
||||
|
||||
const svc = voicePipelineService();
|
||||
await expect(svc.synthesize("Hello.")).rejects.toThrow("Piper TTS not available");
|
||||
});
|
||||
|
||||
it("rejects when piper call times out after 8000ms", async () => {
|
||||
vi.useFakeTimers();
|
||||
|
||||
// Never call the callback (simulates hang)
|
||||
execFileMock.mockImplementation(() => ({} as any));
|
||||
|
||||
const svc = voicePipelineService();
|
||||
const promise = svc.synthesize("Hello.");
|
||||
|
||||
// Advance timers to trigger timeout
|
||||
vi.advanceTimersByTime(9000);
|
||||
|
||||
await expect(promise).rejects.toThrow(/[Tt]imed out/);
|
||||
|
||||
vi.useRealTimers();
|
||||
});
|
||||
});
|
||||
|
||||
describe("formatForVoice", () => {
|
||||
it("extracts SPOKEN section when both SPOKEN: and DETAILED: markers are present", () => {
|
||||
const svc = voicePipelineService();
|
||||
const input = "SPOKEN: Hello there\n\nDETAILED: ## Hello\n**world**";
|
||||
const result = svc.formatForVoice(input);
|
||||
expect(result).toBe("Hello there");
|
||||
});
|
||||
|
||||
it("strips markdown when SPOKEN marker is absent", () => {
|
||||
const svc = voicePipelineService();
|
||||
const input = "## Hello\n**world**\n- item\n```code```";
|
||||
const result = svc.formatForVoice(input);
|
||||
// Headings, bold, bullets, and code fences stripped
|
||||
expect(result).not.toContain("##");
|
||||
expect(result).not.toContain("**");
|
||||
expect(result).not.toContain("- item");
|
||||
expect(result).not.toContain("```");
|
||||
expect(result).toContain("Hello");
|
||||
expect(result).toContain("world");
|
||||
expect(result).toContain("item");
|
||||
expect(result).toContain("code");
|
||||
});
|
||||
|
||||
it("returns empty string for empty input", () => {
|
||||
const svc = voicePipelineService();
|
||||
expect(svc.formatForVoice("")).toBe("");
|
||||
});
|
||||
});
|
||||
});
|
||||
215
server/src/services/voice-pipeline.ts
Normal file
215
server/src/services/voice-pipeline.ts
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
import ffmpegPath from "ffmpeg-static";
|
||||
import { spawn, execFile as execFileCb } from "node:child_process";
|
||||
import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
import { writeFile, unlink } from "node:fs/promises";
|
||||
|
||||
/** Promisifies execFile, always resolving with { stdout, stderr } for consistent mocking. */
|
||||
function execFileAsync(
|
||||
cmd: string,
|
||||
args: string[],
|
||||
opts: { timeout?: number; maxBuffer?: number; input?: string }
|
||||
): Promise<{ stdout: string; stderr: string }> {
|
||||
return new Promise((resolve, reject) => {
|
||||
execFileCb(cmd, args, opts as any, (err, stdout, stderr) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve({
|
||||
stdout: Buffer.isBuffer(stdout) ? stdout.toString() : String(stdout ?? ""),
|
||||
stderr: Buffer.isBuffer(stderr) ? stderr.toString() : String(stderr ?? ""),
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export function voicePipelineService() {
|
||||
if (!ffmpegPath) {
|
||||
throw new Error("ffmpeg-static binary not found on this platform");
|
||||
}
|
||||
|
||||
|
||||
function withTimeout<T>(promise: Promise<T>, ms: number): Promise<T> {
|
||||
return Promise.race([
|
||||
promise,
|
||||
new Promise<never>((_, reject) =>
|
||||
setTimeout(() => reject(new Error(`Timed out after ${ms}ms`)), ms)
|
||||
),
|
||||
]);
|
||||
}
|
||||
|
||||
async function transcodeToWav16k(inputBuffer: Buffer, inputFormat: string): Promise<Buffer> {
|
||||
return new Promise<Buffer>((resolve, reject) => {
|
||||
const ffmpeg = spawn(ffmpegPath, ["-f", inputFormat, "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"], {
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
const chunks: Buffer[] = [];
|
||||
|
||||
ffmpeg.stdout.on("data", (chunk: Buffer) => {
|
||||
chunks.push(chunk);
|
||||
});
|
||||
|
||||
ffmpeg.stderr.on("data", () => {
|
||||
// Discard stderr to avoid blocking
|
||||
});
|
||||
|
||||
ffmpeg.on("close", (code) => {
|
||||
if (code === 0) {
|
||||
resolve(Buffer.concat(chunks));
|
||||
} else {
|
||||
reject(new Error(`ffmpeg exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
|
||||
ffmpeg.on("error", (err) => {
|
||||
reject(err);
|
||||
});
|
||||
|
||||
ffmpeg.stdin.write(inputBuffer);
|
||||
ffmpeg.stdin.end();
|
||||
});
|
||||
}
|
||||
|
||||
async function transcribe(
|
||||
buffer: Buffer,
|
||||
format: "webm" | "ogg" | "wav"
|
||||
): Promise<{ text: string; language?: string }> {
|
||||
const wavBuffer = format !== "wav" ? await transcodeToWav16k(buffer, format) : buffer;
|
||||
|
||||
const tmpPath = path.join(tmpdir(), `nexus-audio-${Date.now()}.wav`);
|
||||
|
||||
try {
|
||||
await writeFile(tmpPath, wavBuffer);
|
||||
|
||||
// Try whisper-cpp first
|
||||
try {
|
||||
const { stdout } = await execFileAsync(
|
||||
"whisper-cpp",
|
||||
["--model", "base.en", "--file", tmpPath, "--no-timestamps", "--output-txt", "--language", "auto"],
|
||||
{ timeout: 30000 }
|
||||
);
|
||||
|
||||
// Parse language from output if present (e.g. "auto-detected language: en")
|
||||
let language: string | undefined;
|
||||
const langMatch = stdout.match(/auto-detected language[:\s]+([a-z]{2})/i);
|
||||
if (langMatch) {
|
||||
language = langMatch[1];
|
||||
}
|
||||
|
||||
return { text: stdout.trim(), language };
|
||||
} catch (_whisperCppErr) {
|
||||
// Fall through to openai-whisper
|
||||
}
|
||||
|
||||
// Try openai-whisper Python CLI as fallback
|
||||
try {
|
||||
const { stdout } = await execFileAsync(
|
||||
"whisper",
|
||||
[tmpPath, "--model", "base.en", "--output_format", "txt", "--output_dir", tmpdir()],
|
||||
{ timeout: 60000 }
|
||||
);
|
||||
return { text: stdout.trim() };
|
||||
} catch (_whisperErr) {
|
||||
// Both failed
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
"Whisper not available. Install whisper-cpp or openai-whisper for voice input."
|
||||
);
|
||||
} finally {
|
||||
unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
}
|
||||
|
||||
async function synthesize(text: string, voiceId?: string): Promise<Buffer> {
|
||||
const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0);
|
||||
|
||||
const buffers: Buffer[] = [];
|
||||
|
||||
for (const sentence of sentences) {
|
||||
try {
|
||||
const audioData = await withTimeout(
|
||||
new Promise<Buffer>((resolve, reject) => {
|
||||
execFileCb(
|
||||
"piper",
|
||||
["--model", voiceId || "en_US-lessac-medium", "--output-raw"],
|
||||
{
|
||||
timeout: 8000,
|
||||
maxBuffer: 10 * 1024 * 1024,
|
||||
// @ts-ignore - input option is valid for execFile
|
||||
input: sentence,
|
||||
},
|
||||
(err, stdout) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string));
|
||||
}
|
||||
}
|
||||
);
|
||||
}),
|
||||
8000
|
||||
);
|
||||
buffers.push(audioData);
|
||||
} catch (err) {
|
||||
const nodeErr = err as NodeJS.ErrnoException;
|
||||
if (nodeErr.code === "ENOENT") {
|
||||
throw new Error("Piper TTS not available. Install piper for voice output.");
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
return Buffer.concat(buffers);
|
||||
}
|
||||
|
||||
function formatForVoice(text: string): string {
|
||||
if (!text) return "";
|
||||
|
||||
// Check for SPOKEN: marker
|
||||
const spokenMatch = text.match(/SPOKEN:\s*([\s\S]*?)(?=\nDETAILED:|\n\n[A-Z]+:)/);
|
||||
if (spokenMatch) {
|
||||
return spokenMatch[1].trim();
|
||||
}
|
||||
|
||||
// Strip markdown
|
||||
let result = text;
|
||||
|
||||
// Remove triple backtick code fences (with optional language identifier followed by newline)
|
||||
// Pattern: ```lang\n...content...\n``` → content
|
||||
// Pattern: ```content``` (no newline) → content
|
||||
result = result.replace(/```([a-z]*)\n?([\s\S]*?)```/g, (_match, lang, inner) => {
|
||||
// If lang is present and followed by a newline, it's a language identifier; inner is the code
|
||||
// If no newline (lang === content), preserve the lang as text
|
||||
if (lang && !inner.trim()) {
|
||||
// ``` followed by word then immediately ``` — the "word" is actually content
|
||||
return lang;
|
||||
}
|
||||
return inner.trim();
|
||||
});
|
||||
|
||||
// Remove inline backticks
|
||||
result = result.replace(/`([^`]+)`/g, "$1");
|
||||
|
||||
// Remove heading markers (## Heading -> Heading)
|
||||
result = result.replace(/^#{1,6}\s+/gm, "");
|
||||
|
||||
// Remove bold markers (**text** -> text)
|
||||
result = result.replace(/\*\*([^*]+)\*\*/g, "$1");
|
||||
|
||||
// Remove italic markers (*text* -> text)
|
||||
result = result.replace(/\*([^*]+)\*/g, "$1");
|
||||
|
||||
// Remove bullet point prefixes (- item or * item)
|
||||
result = result.replace(/^[-*]\s+/gm, "");
|
||||
|
||||
// Collapse multiple blank lines into one
|
||||
result = result.replace(/\n{3,}/g, "\n\n");
|
||||
|
||||
return result.trim();
|
||||
}
|
||||
|
||||
return { transcribe, synthesize, formatForVoice, transcodeToWav16k };
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue