import ffmpegPath from "ffmpeg-static"; import { spawn, execFile as execFileCb } from "node:child_process"; import { tmpdir } from "node:os"; import path from "node:path"; import { writeFile, unlink } from "node:fs/promises"; /** Promisifies execFile, always resolving with { stdout, stderr } for consistent mocking. */ function execFileAsync( cmd: string, args: string[], opts: { timeout?: number; maxBuffer?: number; input?: string } ): Promise<{ stdout: string; stderr: string }> { return new Promise((resolve, reject) => { execFileCb(cmd, args, opts as any, (err, stdout, stderr) => { if (err) { reject(err); } else { resolve({ stdout: Buffer.isBuffer(stdout) ? stdout.toString() : String(stdout ?? ""), stderr: Buffer.isBuffer(stderr) ? stderr.toString() : String(stderr ?? ""), }); } }); }); } /** * Splits text into sentences, preserving title abbreviations like Dr., Mr., etc. * Uses a lookbehind for sentence-ending punctuation followed by whitespace. * Protects common title abbreviations (Dr., Mr., Mrs., etc.) from being split on. * Acronyms like D.C. and U.S. that appear at sentence end will still trigger splits. */ export function splitSentences(text: string): string[] { if (!text || !text.trim()) return []; const PLACEHOLDER = "\x00"; // Protect title abbreviations by replacing the trailing ". " with ".\x00" const processed = text.replace( /\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|Rev|Gen|Col|Sgt|Cpl|Pvt|Lt|Cmdr|Capt|Gov|Rep|Sen)\.\s+/g, (_, abbr) => `${abbr}.${PLACEHOLDER}` ); // Split on sentence-ending punctuation followed by whitespace const parts = processed.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0); // Restore placeholders (replace with a space) return parts .map((s) => s.replace(new RegExp(PLACEHOLDER, "g"), " ").trim()) .filter((s) => s.length > 0); } export function voicePipelineService() { if (!ffmpegPath) { throw new Error("ffmpeg-static binary not found on this platform"); } const ffmpegBin = ffmpegPath as unknown as string; function withTimeout(promise: Promise, ms: number): Promise { return Promise.race([ promise, new Promise((_, reject) => setTimeout(() => reject(new Error(`Timed out after ${ms}ms`)), ms) ), ]); } async function transcodeToWav16k(inputBuffer: Buffer, inputFormat: string): Promise { return new Promise((resolve, reject) => { const ffmpeg = spawn(ffmpegBin, ["-f", inputFormat, "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"], { stdio: ["pipe", "pipe", "pipe"], }); const chunks: Buffer[] = []; ffmpeg.stdout.on("data", (chunk: Buffer) => { chunks.push(chunk); }); ffmpeg.stderr.on("data", () => { // Discard stderr to avoid blocking }); ffmpeg.on("close", (code) => { if (code === 0) { resolve(Buffer.concat(chunks)); } else { reject(new Error(`ffmpeg exited with code ${code}`)); } }); ffmpeg.on("error", (err) => { reject(err); }); ffmpeg.stdin.write(inputBuffer); ffmpeg.stdin.end(); }); } async function transcribe( buffer: Buffer, format: "webm" | "ogg" | "wav" ): Promise<{ text: string; language?: string }> { const wavBuffer = format !== "wav" ? await transcodeToWav16k(buffer, format) : buffer; const tmpPath = path.join(tmpdir(), `nexus-audio-${Date.now()}.wav`); try { await writeFile(tmpPath, wavBuffer); // Try whisper-cpp first try { const { stdout } = await execFileAsync( "whisper-cpp", ["--model", "base.en", "--file", tmpPath, "--no-timestamps", "--output-txt", "--language", "auto"], { timeout: 30000 } ); // Parse language from output if present (e.g. "auto-detected language: en") let language: string | undefined; const langMatch = stdout.match(/auto-detected language[:\s]+([a-z]{2})/i); if (langMatch) { language = langMatch[1]; } return { text: stdout.trim(), language }; } catch (_whisperCppErr) { // Fall through to openai-whisper } // Try openai-whisper Python CLI as fallback try { const { stdout } = await execFileAsync( "whisper", [tmpPath, "--model", "base.en", "--output_format", "txt", "--output_dir", tmpdir()], { timeout: 60000 } ); return { text: stdout.trim() }; } catch (_whisperErr) { // Both failed } throw new Error( "Whisper not available. Install whisper-cpp or openai-whisper for voice input." ); } finally { unlink(tmpPath).catch(() => {}); } } async function synthesizeSentence(sentence: string, voiceId?: string): Promise { return withTimeout( new Promise((resolve, reject) => { execFileCb( "piper", ["--model", voiceId || "en_US-lessac-medium", "--output-raw"], { timeout: 8000, maxBuffer: 10 * 1024 * 1024, // @ts-ignore - input option is valid for execFile input: sentence, }, (err: Error | null, stdout: string | Buffer) => { if (err) { reject(err); } else { resolve(Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout as string)); } } ); }), 8000 ); } async function synthesize(text: string, voiceId?: string): Promise { const sentences = splitSentences(text); const buffers: Buffer[] = []; for (const sentence of sentences) { try { const audioData = await synthesizeSentence(sentence, voiceId); buffers.push(audioData); } catch (err) { const nodeErr = err as NodeJS.ErrnoException; if (nodeErr.code === "ENOENT") { throw new Error("Piper TTS not available. Install piper for voice output."); } throw err; } } return Buffer.concat(buffers); } async function* synthesizeSentenceStream( text: string, voiceId?: string ): AsyncGenerator<{ index: number; total: number; audio: Buffer }> { const sentences = splitSentences(text); const total = sentences.length; for (let index = 0; index < sentences.length; index++) { try { const audio = await synthesizeSentence(sentences[index], voiceId); yield { index, total, audio }; } catch (err) { const nodeErr = err as NodeJS.ErrnoException; if (nodeErr.code === "ENOENT") { throw new Error("Piper TTS not available. Install piper for voice output."); } throw err; } } } async function synthesizeMultiLang(text: string, voiceIds: string[]): Promise> { const results = await Promise.all( voiceIds.map(async (voiceId) => { const audio = await synthesize(text, voiceId); return [voiceId, audio] as [string, Buffer]; }) ); return new Map(results); } function formatForVoice(text: string): string { if (!text) return ""; // Check for SPOKEN: marker const spokenMatch = text.match(/SPOKEN:\s*([\s\S]*?)(?=\nDETAILED:|\n\n[A-Z]+:)/); if (spokenMatch) { return spokenMatch[1].trim(); } // Strip markdown let result = text; // Remove triple backtick code fences (with optional language identifier followed by newline) // Pattern: ```lang\n...content...\n``` → content // Pattern: ```content``` (no newline) → content result = result.replace(/```([a-z]*)\n?([\s\S]*?)```/g, (_match, lang, inner) => { // If lang is present and followed by a newline, it's a language identifier; inner is the code // If no newline (lang === content), preserve the lang as text if (lang && !inner.trim()) { // ``` followed by word then immediately ``` — the "word" is actually content return lang; } return inner.trim(); }); // Remove inline backticks result = result.replace(/`([^`]+)`/g, "$1"); // Remove heading markers (## Heading -> Heading) result = result.replace(/^#{1,6}\s+/gm, ""); // Remove bold markers (**text** -> text) result = result.replace(/\*\*([^*]+)\*\*/g, "$1"); // Remove italic markers (*text* -> text) result = result.replace(/\*([^*]+)\*/g, "$1"); // Remove bullet point prefixes (- item or * item) result = result.replace(/^[-*]\s+/gm, ""); // Collapse multiple blank lines into one result = result.replace(/\n{3,}/g, "\n\n"); return result.trim(); } return { transcribe, synthesize, synthesizeSentenceStream, synthesizeMultiLang, formatForVoice, transcodeToWav16k }; }