From 3a90d7f5c77cbf5f4febb51bef88aa6190e81de0 Mon Sep 17 00:00:00 2001 From: Nexus Dev Date: Sat, 4 Apr 2026 01:16:50 +0000 Subject: [PATCH] =?UTF-8?q?docs(36):=20create=20phase=20plan=20=E2=80=94?= =?UTF-8?q?=203=20plans=20in=202=20waves?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .planning/ROADMAP.md | 9 +- .../36-01-PLAN.md | 191 ++++++++++ .../36-02-PLAN.md | 242 +++++++++++++ .../36-03-PLAN.md | 330 ++++++++++++++++++ 4 files changed, 770 insertions(+), 2 deletions(-) create mode 100644 .planning/phases/36-voice-pipeline-foundation/36-01-PLAN.md create mode 100644 .planning/phases/36-voice-pipeline-foundation/36-02-PLAN.md create mode 100644 .planning/phases/36-voice-pipeline-foundation/36-03-PLAN.md diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index 24d37f8e..d042e45f 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -123,7 +123,12 @@ Plans: 3. A WebM/Opus browser recording and an OGG/Opus Telegram voice note both produce identical Whisper transcription quality after ffmpeg transcodes each to WAV 16kHz mono 4. The `voiceMode` flag on a chat message survives from client request through Express route to message persistence — verifiable in the DB record 5. `nexus-settings.json` accepts `voiceMode: "text" | "voice_input" | "full_voice"` and `telegramToken` fields without breaking existing settings reads -**Plans**: TBD +**Plans**: 3 plans + +Plans: +- [ ] 36-01-PLAN.md — VoicePipelineService: ffmpeg transcoding, Whisper STT, Piper TTS, formatForVoice +- [ ] 36-02-PLAN.md — Schema extensions: voiceMode in shared validators/types + nexus-settings +- [ ] 36-03-PLAN.md — Voice routes, chat.ts voiceMode wiring, app.ts mount, old transcribe removal ### Phase 37: Web Chat Voice UI **Goal**: Users can speak to any agent in web chat — recording auto-stops on silence, a live waveform confirms the mic is active, responses play back automatically (toggleable), and voice mode is a first-class setting @@ -216,7 +221,7 @@ All 23 v1.6 requirements are mapped to exactly one phase. No orphans. | 33. Persistent Memory + Personal Assistant Mode | v1.5 | 3/3 | Complete | 2026-04-03 | | 34. Voice | v1.5 | 2/2 | Complete | 2026-04-03 | | 35. npx buildthis CLI | v1.5 | 1/1 | Complete | 2026-04-03 | -| 36. Voice Pipeline Foundation | v1.6 | 0/TBD | Not started | - | +| 36. Voice Pipeline Foundation | v1.6 | 0/3 | Planning | - | | 37. Web Chat Voice UI | v1.6 | 0/TBD | Not started | - | | 38. Telegram Bridge | v1.6 | 0/TBD | Not started | - | | 39. Voice Polish | v1.6 | 0/TBD | Not started | - | diff --git a/.planning/phases/36-voice-pipeline-foundation/36-01-PLAN.md b/.planning/phases/36-voice-pipeline-foundation/36-01-PLAN.md new file mode 100644 index 00000000..4988e582 --- /dev/null +++ b/.planning/phases/36-voice-pipeline-foundation/36-01-PLAN.md @@ -0,0 +1,191 @@ +--- +phase: 36-voice-pipeline-foundation +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - server/src/services/voice-pipeline.ts + - server/src/__tests__/36-voice-pipeline.test.ts + - server/package.json +autonomous: true +requirements: + - VPIPE-01 + - VPIPE-02 + - VPIPE-04 + - VPIPE-06 + +must_haves: + truths: + - "transcribe() accepts a Buffer and format string, returns { text, language? }" + - "synthesize() accepts text string and optional voiceId, returns a WAV Buffer" + - "transcodeToWav16k() converts any input format to WAV 16kHz mono via ffmpeg-static" + - "formatForVoice() strips markdown and extracts SPOKEN section when present" + - "formatForVoice() falls back to markdown stripping when SPOKEN marker is absent" + artifacts: + - path: "server/src/services/voice-pipeline.ts" + provides: "VoicePipelineService with transcribe, synthesize, formatForVoice, transcodeToWav16k" + exports: ["voicePipelineService"] + - path: "server/src/__tests__/36-voice-pipeline.test.ts" + provides: "Unit tests for voice pipeline service" + min_lines: 80 + key_links: + - from: "server/src/services/voice-pipeline.ts" + to: "ffmpeg-static" + via: "import ffmpegPath from ffmpeg-static" + pattern: "import.*ffmpeg-static" + - from: "server/src/services/voice-pipeline.ts" + to: "node:child_process" + via: "execFile and spawn for piper/ffmpeg" + pattern: "execFile|spawn" +--- + + +Create VoicePipelineService — the transport-agnostic voice service that all downstream consumers (voice routes, Telegram bridge) depend on. + +Purpose: This is the keystone service for the entire v1.6 milestone. It encapsulates STT (Whisper), TTS (Piper), audio transcoding (ffmpeg), and dual-output formatting behind a clean factory function API. + +Output: `server/src/services/voice-pipeline.ts` with `transcribe()`, `synthesize()`, `formatForVoice()` methods, plus unit tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/36-voice-pipeline-foundation/36-RESEARCH.md + + + + +From server/src/services/nexus-settings.ts (factory function pattern): +```typescript +export function nexusSettingsService() { + async function get(): Promise { ... } + async function set(patch: Partial): Promise { ... } + return { get, set }; +} +``` + +From server/src/routes/chat-files.ts lines 297-386 (existing whisper cascade to extract and move): +```typescript +// Writes raw WebM to temp file, then tries whisper-cpp --model base.en --no-timestamps +// Falls back to openai-whisper Python CLI +// Uses promisify(execFileCb) pattern from node:child_process +``` + + + + + + + Task 1: Install ffmpeg-static and create VoicePipelineService with tests + + server/package.json + server/src/services/voice-pipeline.ts + server/src/__tests__/36-voice-pipeline.test.ts + + + server/src/services/nexus-settings.ts + server/src/routes/chat-files.ts + server/package.json + + + - transcodeToWav16k: spawns ffmpeg with args ["-f", inputFormat, "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"] and returns Buffer + - transcodeToWav16k: rejects with error when ffmpeg exits non-zero + - transcribe: calls transcodeToWav16k first (unless format is "wav"), then writes to temp file, runs whisper-cpp with --language auto flag, returns { text, language } + - transcribe: falls back to openai-whisper Python CLI when whisper-cpp fails + - transcribe: returns 503-style error when neither whisper binary is available + - synthesize: splits text into sentences using /(?<=[.!?])\s+/ regex, calls piper via execFile for each chunk, concatenates WAV buffers + - synthesize: wraps each piper call in Promise.race with 8000ms timeout + - synthesize: returns error when piper binary is not found + - formatForVoice: extracts text between "SPOKEN:" and "DETAILED:" markers when both present + - formatForVoice: strips markdown (headings ##, bold **, italic *, code fences ```, bullet points -/*) when SPOKEN marker is absent + - formatForVoice: handles empty string input returning empty string + - voicePipelineService factory: throws Error("ffmpeg-static binary not found") when ffmpegPath is null + + + 1. Install ffmpeg-static: + ```bash + cd /opt/nexus/server && pnpm add ffmpeg-static && pnpm add -D @types/ffmpeg-static + ``` + + 2. Create `server/src/__tests__/36-voice-pipeline.test.ts` with unit tests (RED phase): + - Mock `node:child_process` execFile and spawn + - Mock `ffmpeg-static` to return "/mock/ffmpeg" by default, null for the fail-fast test + - Test `transcodeToWav16k("webm")` verifies spawn is called with args ["-f", "webm", "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"] + - Test `transcribe(buffer, "webm")` verifies it calls transcodeToWav16k then whisper-cpp with `--language auto` + - Test `transcribe` whisper-cpp fallback to openai-whisper + - Test `synthesize("Hello world. How are you?")` verifies it splits into 2 sentences and calls piper execFile twice + - Test `synthesize` timeout rejects after 8000ms + - Test `formatForVoice("SPOKEN: Hello\n\nDETAILED: ## Hello\n**world**")` returns "Hello" + - Test `formatForVoice("## Hello\n**world**\n- item\n```code```")` returns "Hello\nworld\nitem\ncode" (markdown stripped) + - Test `formatForVoice("")` returns "" + - Test factory throws when ffmpegPath is null + + 3. Create `server/src/services/voice-pipeline.ts` (GREEN phase): + - Export `voicePipelineService()` factory function (matches nexus-settings pattern) + - Assert `ffmpegPath` is not null at construction time: `if (!ffmpegPath) throw new Error("ffmpeg-static binary not found on this platform");` + - `transcodeToWav16k(inputBuffer: Buffer, inputFormat: string): Promise` — uses `spawn(ffmpegPath, ["-f", inputFormat, "-i", "pipe:0", "-ar", "16000", "-ac", "1", "-f", "wav", "pipe:1"], { stdio: ["pipe", "pipe", "pipe"] })`; write inputBuffer to stdin, collect stdout chunks, resolve on close code 0, reject otherwise + - `withTimeout(promise: Promise, ms: number): Promise` — `Promise.race([promise, new Promise((_, reject) => setTimeout(() => reject(new Error("Timed out after ${ms}ms")), ms))])` + - `transcribe(buffer: Buffer, format: "webm" | "ogg" | "wav"): Promise<{ text: string; language?: string }>`: + 1. If format !== "wav", call transcodeToWav16k(buffer, format) + 2. Write WAV to temp file: `path.join(tmpdir(), "nexus-audio-" + Date.now() + ".wav")` + 3. Try whisper-cpp: `execFile("whisper-cpp", ["--model", "base.en", "--file", tmpPath, "--no-timestamps", "--output-txt", "--language", "auto"], { timeout: 30000 })` + 4. Parse language from whisper-cpp stdout if present; return `{ text: stdout.trim(), language }` + 5. On failure, try openai-whisper: `execFile("whisper", [tmpPath, "--model", "base.en", "--output_format", "txt", "--output_dir", tmpdir()], { timeout: 60000 })` + 6. On both failure, throw new Error("Whisper not available. Install whisper-cpp or openai-whisper for voice input.") + 7. Cleanup temp file in `finally` block via `unlink(tmpPath).catch(() => {})` + - `synthesize(text: string, voiceId?: string): Promise`: + 1. Split text into sentences: `text.split(/(?<=[.!?])\s+/).filter(s => s.length > 0)` + 2. For each sentence, call piper via `withTimeout(execFile("piper", ["--model", voiceId || "en_US-lessac-medium", "--output-raw"], { timeout: 8000, maxBuffer: 10 * 1024 * 1024, input: sentence }), 8000)` + 3. Concatenate all output buffers + 4. On ENOENT (piper not found), throw new Error("Piper TTS not available. Install piper for voice output.") + - `formatForVoice(text: string): string`: + 1. If empty, return "" + 2. Check for `SPOKEN:` marker: `const spokenMatch = text.match(/SPOKEN:\s*([\s\S]*?)(?=\nDETAILED:|\n\n[A-Z]+:)/)` + 3. If match found, return `spokenMatch[1].trim()` + 4. Otherwise strip markdown: remove `# ` headings, `**` bold, `*` italic, triple backtick code fences (and lang identifier), `- ` and `* ` bullet prefixes, inline backticks + 5. Collapse multiple blank lines, trim + - Return `{ transcribe, synthesize, formatForVoice }` + + + cd /opt/nexus && pnpm --filter @paperclipai/server test --run src/__tests__/36-voice-pipeline.test.ts + + + - server/src/services/voice-pipeline.ts contains `export function voicePipelineService()` + - server/src/services/voice-pipeline.ts contains `import ffmpegPath from "ffmpeg-static"` + - server/src/services/voice-pipeline.ts contains `spawn(ffmpegPath` (not exec) + - server/src/services/voice-pipeline.ts contains `execFile("whisper-cpp"` (not exec) + - server/src/services/voice-pipeline.ts contains `"--language", "auto"` for VPIPE-01 language detection + - server/src/services/voice-pipeline.ts contains `"-ar", "16000", "-ac", "1"` for VPIPE-04 transcoding + - server/src/services/voice-pipeline.ts contains `Promise.race` for timeout wrapping + - server/src/services/voice-pipeline.ts contains `SPOKEN:` marker check in formatForVoice + - server/src/services/voice-pipeline.ts contains `return { transcribe, synthesize, formatForVoice }` + - server/src/__tests__/36-voice-pipeline.test.ts exits 0 + - server/package.json contains "ffmpeg-static" + + + VoicePipelineService exists with transcribe (whisper cascade + language detection), synthesize (piper with sentence chunking + timeout), formatForVoice (SPOKEN extraction + markdown strip fallback), and transcodeToWav16k (ffmpeg pipe). All unit tests pass with mocked child_process. + + + + + + +- `pnpm --filter @paperclipai/server test --run src/__tests__/36-voice-pipeline.test.ts` exits 0 +- `grep -c "export function voicePipelineService" server/src/services/voice-pipeline.ts` returns 1 +- `grep "ffmpeg-static" server/package.json` shows dependency present + + + +VoicePipelineService is a working, tested factory function that downstream consumers (voice.ts routes in Plan 03, Telegram bridge in Phase 38) can import and call without additional setup. ffmpeg-static is installed. All unit tests pass. + + + +After completion, create `.planning/phases/36-voice-pipeline-foundation/36-01-SUMMARY.md` + diff --git a/.planning/phases/36-voice-pipeline-foundation/36-02-PLAN.md b/.planning/phases/36-voice-pipeline-foundation/36-02-PLAN.md new file mode 100644 index 00000000..16b6d65a --- /dev/null +++ b/.planning/phases/36-voice-pipeline-foundation/36-02-PLAN.md @@ -0,0 +1,242 @@ +--- +phase: 36-voice-pipeline-foundation +plan: 02 +type: execute +wave: 1 +depends_on: [] +files_modified: + - packages/shared/src/validators/chat.ts + - packages/shared/src/types/chat.ts + - server/src/services/nexus-settings.ts + - server/src/__tests__/36-voice-schema.test.ts +autonomous: true +requirements: + - VPIPE-05 + +must_haves: + truths: + - "createMessageSchema accepts voiceMode field with values text, voice_input, or full_voice" + - "createMessageSchema strips unknown fields but preserves voiceMode when present" + - "ChatMessage interface includes optional voiceMode field" + - "nexus-settings schema accepts voiceMode and telegramToken fields" + - "Existing nexus-settings.json files without voiceMode parse without error (defaults to text)" + artifacts: + - path: "packages/shared/src/validators/chat.ts" + provides: "voiceMode field on createMessageSchema" + contains: "voiceMode" + - path: "packages/shared/src/types/chat.ts" + provides: "voiceMode on ChatMessage interface" + contains: "voiceMode" + - path: "server/src/services/nexus-settings.ts" + provides: "voiceMode and telegramToken in settings schema" + contains: "voiceMode" + - path: "server/src/__tests__/36-voice-schema.test.ts" + provides: "Schema validation tests for voiceMode" + min_lines: 40 + key_links: + - from: "packages/shared/src/validators/chat.ts" + to: "server/src/routes/chat.ts" + via: "createMessageSchema.parse(req.body) preserves voiceMode" + pattern: "createMessageSchema" + - from: "server/src/services/nexus-settings.ts" + to: "nexus-settings.json" + via: "Zod .default() handles missing voiceMode in existing files" + pattern: 'voiceMode.*default.*"text"' +--- + + +Extend shared type definitions and settings schema with voiceMode support so the voice flag can propagate through the entire message pipeline. + +Purpose: VPIPE-05 requires the voiceMode flag to survive from client request through Express route to message persistence. This plan adds the schema and type foundations that Plan 03 wires together. + +Output: Updated `createMessageSchema`, `ChatMessage` interface, and `nexus-settings` schema with voiceMode and telegramToken fields, plus tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/36-voice-pipeline-foundation/36-RESEARCH.md + + + + +From packages/shared/src/validators/chat.ts: +```typescript +export const createMessageSchema = z.object({ + role: z.enum(["user", "assistant", "system"]), + content: z.string().min(1).max(100_000), + agentId: z.string().uuid().optional(), + messageType: z.string().optional(), +}); +export type CreateMessage = z.infer; +``` + +From packages/shared/src/types/chat.ts: +```typescript +export interface ChatMessage { + id: string; + conversationId: string; + role: "user" | "assistant" | "system"; + content: string; + agentId: string | null; + messageType: string | null; + createdAt: string; + updatedAt: string | null; + files?: ChatFile[]; +} +``` + +From server/src/services/nexus-settings.ts: +```typescript +export const NEXUS_MODES = ["personal_ai", "project_builder", "both"] as const; +const nexusSettingsSchema = z.object({ + mode: z.enum(NEXUS_MODES).default("both"), + voiceEnabled: z.boolean().default(false), +}); +``` + + + + + + + Task 1: Extend shared validators and types with voiceMode field + + packages/shared/src/validators/chat.ts + packages/shared/src/types/chat.ts + server/src/__tests__/36-voice-schema.test.ts + + + packages/shared/src/validators/chat.ts + packages/shared/src/types/chat.ts + + + - createMessageSchema.parse({ role: "user", content: "hi", voiceMode: "full_voice" }) succeeds and output.voiceMode equals "full_voice" + - createMessageSchema.parse({ role: "user", content: "hi", voiceMode: "voice_input" }) succeeds + - createMessageSchema.parse({ role: "user", content: "hi", voiceMode: "text" }) succeeds + - createMessageSchema.parse({ role: "user", content: "hi" }) succeeds and output.voiceMode is undefined + - createMessageSchema.parse({ role: "user", content: "hi", voiceMode: "invalid" }) throws ZodError + - Existing createMessageSchema behavior unchanged (role, content, agentId, messageType all work as before) + + + 1. Create `server/src/__tests__/36-voice-schema.test.ts` (RED): + - Import `createMessageSchema` from `@paperclipai/shared/validators/chat` + - Test: parse with voiceMode "full_voice" returns voiceMode "full_voice" + - Test: parse with voiceMode "voice_input" returns voiceMode "voice_input" + - Test: parse with voiceMode "text" returns voiceMode "text" + - Test: parse without voiceMode returns undefined for voiceMode + - Test: parse with voiceMode "invalid" throws + - Test: existing fields (role, content, agentId, messageType) still parse correctly + + 2. Modify `packages/shared/src/validators/chat.ts` (GREEN): + - Add to createMessageSchema object: `voiceMode: z.enum(["text", "voice_input", "full_voice"]).optional(),` + - Add after existing exports: + ```typescript + export const VOICE_MODES = ["text", "voice_input", "full_voice"] as const; + export type VoiceMode = (typeof VOICE_MODES)[number]; + ``` + + 3. Modify `packages/shared/src/types/chat.ts` (GREEN): + - Add to ChatMessage interface after `messageType: string | null;`: + ```typescript + voiceMode?: "text" | "voice_input" | "full_voice" | null; + ``` + + + cd /opt/nexus && pnpm --filter @paperclipai/server test --run src/__tests__/36-voice-schema.test.ts + + + - packages/shared/src/validators/chat.ts contains `voiceMode: z.enum(["text", "voice_input", "full_voice"]).optional()` + - packages/shared/src/validators/chat.ts contains `export const VOICE_MODES` + - packages/shared/src/validators/chat.ts contains `export type VoiceMode` + - packages/shared/src/types/chat.ts contains `voiceMode?:` in ChatMessage interface + - server/src/__tests__/36-voice-schema.test.ts exits 0 + + createMessageSchema accepts optional voiceMode field with enum validation. ChatMessage interface includes voiceMode. VOICE_MODES constant and VoiceMode type are exported for reuse. + + + + Task 2: Extend nexus-settings schema with voiceMode and telegramToken + + server/src/services/nexus-settings.ts + server/src/__tests__/36-voice-schema.test.ts + + + server/src/services/nexus-settings.ts + server/src/__tests__/36-voice-schema.test.ts + + + - nexusSettingsSchema parses { mode: "both" } and output.voiceMode equals "text" (default) + - nexusSettingsSchema parses { mode: "both", voiceMode: "full_voice" } and output.voiceMode equals "full_voice" + - nexusSettingsSchema parses { mode: "both", telegramToken: "123:ABC" } and output.telegramToken equals "123:ABC" + - nexusSettingsSchema parses { mode: "both" } and output.telegramToken is undefined + - Existing settings without voiceMode/telegramToken parse without error + + + 1. Add tests to `server/src/__tests__/36-voice-schema.test.ts` (RED): + - Import `nexusSettingsService` from `../services/nexus-settings.js` + - Note: since nexusSettingsService reads from disk, test the schema directly instead. Import the schema or test via the service's `get()` method with a mocked file system. + - Alternative approach: directly import and test the Zod schema. If the schema is not exported, add a named export `nexusSettingsSchema` for testing, or test through the service. + - Test: settings without voiceMode defaults to "text" + - Test: settings with voiceMode "full_voice" preserves the value + - Test: settings with telegramToken preserves the value + - Test: settings without telegramToken has telegramToken as undefined + + 2. Modify `server/src/services/nexus-settings.ts` (GREEN): + - Add after existing NEXUS_MODES: + ```typescript + export const VOICE_MODES = ["text", "voice_input", "full_voice"] as const; + export type VoiceMode = (typeof VOICE_MODES)[number]; + ``` + - Extend nexusSettingsSchema: + ```typescript + const nexusSettingsSchema = z.object({ + mode: z.enum(NEXUS_MODES).default("both"), + voiceEnabled: z.boolean().default(false), + voiceMode: z.enum(VOICE_MODES).default("text"), + telegramToken: z.string().optional(), + piperBinaryPath: z.string().optional(), + whisperBinaryPath: z.string().optional(), + }); + ``` + - Export the schema for testing: `export { nexusSettingsSchema };` + - The `NexusSettings` type (already `z.infer`) auto-updates + - Update the fallback return in `get()` catch block from `{ mode: "both", voiceEnabled: false }` to `nexusSettingsSchema.parse({})` so it uses Zod defaults consistently + + + cd /opt/nexus && pnpm --filter @paperclipai/server test --run src/__tests__/36-voice-schema.test.ts + + + - server/src/services/nexus-settings.ts contains `voiceMode: z.enum(VOICE_MODES).default("text")` + - server/src/services/nexus-settings.ts contains `telegramToken: z.string().optional()` + - server/src/services/nexus-settings.ts contains `piperBinaryPath: z.string().optional()` + - server/src/services/nexus-settings.ts contains `whisperBinaryPath: z.string().optional()` + - server/src/services/nexus-settings.ts contains `export const VOICE_MODES` + - server/src/__tests__/36-voice-schema.test.ts exits 0 + + nexus-settings schema accepts voiceMode (defaulting to "text"), telegramToken, piperBinaryPath, and whisperBinaryPath. Existing nexus-settings.json files without these fields parse without error due to Zod defaults. + + + + + +- `pnpm --filter @paperclipai/server test --run src/__tests__/36-voice-schema.test.ts` exits 0 +- `grep "voiceMode" packages/shared/src/validators/chat.ts` shows the field +- `grep "voiceMode" packages/shared/src/types/chat.ts` shows the field +- `grep "voiceMode" server/src/services/nexus-settings.ts` shows the field +- `grep "telegramToken" server/src/services/nexus-settings.ts` shows the field + + + +The voiceMode field is accepted by createMessageSchema (shared package), present on the ChatMessage type, and configurable in nexus-settings with a safe "text" default. telegramToken is ready for Phase 38. All schema tests pass. + + + +After completion, create `.planning/phases/36-voice-pipeline-foundation/36-02-SUMMARY.md` + diff --git a/.planning/phases/36-voice-pipeline-foundation/36-03-PLAN.md b/.planning/phases/36-voice-pipeline-foundation/36-03-PLAN.md new file mode 100644 index 00000000..338d793c --- /dev/null +++ b/.planning/phases/36-voice-pipeline-foundation/36-03-PLAN.md @@ -0,0 +1,330 @@ +--- +phase: 36-voice-pipeline-foundation +plan: 03 +type: execute +wave: 2 +depends_on: ["36-01", "36-02"] +files_modified: + - server/src/routes/voice.ts + - server/src/routes/chat-files.ts + - server/src/routes/chat.ts + - server/src/app.ts + - server/src/__tests__/36-voice-routes.test.ts +autonomous: true +requirements: + - VPIPE-03 + - VPIPE-06 + +must_haves: + truths: + - "POST /api/transcribe accepts audio file upload and returns { text, language? }" + - "POST /api/synthesize accepts { text } body and returns audio/wav buffer" + - "voiceMode from request body is injected as dual-output system prompt in stream endpoint" + - "voiceMode is persisted to messageType column when message is saved" + - "Old /transcribe endpoint is removed from chat-files.ts" + - "Voice routes are mounted in app.ts" + artifacts: + - path: "server/src/routes/voice.ts" + provides: "POST /api/transcribe and POST /api/synthesize endpoints" + exports: ["voiceRoutes"] + - path: "server/src/__tests__/36-voice-routes.test.ts" + provides: "Integration tests for voice routes and voiceMode wiring" + min_lines: 60 + key_links: + - from: "server/src/routes/voice.ts" + to: "server/src/services/voice-pipeline.ts" + via: "voicePipelineService() import" + pattern: "voicePipelineService" + - from: "server/src/routes/chat.ts" + to: "packages/shared/src/validators/chat.ts" + via: "createMessageSchema preserves voiceMode on parse" + pattern: "voiceMode" + - from: "server/src/app.ts" + to: "server/src/routes/voice.ts" + via: "api.use(voiceRoutes())" + pattern: "voiceRoutes" +--- + + +Create voice HTTP routes (transcribe + synthesize), wire voiceMode through the chat stream endpoint with dual-output prompt injection, mount in app.ts, and remove the old transcribe endpoint from chat-files.ts. + +Purpose: VPIPE-03 requires the voice pipeline to be callable from any transport via HTTP. VPIPE-06 requires dual output (spoken prose + full markdown) triggered by voiceMode=full_voice in the stream endpoint. + +Output: `server/src/routes/voice.ts` with two endpoints, updated `chat.ts` with voiceMode wiring, cleaned `chat-files.ts`, updated `app.ts` mount. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/36-voice-pipeline-foundation/36-RESEARCH.md +@.planning/phases/36-voice-pipeline-foundation/36-01-SUMMARY.md +@.planning/phases/36-voice-pipeline-foundation/36-02-SUMMARY.md + + + +From server/src/services/voice-pipeline.ts: +```typescript +export function voicePipelineService(): { + transcribe(buffer: Buffer, format: "webm" | "ogg" | "wav"): Promise<{ text: string; language?: string }>; + synthesize(text: string, voiceId?: string): Promise; + formatForVoice(text: string): string; +}; +``` + + +From packages/shared/src/validators/chat.ts: +```typescript +export const createMessageSchema = z.object({ + role: z.enum(["user", "assistant", "system"]), + content: z.string().min(1).max(100_000), + agentId: z.string().uuid().optional(), + messageType: z.string().optional(), + voiceMode: z.enum(["text", "voice_input", "full_voice"]).optional(), +}); +``` + + +From server/src/routes/authz.ts: +```typescript +export function assertBoard(req: Request) { + if (req.actor.type !== "board") throw forbidden("Board access required"); +} +``` + +From server/src/attachment-types.ts: +```typescript +export const MAX_ATTACHMENT_BYTES = ... +``` + +From server/src/app.ts (mount pattern, line ~164): +```typescript +api.use(chatFileRoutes(db, opts.storageService)); +api.use(nexusSettingsRoutes()); +``` + +From server/src/routes/chat.ts (stream endpoint, lines 91-194): +```typescript +router.post("/conversations/:id/stream", async (req, res) => { + assertBoard(req); + const { content, agentId } = req.body; + // ... builds messagesWithMemory array ... + // ... streams tokens ... + const message = await svc.addMessage(req.params.id!, { + role: "assistant", + content: fullContent.trim(), + agentId: agentId || undefined, + }); +}); +``` + +From server/src/routes/chat-files.ts (lines 297-386 to remove): +```typescript +// POST /transcribe — the old endpoint with inline audioUpload multer, runAudioUpload helper, +// and whisper-cpp/openai-whisper cascade. This entire block (lines 297-386) is replaced by voice.ts. +``` + + + + + + + Task 1: Create voice.ts routes and tests + + server/src/routes/voice.ts + server/src/__tests__/36-voice-routes.test.ts + + + server/src/routes/chat-files.ts + server/src/routes/authz.ts + server/src/attachment-types.ts + server/src/services/voice-pipeline.ts + + + - POST /transcribe with valid audio file returns 200 with { text: "...", language: "..." } + - POST /transcribe without audio field returns 400 with { error: "Missing audio field" } + - POST /synthesize with { text: "Hello" } returns 200 with Content-Type audio/wav + - POST /synthesize without text returns 400 with { error: "text is required" } + - Both endpoints call assertBoard(req) for auth + + + 1. Create `server/src/__tests__/36-voice-routes.test.ts` (RED): + - Mock `../services/voice-pipeline.js` to return a mock service object + - Mock `./authz.js` assertBoard to be a no-op + - Test POST /transcribe with a Buffer body returns { text, language } + - Test POST /transcribe without file returns 400 + - Test POST /synthesize with { text: "Hello" } returns audio/wav content-type + - Test POST /synthesize without text returns 400 + + 2. Create `server/src/routes/voice.ts` (GREEN): + ```typescript + import { Router } from "express"; + import multer from "multer"; + import { assertBoard } from "./authz.js"; + import { voicePipelineService } from "../services/voice-pipeline.js"; + import { MAX_ATTACHMENT_BYTES } from "../attachment-types.js"; + + export function voiceRoutes(): Router { + const router = Router(); + const svc = voicePipelineService(); + const audioUpload = multer({ + storage: multer.memoryStorage(), + limits: { fileSize: MAX_ATTACHMENT_BYTES, files: 1 }, + }); + + // POST /api/transcribe — transcribe uploaded audio via VoicePipelineService + router.post("/transcribe", async (req, res) => { + assertBoard(req); + await new Promise((resolve, reject) => + audioUpload.single("audio")(req, res, (err) => (err ? reject(err) : resolve())) + ); + const file = (req as any).file as { buffer: Buffer; mimetype: string } | undefined; + if (!file) { + res.status(400).json({ error: "Missing audio field" }); + return; + } + const fmt = file.mimetype.includes("ogg") ? "ogg" + : file.mimetype.includes("wav") ? "wav" + : "webm"; + const result = await svc.transcribe(file.buffer, fmt); + res.json(result); + }); + + // POST /api/synthesize — synthesize text to speech via VoicePipelineService + router.post("/synthesize", async (req, res) => { + assertBoard(req); + const { text, voiceId } = req.body as { text?: string; voiceId?: string }; + if (!text || typeof text !== "string") { + res.status(400).json({ error: "text is required" }); + return; + } + const audioBuffer = await svc.synthesize(text, voiceId); + res.setHeader("Content-Type", "audio/wav"); + res.send(audioBuffer); + }); + + return router; + } + ``` + + + cd /opt/nexus && pnpm --filter @paperclipai/server test --run src/__tests__/36-voice-routes.test.ts + + + - server/src/routes/voice.ts contains `export function voiceRoutes()` + - server/src/routes/voice.ts contains `router.post("/transcribe"` + - server/src/routes/voice.ts contains `router.post("/synthesize"` + - server/src/routes/voice.ts contains `import { voicePipelineService }` from voice-pipeline service + - server/src/routes/voice.ts contains `import { MAX_ATTACHMENT_BYTES }` from attachment-types + - server/src/routes/voice.ts contains `assertBoard(req)` on both routes + - server/src/routes/voice.ts contains `res.setHeader("Content-Type", "audio/wav")` + - server/src/__tests__/36-voice-routes.test.ts exits 0 + + Voice routes exist with POST /transcribe (audio upload -> VoicePipelineService.transcribe) and POST /synthesize (text body -> VoicePipelineService.synthesize -> WAV response). Both authenticated via assertBoard. + + + + Task 2: Wire voiceMode in chat.ts stream, mount voice routes, remove old transcribe + + server/src/routes/chat.ts + server/src/routes/chat-files.ts + server/src/app.ts + + + server/src/routes/chat.ts + server/src/routes/chat-files.ts + server/src/app.ts + server/src/routes/voice.ts + + + 1. Modify `server/src/routes/chat.ts` — inject voiceMode into stream endpoint: + - At line 93, change `const { content, agentId } = req.body;` to: + ```typescript + const { content, agentId, voiceMode } = req.body as { + content: string; agentId?: string; voiceMode?: "text" | "voice_input" | "full_voice"; + }; + ``` + - After building `messagesWithMemory` array (after line 140 where user message is pushed), before the SSE headers block (before line 142), add: + ```typescript + // Inject dual-output formatting prompt when voice mode is full_voice (VPIPE-06) + if (voiceMode === "full_voice") { + messagesWithMemory.push({ + role: "system", + content: [ + "Format your response with EXACTLY these two labeled sections:", + "", + "SPOKEN: [Natural speech prose only. No markdown. No bullet points. No code blocks. Max 2-3 sentences for spoken delivery.]", + "", + "DETAILED: [Your full response with all detail, code blocks, and markdown formatting.]", + ].join("\n"), + }); + } + ``` + - At line 167-171, update the `svc.addMessage()` call to include voiceMode in messageType: + ```typescript + const message = await svc.addMessage(req.params.id!, { + role: "assistant", + content: fullContent.trim(), + agentId: agentId || undefined, + messageType: voiceMode === "full_voice" ? "voice_full" + : voiceMode === "voice_input" ? "voice_input" + : undefined, + }); + ``` + + 2. Modify `server/src/routes/chat-files.ts` — remove old /transcribe endpoint: + - Delete lines 297-386: the `audioUpload` multer instance, `runAudioUpload` helper function, and the entire `router.post("/transcribe", ...)` handler + - The line `return router;` at line 388 should remain (it becomes the new end of the function) + - Remove the `multer` import ONLY if no other route in the file uses multer (check: the file upload routes at top of file use multer too, so keep the import) + + 3. Modify `server/src/app.ts` — mount voice routes: + - Add import at top with other route imports: + ```typescript + import { voiceRoutes } from "./routes/voice.js"; + ``` + - Add mount line after `api.use(nexusSettingsRoutes());` (after line 165): + ```typescript + api.use(voiceRoutes()); + ``` + + + cd /opt/nexus && pnpm --filter @paperclipai/server exec tsc --noEmit 2>&1 | head -30 + + + - server/src/routes/chat.ts contains `voiceMode` destructured from req.body + - server/src/routes/chat.ts contains `if (voiceMode === "full_voice")` + - server/src/routes/chat.ts contains `SPOKEN:` in the system prompt string + - server/src/routes/chat.ts contains `DETAILED:` in the system prompt string + - server/src/routes/chat.ts contains `messageType: voiceMode === "full_voice" ? "voice_full"` + - server/src/routes/chat-files.ts does NOT contain `router.post("/transcribe"` (old endpoint removed) + - server/src/app.ts contains `import { voiceRoutes }` from voice routes + - server/src/app.ts contains `api.use(voiceRoutes())` + - TypeScript compilation passes with no errors (`tsc --noEmit` exits 0) + + + voiceMode flows from client request body through the stream endpoint: (1) dual-output system prompt injected when full_voice, (2) voiceMode persisted to messageType column on assistant message save. Old /transcribe endpoint removed from chat-files.ts. Voice routes mounted in app.ts. TypeScript compiles clean. + + + + + + +- `pnpm --filter @paperclipai/server exec tsc --noEmit` exits 0 +- `pnpm --filter @paperclipai/server test --run src/__tests__/36-voice-routes.test.ts` exits 0 +- `grep -c "router.post(\"/transcribe\"" server/src/routes/chat-files.ts` returns 0 (old endpoint removed) +- `grep "voiceRoutes" server/src/app.ts` shows mount present +- `grep "voiceMode" server/src/routes/chat.ts` shows flag wired through stream endpoint + + + +Voice pipeline is fully callable via HTTP (POST /api/transcribe, POST /api/synthesize). voiceMode flag propagates from client request through the stream endpoint to message persistence. Dual output prompt injected for full_voice mode. Old transcribe endpoint removed from chat-files.ts. All routes mounted and TypeScript compiles clean. + + + +After completion, create `.planning/phases/36-voice-pipeline-foundation/36-03-SUMMARY.md` +