From 1eaa6c4b3eda59208acacc0e459bb0506bcd2ca8 Mon Sep 17 00:00:00 2001 From: Nexus Dev Date: Sat, 4 Apr 2026 02:17:14 +0000 Subject: [PATCH] docs(37): create 4 plans in 3 waves for web chat voice UI --- .../phases/37-web-chat-voice-ui/37-01-PLAN.md | 297 ++++++++++++++ .../phases/37-web-chat-voice-ui/37-02-PLAN.md | 300 ++++++++++++++ .../phases/37-web-chat-voice-ui/37-03-PLAN.md | 286 +++++++++++++ .../phases/37-web-chat-voice-ui/37-04-PLAN.md | 377 ++++++++++++++++++ 4 files changed, 1260 insertions(+) create mode 100644 .planning/phases/37-web-chat-voice-ui/37-01-PLAN.md create mode 100644 .planning/phases/37-web-chat-voice-ui/37-02-PLAN.md create mode 100644 .planning/phases/37-web-chat-voice-ui/37-03-PLAN.md create mode 100644 .planning/phases/37-web-chat-voice-ui/37-04-PLAN.md diff --git a/.planning/phases/37-web-chat-voice-ui/37-01-PLAN.md b/.planning/phases/37-web-chat-voice-ui/37-01-PLAN.md new file mode 100644 index 00000000..ea2551b9 --- /dev/null +++ b/.planning/phases/37-web-chat-voice-ui/37-01-PLAN.md @@ -0,0 +1,297 @@ +--- +phase: 37-web-chat-voice-ui +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - server/src/services/nexus-settings.ts + - server/src/routes/nexus-settings.ts + - server/src/routes/voice.ts + - server/src/routes/chat.ts + - server/src/app.ts + - packages/shared/src/types/chat.ts + - packages/shared/src/validators/chat.ts + - ui/vite.config.ts + - ui/package.json + - ui/public/vad.worklet.bundle.min.js + - ui/public/silero_vad_legacy.onnx + - ui/public/silero_vad_v5.onnx +autonomous: true +requirements: + - WCHAT-01 + - WCHAT-02 + - WCHAT-04 + +must_haves: + truths: + - "POST /api/transcribe accepts audio upload and returns { text }" + - "POST /api/synthesize accepts { text } and returns audio/wav" + - "GET /api/nexus/settings returns voiceMode field" + - "PATCH /api/nexus/settings accepts voiceMode update" + - "Chat stream endpoint accepts voiceMode in request body" + - "SharedArrayBuffer is available in browser (COOP/COEP headers set)" + - "VAD ONNX model files are served from /vad.worklet.bundle.min.js, /silero_vad_legacy.onnx, /silero_vad_v5.onnx" + artifacts: + - path: "server/src/routes/voice.ts" + provides: "POST /api/transcribe and POST /api/synthesize" + exports: ["voiceRoutes"] + - path: "server/src/routes/nexus-settings.ts" + provides: "GET/PATCH /api/nexus/settings" + exports: ["nexusSettingsRoutes"] + - path: "server/src/services/nexus-settings.ts" + provides: "nexusSettingsService with voiceMode field" + exports: ["nexusSettingsService", "VoiceMode", "VOICE_MODES"] + - path: "ui/public/vad.worklet.bundle.min.js" + provides: "VAD AudioWorklet bundle" + - path: "ui/public/silero_vad_legacy.onnx" + provides: "Silero VAD legacy ONNX model" + key_links: + - from: "server/src/app.ts" + to: "server/src/routes/voice.ts" + via: "api.use(voiceRoutes())" + pattern: "voiceRoutes" + - from: "server/src/app.ts" + to: "server/src/routes/nexus-settings.ts" + via: "api.use(nexusSettingsRoutes())" + pattern: "nexusSettingsRoutes" + - from: "server/src/routes/chat.ts" + to: "voiceMode parameter" + via: "req.body.voiceMode in stream handler" + pattern: "voiceMode.*voice_input|voice_full" +--- + + +Establish all server-side prerequisites and browser infrastructure for voice I/O. + +Purpose: Phase 36 Tasks 2-3 (nexus-settings voiceMode schema, voice HTTP routes, voiceMode wiring in chat.ts) are not present on this branch. This plan cherry-picks or re-implements those deliverables, adds COOP/COEP headers for SharedArrayBuffer, installs @ricky0123/vad-react, copies VAD ONNX assets to ui/public/, and configures Vite dev server headers. + +Output: Working server endpoints (transcribe, synthesize, nexus-settings), COOP/COEP isolation, VAD assets ready in ui/public/ + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md + + + + +From server/src/services/voice-pipeline.ts (ALREADY on this branch): +```typescript +// voicePipelineService() exposes transcribe(buffer, format) and synthesize(text, voiceId?) +export function voicePipelineService(): { transcribe, synthesize, formatForVoice, transcodeToWav16k } +``` + +From server/src/app.ts (parent branch — route mounting pattern): +```typescript +// Routes are mounted on an `api` Router via api.use(...) +// Pattern: import { xyzRoutes } from "./routes/xyz.js"; then api.use(xyzRoutes()); +import { chatRoutes } from "./routes/chat.js"; +api.use(chatRoutes(db, storageService, config)); +``` + +From packages/shared/src/types/chat.ts (parent branch): +```typescript +export interface ChatMessage { + id: string; + conversationId: string; + role: "user" | "assistant" | "system"; + content: string; + messageType?: string | null; + // ... other fields +} +``` + +From packages/shared/src/validators/chat.ts (parent branch): +```typescript +export const createMessageSchema = z.object({ + content: z.string().min(1), + role: z.enum(["user", "assistant", "system"]).default("user"), + agentId: z.string().uuid().optional(), + // voiceMode NOT present on parent branch — must add +}); +``` + + + + + + + Task 1: Cherry-pick Phase 36 server deliverables and add COOP/COEP headers + + server/src/services/nexus-settings.ts, + server/src/routes/nexus-settings.ts, + server/src/routes/voice.ts, + server/src/routes/chat.ts, + server/src/app.ts, + packages/shared/src/types/chat.ts, + packages/shared/src/validators/chat.ts + + + server/src/services/nexus-settings.ts, + server/src/services/voice-pipeline.ts, + server/src/app.ts, + server/src/routes/chat.ts, + packages/shared/src/types/chat.ts, + packages/shared/src/validators/chat.ts + + +Cherry-pick or re-implement Phase 36 Tasks 2-3 deliverables. The commits on gsd/phase-36-voice-pipeline-foundation are: +- d0d7a23a (nexus-settings voiceMode schema extension) +- b964c0e4 (voiceMode in createMessageSchema + ChatMessage interface) +- 11508547 (voice HTTP routes) +- fd372eaf (voiceMode wiring in chat.ts + route mounting) + +Try cherry-picking these 4 commits in order: +```bash +git cherry-pick d0d7a23a b964c0e4 11508547 fd372eaf +``` + +If cherry-pick conflicts, re-implement manually: + +1. **server/src/services/nexus-settings.ts** — Add VOICE_MODES and VoiceMode type: + ```typescript + export const VOICE_MODES = ["text", "voice_input", "full_voice"] as const; + export type VoiceMode = (typeof VOICE_MODES)[number]; + ``` + Add `voiceMode: z.enum(VOICE_MODES).default("text")` to nexusSettingsSchema. + Add `telegramToken: z.string().optional()`, `piperBinaryPath: z.string().optional()`, `whisperBinaryPath: z.string().optional()`. + +2. **server/src/routes/nexus-settings.ts** — Create new file: + - GET /nexus/settings — returns nexusSettingsService().get() + - PATCH /nexus/settings — calls nexusSettingsService().set(req.body), returns updated + - Both routes call assertBoard(req) first + - Import Router from express, assertBoard from ./authz.js, nexusSettingsService from ../services/nexus-settings.js + +3. **server/src/routes/voice.ts** — Create new file: + - POST /transcribe — accepts multipart audio upload via multer memoryStorage, calls voicePipelineService().transcribe(buffer, format), returns { text } + - POST /synthesize — accepts JSON { text, voiceId? }, calls voicePipelineService().synthesize(text, voiceId), returns audio/wav buffer + - Both routes call assertBoard(req) + - Import multer, Router, assertBoard, voicePipelineService, MAX_ATTACHMENT_BYTES + +4. **packages/shared/src/types/chat.ts** — Add `voiceMode?: string | null;` to ChatMessage interface if not present. + +5. **packages/shared/src/validators/chat.ts** — Add `voiceMode: z.enum(["text", "voice_input", "full_voice"]).optional()` to createMessageSchema. + +6. **server/src/routes/chat.ts** — In the stream POST handler, destructure `voiceMode` from req.body alongside content and agentId. When voiceMode is "full_voice", call voicePipelineService().formatForVoice(aiContent) to produce SPOKEN/DETAILED format. Set messageType on stored message: "voice_full" if voiceMode==="full_voice", "voice_input" if voiceMode==="voice_input", else null. + +7. **server/src/app.ts** — Import and mount voiceRoutes and nexusSettingsRoutes: + ```typescript + import { nexusSettingsRoutes } from "./routes/nexus-settings.js"; + import { voiceRoutes } from "./routes/voice.js"; + // In the api router setup: + api.use(nexusSettingsRoutes()); + api.use(voiceRoutes()); + ``` + +8. **COOP/COEP headers** — In server/src/app.ts, add middleware BEFORE static file serving and vite dev middleware: + ```typescript + app.use((_req, res, next) => { + res.setHeader("Cross-Origin-Opener-Policy", "same-origin"); + res.setHeader("Cross-Origin-Embedder-Policy", "require-corp"); + next(); + }); + ``` + Place this before any `app.use(express.static(...))` or vite middleware attachment. + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "voiceRoutes" server/src/app.ts && grep -q "nexusSettingsRoutes" server/src/app.ts && grep -q "Cross-Origin-Opener-Policy" server/src/app.ts && grep -q "voiceMode" server/src/routes/chat.ts && grep -q "voice_full" server/src/routes/chat.ts && test -f server/src/routes/voice.ts && test -f server/src/routes/nexus-settings.ts && echo "PASS" || echo "FAIL" + + + - grep "voiceRoutes" server/src/app.ts returns match + - grep "nexusSettingsRoutes" server/src/app.ts returns match + - grep "Cross-Origin-Opener-Policy" server/src/app.ts returns "same-origin" + - grep "Cross-Origin-Embedder-Policy" server/src/app.ts returns "require-corp" + - grep "voiceMode" server/src/routes/chat.ts returns match + - grep "voice_full" server/src/routes/chat.ts returns match + - server/src/routes/voice.ts exists with POST /transcribe and POST /synthesize + - server/src/routes/nexus-settings.ts exists with GET and PATCH /nexus/settings + - grep "VOICE_MODES" server/src/services/nexus-settings.ts returns match + + Phase 36 server deliverables present on branch. COOP/COEP headers added. Voice routes mounted. Chat stream accepts voiceMode. + + + + Task 2: Install VAD library, copy ONNX assets, configure Vite COOP/COEP headers + + ui/package.json, + ui/public/vad.worklet.bundle.min.js, + ui/public/silero_vad_legacy.onnx, + ui/public/silero_vad_v5.onnx, + ui/vite.config.ts + + + ui/package.json, + ui/vite.config.ts + + +1. Install @ricky0123/vad-react in the ui package: + ```bash + pnpm add @ricky0123/vad-react --filter @paperclipai/ui + ``` + +2. Copy VAD assets from node_modules to ui/public/ for same-origin serving (avoids COEP blocking CDN): + ```bash + cp node_modules/@ricky0123/vad-web/dist/vad.worklet.bundle.min.js ui/public/ + cp node_modules/@ricky0123/vad-web/dist/silero_vad_legacy.onnx ui/public/ + cp node_modules/@ricky0123/vad-web/dist/silero_vad_v5.onnx ui/public/ + ``` + If vad-web is in ui/node_modules/@ricky0123/vad-web/dist/, use that path instead. + Verify all three files exist after copy. + +3. Add a "copy-vad-assets" script to ui/package.json: + ```json + "copy-vad-assets": "cp node_modules/@ricky0123/vad-web/dist/vad.worklet.bundle.min.js public/ && cp node_modules/@ricky0123/vad-web/dist/silero_vad_legacy.onnx public/ && cp node_modules/@ricky0123/vad-web/dist/silero_vad_v5.onnx public/" + ``` + +4. Update ui/vite.config.ts — add COOP/COEP headers to dev server config: + ```typescript + server: { + port: 5173, + headers: { + "Cross-Origin-Opener-Policy": "same-origin", + "Cross-Origin-Embedder-Policy": "require-corp", + }, + proxy: { ... }, // keep existing proxy config + }, + ``` + This ensures SharedArrayBuffer works in Vite dev mode too. + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/public/vad.worklet.bundle.min.js && test -f ui/public/silero_vad_legacy.onnx && test -f ui/public/silero_vad_v5.onnx && grep -q "vad-react" ui/package.json && grep -q "Cross-Origin-Opener-Policy" ui/vite.config.ts && echo "PASS" || echo "FAIL" + + + - ui/public/vad.worklet.bundle.min.js exists (non-zero size) + - ui/public/silero_vad_legacy.onnx exists (non-zero size) + - ui/public/silero_vad_v5.onnx exists (non-zero size) + - grep "vad-react" ui/package.json returns match + - grep "Cross-Origin-Opener-Policy" ui/vite.config.ts returns "same-origin" + - grep "Cross-Origin-Embedder-Policy" ui/vite.config.ts returns "require-corp" + - grep "copy-vad-assets" ui/package.json returns match + + VAD library installed. ONNX model files and worklet bundle served from ui/public/. Vite dev server sends COOP/COEP headers. SharedArrayBuffer available in dev. + + + + + +- server/src/routes/voice.ts exists with transcribe and synthesize endpoints +- server/src/routes/nexus-settings.ts exists with GET/PATCH +- server/src/app.ts mounts both route sets and has COOP/COEP middleware +- server/src/routes/chat.ts handles voiceMode in stream handler +- ui/public/ has all 3 VAD asset files +- ui/vite.config.ts has COOP/COEP headers +- @ricky0123/vad-react in ui/package.json dependencies + + + +All Phase 36 server deliverables present. COOP/COEP headers set on both Express and Vite dev server. VAD assets served from same-origin. Foundation ready for frontend voice components. + + + +After completion, create `.planning/phases/37-web-chat-voice-ui/37-01-SUMMARY.md` + diff --git a/.planning/phases/37-web-chat-voice-ui/37-02-PLAN.md b/.planning/phases/37-web-chat-voice-ui/37-02-PLAN.md new file mode 100644 index 00000000..6c4fb4bd --- /dev/null +++ b/.planning/phases/37-web-chat-voice-ui/37-02-PLAN.md @@ -0,0 +1,300 @@ +--- +phase: 37-web-chat-voice-ui +plan: 02 +type: execute +wave: 2 +depends_on: ["37-01"] +files_modified: + - ui/src/lib/encodeWav.ts + - ui/src/hooks/useVadRecorder.ts + - ui/src/hooks/useVoiceMode.ts + - ui/src/components/VoiceWaveform.tsx + - ui/src/components/VoiceMicButton.tsx +autonomous: true +requirements: + - WCHAT-01 + - WCHAT-02 + - WCHAT-03 + - WCHAT-05 + +must_haves: + truths: + - "VoiceMicButton renders three visual states: idle (Mic icon), recording (waveform + ring), processing (Loader2 spinner)" + - "Recording auto-stops on silence via VAD onSpeechEnd callback" + - "VoiceWaveform renders animated canvas bars during recording" + - "useVadRecorder converts Float32Array to WAV and POSTs to /api/transcribe" + - "useVoiceMode reads voiceMode from GET /api/nexus/settings and writes via PATCH" + artifacts: + - path: "ui/src/lib/encodeWav.ts" + provides: "Float32Array to WAV blob encoder" + exports: ["encodeWav"] + - path: "ui/src/hooks/useVadRecorder.ts" + provides: "VAD recording hook with auto-stop" + exports: ["useVadRecorder"] + - path: "ui/src/hooks/useVoiceMode.ts" + provides: "Voice mode state from nexus-settings" + exports: ["useVoiceMode"] + - path: "ui/src/components/VoiceWaveform.tsx" + provides: "Canvas amplitude visualization" + exports: ["VoiceWaveform"] + - path: "ui/src/components/VoiceMicButton.tsx" + provides: "VAD-powered mic button with three states" + exports: ["VoiceMicButton"] + key_links: + - from: "ui/src/components/VoiceMicButton.tsx" + to: "ui/src/hooks/useVadRecorder.ts" + via: "useVadRecorder() hook call" + pattern: "useVadRecorder" + - from: "ui/src/hooks/useVadRecorder.ts" + to: "ui/src/lib/encodeWav.ts" + via: "encodeWav(audio) in onSpeechEnd" + pattern: "encodeWav" + - from: "ui/src/hooks/useVadRecorder.ts" + to: "/api/transcribe" + via: "fetch POST with FormData" + pattern: "fetch.*api/transcribe" + - from: "ui/src/components/VoiceMicButton.tsx" + to: "ui/src/components/VoiceWaveform.tsx" + via: "VoiceWaveform rendered inside recording state" + pattern: " +Build the core voice recording components: WAV encoder, VAD recorder hook, voice mode hook, waveform visualization, and the VoiceMicButton that ties them together. + +Purpose: These are the foundational building blocks that replace VoiceRecordButton with VAD-powered auto-stop recording and real-time waveform visualization. + +Output: 5 new files — encodeWav utility, useVadRecorder hook, useVoiceMode hook, VoiceWaveform component, VoiceMicButton component + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md +@.planning/phases/37-web-chat-voice-ui/37-01-SUMMARY.md + + + +```typescript +// @ricky0123/vad-react useMicVAD hook +import { useMicVAD } from "@ricky0123/vad-react"; +const vad = useMicVAD({ + startOnLoad: false, + baseAssetPath: "/", + onnxWASMBasePath: "/", + positiveSpeechThreshold: 0.8, + negativeSpeechThreshold: 0.65, + redemptionFrames: 8, + minSpeechFrames: 5, + onSpeechStart: () => void, + onSpeechEnd: (audio: Float32Array) => void, +}); +// Returns: { listening, loading, errored, userSpeaking, start, pause } +``` + + +```typescript +interface VoiceRecordButtonProps { + onTranscription: (text: string) => void; + disabled?: boolean; +} +``` + + +``` +GET /api/nexus/settings → { mode, voiceEnabled, voiceMode, ... } +PATCH /api/nexus/settings → accepts partial, returns updated +``` + + + + + + + Task 1: Create encodeWav utility and useVadRecorder + useVoiceMode hooks + + ui/src/lib/encodeWav.ts, + ui/src/hooks/useVadRecorder.ts, + ui/src/hooks/useVoiceMode.ts + + + ui/src/hooks/useStreamingChat.ts, + ui/src/api/chat.ts, + ui/src/components/VoiceRecordButton.tsx + + +1. **ui/src/lib/encodeWav.ts** — Create WAV encoder function: + ```typescript + export function encodeWav(samples: Float32Array, sampleRate = 16000): Blob + ``` + - Standard 44-byte WAV header (RIFF/WAVE/fmt/data chunks) + - PCM format (1), mono (1 channel), 16-bit depth + - Clamp samples to [-1, 1] range before int16 conversion + - Return Blob with type "audio/wav" + - Helper: `function writeString(view: DataView, offset: number, str: string)` + +2. **ui/src/hooks/useVadRecorder.ts** — Create VAD recording hook: + ```typescript + interface UseVadRecorderOptions { + onTranscript: (text: string) => void; + } + interface UseVadRecorderReturn { + state: "idle" | "recording" | "processing"; + start: () => void; + stop: () => void; + mediaStream: MediaStream | null; // exposed for VoiceWaveform AnalyserNode + } + export function useVadRecorder(opts: UseVadRecorderOptions): UseVadRecorderReturn + ``` + Implementation: + - Use `useMicVAD` from `@ricky0123/vad-react` with `startOnLoad: false` + - Set `baseAssetPath: "/"` and `onnxWASMBasePath: "/"` (serve from ui/public/) + - Set `positiveSpeechThreshold: 0.8`, `minSpeechFrames: 5` (300ms minimum to filter noise) + - In `onSpeechEnd(audio: Float32Array)`: + a. Call `vad.pause()` to stop listening + b. Set state to "processing" + c. Call `encodeWav(audio)` to get WAV blob + d. Create FormData, append blob as "audio" field with filename "recording.wav" + e. POST to `/api/transcribe` with `credentials: "include"` + f. Parse response as `{ text: string }` + g. If text is non-empty (length >= 2), call `opts.onTranscript(text.trim())` + h. Set state back to "idle" + - `start()`: calls `vad.start()`, sets state to "recording" + - `stop()`: calls `vad.pause()`, sets state to "idle" + - Expose `mediaStream` from `navigator.mediaDevices.getUserMedia({ audio: true })` — store in a ref. This is needed for VoiceWaveform AnalyserNode. + - NOTE: useMicVAD manages its own media stream internally, but VoiceWaveform needs a separate reference to the stream for the AnalyserNode. Request the stream in the `start()` function and store in a ref. Stop tracks in `stop()`. + +3. **ui/src/hooks/useVoiceMode.ts** — Create voice mode hook: + ```typescript + type VoiceMode = "text" | "voice_input" | "full_voice"; + interface UseVoiceModeReturn { + mode: VoiceMode; + setMode: (next: VoiceMode) => Promise; + isLoading: boolean; + } + export function useVoiceMode(): UseVoiceModeReturn + ``` + Implementation: + - On mount, GET /api/nexus/settings with credentials: "include" + - Extract `voiceMode` from response, default to "text" + - `setMode(next)`: optimistically update local state, then PATCH /api/nexus/settings with `{ voiceMode: next }` + - Use useState for mode and isLoading + - Wrap fetch in try/catch; on error, revert to previous mode + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/lib/encodeWav.ts && test -f ui/src/hooks/useVadRecorder.ts && test -f ui/src/hooks/useVoiceMode.ts && grep -q "encodeWav" ui/src/lib/encodeWav.ts && grep -q "useVadRecorder" ui/src/hooks/useVadRecorder.ts && grep -q "useVoiceMode" ui/src/hooks/useVoiceMode.ts && grep -q "useMicVAD" ui/src/hooks/useVadRecorder.ts && grep -q "api/transcribe" ui/src/hooks/useVadRecorder.ts && grep -q "api/nexus/settings" ui/src/hooks/useVoiceMode.ts && echo "PASS" || echo "FAIL" + + + - grep "export function encodeWav" ui/src/lib/encodeWav.ts returns match + - grep "export function useVadRecorder" ui/src/hooks/useVadRecorder.ts returns match + - grep "export function useVoiceMode" ui/src/hooks/useVoiceMode.ts returns match + - grep "useMicVAD" ui/src/hooks/useVadRecorder.ts returns match + - grep "startOnLoad.*false" ui/src/hooks/useVadRecorder.ts returns match + - grep "baseAssetPath" ui/src/hooks/useVadRecorder.ts returns match with "/" + - grep "api/transcribe" ui/src/hooks/useVadRecorder.ts returns match + - grep "api/nexus/settings" ui/src/hooks/useVoiceMode.ts returns match + - grep "encodeWav" ui/src/hooks/useVadRecorder.ts returns match (imports it) + - grep "RIFF" ui/src/lib/encodeWav.ts returns match (WAV header) + + encodeWav utility produces valid WAV blobs. useVadRecorder wraps useMicVAD with auto-stop + transcription. useVoiceMode reads/writes voiceMode from nexus-settings API. + + + + Task 2: Create VoiceWaveform canvas component and VoiceMicButton + + ui/src/components/VoiceWaveform.tsx, + ui/src/components/VoiceMicButton.tsx + + + ui/src/hooks/useVadRecorder.ts, + ui/src/lib/encodeWav.ts, + ui/src/components/VoiceRecordButton.tsx + + +1. **ui/src/components/VoiceWaveform.tsx** — Canvas-based amplitude visualization: + ```typescript + interface VoiceWaveformProps { + stream: MediaStream | null; + active: boolean; // controls animation loop + } + export function VoiceWaveform({ stream, active }: VoiceWaveformProps) + ``` + Implementation: + - Use a `` element, width=80, height=32 (h-8 per UI spec), className="inline-block" + - On mount (when stream is truthy and active is true): + a. Create AudioContext (lazily — only create once, store in ref) + b. If AudioContext is suspended, call `audioCtx.resume()` + c. Create MediaStreamSource from stream + d. Create AnalyserNode with fftSize=64 (gives 32 frequency bins) + e. Connect source -> analyser + f. Start requestAnimationFrame loop: + - Call `analyser.getByteFrequencyData(dataArray)` into Uint8Array(32) + - Clear canvas + - Draw 20 bars (skip every other bin for cleaner look): each bar width=2px, gap=2px + - Bar height = (dataArray[i*2] / 255) * canvasHeight, minimum 2px + - Bar color: use CSS variable --primary via getComputedStyle + g. Store animationFrame id in ref for cleanup + - On cleanup or when active becomes false: cancelAnimationFrame, disconnect source + - Do NOT close AudioContext on cleanup (reuse across start/stop cycles) + +2. **ui/src/components/VoiceMicButton.tsx** — VAD-powered mic button: + ```typescript + interface VoiceMicButtonProps { + onTranscript: (text: string) => void; + disabled?: boolean; + } + export function VoiceMicButton({ onTranscript, disabled }: VoiceMicButtonProps) + ``` + Implementation: + - Call `useVadRecorder({ onTranscript })` to get `{ state, start, stop, mediaStream }` + - Three visual states per UI spec: + a. **idle** (state === "idle"): Render Button with ghost variant, size="icon", h-8 w-8. Content: ``. aria-label="Start voice input". onClick calls start(). + b. **recording** (state === "recording"): Render Button with ghost variant, size="icon", h-8 w-8, with `ring-2 ring-primary` classes. Content: ``. aria-label="Recording — speak now". onClick calls stop(). + c. **processing** (state === "processing"): Render Button disabled, ghost variant, size="icon", h-8 w-8. Content: ``. aria-label="Transcribing...". + - Import Mic, Loader2 from lucide-react + - Import Button from @/components/ui/button + - Import VoiceWaveform from ./VoiceWaveform + - Import useVadRecorder from ../hooks/useVadRecorder + - When disabled prop is true, render idle state with disabled attribute + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/components/VoiceWaveform.tsx && test -f ui/src/components/VoiceMicButton.tsx && grep -q "VoiceWaveform" ui/src/components/VoiceWaveform.tsx && grep -q "VoiceMicButton" ui/src/components/VoiceMicButton.tsx && grep -q "canvas" ui/src/components/VoiceWaveform.tsx && grep -q "useVadRecorder" ui/src/components/VoiceMicButton.tsx && grep -q "Mic" ui/src/components/VoiceMicButton.tsx && grep -q "Loader2" ui/src/components/VoiceMicButton.tsx && grep -q "ring-2 ring-primary" ui/src/components/VoiceMicButton.tsx && echo "PASS" || echo "FAIL" + + + - grep "export function VoiceWaveform" ui/src/components/VoiceWaveform.tsx returns match + - grep "export function VoiceMicButton" ui/src/components/VoiceMicButton.tsx returns match + - grep "canvas" ui/src/components/VoiceWaveform.tsx returns match + - grep "AnalyserNode\|createAnalyser\|analyser" ui/src/components/VoiceWaveform.tsx returns match + - grep "requestAnimationFrame" ui/src/components/VoiceWaveform.tsx returns match + - grep "getByteFrequencyData" ui/src/components/VoiceWaveform.tsx returns match + - grep "useVadRecorder" ui/src/components/VoiceMicButton.tsx returns match + - grep 'aria-label="Start voice input"' ui/src/components/VoiceMicButton.tsx returns match + - grep 'aria-label="Recording' ui/src/components/VoiceMicButton.tsx returns match + - grep 'aria-label="Transcribing' ui/src/components/VoiceMicButton.tsx returns match + - grep "ring-2 ring-primary" ui/src/components/VoiceMicButton.tsx returns match + - grep "Loader2.*animate-spin" ui/src/components/VoiceMicButton.tsx returns match + + VoiceWaveform renders 20 animated bars from Web Audio API AnalyserNode on a 80x32 canvas. VoiceMicButton shows idle/recording/processing states with correct icons, aria-labels, and ring styling. + + + + + +- All 5 files exist and export their named functions +- useVadRecorder uses useMicVAD with startOnLoad: false and baseAssetPath: "/" +- VoiceMicButton has three distinct visual states with correct aria-labels +- VoiceWaveform uses canvas + AnalyserNode pattern +- encodeWav produces Blob with type audio/wav +- useVoiceMode reads/writes via /api/nexus/settings + + + +Core voice recording pipeline complete: user clicks mic -> VAD listens -> waveform animates -> silence detected -> audio encoded to WAV -> POSTed to /api/transcribe -> transcript returned. Voice mode readable/writable from nexus-settings. + + + +After completion, create `.planning/phases/37-web-chat-voice-ui/37-02-SUMMARY.md` + diff --git a/.planning/phases/37-web-chat-voice-ui/37-03-PLAN.md b/.planning/phases/37-web-chat-voice-ui/37-03-PLAN.md new file mode 100644 index 00000000..db6aa573 --- /dev/null +++ b/.planning/phases/37-web-chat-voice-ui/37-03-PLAN.md @@ -0,0 +1,286 @@ +--- +phase: 37-web-chat-voice-ui +plan: 03 +type: execute +wave: 2 +depends_on: ["37-01"] +files_modified: + - ui/src/components/ChatVoicePlayer.tsx + - ui/src/components/ChatVoiceBadge.tsx + - ui/src/components/VoiceModeToggle.tsx +autonomous: true +requirements: + - WCHAT-04 + - WCHAT-05 + - WCHAT-06 + +must_haves: + truths: + - "ChatVoicePlayer renders inline audio player with play/pause controls" + - "ChatVoicePlayer auto-plays when autoPlay setting is true" + - "ChatVoiceBadge shows 'Voice' badge on voice messages" + - "ChatVoiceBadge has collapsible full markdown section for voice_full messages" + - "VoiceModeToggle renders three pills: Text / Voice In / Full Voice" + - "VoiceModeToggle persists selection via useVoiceMode hook" + - "Auto-play preference stored in localStorage under nexus:voice:autoplay" + artifacts: + - path: "ui/src/components/ChatVoicePlayer.tsx" + provides: "Inline audio player for synthesized voice responses" + exports: ["ChatVoicePlayer"] + - path: "ui/src/components/ChatVoiceBadge.tsx" + provides: "Voice badge + collapsible markdown on agent messages" + exports: ["ChatVoiceBadge"] + - path: "ui/src/components/VoiceModeToggle.tsx" + provides: "Three-state pill toggle for voice mode" + exports: ["VoiceModeToggle"] + key_links: + - from: "ui/src/components/ChatVoicePlayer.tsx" + to: "/api/synthesize" + via: "fetch POST to get audio blob" + pattern: "fetch.*api/synthesize" + - from: "ui/src/components/ChatVoiceBadge.tsx" + to: "shadcn Collapsible" + via: "Collapsible/CollapsibleContent/CollapsibleTrigger" + pattern: "Collapsible" + - from: "ui/src/components/VoiceModeToggle.tsx" + to: "ui/src/hooks/useVoiceMode.ts" + via: "useVoiceMode() hook" + pattern: "useVoiceMode" +--- + + +Build the voice output and mode selection components: ChatVoicePlayer for inline audio playback, ChatVoiceBadge for voice message display, and VoiceModeToggle for switching between text/voice_input/full_voice modes. + +Purpose: These components handle the output side of voice I/O (playing synthesized responses, showing voice badges on messages) and the mode selector that controls the entire voice behavior. + +Output: 3 new component files — ChatVoicePlayer, ChatVoiceBadge, VoiceModeToggle + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md + + + +``` +POST /api/synthesize + Body: { text: string, voiceId?: string } + Response: audio/wav binary buffer +``` + + +```typescript +type VoiceMode = "text" | "voice_input" | "full_voice"; +export function useVoiceMode(): { + mode: VoiceMode; + setMode: (next: VoiceMode) => Promise; + isLoading: boolean; +} +``` + + +``` +messageType: "voice_input" → user sent via voice, agent replied with text +messageType: "voice_full" → user sent via voice, agent replied with SPOKEN + DETAILED format +``` + + +``` +SPOKEN: +DETAILED: +``` + + +```typescript +import { Badge } from "@/components/ui/badge"; +import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; +import { Button } from "@/components/ui/button"; +``` + + + + + + + Task 1: Create ChatVoicePlayer and ChatVoiceBadge components + + ui/src/components/ChatVoicePlayer.tsx, + ui/src/components/ChatVoiceBadge.tsx + + + ui/src/components/ChatMessage.tsx, + ui/src/components/ChatMarkdownMessage.tsx + + +1. **ui/src/components/ChatVoicePlayer.tsx** — Inline audio player for voice responses: + ```typescript + interface ChatVoicePlayerProps { + text: string; // The spoken text to synthesize + autoPlay?: boolean; // Whether to auto-play on mount + } + export function ChatVoicePlayer({ text, autoPlay = false }: ChatVoicePlayerProps) + ``` + Implementation: + - State: `status: "idle" | "loading" | "playing" | "paused"`, `audioUrl: string | null` + - On mount (or when text changes): POST /api/synthesize with `{ text }`, credentials: "include" + - Set status to "loading" + - Get response as blob: `const blob = await res.blob()` + - Create object URL: `const url = URL.createObjectURL(blob)` + - Store url in state, set status to "idle" + - Create `