diff --git a/.planning/phases/37-web-chat-voice-ui/37-01-PLAN.md b/.planning/phases/37-web-chat-voice-ui/37-01-PLAN.md new file mode 100644 index 00000000..ea2551b9 --- /dev/null +++ b/.planning/phases/37-web-chat-voice-ui/37-01-PLAN.md @@ -0,0 +1,297 @@ +--- +phase: 37-web-chat-voice-ui +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - server/src/services/nexus-settings.ts + - server/src/routes/nexus-settings.ts + - server/src/routes/voice.ts + - server/src/routes/chat.ts + - server/src/app.ts + - packages/shared/src/types/chat.ts + - packages/shared/src/validators/chat.ts + - ui/vite.config.ts + - ui/package.json + - ui/public/vad.worklet.bundle.min.js + - ui/public/silero_vad_legacy.onnx + - ui/public/silero_vad_v5.onnx +autonomous: true +requirements: + - WCHAT-01 + - WCHAT-02 + - WCHAT-04 + +must_haves: + truths: + - "POST /api/transcribe accepts audio upload and returns { text }" + - "POST /api/synthesize accepts { text } and returns audio/wav" + - "GET /api/nexus/settings returns voiceMode field" + - "PATCH /api/nexus/settings accepts voiceMode update" + - "Chat stream endpoint accepts voiceMode in request body" + - "SharedArrayBuffer is available in browser (COOP/COEP headers set)" + - "VAD ONNX model files are served from /vad.worklet.bundle.min.js, /silero_vad_legacy.onnx, /silero_vad_v5.onnx" + artifacts: + - path: "server/src/routes/voice.ts" + provides: "POST /api/transcribe and POST /api/synthesize" + exports: ["voiceRoutes"] + - path: "server/src/routes/nexus-settings.ts" + provides: "GET/PATCH /api/nexus/settings" + exports: ["nexusSettingsRoutes"] + - path: "server/src/services/nexus-settings.ts" + provides: "nexusSettingsService with voiceMode field" + exports: ["nexusSettingsService", "VoiceMode", "VOICE_MODES"] + - path: "ui/public/vad.worklet.bundle.min.js" + provides: "VAD AudioWorklet bundle" + - path: "ui/public/silero_vad_legacy.onnx" + provides: "Silero VAD legacy ONNX model" + key_links: + - from: "server/src/app.ts" + to: "server/src/routes/voice.ts" + via: "api.use(voiceRoutes())" + pattern: "voiceRoutes" + - from: "server/src/app.ts" + to: "server/src/routes/nexus-settings.ts" + via: "api.use(nexusSettingsRoutes())" + pattern: "nexusSettingsRoutes" + - from: "server/src/routes/chat.ts" + to: "voiceMode parameter" + via: "req.body.voiceMode in stream handler" + pattern: "voiceMode.*voice_input|voice_full" +--- + + +Establish all server-side prerequisites and browser infrastructure for voice I/O. + +Purpose: Phase 36 Tasks 2-3 (nexus-settings voiceMode schema, voice HTTP routes, voiceMode wiring in chat.ts) are not present on this branch. This plan cherry-picks or re-implements those deliverables, adds COOP/COEP headers for SharedArrayBuffer, installs @ricky0123/vad-react, copies VAD ONNX assets to ui/public/, and configures Vite dev server headers. + +Output: Working server endpoints (transcribe, synthesize, nexus-settings), COOP/COEP isolation, VAD assets ready in ui/public/ + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md + + + + +From server/src/services/voice-pipeline.ts (ALREADY on this branch): +```typescript +// voicePipelineService() exposes transcribe(buffer, format) and synthesize(text, voiceId?) +export function voicePipelineService(): { transcribe, synthesize, formatForVoice, transcodeToWav16k } +``` + +From server/src/app.ts (parent branch — route mounting pattern): +```typescript +// Routes are mounted on an `api` Router via api.use(...) +// Pattern: import { xyzRoutes } from "./routes/xyz.js"; then api.use(xyzRoutes()); +import { chatRoutes } from "./routes/chat.js"; +api.use(chatRoutes(db, storageService, config)); +``` + +From packages/shared/src/types/chat.ts (parent branch): +```typescript +export interface ChatMessage { + id: string; + conversationId: string; + role: "user" | "assistant" | "system"; + content: string; + messageType?: string | null; + // ... other fields +} +``` + +From packages/shared/src/validators/chat.ts (parent branch): +```typescript +export const createMessageSchema = z.object({ + content: z.string().min(1), + role: z.enum(["user", "assistant", "system"]).default("user"), + agentId: z.string().uuid().optional(), + // voiceMode NOT present on parent branch — must add +}); +``` + + + + + + + Task 1: Cherry-pick Phase 36 server deliverables and add COOP/COEP headers + + server/src/services/nexus-settings.ts, + server/src/routes/nexus-settings.ts, + server/src/routes/voice.ts, + server/src/routes/chat.ts, + server/src/app.ts, + packages/shared/src/types/chat.ts, + packages/shared/src/validators/chat.ts + + + server/src/services/nexus-settings.ts, + server/src/services/voice-pipeline.ts, + server/src/app.ts, + server/src/routes/chat.ts, + packages/shared/src/types/chat.ts, + packages/shared/src/validators/chat.ts + + +Cherry-pick or re-implement Phase 36 Tasks 2-3 deliverables. The commits on gsd/phase-36-voice-pipeline-foundation are: +- d0d7a23a (nexus-settings voiceMode schema extension) +- b964c0e4 (voiceMode in createMessageSchema + ChatMessage interface) +- 11508547 (voice HTTP routes) +- fd372eaf (voiceMode wiring in chat.ts + route mounting) + +Try cherry-picking these 4 commits in order: +```bash +git cherry-pick d0d7a23a b964c0e4 11508547 fd372eaf +``` + +If cherry-pick conflicts, re-implement manually: + +1. **server/src/services/nexus-settings.ts** — Add VOICE_MODES and VoiceMode type: + ```typescript + export const VOICE_MODES = ["text", "voice_input", "full_voice"] as const; + export type VoiceMode = (typeof VOICE_MODES)[number]; + ``` + Add `voiceMode: z.enum(VOICE_MODES).default("text")` to nexusSettingsSchema. + Add `telegramToken: z.string().optional()`, `piperBinaryPath: z.string().optional()`, `whisperBinaryPath: z.string().optional()`. + +2. **server/src/routes/nexus-settings.ts** — Create new file: + - GET /nexus/settings — returns nexusSettingsService().get() + - PATCH /nexus/settings — calls nexusSettingsService().set(req.body), returns updated + - Both routes call assertBoard(req) first + - Import Router from express, assertBoard from ./authz.js, nexusSettingsService from ../services/nexus-settings.js + +3. **server/src/routes/voice.ts** — Create new file: + - POST /transcribe — accepts multipart audio upload via multer memoryStorage, calls voicePipelineService().transcribe(buffer, format), returns { text } + - POST /synthesize — accepts JSON { text, voiceId? }, calls voicePipelineService().synthesize(text, voiceId), returns audio/wav buffer + - Both routes call assertBoard(req) + - Import multer, Router, assertBoard, voicePipelineService, MAX_ATTACHMENT_BYTES + +4. **packages/shared/src/types/chat.ts** — Add `voiceMode?: string | null;` to ChatMessage interface if not present. + +5. **packages/shared/src/validators/chat.ts** — Add `voiceMode: z.enum(["text", "voice_input", "full_voice"]).optional()` to createMessageSchema. + +6. **server/src/routes/chat.ts** — In the stream POST handler, destructure `voiceMode` from req.body alongside content and agentId. When voiceMode is "full_voice", call voicePipelineService().formatForVoice(aiContent) to produce SPOKEN/DETAILED format. Set messageType on stored message: "voice_full" if voiceMode==="full_voice", "voice_input" if voiceMode==="voice_input", else null. + +7. **server/src/app.ts** — Import and mount voiceRoutes and nexusSettingsRoutes: + ```typescript + import { nexusSettingsRoutes } from "./routes/nexus-settings.js"; + import { voiceRoutes } from "./routes/voice.js"; + // In the api router setup: + api.use(nexusSettingsRoutes()); + api.use(voiceRoutes()); + ``` + +8. **COOP/COEP headers** — In server/src/app.ts, add middleware BEFORE static file serving and vite dev middleware: + ```typescript + app.use((_req, res, next) => { + res.setHeader("Cross-Origin-Opener-Policy", "same-origin"); + res.setHeader("Cross-Origin-Embedder-Policy", "require-corp"); + next(); + }); + ``` + Place this before any `app.use(express.static(...))` or vite middleware attachment. + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "voiceRoutes" server/src/app.ts && grep -q "nexusSettingsRoutes" server/src/app.ts && grep -q "Cross-Origin-Opener-Policy" server/src/app.ts && grep -q "voiceMode" server/src/routes/chat.ts && grep -q "voice_full" server/src/routes/chat.ts && test -f server/src/routes/voice.ts && test -f server/src/routes/nexus-settings.ts && echo "PASS" || echo "FAIL" + + + - grep "voiceRoutes" server/src/app.ts returns match + - grep "nexusSettingsRoutes" server/src/app.ts returns match + - grep "Cross-Origin-Opener-Policy" server/src/app.ts returns "same-origin" + - grep "Cross-Origin-Embedder-Policy" server/src/app.ts returns "require-corp" + - grep "voiceMode" server/src/routes/chat.ts returns match + - grep "voice_full" server/src/routes/chat.ts returns match + - server/src/routes/voice.ts exists with POST /transcribe and POST /synthesize + - server/src/routes/nexus-settings.ts exists with GET and PATCH /nexus/settings + - grep "VOICE_MODES" server/src/services/nexus-settings.ts returns match + + Phase 36 server deliverables present on branch. COOP/COEP headers added. Voice routes mounted. Chat stream accepts voiceMode. + + + + Task 2: Install VAD library, copy ONNX assets, configure Vite COOP/COEP headers + + ui/package.json, + ui/public/vad.worklet.bundle.min.js, + ui/public/silero_vad_legacy.onnx, + ui/public/silero_vad_v5.onnx, + ui/vite.config.ts + + + ui/package.json, + ui/vite.config.ts + + +1. Install @ricky0123/vad-react in the ui package: + ```bash + pnpm add @ricky0123/vad-react --filter @paperclipai/ui + ``` + +2. Copy VAD assets from node_modules to ui/public/ for same-origin serving (avoids COEP blocking CDN): + ```bash + cp node_modules/@ricky0123/vad-web/dist/vad.worklet.bundle.min.js ui/public/ + cp node_modules/@ricky0123/vad-web/dist/silero_vad_legacy.onnx ui/public/ + cp node_modules/@ricky0123/vad-web/dist/silero_vad_v5.onnx ui/public/ + ``` + If vad-web is in ui/node_modules/@ricky0123/vad-web/dist/, use that path instead. + Verify all three files exist after copy. + +3. Add a "copy-vad-assets" script to ui/package.json: + ```json + "copy-vad-assets": "cp node_modules/@ricky0123/vad-web/dist/vad.worklet.bundle.min.js public/ && cp node_modules/@ricky0123/vad-web/dist/silero_vad_legacy.onnx public/ && cp node_modules/@ricky0123/vad-web/dist/silero_vad_v5.onnx public/" + ``` + +4. Update ui/vite.config.ts — add COOP/COEP headers to dev server config: + ```typescript + server: { + port: 5173, + headers: { + "Cross-Origin-Opener-Policy": "same-origin", + "Cross-Origin-Embedder-Policy": "require-corp", + }, + proxy: { ... }, // keep existing proxy config + }, + ``` + This ensures SharedArrayBuffer works in Vite dev mode too. + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/public/vad.worklet.bundle.min.js && test -f ui/public/silero_vad_legacy.onnx && test -f ui/public/silero_vad_v5.onnx && grep -q "vad-react" ui/package.json && grep -q "Cross-Origin-Opener-Policy" ui/vite.config.ts && echo "PASS" || echo "FAIL" + + + - ui/public/vad.worklet.bundle.min.js exists (non-zero size) + - ui/public/silero_vad_legacy.onnx exists (non-zero size) + - ui/public/silero_vad_v5.onnx exists (non-zero size) + - grep "vad-react" ui/package.json returns match + - grep "Cross-Origin-Opener-Policy" ui/vite.config.ts returns "same-origin" + - grep "Cross-Origin-Embedder-Policy" ui/vite.config.ts returns "require-corp" + - grep "copy-vad-assets" ui/package.json returns match + + VAD library installed. ONNX model files and worklet bundle served from ui/public/. Vite dev server sends COOP/COEP headers. SharedArrayBuffer available in dev. + + + + + +- server/src/routes/voice.ts exists with transcribe and synthesize endpoints +- server/src/routes/nexus-settings.ts exists with GET/PATCH +- server/src/app.ts mounts both route sets and has COOP/COEP middleware +- server/src/routes/chat.ts handles voiceMode in stream handler +- ui/public/ has all 3 VAD asset files +- ui/vite.config.ts has COOP/COEP headers +- @ricky0123/vad-react in ui/package.json dependencies + + + +All Phase 36 server deliverables present. COOP/COEP headers set on both Express and Vite dev server. VAD assets served from same-origin. Foundation ready for frontend voice components. + + + +After completion, create `.planning/phases/37-web-chat-voice-ui/37-01-SUMMARY.md` + diff --git a/.planning/phases/37-web-chat-voice-ui/37-02-PLAN.md b/.planning/phases/37-web-chat-voice-ui/37-02-PLAN.md new file mode 100644 index 00000000..6c4fb4bd --- /dev/null +++ b/.planning/phases/37-web-chat-voice-ui/37-02-PLAN.md @@ -0,0 +1,300 @@ +--- +phase: 37-web-chat-voice-ui +plan: 02 +type: execute +wave: 2 +depends_on: ["37-01"] +files_modified: + - ui/src/lib/encodeWav.ts + - ui/src/hooks/useVadRecorder.ts + - ui/src/hooks/useVoiceMode.ts + - ui/src/components/VoiceWaveform.tsx + - ui/src/components/VoiceMicButton.tsx +autonomous: true +requirements: + - WCHAT-01 + - WCHAT-02 + - WCHAT-03 + - WCHAT-05 + +must_haves: + truths: + - "VoiceMicButton renders three visual states: idle (Mic icon), recording (waveform + ring), processing (Loader2 spinner)" + - "Recording auto-stops on silence via VAD onSpeechEnd callback" + - "VoiceWaveform renders animated canvas bars during recording" + - "useVadRecorder converts Float32Array to WAV and POSTs to /api/transcribe" + - "useVoiceMode reads voiceMode from GET /api/nexus/settings and writes via PATCH" + artifacts: + - path: "ui/src/lib/encodeWav.ts" + provides: "Float32Array to WAV blob encoder" + exports: ["encodeWav"] + - path: "ui/src/hooks/useVadRecorder.ts" + provides: "VAD recording hook with auto-stop" + exports: ["useVadRecorder"] + - path: "ui/src/hooks/useVoiceMode.ts" + provides: "Voice mode state from nexus-settings" + exports: ["useVoiceMode"] + - path: "ui/src/components/VoiceWaveform.tsx" + provides: "Canvas amplitude visualization" + exports: ["VoiceWaveform"] + - path: "ui/src/components/VoiceMicButton.tsx" + provides: "VAD-powered mic button with three states" + exports: ["VoiceMicButton"] + key_links: + - from: "ui/src/components/VoiceMicButton.tsx" + to: "ui/src/hooks/useVadRecorder.ts" + via: "useVadRecorder() hook call" + pattern: "useVadRecorder" + - from: "ui/src/hooks/useVadRecorder.ts" + to: "ui/src/lib/encodeWav.ts" + via: "encodeWav(audio) in onSpeechEnd" + pattern: "encodeWav" + - from: "ui/src/hooks/useVadRecorder.ts" + to: "/api/transcribe" + via: "fetch POST with FormData" + pattern: "fetch.*api/transcribe" + - from: "ui/src/components/VoiceMicButton.tsx" + to: "ui/src/components/VoiceWaveform.tsx" + via: "VoiceWaveform rendered inside recording state" + pattern: " +Build the core voice recording components: WAV encoder, VAD recorder hook, voice mode hook, waveform visualization, and the VoiceMicButton that ties them together. + +Purpose: These are the foundational building blocks that replace VoiceRecordButton with VAD-powered auto-stop recording and real-time waveform visualization. + +Output: 5 new files — encodeWav utility, useVadRecorder hook, useVoiceMode hook, VoiceWaveform component, VoiceMicButton component + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md +@.planning/phases/37-web-chat-voice-ui/37-01-SUMMARY.md + + + +```typescript +// @ricky0123/vad-react useMicVAD hook +import { useMicVAD } from "@ricky0123/vad-react"; +const vad = useMicVAD({ + startOnLoad: false, + baseAssetPath: "/", + onnxWASMBasePath: "/", + positiveSpeechThreshold: 0.8, + negativeSpeechThreshold: 0.65, + redemptionFrames: 8, + minSpeechFrames: 5, + onSpeechStart: () => void, + onSpeechEnd: (audio: Float32Array) => void, +}); +// Returns: { listening, loading, errored, userSpeaking, start, pause } +``` + + +```typescript +interface VoiceRecordButtonProps { + onTranscription: (text: string) => void; + disabled?: boolean; +} +``` + + +``` +GET /api/nexus/settings → { mode, voiceEnabled, voiceMode, ... } +PATCH /api/nexus/settings → accepts partial, returns updated +``` + + + + + + + Task 1: Create encodeWav utility and useVadRecorder + useVoiceMode hooks + + ui/src/lib/encodeWav.ts, + ui/src/hooks/useVadRecorder.ts, + ui/src/hooks/useVoiceMode.ts + + + ui/src/hooks/useStreamingChat.ts, + ui/src/api/chat.ts, + ui/src/components/VoiceRecordButton.tsx + + +1. **ui/src/lib/encodeWav.ts** — Create WAV encoder function: + ```typescript + export function encodeWav(samples: Float32Array, sampleRate = 16000): Blob + ``` + - Standard 44-byte WAV header (RIFF/WAVE/fmt/data chunks) + - PCM format (1), mono (1 channel), 16-bit depth + - Clamp samples to [-1, 1] range before int16 conversion + - Return Blob with type "audio/wav" + - Helper: `function writeString(view: DataView, offset: number, str: string)` + +2. **ui/src/hooks/useVadRecorder.ts** — Create VAD recording hook: + ```typescript + interface UseVadRecorderOptions { + onTranscript: (text: string) => void; + } + interface UseVadRecorderReturn { + state: "idle" | "recording" | "processing"; + start: () => void; + stop: () => void; + mediaStream: MediaStream | null; // exposed for VoiceWaveform AnalyserNode + } + export function useVadRecorder(opts: UseVadRecorderOptions): UseVadRecorderReturn + ``` + Implementation: + - Use `useMicVAD` from `@ricky0123/vad-react` with `startOnLoad: false` + - Set `baseAssetPath: "/"` and `onnxWASMBasePath: "/"` (serve from ui/public/) + - Set `positiveSpeechThreshold: 0.8`, `minSpeechFrames: 5` (300ms minimum to filter noise) + - In `onSpeechEnd(audio: Float32Array)`: + a. Call `vad.pause()` to stop listening + b. Set state to "processing" + c. Call `encodeWav(audio)` to get WAV blob + d. Create FormData, append blob as "audio" field with filename "recording.wav" + e. POST to `/api/transcribe` with `credentials: "include"` + f. Parse response as `{ text: string }` + g. If text is non-empty (length >= 2), call `opts.onTranscript(text.trim())` + h. Set state back to "idle" + - `start()`: calls `vad.start()`, sets state to "recording" + - `stop()`: calls `vad.pause()`, sets state to "idle" + - Expose `mediaStream` from `navigator.mediaDevices.getUserMedia({ audio: true })` — store in a ref. This is needed for VoiceWaveform AnalyserNode. + - NOTE: useMicVAD manages its own media stream internally, but VoiceWaveform needs a separate reference to the stream for the AnalyserNode. Request the stream in the `start()` function and store in a ref. Stop tracks in `stop()`. + +3. **ui/src/hooks/useVoiceMode.ts** — Create voice mode hook: + ```typescript + type VoiceMode = "text" | "voice_input" | "full_voice"; + interface UseVoiceModeReturn { + mode: VoiceMode; + setMode: (next: VoiceMode) => Promise; + isLoading: boolean; + } + export function useVoiceMode(): UseVoiceModeReturn + ``` + Implementation: + - On mount, GET /api/nexus/settings with credentials: "include" + - Extract `voiceMode` from response, default to "text" + - `setMode(next)`: optimistically update local state, then PATCH /api/nexus/settings with `{ voiceMode: next }` + - Use useState for mode and isLoading + - Wrap fetch in try/catch; on error, revert to previous mode + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/lib/encodeWav.ts && test -f ui/src/hooks/useVadRecorder.ts && test -f ui/src/hooks/useVoiceMode.ts && grep -q "encodeWav" ui/src/lib/encodeWav.ts && grep -q "useVadRecorder" ui/src/hooks/useVadRecorder.ts && grep -q "useVoiceMode" ui/src/hooks/useVoiceMode.ts && grep -q "useMicVAD" ui/src/hooks/useVadRecorder.ts && grep -q "api/transcribe" ui/src/hooks/useVadRecorder.ts && grep -q "api/nexus/settings" ui/src/hooks/useVoiceMode.ts && echo "PASS" || echo "FAIL" + + + - grep "export function encodeWav" ui/src/lib/encodeWav.ts returns match + - grep "export function useVadRecorder" ui/src/hooks/useVadRecorder.ts returns match + - grep "export function useVoiceMode" ui/src/hooks/useVoiceMode.ts returns match + - grep "useMicVAD" ui/src/hooks/useVadRecorder.ts returns match + - grep "startOnLoad.*false" ui/src/hooks/useVadRecorder.ts returns match + - grep "baseAssetPath" ui/src/hooks/useVadRecorder.ts returns match with "/" + - grep "api/transcribe" ui/src/hooks/useVadRecorder.ts returns match + - grep "api/nexus/settings" ui/src/hooks/useVoiceMode.ts returns match + - grep "encodeWav" ui/src/hooks/useVadRecorder.ts returns match (imports it) + - grep "RIFF" ui/src/lib/encodeWav.ts returns match (WAV header) + + encodeWav utility produces valid WAV blobs. useVadRecorder wraps useMicVAD with auto-stop + transcription. useVoiceMode reads/writes voiceMode from nexus-settings API. + + + + Task 2: Create VoiceWaveform canvas component and VoiceMicButton + + ui/src/components/VoiceWaveform.tsx, + ui/src/components/VoiceMicButton.tsx + + + ui/src/hooks/useVadRecorder.ts, + ui/src/lib/encodeWav.ts, + ui/src/components/VoiceRecordButton.tsx + + +1. **ui/src/components/VoiceWaveform.tsx** — Canvas-based amplitude visualization: + ```typescript + interface VoiceWaveformProps { + stream: MediaStream | null; + active: boolean; // controls animation loop + } + export function VoiceWaveform({ stream, active }: VoiceWaveformProps) + ``` + Implementation: + - Use a `` element, width=80, height=32 (h-8 per UI spec), className="inline-block" + - On mount (when stream is truthy and active is true): + a. Create AudioContext (lazily — only create once, store in ref) + b. If AudioContext is suspended, call `audioCtx.resume()` + c. Create MediaStreamSource from stream + d. Create AnalyserNode with fftSize=64 (gives 32 frequency bins) + e. Connect source -> analyser + f. Start requestAnimationFrame loop: + - Call `analyser.getByteFrequencyData(dataArray)` into Uint8Array(32) + - Clear canvas + - Draw 20 bars (skip every other bin for cleaner look): each bar width=2px, gap=2px + - Bar height = (dataArray[i*2] / 255) * canvasHeight, minimum 2px + - Bar color: use CSS variable --primary via getComputedStyle + g. Store animationFrame id in ref for cleanup + - On cleanup or when active becomes false: cancelAnimationFrame, disconnect source + - Do NOT close AudioContext on cleanup (reuse across start/stop cycles) + +2. **ui/src/components/VoiceMicButton.tsx** — VAD-powered mic button: + ```typescript + interface VoiceMicButtonProps { + onTranscript: (text: string) => void; + disabled?: boolean; + } + export function VoiceMicButton({ onTranscript, disabled }: VoiceMicButtonProps) + ``` + Implementation: + - Call `useVadRecorder({ onTranscript })` to get `{ state, start, stop, mediaStream }` + - Three visual states per UI spec: + a. **idle** (state === "idle"): Render Button with ghost variant, size="icon", h-8 w-8. Content: ``. aria-label="Start voice input". onClick calls start(). + b. **recording** (state === "recording"): Render Button with ghost variant, size="icon", h-8 w-8, with `ring-2 ring-primary` classes. Content: ``. aria-label="Recording — speak now". onClick calls stop(). + c. **processing** (state === "processing"): Render Button disabled, ghost variant, size="icon", h-8 w-8. Content: ``. aria-label="Transcribing...". + - Import Mic, Loader2 from lucide-react + - Import Button from @/components/ui/button + - Import VoiceWaveform from ./VoiceWaveform + - Import useVadRecorder from ../hooks/useVadRecorder + - When disabled prop is true, render idle state with disabled attribute + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/components/VoiceWaveform.tsx && test -f ui/src/components/VoiceMicButton.tsx && grep -q "VoiceWaveform" ui/src/components/VoiceWaveform.tsx && grep -q "VoiceMicButton" ui/src/components/VoiceMicButton.tsx && grep -q "canvas" ui/src/components/VoiceWaveform.tsx && grep -q "useVadRecorder" ui/src/components/VoiceMicButton.tsx && grep -q "Mic" ui/src/components/VoiceMicButton.tsx && grep -q "Loader2" ui/src/components/VoiceMicButton.tsx && grep -q "ring-2 ring-primary" ui/src/components/VoiceMicButton.tsx && echo "PASS" || echo "FAIL" + + + - grep "export function VoiceWaveform" ui/src/components/VoiceWaveform.tsx returns match + - grep "export function VoiceMicButton" ui/src/components/VoiceMicButton.tsx returns match + - grep "canvas" ui/src/components/VoiceWaveform.tsx returns match + - grep "AnalyserNode\|createAnalyser\|analyser" ui/src/components/VoiceWaveform.tsx returns match + - grep "requestAnimationFrame" ui/src/components/VoiceWaveform.tsx returns match + - grep "getByteFrequencyData" ui/src/components/VoiceWaveform.tsx returns match + - grep "useVadRecorder" ui/src/components/VoiceMicButton.tsx returns match + - grep 'aria-label="Start voice input"' ui/src/components/VoiceMicButton.tsx returns match + - grep 'aria-label="Recording' ui/src/components/VoiceMicButton.tsx returns match + - grep 'aria-label="Transcribing' ui/src/components/VoiceMicButton.tsx returns match + - grep "ring-2 ring-primary" ui/src/components/VoiceMicButton.tsx returns match + - grep "Loader2.*animate-spin" ui/src/components/VoiceMicButton.tsx returns match + + VoiceWaveform renders 20 animated bars from Web Audio API AnalyserNode on a 80x32 canvas. VoiceMicButton shows idle/recording/processing states with correct icons, aria-labels, and ring styling. + + + + + +- All 5 files exist and export their named functions +- useVadRecorder uses useMicVAD with startOnLoad: false and baseAssetPath: "/" +- VoiceMicButton has three distinct visual states with correct aria-labels +- VoiceWaveform uses canvas + AnalyserNode pattern +- encodeWav produces Blob with type audio/wav +- useVoiceMode reads/writes via /api/nexus/settings + + + +Core voice recording pipeline complete: user clicks mic -> VAD listens -> waveform animates -> silence detected -> audio encoded to WAV -> POSTed to /api/transcribe -> transcript returned. Voice mode readable/writable from nexus-settings. + + + +After completion, create `.planning/phases/37-web-chat-voice-ui/37-02-SUMMARY.md` + diff --git a/.planning/phases/37-web-chat-voice-ui/37-03-PLAN.md b/.planning/phases/37-web-chat-voice-ui/37-03-PLAN.md new file mode 100644 index 00000000..db6aa573 --- /dev/null +++ b/.planning/phases/37-web-chat-voice-ui/37-03-PLAN.md @@ -0,0 +1,286 @@ +--- +phase: 37-web-chat-voice-ui +plan: 03 +type: execute +wave: 2 +depends_on: ["37-01"] +files_modified: + - ui/src/components/ChatVoicePlayer.tsx + - ui/src/components/ChatVoiceBadge.tsx + - ui/src/components/VoiceModeToggle.tsx +autonomous: true +requirements: + - WCHAT-04 + - WCHAT-05 + - WCHAT-06 + +must_haves: + truths: + - "ChatVoicePlayer renders inline audio player with play/pause controls" + - "ChatVoicePlayer auto-plays when autoPlay setting is true" + - "ChatVoiceBadge shows 'Voice' badge on voice messages" + - "ChatVoiceBadge has collapsible full markdown section for voice_full messages" + - "VoiceModeToggle renders three pills: Text / Voice In / Full Voice" + - "VoiceModeToggle persists selection via useVoiceMode hook" + - "Auto-play preference stored in localStorage under nexus:voice:autoplay" + artifacts: + - path: "ui/src/components/ChatVoicePlayer.tsx" + provides: "Inline audio player for synthesized voice responses" + exports: ["ChatVoicePlayer"] + - path: "ui/src/components/ChatVoiceBadge.tsx" + provides: "Voice badge + collapsible markdown on agent messages" + exports: ["ChatVoiceBadge"] + - path: "ui/src/components/VoiceModeToggle.tsx" + provides: "Three-state pill toggle for voice mode" + exports: ["VoiceModeToggle"] + key_links: + - from: "ui/src/components/ChatVoicePlayer.tsx" + to: "/api/synthesize" + via: "fetch POST to get audio blob" + pattern: "fetch.*api/synthesize" + - from: "ui/src/components/ChatVoiceBadge.tsx" + to: "shadcn Collapsible" + via: "Collapsible/CollapsibleContent/CollapsibleTrigger" + pattern: "Collapsible" + - from: "ui/src/components/VoiceModeToggle.tsx" + to: "ui/src/hooks/useVoiceMode.ts" + via: "useVoiceMode() hook" + pattern: "useVoiceMode" +--- + + +Build the voice output and mode selection components: ChatVoicePlayer for inline audio playback, ChatVoiceBadge for voice message display, and VoiceModeToggle for switching between text/voice_input/full_voice modes. + +Purpose: These components handle the output side of voice I/O (playing synthesized responses, showing voice badges on messages) and the mode selector that controls the entire voice behavior. + +Output: 3 new component files — ChatVoicePlayer, ChatVoiceBadge, VoiceModeToggle + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md + + + +``` +POST /api/synthesize + Body: { text: string, voiceId?: string } + Response: audio/wav binary buffer +``` + + +```typescript +type VoiceMode = "text" | "voice_input" | "full_voice"; +export function useVoiceMode(): { + mode: VoiceMode; + setMode: (next: VoiceMode) => Promise; + isLoading: boolean; +} +``` + + +``` +messageType: "voice_input" → user sent via voice, agent replied with text +messageType: "voice_full" → user sent via voice, agent replied with SPOKEN + DETAILED format +``` + + +``` +SPOKEN: +DETAILED: +``` + + +```typescript +import { Badge } from "@/components/ui/badge"; +import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; +import { Button } from "@/components/ui/button"; +``` + + + + + + + Task 1: Create ChatVoicePlayer and ChatVoiceBadge components + + ui/src/components/ChatVoicePlayer.tsx, + ui/src/components/ChatVoiceBadge.tsx + + + ui/src/components/ChatMessage.tsx, + ui/src/components/ChatMarkdownMessage.tsx + + +1. **ui/src/components/ChatVoicePlayer.tsx** — Inline audio player for voice responses: + ```typescript + interface ChatVoicePlayerProps { + text: string; // The spoken text to synthesize + autoPlay?: boolean; // Whether to auto-play on mount + } + export function ChatVoicePlayer({ text, autoPlay = false }: ChatVoicePlayerProps) + ``` + Implementation: + - State: `status: "idle" | "loading" | "playing" | "paused"`, `audioUrl: string | null` + - On mount (or when text changes): POST /api/synthesize with `{ text }`, credentials: "include" + - Set status to "loading" + - Get response as blob: `const blob = await res.blob()` + - Create object URL: `const url = URL.createObjectURL(blob)` + - Store url in state, set status to "idle" + - Create `` element ref. Set src to audioUrl when available. + - If autoPlay is true AND audioUrl is set, call `audioRef.current.play()`, set status to "playing" + - Audio event listeners: + - `onEnded`: set status to "idle", revoke blob URL via `URL.revokeObjectURL(audioUrl)` + - `onPause`: set status to "paused" + - `onPlay`: set status to "playing" + - Render: + - loading: `` with "Loading audio..." text + - idle/paused: `` with `` icon. onClick: `audioRef.current.play()` + - playing: `` with `` icon. onClick: `audioRef.current.pause()` + - Hidden `` element with aria-label="Voice response" + - Import Play, Pause, Loader2 from lucide-react + - Cleanup: revoke any blob URL on unmount + +2. **ui/src/components/ChatVoiceBadge.tsx** — Voice badge + collapsible markdown: + ```typescript + interface ChatVoiceBadgeProps { + content: string; + messageType: string; // "voice_input" | "voice_full" + autoPlayVoice?: boolean; + } + export function ChatVoiceBadge({ content, messageType, autoPlayVoice = false }: ChatVoiceBadgeProps) + ``` + Implementation: + - Parse content for SPOKEN/DETAILED sections: + ```typescript + const spokenMatch = content.match(/SPOKEN:\s*([\s\S]*?)(?=\nDETAILED:|$)/); + const spokenText = spokenMatch?.[1]?.trim() ?? content; + const detailedMatch = content.match(/DETAILED:\s*([\s\S]*)/); + ``` + - Render: + a. `Voice` + b. `{spokenText}` + c. If messageType === "voice_full": + - `` + - If detailedMatch exists, render shadcn Collapsible: + ``` + + + {open ? "Hide full response" : "Show full response"} + + + + + + ``` + - For voice_input messageType: just show badge + spoken text, no player, no collapsible + - Import ChatVoicePlayer from ./ChatVoicePlayer + - Import ChatMarkdownMessage from ./ChatMarkdownMessage (already exists in codebase) + - Import Badge from @/components/ui/badge + - Import Collapsible, CollapsibleContent, CollapsibleTrigger from @/components/ui/collapsible + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/components/ChatVoicePlayer.tsx && test -f ui/src/components/ChatVoiceBadge.tsx && grep -q "export function ChatVoicePlayer" ui/src/components/ChatVoicePlayer.tsx && grep -q "export function ChatVoiceBadge" ui/src/components/ChatVoiceBadge.tsx && grep -q "api/synthesize" ui/src/components/ChatVoicePlayer.tsx && grep -q "URL.createObjectURL" ui/src/components/ChatVoicePlayer.tsx && grep -q "URL.revokeObjectURL" ui/src/components/ChatVoicePlayer.tsx && grep -q "Collapsible" ui/src/components/ChatVoiceBadge.tsx && grep -q "Show full response" ui/src/components/ChatVoiceBadge.tsx && grep -q "Badge" ui/src/components/ChatVoiceBadge.tsx && grep -q "SPOKEN:" ui/src/components/ChatVoiceBadge.tsx && echo "PASS" || echo "FAIL" + + + - grep "export function ChatVoicePlayer" ui/src/components/ChatVoicePlayer.tsx returns match + - grep "export function ChatVoiceBadge" ui/src/components/ChatVoiceBadge.tsx returns match + - grep "api/synthesize" ui/src/components/ChatVoicePlayer.tsx returns match + - grep "URL.createObjectURL" ui/src/components/ChatVoicePlayer.tsx returns match + - grep "URL.revokeObjectURL" ui/src/components/ChatVoicePlayer.tsx returns match + - grep "audio" ui/src/components/ChatVoicePlayer.tsx returns match (native audio element) + - grep "aria-label.*Voice response" ui/src/components/ChatVoicePlayer.tsx returns match + - grep "Collapsible" ui/src/components/ChatVoiceBadge.tsx returns match + - grep "Show full response" ui/src/components/ChatVoiceBadge.tsx returns match + - grep "Hide full response" ui/src/components/ChatVoiceBadge.tsx returns match + - grep "Badge.*Voice" ui/src/components/ChatVoiceBadge.tsx returns match + - grep "SPOKEN:" ui/src/components/ChatVoiceBadge.tsx returns match + - grep "ChatVoicePlayer" ui/src/components/ChatVoiceBadge.tsx returns match (imports it) + + ChatVoicePlayer synthesizes and plays audio with play/pause controls, auto-play support, and proper blob URL cleanup. ChatVoiceBadge shows Voice badge, spoken text, optional audio player, and collapsible full markdown for voice_full messages. + + + + Task 2: Create VoiceModeToggle three-pill component + + ui/src/components/VoiceModeToggle.tsx + + + ui/src/hooks/useVoiceMode.ts + + +**ui/src/components/VoiceModeToggle.tsx** — Three-state pill toggle: +```typescript +export function VoiceModeToggle() +``` +Implementation: +- Call `useVoiceMode()` to get `{ mode, setMode, isLoading }` +- Read auto-play preference from localStorage: `localStorage.getItem("nexus:voice:autoplay") === "true"` +- Provide `autoPlay` state + toggle in the component for WCHAT-06 (auto-play configurable) +- Render a ``: + - Three pill buttons, each a ``: + - "Text" → `setMode("text")` + - "Voice In" → `setMode("voice_input")` + - "Full Voice" → `setMode("full_voice")` + - Active pill: `bg-primary text-primary-foreground` classes + - Inactive pills: `bg-muted text-muted-foreground` classes + - All pills: `rounded-full px-3 py-1 text-xs font-medium transition-colors` + - Disabled when isLoading +- Below the pills (only when mode is "full_voice"), render auto-play toggle: + ``` + + { + setAutoPlay(e.target.checked); + localStorage.setItem("nexus:voice:autoplay", String(e.target.checked)); + }} + /> + Auto-play voice responses + + ``` +- Export autoPlay state for consumers: expose via a separate export or make VoiceModeToggle accept `onAutoPlayChange` callback. Better: just read localStorage directly in ChatVoiceBadge — keep it simple. +- The auto-play checkbox label text per UI spec: "Auto-play voice responses" + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/components/VoiceModeToggle.tsx && grep -q "export function VoiceModeToggle" ui/src/components/VoiceModeToggle.tsx && grep -q "useVoiceMode" ui/src/components/VoiceModeToggle.tsx && grep -q "Voice In" ui/src/components/VoiceModeToggle.tsx && grep -q "Full Voice" ui/src/components/VoiceModeToggle.tsx && grep -q "Text" ui/src/components/VoiceModeToggle.tsx && grep -q "bg-primary" ui/src/components/VoiceModeToggle.tsx && grep -q 'role="group"' ui/src/components/VoiceModeToggle.tsx && grep -q "nexus:voice:autoplay" ui/src/components/VoiceModeToggle.tsx && grep -q "Auto-play voice responses" ui/src/components/VoiceModeToggle.tsx && echo "PASS" || echo "FAIL" + + + - grep "export function VoiceModeToggle" ui/src/components/VoiceModeToggle.tsx returns match + - grep "useVoiceMode" ui/src/components/VoiceModeToggle.tsx returns match + - grep "Text" ui/src/components/VoiceModeToggle.tsx returns match (first pill) + - grep "Voice In" ui/src/components/VoiceModeToggle.tsx returns match (second pill) + - grep "Full Voice" ui/src/components/VoiceModeToggle.tsx returns match (third pill) + - grep "bg-primary text-primary-foreground" ui/src/components/VoiceModeToggle.tsx returns match (active state) + - grep "bg-muted text-muted-foreground" ui/src/components/VoiceModeToggle.tsx returns match (inactive state) + - grep 'role="group"' ui/src/components/VoiceModeToggle.tsx returns match + - grep 'aria-label="Voice mode"' ui/src/components/VoiceModeToggle.tsx returns match + - grep "nexus:voice:autoplay" ui/src/components/VoiceModeToggle.tsx returns match (localStorage key) + - grep "Auto-play voice responses" ui/src/components/VoiceModeToggle.tsx returns match + + VoiceModeToggle renders three pills with active/inactive styling. Clicking a pill persists voiceMode to nexus-settings. Auto-play checkbox appears in full_voice mode and persists to localStorage. + + + + + +- ChatVoicePlayer POSTs to /api/synthesize and plays via native audio element +- ChatVoicePlayer revokes blob URLs on cleanup (no memory leaks) +- ChatVoiceBadge parses SPOKEN/DETAILED content format +- ChatVoiceBadge shows collapsible section only for voice_full +- VoiceModeToggle has three pills with correct labels and accessibility +- Auto-play preference persisted in localStorage under nexus:voice:autoplay + + + +All three output-side voice components complete: ChatVoicePlayer plays synthesized audio with controls, ChatVoiceBadge renders voice badges with collapsible detail, VoiceModeToggle switches between text/voice_input/full_voice with persistence. + + + +After completion, create `.planning/phases/37-web-chat-voice-ui/37-03-SUMMARY.md` + diff --git a/.planning/phases/37-web-chat-voice-ui/37-04-PLAN.md b/.planning/phases/37-web-chat-voice-ui/37-04-PLAN.md new file mode 100644 index 00000000..1e9675f3 --- /dev/null +++ b/.planning/phases/37-web-chat-voice-ui/37-04-PLAN.md @@ -0,0 +1,377 @@ +--- +phase: 37-web-chat-voice-ui +plan: 04 +type: execute +wave: 3 +depends_on: ["37-02", "37-03"] +files_modified: + - ui/src/components/ChatInput.tsx + - ui/src/components/ChatMessage.tsx + - ui/src/components/ChatPanel.tsx + - ui/src/hooks/useStreamingChat.ts + - ui/src/api/chat.ts +autonomous: false +requirements: + - WCHAT-01 + - WCHAT-02 + - WCHAT-03 + - WCHAT-04 + - WCHAT-05 + - WCHAT-06 + +must_haves: + truths: + - "ChatInput renders VoiceMicButton instead of VoiceRecordButton" + - "ChatInput shows VoiceModeToggle when voice mode is not 'text'" + - "ChatMessage renders ChatVoiceBadge for voice_input and voice_full messageTypes" + - "ChatMessage renders ChatVoicePlayer for voice_full messages with auto-play from localStorage" + - "useStreamingChat.startStream accepts voiceMode parameter" + - "chatApi.postMessageAndStream sends voiceMode in request body" + - "ChatPanel passes voiceMode from useVoiceMode to startStream calls" + - "Full voice flow works end-to-end: mic -> VAD -> transcribe -> stream -> voice badge + audio" + artifacts: + - path: "ui/src/components/ChatInput.tsx" + provides: "Voice-enhanced chat input with VoiceMicButton + VoiceModeToggle" + contains: "VoiceMicButton" + - path: "ui/src/components/ChatMessage.tsx" + provides: "Voice-aware message rendering" + contains: "ChatVoiceBadge" + - path: "ui/src/hooks/useStreamingChat.ts" + provides: "Voice-mode-aware streaming" + contains: "voiceMode" + - path: "ui/src/api/chat.ts" + provides: "Voice mode in stream request" + contains: "voiceMode" + key_links: + - from: "ui/src/components/ChatPanel.tsx" + to: "ui/src/hooks/useVoiceMode.ts" + via: "useVoiceMode() hook call" + pattern: "useVoiceMode" + - from: "ui/src/components/ChatPanel.tsx" + to: "ui/src/hooks/useStreamingChat.ts" + via: "startStream(content, agentId, voiceMode)" + pattern: "startStream.*voiceMode" + - from: "ui/src/hooks/useStreamingChat.ts" + to: "ui/src/api/chat.ts" + via: "chatApi.postMessageAndStream with voiceMode" + pattern: "voiceMode" + - from: "ui/src/components/ChatInput.tsx" + to: "ui/src/components/VoiceMicButton.tsx" + via: "VoiceMicButton replaces VoiceRecordButton" + pattern: "VoiceMicButton" + - from: "ui/src/components/ChatMessage.tsx" + to: "ui/src/components/ChatVoiceBadge.tsx" + via: "ChatVoiceBadge for voice messageTypes" + pattern: "ChatVoiceBadge" +--- + + +Wire all voice components into the existing chat system: replace VoiceRecordButton with VoiceMicButton in ChatInput, add VoiceModeToggle, render ChatVoiceBadge in ChatMessage, and thread voiceMode through useStreamingChat and chatApi. + +Purpose: This is the integration plan that connects all Phase 37 components to the existing chat UI. Without this wiring, the components exist but aren't used. + +Output: 5 modified files connecting voice I/O to the chat system + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md +@.planning/phases/37-web-chat-voice-ui/37-02-SUMMARY.md +@.planning/phases/37-web-chat-voice-ui/37-03-SUMMARY.md + + + +```typescript +interface VoiceMicButtonProps { + onTranscript: (text: string) => void; + disabled?: boolean; +} +export function VoiceMicButton({ onTranscript, disabled }: VoiceMicButtonProps) +``` + + +```typescript +export function VoiceModeToggle() +// Uses useVoiceMode() internally; renders three pills + auto-play checkbox +``` + + +```typescript +interface ChatVoiceBadgeProps { + content: string; + messageType: string; // "voice_input" | "voice_full" + autoPlayVoice?: boolean; +} +export function ChatVoiceBadge({ content, messageType, autoPlayVoice }: ChatVoiceBadgeProps) +``` + + +```typescript +type VoiceMode = "text" | "voice_input" | "full_voice"; +export function useVoiceMode(): { mode: VoiceMode; setMode: (v: VoiceMode) => Promise; isLoading: boolean } +``` + + +```typescript +interface ChatInputProps { + onSend: (content: string) => void; + isSubmitting?: boolean; + disabled?: boolean; + placeholder?: string; + agents?: Agent[]; + agentsLoading?: boolean; + onFilesPicked?: (files: File[]) => void; + pendingFiles?: PendingFile[]; + onRemoveFile?: (id: string) => void; + enableVoiceInput?: boolean; // Controls VoiceRecordButton visibility +} +``` + + +```typescript +export function useStreamingChat(conversationId: string | null) { + // startStream(userMessage: string, agentId?: string) — needs voiceMode param added + return { streamingContent, isStreaming, startStream, stop }; +} +``` + + +```typescript +async postMessageAndStream( + conversationId: string, + data: { content: string; agentId?: string }, // needs voiceMode added + callbacks: { onToken, onDone, onError }, + signal?: AbortSignal, +): Promise +``` + + +```typescript +// handleSend calls startStream(content, resolvedAgentId) — needs voiceMode +``` + + + + + + + Task 1: Thread voiceMode through chatApi and useStreamingChat + + ui/src/api/chat.ts, + ui/src/hooks/useStreamingChat.ts + + + ui/src/api/chat.ts, + ui/src/hooks/useStreamingChat.ts + + +1. **ui/src/api/chat.ts** — Extend postMessageAndStream data parameter: + - Change the `data` parameter type from `{ content: string; agentId?: string }` to `{ content: string; agentId?: string; voiceMode?: string }` + - The body is already sent as `JSON.stringify(data)`, so voiceMode will be included automatically when present + - No other changes needed — the server's chat.ts stream handler already reads voiceMode from req.body (added in Plan 01) + +2. **ui/src/hooks/useStreamingChat.ts** — Extend startStream to accept voiceMode: + - Change `startStream` signature from `(userMessage: string, agentId?: string)` to `(userMessage: string, agentId?: string, voiceMode?: string)` + - Pass voiceMode through to chatApi.postMessageAndStream: + ```typescript + chatApi.postMessageAndStream( + conversationId, + { content: userMessage, agentId, voiceMode }, + { onToken, onDone, onError }, + abort.signal, + ); + ``` + - Add `voiceMode` to the useCallback dependency array if needed (it's a parameter, not state, so it shouldn't need to be) + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "voiceMode" ui/src/api/chat.ts && grep -q "voiceMode" ui/src/hooks/useStreamingChat.ts && grep "postMessageAndStream" ui/src/api/chat.ts | grep -q "voiceMode" && echo "PASS" || echo "FAIL" + + + - grep "voiceMode" ui/src/api/chat.ts returns match in postMessageAndStream data type + - grep "voiceMode" ui/src/hooks/useStreamingChat.ts returns match in startStream signature + - grep "voiceMode" ui/src/hooks/useStreamingChat.ts returns match in postMessageAndStream call + + chatApi.postMessageAndStream sends voiceMode in request body. useStreamingChat.startStream accepts and forwards voiceMode parameter. + + + + Task 2: Wire VoiceMicButton + VoiceModeToggle into ChatInput, ChatVoiceBadge into ChatMessage, voiceMode into ChatPanel + + ui/src/components/ChatInput.tsx, + ui/src/components/ChatMessage.tsx, + ui/src/components/ChatPanel.tsx + + + ui/src/components/ChatInput.tsx, + ui/src/components/ChatMessage.tsx, + ui/src/components/ChatPanel.tsx, + ui/src/components/VoiceMicButton.tsx, + ui/src/components/VoiceModeToggle.tsx, + ui/src/components/ChatVoiceBadge.tsx, + ui/src/hooks/useVoiceMode.ts, + ui/src/hooks/useStreamingChat.ts + + +1. **ui/src/components/ChatInput.tsx** — Replace VoiceRecordButton with VoiceMicButton: + - Remove import of VoiceRecordButton: `import { VoiceRecordButton } from "./VoiceRecordButton";` + - Add import: `import { VoiceMicButton } from "./VoiceMicButton";` + - Add import: `import { VoiceModeToggle } from "./VoiceModeToggle";` + - In the JSX, find the VoiceRecordButton rendering block: + ```tsx + {enableVoiceInput && ( + + )} + ``` + Replace with: + ```tsx + {enableVoiceInput && ( + + )} + ``` + - Add VoiceModeToggle ABOVE the input form, inside the ChatInput component, after ChatFileDropZone opens but before the form: + ```tsx + + {enableVoiceInput && } + + ``` + This places the toggle above the input row so it doesn't crowd the send button area. + +2. **ui/src/components/ChatMessage.tsx** — Add ChatVoiceBadge for voice messages: + - Add imports: + ```typescript + import { ChatVoiceBadge } from "./ChatVoiceBadge"; + ``` + - In the messageType dispatch block (after the existing spec_card, handoff, task_created, status_update checks), add: + ```typescript + if (messageType === "voice_input" || messageType === "voice_full") { + const autoPlay = typeof window !== "undefined" + ? localStorage.getItem("nexus:voice:autoplay") === "true" + : false; + return ( + + {agentName && ( + + )} + + {isStreaming && } + onRetry(id) : undefined} + onBookmark={id && onBookmark ? () => onBookmark(id) : undefined} + isBookmarked={isBookmarked} + /> + + ); + } + ``` + - Place this BEFORE the general "fall through to default system message rendering" comment, but AFTER the status_update check + +3. **ui/src/components/ChatPanel.tsx** — Connect useVoiceMode and pass voiceMode to startStream: + - Add imports: + ```typescript + import { useVoiceMode } from "../hooks/useVoiceMode"; + ``` + - Inside the ChatPanel component, call the hook: + ```typescript + const { mode: voiceMode } = useVoiceMode(); + ``` + - Find ALL calls to `startStream(content, agentId)` (there are ~5 of them per the read_first scan). Add voiceMode as third argument: + ```typescript + startStream(content, resolvedAgentId ?? undefined, voiceMode); + ``` + - The five locations are approximately: + - In handleSend: `startStream(content, resolvedAgentId ?? undefined)` (two calls — online and offline branches) + - In handleEdit callback: `startStream(newContent, activeAgentId ?? undefined)` + - In handleRetry: `startStream(newContent, activeAgentId ?? undefined)` + - In retry from error: `startStream(lastUserContent, activeAgentId ?? undefined)` + - Update each to include `voiceMode` as the third argument + - Also pass `enableVoiceInput={voiceMode !== "text" || true}` to ChatInput — actually, keep `enableVoiceInput={true}` always (or however it's currently set). The VoiceModeToggle handles mode selection independently. The mic button should always be visible when voice is available. + - Check how enableVoiceInput is currently set in ChatPanel. If it's hardcoded or conditional, ensure it stays true so VoiceMicButton renders. + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "VoiceMicButton" ui/src/components/ChatInput.tsx && grep -q "VoiceModeToggle" ui/src/components/ChatInput.tsx && ! grep -q "VoiceRecordButton" ui/src/components/ChatInput.tsx && grep -q "ChatVoiceBadge" ui/src/components/ChatMessage.tsx && grep -q "voice_input\|voice_full" ui/src/components/ChatMessage.tsx && grep -q "useVoiceMode" ui/src/components/ChatPanel.tsx && grep -q "voiceMode" ui/src/components/ChatPanel.tsx && echo "PASS" || echo "FAIL" + + + - grep "VoiceMicButton" ui/src/components/ChatInput.tsx returns match + - grep "VoiceModeToggle" ui/src/components/ChatInput.tsx returns match + - grep "VoiceRecordButton" ui/src/components/ChatInput.tsx returns NO match (replaced) + - grep "ChatVoiceBadge" ui/src/components/ChatMessage.tsx returns match + - grep "voice_input" ui/src/components/ChatMessage.tsx returns match + - grep "voice_full" ui/src/components/ChatMessage.tsx returns match + - grep "nexus:voice:autoplay" ui/src/components/ChatMessage.tsx returns match (reads localStorage) + - grep "useVoiceMode" ui/src/components/ChatPanel.tsx returns match + - grep "voiceMode" ui/src/components/ChatPanel.tsx appears in startStream calls + - grep "startStream.*voiceMode" ui/src/components/ChatPanel.tsx returns match + + ChatInput uses VoiceMicButton (VAD-powered) instead of VoiceRecordButton. VoiceModeToggle shown above input. ChatMessage renders ChatVoiceBadge for voice messages. ChatPanel passes voiceMode to all startStream calls. + + + + Task 3: Verify voice flow end-to-end + ui/src/components/ChatPanel.tsx + ui/src/components/ChatPanel.tsx + +Human verification of the complete voice I/O integration. No code changes in this task — all implementation was done in Tasks 1-2. This checkpoint confirms the full voice flow works visually and functionally in the browser. + +What was built across all Phase 37 plans: +- VoiceMicButton with VAD auto-stop replacing VoiceRecordButton +- VoiceWaveform canvas animation during recording +- VoiceModeToggle (Text / Voice In / Full Voice) with nexus-settings persistence +- ChatVoiceBadge with collapsible full markdown for voice_full messages +- ChatVoicePlayer with play/pause and auto-play from localStorage +- voiceMode threaded through ChatPanel -> useStreamingChat -> chatApi -> server chat.ts + + + cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "VoiceMicButton" ui/src/components/ChatInput.tsx && grep -q "ChatVoiceBadge" ui/src/components/ChatMessage.tsx && grep -q "voiceMode" ui/src/components/ChatPanel.tsx && echo "PASS" || echo "FAIL" + + + - VoiceModeToggle visible above chat input with three pills + - Mic button starts recording with waveform animation + - Recording auto-stops on silence detection + - Transcribed text populates input field + - Voice badge appears on agent responses in voice modes + - Audio player works for voice_full messages + - Auto-play toggle persists across page refresh + + End-to-end voice flow verified by human: recording, VAD auto-stop, transcription, voice mode toggle, voice badge, audio playback, and auto-play setting all working correctly. + + + + + +- VoiceRecordButton fully replaced by VoiceMicButton in ChatInput +- VoiceModeToggle renders above chat input +- ChatMessage dispatches voice_input and voice_full to ChatVoiceBadge +- voiceMode flows: ChatPanel -> useStreamingChat -> chatApi -> server chat.ts +- Auto-play reads from localStorage +- TypeScript compiles without errors: pnpm --filter @paperclipai/ui typecheck + + + +Complete voice I/O working in browser chat: VAD-powered recording with waveform, auto-stop on silence, voice mode toggle with persistence, voice badge on responses, inline audio player with auto-play setting. User can have a full voice conversation with their agent. + + + +After completion, create `.planning/phases/37-web-chat-voice-ui/37-04-SUMMARY.md` +
{spokenText}