300 lines
14 KiB
Markdown
300 lines
14 KiB
Markdown
---
|
|
phase: 37-web-chat-voice-ui
|
|
plan: 02
|
|
type: execute
|
|
wave: 2
|
|
depends_on: ["37-01"]
|
|
files_modified:
|
|
- ui/src/lib/encodeWav.ts
|
|
- ui/src/hooks/useVadRecorder.ts
|
|
- ui/src/hooks/useVoiceMode.ts
|
|
- ui/src/components/VoiceWaveform.tsx
|
|
- ui/src/components/VoiceMicButton.tsx
|
|
autonomous: true
|
|
requirements:
|
|
- WCHAT-01
|
|
- WCHAT-02
|
|
- WCHAT-03
|
|
- WCHAT-05
|
|
|
|
must_haves:
|
|
truths:
|
|
- "VoiceMicButton renders three visual states: idle (Mic icon), recording (waveform + ring), processing (Loader2 spinner)"
|
|
- "Recording auto-stops on silence via VAD onSpeechEnd callback"
|
|
- "VoiceWaveform renders animated canvas bars during recording"
|
|
- "useVadRecorder converts Float32Array to WAV and POSTs to /api/transcribe"
|
|
- "useVoiceMode reads voiceMode from GET /api/nexus/settings and writes via PATCH"
|
|
artifacts:
|
|
- path: "ui/src/lib/encodeWav.ts"
|
|
provides: "Float32Array to WAV blob encoder"
|
|
exports: ["encodeWav"]
|
|
- path: "ui/src/hooks/useVadRecorder.ts"
|
|
provides: "VAD recording hook with auto-stop"
|
|
exports: ["useVadRecorder"]
|
|
- path: "ui/src/hooks/useVoiceMode.ts"
|
|
provides: "Voice mode state from nexus-settings"
|
|
exports: ["useVoiceMode"]
|
|
- path: "ui/src/components/VoiceWaveform.tsx"
|
|
provides: "Canvas amplitude visualization"
|
|
exports: ["VoiceWaveform"]
|
|
- path: "ui/src/components/VoiceMicButton.tsx"
|
|
provides: "VAD-powered mic button with three states"
|
|
exports: ["VoiceMicButton"]
|
|
key_links:
|
|
- from: "ui/src/components/VoiceMicButton.tsx"
|
|
to: "ui/src/hooks/useVadRecorder.ts"
|
|
via: "useVadRecorder() hook call"
|
|
pattern: "useVadRecorder"
|
|
- from: "ui/src/hooks/useVadRecorder.ts"
|
|
to: "ui/src/lib/encodeWav.ts"
|
|
via: "encodeWav(audio) in onSpeechEnd"
|
|
pattern: "encodeWav"
|
|
- from: "ui/src/hooks/useVadRecorder.ts"
|
|
to: "/api/transcribe"
|
|
via: "fetch POST with FormData"
|
|
pattern: "fetch.*api/transcribe"
|
|
- from: "ui/src/components/VoiceMicButton.tsx"
|
|
to: "ui/src/components/VoiceWaveform.tsx"
|
|
via: "VoiceWaveform rendered inside recording state"
|
|
pattern: "<VoiceWaveform"
|
|
---
|
|
|
|
<objective>
|
|
Build the core voice recording components: WAV encoder, VAD recorder hook, voice mode hook, waveform visualization, and the VoiceMicButton that ties them together.
|
|
|
|
Purpose: These are the foundational building blocks that replace VoiceRecordButton with VAD-powered auto-stop recording and real-time waveform visualization.
|
|
|
|
Output: 5 new files — encodeWav utility, useVadRecorder hook, useVoiceMode hook, VoiceWaveform component, VoiceMicButton component
|
|
</objective>
|
|
|
|
<execution_context>
|
|
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
|
|
@$HOME/.claude/get-shit-done/templates/summary.md
|
|
</execution_context>
|
|
|
|
<context>
|
|
@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md
|
|
@.planning/phases/37-web-chat-voice-ui/37-01-SUMMARY.md
|
|
|
|
<interfaces>
|
|
<!-- From 37-RESEARCH.md — useMicVAD API -->
|
|
```typescript
|
|
// @ricky0123/vad-react useMicVAD hook
|
|
import { useMicVAD } from "@ricky0123/vad-react";
|
|
const vad = useMicVAD({
|
|
startOnLoad: false,
|
|
baseAssetPath: "/",
|
|
onnxWASMBasePath: "/",
|
|
positiveSpeechThreshold: 0.8,
|
|
negativeSpeechThreshold: 0.65,
|
|
redemptionFrames: 8,
|
|
minSpeechFrames: 5,
|
|
onSpeechStart: () => void,
|
|
onSpeechEnd: (audio: Float32Array) => void,
|
|
});
|
|
// Returns: { listening, loading, errored, userSpeaking, start, pause }
|
|
```
|
|
|
|
<!-- From existing VoiceRecordButton (parent branch) — replacement target -->
|
|
```typescript
|
|
interface VoiceRecordButtonProps {
|
|
onTranscription: (text: string) => void;
|
|
disabled?: boolean;
|
|
}
|
|
```
|
|
|
|
<!-- nexus-settings API (from Plan 01) -->
|
|
```
|
|
GET /api/nexus/settings → { mode, voiceEnabled, voiceMode, ... }
|
|
PATCH /api/nexus/settings → accepts partial, returns updated
|
|
```
|
|
</interfaces>
|
|
</context>
|
|
|
|
<tasks>
|
|
|
|
<task type="auto">
|
|
<name>Task 1: Create encodeWav utility and useVadRecorder + useVoiceMode hooks</name>
|
|
<files>
|
|
ui/src/lib/encodeWav.ts,
|
|
ui/src/hooks/useVadRecorder.ts,
|
|
ui/src/hooks/useVoiceMode.ts
|
|
</files>
|
|
<read_first>
|
|
ui/src/hooks/useStreamingChat.ts,
|
|
ui/src/api/chat.ts,
|
|
ui/src/components/VoiceRecordButton.tsx
|
|
</read_first>
|
|
<action>
|
|
1. **ui/src/lib/encodeWav.ts** — Create WAV encoder function:
|
|
```typescript
|
|
export function encodeWav(samples: Float32Array, sampleRate = 16000): Blob
|
|
```
|
|
- Standard 44-byte WAV header (RIFF/WAVE/fmt/data chunks)
|
|
- PCM format (1), mono (1 channel), 16-bit depth
|
|
- Clamp samples to [-1, 1] range before int16 conversion
|
|
- Return Blob with type "audio/wav"
|
|
- Helper: `function writeString(view: DataView, offset: number, str: string)`
|
|
|
|
2. **ui/src/hooks/useVadRecorder.ts** — Create VAD recording hook:
|
|
```typescript
|
|
interface UseVadRecorderOptions {
|
|
onTranscript: (text: string) => void;
|
|
}
|
|
interface UseVadRecorderReturn {
|
|
state: "idle" | "recording" | "processing";
|
|
start: () => void;
|
|
stop: () => void;
|
|
mediaStream: MediaStream | null; // exposed for VoiceWaveform AnalyserNode
|
|
}
|
|
export function useVadRecorder(opts: UseVadRecorderOptions): UseVadRecorderReturn
|
|
```
|
|
Implementation:
|
|
- Use `useMicVAD` from `@ricky0123/vad-react` with `startOnLoad: false`
|
|
- Set `baseAssetPath: "/"` and `onnxWASMBasePath: "/"` (serve from ui/public/)
|
|
- Set `positiveSpeechThreshold: 0.8`, `minSpeechFrames: 5` (300ms minimum to filter noise)
|
|
- In `onSpeechEnd(audio: Float32Array)`:
|
|
a. Call `vad.pause()` to stop listening
|
|
b. Set state to "processing"
|
|
c. Call `encodeWav(audio)` to get WAV blob
|
|
d. Create FormData, append blob as "audio" field with filename "recording.wav"
|
|
e. POST to `/api/transcribe` with `credentials: "include"`
|
|
f. Parse response as `{ text: string }`
|
|
g. If text is non-empty (length >= 2), call `opts.onTranscript(text.trim())`
|
|
h. Set state back to "idle"
|
|
- `start()`: calls `vad.start()`, sets state to "recording"
|
|
- `stop()`: calls `vad.pause()`, sets state to "idle"
|
|
- Expose `mediaStream` from `navigator.mediaDevices.getUserMedia({ audio: true })` — store in a ref. This is needed for VoiceWaveform AnalyserNode.
|
|
- NOTE: useMicVAD manages its own media stream internally, but VoiceWaveform needs a separate reference to the stream for the AnalyserNode. Request the stream in the `start()` function and store in a ref. Stop tracks in `stop()`.
|
|
|
|
3. **ui/src/hooks/useVoiceMode.ts** — Create voice mode hook:
|
|
```typescript
|
|
type VoiceMode = "text" | "voice_input" | "full_voice";
|
|
interface UseVoiceModeReturn {
|
|
mode: VoiceMode;
|
|
setMode: (next: VoiceMode) => Promise<void>;
|
|
isLoading: boolean;
|
|
}
|
|
export function useVoiceMode(): UseVoiceModeReturn
|
|
```
|
|
Implementation:
|
|
- On mount, GET /api/nexus/settings with credentials: "include"
|
|
- Extract `voiceMode` from response, default to "text"
|
|
- `setMode(next)`: optimistically update local state, then PATCH /api/nexus/settings with `{ voiceMode: next }`
|
|
- Use useState for mode and isLoading
|
|
- Wrap fetch in try/catch; on error, revert to previous mode
|
|
</action>
|
|
<verify>
|
|
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/lib/encodeWav.ts && test -f ui/src/hooks/useVadRecorder.ts && test -f ui/src/hooks/useVoiceMode.ts && grep -q "encodeWav" ui/src/lib/encodeWav.ts && grep -q "useVadRecorder" ui/src/hooks/useVadRecorder.ts && grep -q "useVoiceMode" ui/src/hooks/useVoiceMode.ts && grep -q "useMicVAD" ui/src/hooks/useVadRecorder.ts && grep -q "api/transcribe" ui/src/hooks/useVadRecorder.ts && grep -q "api/nexus/settings" ui/src/hooks/useVoiceMode.ts && echo "PASS" || echo "FAIL"</automated>
|
|
</verify>
|
|
<acceptance_criteria>
|
|
- grep "export function encodeWav" ui/src/lib/encodeWav.ts returns match
|
|
- grep "export function useVadRecorder" ui/src/hooks/useVadRecorder.ts returns match
|
|
- grep "export function useVoiceMode" ui/src/hooks/useVoiceMode.ts returns match
|
|
- grep "useMicVAD" ui/src/hooks/useVadRecorder.ts returns match
|
|
- grep "startOnLoad.*false" ui/src/hooks/useVadRecorder.ts returns match
|
|
- grep "baseAssetPath" ui/src/hooks/useVadRecorder.ts returns match with "/"
|
|
- grep "api/transcribe" ui/src/hooks/useVadRecorder.ts returns match
|
|
- grep "api/nexus/settings" ui/src/hooks/useVoiceMode.ts returns match
|
|
- grep "encodeWav" ui/src/hooks/useVadRecorder.ts returns match (imports it)
|
|
- grep "RIFF" ui/src/lib/encodeWav.ts returns match (WAV header)
|
|
</acceptance_criteria>
|
|
<done>encodeWav utility produces valid WAV blobs. useVadRecorder wraps useMicVAD with auto-stop + transcription. useVoiceMode reads/writes voiceMode from nexus-settings API.</done>
|
|
</task>
|
|
|
|
<task type="auto">
|
|
<name>Task 2: Create VoiceWaveform canvas component and VoiceMicButton</name>
|
|
<files>
|
|
ui/src/components/VoiceWaveform.tsx,
|
|
ui/src/components/VoiceMicButton.tsx
|
|
</files>
|
|
<read_first>
|
|
ui/src/hooks/useVadRecorder.ts,
|
|
ui/src/lib/encodeWav.ts,
|
|
ui/src/components/VoiceRecordButton.tsx
|
|
</read_first>
|
|
<action>
|
|
1. **ui/src/components/VoiceWaveform.tsx** — Canvas-based amplitude visualization:
|
|
```typescript
|
|
interface VoiceWaveformProps {
|
|
stream: MediaStream | null;
|
|
active: boolean; // controls animation loop
|
|
}
|
|
export function VoiceWaveform({ stream, active }: VoiceWaveformProps)
|
|
```
|
|
Implementation:
|
|
- Use a `<canvas>` element, width=80, height=32 (h-8 per UI spec), className="inline-block"
|
|
- On mount (when stream is truthy and active is true):
|
|
a. Create AudioContext (lazily — only create once, store in ref)
|
|
b. If AudioContext is suspended, call `audioCtx.resume()`
|
|
c. Create MediaStreamSource from stream
|
|
d. Create AnalyserNode with fftSize=64 (gives 32 frequency bins)
|
|
e. Connect source -> analyser
|
|
f. Start requestAnimationFrame loop:
|
|
- Call `analyser.getByteFrequencyData(dataArray)` into Uint8Array(32)
|
|
- Clear canvas
|
|
- Draw 20 bars (skip every other bin for cleaner look): each bar width=2px, gap=2px
|
|
- Bar height = (dataArray[i*2] / 255) * canvasHeight, minimum 2px
|
|
- Bar color: use CSS variable --primary via getComputedStyle
|
|
g. Store animationFrame id in ref for cleanup
|
|
- On cleanup or when active becomes false: cancelAnimationFrame, disconnect source
|
|
- Do NOT close AudioContext on cleanup (reuse across start/stop cycles)
|
|
|
|
2. **ui/src/components/VoiceMicButton.tsx** — VAD-powered mic button:
|
|
```typescript
|
|
interface VoiceMicButtonProps {
|
|
onTranscript: (text: string) => void;
|
|
disabled?: boolean;
|
|
}
|
|
export function VoiceMicButton({ onTranscript, disabled }: VoiceMicButtonProps)
|
|
```
|
|
Implementation:
|
|
- Call `useVadRecorder({ onTranscript })` to get `{ state, start, stop, mediaStream }`
|
|
- Three visual states per UI spec:
|
|
a. **idle** (state === "idle"): Render Button with ghost variant, size="icon", h-8 w-8. Content: `<Mic className="h-4 w-4" />`. aria-label="Start voice input". onClick calls start().
|
|
b. **recording** (state === "recording"): Render Button with ghost variant, size="icon", h-8 w-8, with `ring-2 ring-primary` classes. Content: `<VoiceWaveform stream={mediaStream} active={true} />`. aria-label="Recording — speak now". onClick calls stop().
|
|
c. **processing** (state === "processing"): Render Button disabled, ghost variant, size="icon", h-8 w-8. Content: `<Loader2 className="h-4 w-4 animate-spin" />`. aria-label="Transcribing...".
|
|
- Import Mic, Loader2 from lucide-react
|
|
- Import Button from @/components/ui/button
|
|
- Import VoiceWaveform from ./VoiceWaveform
|
|
- Import useVadRecorder from ../hooks/useVadRecorder
|
|
- When disabled prop is true, render idle state with disabled attribute
|
|
</action>
|
|
<verify>
|
|
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/components/VoiceWaveform.tsx && test -f ui/src/components/VoiceMicButton.tsx && grep -q "VoiceWaveform" ui/src/components/VoiceWaveform.tsx && grep -q "VoiceMicButton" ui/src/components/VoiceMicButton.tsx && grep -q "canvas" ui/src/components/VoiceWaveform.tsx && grep -q "useVadRecorder" ui/src/components/VoiceMicButton.tsx && grep -q "Mic" ui/src/components/VoiceMicButton.tsx && grep -q "Loader2" ui/src/components/VoiceMicButton.tsx && grep -q "ring-2 ring-primary" ui/src/components/VoiceMicButton.tsx && echo "PASS" || echo "FAIL"</automated>
|
|
</verify>
|
|
<acceptance_criteria>
|
|
- grep "export function VoiceWaveform" ui/src/components/VoiceWaveform.tsx returns match
|
|
- grep "export function VoiceMicButton" ui/src/components/VoiceMicButton.tsx returns match
|
|
- grep "canvas" ui/src/components/VoiceWaveform.tsx returns match
|
|
- grep "AnalyserNode\|createAnalyser\|analyser" ui/src/components/VoiceWaveform.tsx returns match
|
|
- grep "requestAnimationFrame" ui/src/components/VoiceWaveform.tsx returns match
|
|
- grep "getByteFrequencyData" ui/src/components/VoiceWaveform.tsx returns match
|
|
- grep "useVadRecorder" ui/src/components/VoiceMicButton.tsx returns match
|
|
- grep 'aria-label="Start voice input"' ui/src/components/VoiceMicButton.tsx returns match
|
|
- grep 'aria-label="Recording' ui/src/components/VoiceMicButton.tsx returns match
|
|
- grep 'aria-label="Transcribing' ui/src/components/VoiceMicButton.tsx returns match
|
|
- grep "ring-2 ring-primary" ui/src/components/VoiceMicButton.tsx returns match
|
|
- grep "Loader2.*animate-spin" ui/src/components/VoiceMicButton.tsx returns match
|
|
</acceptance_criteria>
|
|
<done>VoiceWaveform renders 20 animated bars from Web Audio API AnalyserNode on a 80x32 canvas. VoiceMicButton shows idle/recording/processing states with correct icons, aria-labels, and ring styling.</done>
|
|
</task>
|
|
|
|
</tasks>
|
|
|
|
<verification>
|
|
- All 5 files exist and export their named functions
|
|
- useVadRecorder uses useMicVAD with startOnLoad: false and baseAssetPath: "/"
|
|
- VoiceMicButton has three distinct visual states with correct aria-labels
|
|
- VoiceWaveform uses canvas + AnalyserNode pattern
|
|
- encodeWav produces Blob with type audio/wav
|
|
- useVoiceMode reads/writes via /api/nexus/settings
|
|
</verification>
|
|
|
|
<success_criteria>
|
|
Core voice recording pipeline complete: user clicks mic -> VAD listens -> waveform animates -> silence detected -> audio encoded to WAV -> POSTed to /api/transcribe -> transcript returned. Voice mode readable/writable from nexus-settings.
|
|
</success_criteria>
|
|
|
|
<output>
|
|
After completion, create `.planning/phases/37-web-chat-voice-ui/37-02-SUMMARY.md`
|
|
</output>
|