docs(37): create 4 plans in 3 waves for web chat voice UI
This commit is contained in:
parent
fdc956c6a6
commit
1eaa6c4b3e
4 changed files with 1260 additions and 0 deletions
297
.planning/phases/37-web-chat-voice-ui/37-01-PLAN.md
Normal file
297
.planning/phases/37-web-chat-voice-ui/37-01-PLAN.md
Normal file
|
|
@ -0,0 +1,297 @@
|
|||
---
|
||||
phase: 37-web-chat-voice-ui
|
||||
plan: 01
|
||||
type: execute
|
||||
wave: 1
|
||||
depends_on: []
|
||||
files_modified:
|
||||
- server/src/services/nexus-settings.ts
|
||||
- server/src/routes/nexus-settings.ts
|
||||
- server/src/routes/voice.ts
|
||||
- server/src/routes/chat.ts
|
||||
- server/src/app.ts
|
||||
- packages/shared/src/types/chat.ts
|
||||
- packages/shared/src/validators/chat.ts
|
||||
- ui/vite.config.ts
|
||||
- ui/package.json
|
||||
- ui/public/vad.worklet.bundle.min.js
|
||||
- ui/public/silero_vad_legacy.onnx
|
||||
- ui/public/silero_vad_v5.onnx
|
||||
autonomous: true
|
||||
requirements:
|
||||
- WCHAT-01
|
||||
- WCHAT-02
|
||||
- WCHAT-04
|
||||
|
||||
must_haves:
|
||||
truths:
|
||||
- "POST /api/transcribe accepts audio upload and returns { text }"
|
||||
- "POST /api/synthesize accepts { text } and returns audio/wav"
|
||||
- "GET /api/nexus/settings returns voiceMode field"
|
||||
- "PATCH /api/nexus/settings accepts voiceMode update"
|
||||
- "Chat stream endpoint accepts voiceMode in request body"
|
||||
- "SharedArrayBuffer is available in browser (COOP/COEP headers set)"
|
||||
- "VAD ONNX model files are served from /vad.worklet.bundle.min.js, /silero_vad_legacy.onnx, /silero_vad_v5.onnx"
|
||||
artifacts:
|
||||
- path: "server/src/routes/voice.ts"
|
||||
provides: "POST /api/transcribe and POST /api/synthesize"
|
||||
exports: ["voiceRoutes"]
|
||||
- path: "server/src/routes/nexus-settings.ts"
|
||||
provides: "GET/PATCH /api/nexus/settings"
|
||||
exports: ["nexusSettingsRoutes"]
|
||||
- path: "server/src/services/nexus-settings.ts"
|
||||
provides: "nexusSettingsService with voiceMode field"
|
||||
exports: ["nexusSettingsService", "VoiceMode", "VOICE_MODES"]
|
||||
- path: "ui/public/vad.worklet.bundle.min.js"
|
||||
provides: "VAD AudioWorklet bundle"
|
||||
- path: "ui/public/silero_vad_legacy.onnx"
|
||||
provides: "Silero VAD legacy ONNX model"
|
||||
key_links:
|
||||
- from: "server/src/app.ts"
|
||||
to: "server/src/routes/voice.ts"
|
||||
via: "api.use(voiceRoutes())"
|
||||
pattern: "voiceRoutes"
|
||||
- from: "server/src/app.ts"
|
||||
to: "server/src/routes/nexus-settings.ts"
|
||||
via: "api.use(nexusSettingsRoutes())"
|
||||
pattern: "nexusSettingsRoutes"
|
||||
- from: "server/src/routes/chat.ts"
|
||||
to: "voiceMode parameter"
|
||||
via: "req.body.voiceMode in stream handler"
|
||||
pattern: "voiceMode.*voice_input|voice_full"
|
||||
---
|
||||
|
||||
<objective>
|
||||
Establish all server-side prerequisites and browser infrastructure for voice I/O.
|
||||
|
||||
Purpose: Phase 36 Tasks 2-3 (nexus-settings voiceMode schema, voice HTTP routes, voiceMode wiring in chat.ts) are not present on this branch. This plan cherry-picks or re-implements those deliverables, adds COOP/COEP headers for SharedArrayBuffer, installs @ricky0123/vad-react, copies VAD ONNX assets to ui/public/, and configures Vite dev server headers.
|
||||
|
||||
Output: Working server endpoints (transcribe, synthesize, nexus-settings), COOP/COEP isolation, VAD assets ready in ui/public/
|
||||
</objective>
|
||||
|
||||
<execution_context>
|
||||
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
|
||||
@$HOME/.claude/get-shit-done/templates/summary.md
|
||||
</execution_context>
|
||||
|
||||
<context>
|
||||
@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md
|
||||
|
||||
<interfaces>
|
||||
<!-- Phase 36 branch deliverables that must be present before Phase 37 UI work -->
|
||||
|
||||
From server/src/services/voice-pipeline.ts (ALREADY on this branch):
|
||||
```typescript
|
||||
// voicePipelineService() exposes transcribe(buffer, format) and synthesize(text, voiceId?)
|
||||
export function voicePipelineService(): { transcribe, synthesize, formatForVoice, transcodeToWav16k }
|
||||
```
|
||||
|
||||
From server/src/app.ts (parent branch — route mounting pattern):
|
||||
```typescript
|
||||
// Routes are mounted on an `api` Router via api.use(...)
|
||||
// Pattern: import { xyzRoutes } from "./routes/xyz.js"; then api.use(xyzRoutes());
|
||||
import { chatRoutes } from "./routes/chat.js";
|
||||
api.use(chatRoutes(db, storageService, config));
|
||||
```
|
||||
|
||||
From packages/shared/src/types/chat.ts (parent branch):
|
||||
```typescript
|
||||
export interface ChatMessage {
|
||||
id: string;
|
||||
conversationId: string;
|
||||
role: "user" | "assistant" | "system";
|
||||
content: string;
|
||||
messageType?: string | null;
|
||||
// ... other fields
|
||||
}
|
||||
```
|
||||
|
||||
From packages/shared/src/validators/chat.ts (parent branch):
|
||||
```typescript
|
||||
export const createMessageSchema = z.object({
|
||||
content: z.string().min(1),
|
||||
role: z.enum(["user", "assistant", "system"]).default("user"),
|
||||
agentId: z.string().uuid().optional(),
|
||||
// voiceMode NOT present on parent branch — must add
|
||||
});
|
||||
```
|
||||
</interfaces>
|
||||
</context>
|
||||
|
||||
<tasks>
|
||||
|
||||
<task type="auto">
|
||||
<name>Task 1: Cherry-pick Phase 36 server deliverables and add COOP/COEP headers</name>
|
||||
<files>
|
||||
server/src/services/nexus-settings.ts,
|
||||
server/src/routes/nexus-settings.ts,
|
||||
server/src/routes/voice.ts,
|
||||
server/src/routes/chat.ts,
|
||||
server/src/app.ts,
|
||||
packages/shared/src/types/chat.ts,
|
||||
packages/shared/src/validators/chat.ts
|
||||
</files>
|
||||
<read_first>
|
||||
server/src/services/nexus-settings.ts,
|
||||
server/src/services/voice-pipeline.ts,
|
||||
server/src/app.ts,
|
||||
server/src/routes/chat.ts,
|
||||
packages/shared/src/types/chat.ts,
|
||||
packages/shared/src/validators/chat.ts
|
||||
</read_first>
|
||||
<action>
|
||||
Cherry-pick or re-implement Phase 36 Tasks 2-3 deliverables. The commits on gsd/phase-36-voice-pipeline-foundation are:
|
||||
- d0d7a23a (nexus-settings voiceMode schema extension)
|
||||
- b964c0e4 (voiceMode in createMessageSchema + ChatMessage interface)
|
||||
- 11508547 (voice HTTP routes)
|
||||
- fd372eaf (voiceMode wiring in chat.ts + route mounting)
|
||||
|
||||
Try cherry-picking these 4 commits in order:
|
||||
```bash
|
||||
git cherry-pick d0d7a23a b964c0e4 11508547 fd372eaf
|
||||
```
|
||||
|
||||
If cherry-pick conflicts, re-implement manually:
|
||||
|
||||
1. **server/src/services/nexus-settings.ts** — Add VOICE_MODES and VoiceMode type:
|
||||
```typescript
|
||||
export const VOICE_MODES = ["text", "voice_input", "full_voice"] as const;
|
||||
export type VoiceMode = (typeof VOICE_MODES)[number];
|
||||
```
|
||||
Add `voiceMode: z.enum(VOICE_MODES).default("text")` to nexusSettingsSchema.
|
||||
Add `telegramToken: z.string().optional()`, `piperBinaryPath: z.string().optional()`, `whisperBinaryPath: z.string().optional()`.
|
||||
|
||||
2. **server/src/routes/nexus-settings.ts** — Create new file:
|
||||
- GET /nexus/settings — returns nexusSettingsService().get()
|
||||
- PATCH /nexus/settings — calls nexusSettingsService().set(req.body), returns updated
|
||||
- Both routes call assertBoard(req) first
|
||||
- Import Router from express, assertBoard from ./authz.js, nexusSettingsService from ../services/nexus-settings.js
|
||||
|
||||
3. **server/src/routes/voice.ts** — Create new file:
|
||||
- POST /transcribe — accepts multipart audio upload via multer memoryStorage, calls voicePipelineService().transcribe(buffer, format), returns { text }
|
||||
- POST /synthesize — accepts JSON { text, voiceId? }, calls voicePipelineService().synthesize(text, voiceId), returns audio/wav buffer
|
||||
- Both routes call assertBoard(req)
|
||||
- Import multer, Router, assertBoard, voicePipelineService, MAX_ATTACHMENT_BYTES
|
||||
|
||||
4. **packages/shared/src/types/chat.ts** — Add `voiceMode?: string | null;` to ChatMessage interface if not present.
|
||||
|
||||
5. **packages/shared/src/validators/chat.ts** — Add `voiceMode: z.enum(["text", "voice_input", "full_voice"]).optional()` to createMessageSchema.
|
||||
|
||||
6. **server/src/routes/chat.ts** — In the stream POST handler, destructure `voiceMode` from req.body alongside content and agentId. When voiceMode is "full_voice", call voicePipelineService().formatForVoice(aiContent) to produce SPOKEN/DETAILED format. Set messageType on stored message: "voice_full" if voiceMode==="full_voice", "voice_input" if voiceMode==="voice_input", else null.
|
||||
|
||||
7. **server/src/app.ts** — Import and mount voiceRoutes and nexusSettingsRoutes:
|
||||
```typescript
|
||||
import { nexusSettingsRoutes } from "./routes/nexus-settings.js";
|
||||
import { voiceRoutes } from "./routes/voice.js";
|
||||
// In the api router setup:
|
||||
api.use(nexusSettingsRoutes());
|
||||
api.use(voiceRoutes());
|
||||
```
|
||||
|
||||
8. **COOP/COEP headers** — In server/src/app.ts, add middleware BEFORE static file serving and vite dev middleware:
|
||||
```typescript
|
||||
app.use((_req, res, next) => {
|
||||
res.setHeader("Cross-Origin-Opener-Policy", "same-origin");
|
||||
res.setHeader("Cross-Origin-Embedder-Policy", "require-corp");
|
||||
next();
|
||||
});
|
||||
```
|
||||
Place this before any `app.use(express.static(...))` or vite middleware attachment.
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "voiceRoutes" server/src/app.ts && grep -q "nexusSettingsRoutes" server/src/app.ts && grep -q "Cross-Origin-Opener-Policy" server/src/app.ts && grep -q "voiceMode" server/src/routes/chat.ts && grep -q "voice_full" server/src/routes/chat.ts && test -f server/src/routes/voice.ts && test -f server/src/routes/nexus-settings.ts && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- grep "voiceRoutes" server/src/app.ts returns match
|
||||
- grep "nexusSettingsRoutes" server/src/app.ts returns match
|
||||
- grep "Cross-Origin-Opener-Policy" server/src/app.ts returns "same-origin"
|
||||
- grep "Cross-Origin-Embedder-Policy" server/src/app.ts returns "require-corp"
|
||||
- grep "voiceMode" server/src/routes/chat.ts returns match
|
||||
- grep "voice_full" server/src/routes/chat.ts returns match
|
||||
- server/src/routes/voice.ts exists with POST /transcribe and POST /synthesize
|
||||
- server/src/routes/nexus-settings.ts exists with GET and PATCH /nexus/settings
|
||||
- grep "VOICE_MODES" server/src/services/nexus-settings.ts returns match
|
||||
</acceptance_criteria>
|
||||
<done>Phase 36 server deliverables present on branch. COOP/COEP headers added. Voice routes mounted. Chat stream accepts voiceMode.</done>
|
||||
</task>
|
||||
|
||||
<task type="auto">
|
||||
<name>Task 2: Install VAD library, copy ONNX assets, configure Vite COOP/COEP headers</name>
|
||||
<files>
|
||||
ui/package.json,
|
||||
ui/public/vad.worklet.bundle.min.js,
|
||||
ui/public/silero_vad_legacy.onnx,
|
||||
ui/public/silero_vad_v5.onnx,
|
||||
ui/vite.config.ts
|
||||
</files>
|
||||
<read_first>
|
||||
ui/package.json,
|
||||
ui/vite.config.ts
|
||||
</read_first>
|
||||
<action>
|
||||
1. Install @ricky0123/vad-react in the ui package:
|
||||
```bash
|
||||
pnpm add @ricky0123/vad-react --filter @paperclipai/ui
|
||||
```
|
||||
|
||||
2. Copy VAD assets from node_modules to ui/public/ for same-origin serving (avoids COEP blocking CDN):
|
||||
```bash
|
||||
cp node_modules/@ricky0123/vad-web/dist/vad.worklet.bundle.min.js ui/public/
|
||||
cp node_modules/@ricky0123/vad-web/dist/silero_vad_legacy.onnx ui/public/
|
||||
cp node_modules/@ricky0123/vad-web/dist/silero_vad_v5.onnx ui/public/
|
||||
```
|
||||
If vad-web is in ui/node_modules/@ricky0123/vad-web/dist/, use that path instead.
|
||||
Verify all three files exist after copy.
|
||||
|
||||
3. Add a "copy-vad-assets" script to ui/package.json:
|
||||
```json
|
||||
"copy-vad-assets": "cp node_modules/@ricky0123/vad-web/dist/vad.worklet.bundle.min.js public/ && cp node_modules/@ricky0123/vad-web/dist/silero_vad_legacy.onnx public/ && cp node_modules/@ricky0123/vad-web/dist/silero_vad_v5.onnx public/"
|
||||
```
|
||||
|
||||
4. Update ui/vite.config.ts — add COOP/COEP headers to dev server config:
|
||||
```typescript
|
||||
server: {
|
||||
port: 5173,
|
||||
headers: {
|
||||
"Cross-Origin-Opener-Policy": "same-origin",
|
||||
"Cross-Origin-Embedder-Policy": "require-corp",
|
||||
},
|
||||
proxy: { ... }, // keep existing proxy config
|
||||
},
|
||||
```
|
||||
This ensures SharedArrayBuffer works in Vite dev mode too.
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/public/vad.worklet.bundle.min.js && test -f ui/public/silero_vad_legacy.onnx && test -f ui/public/silero_vad_v5.onnx && grep -q "vad-react" ui/package.json && grep -q "Cross-Origin-Opener-Policy" ui/vite.config.ts && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- ui/public/vad.worklet.bundle.min.js exists (non-zero size)
|
||||
- ui/public/silero_vad_legacy.onnx exists (non-zero size)
|
||||
- ui/public/silero_vad_v5.onnx exists (non-zero size)
|
||||
- grep "vad-react" ui/package.json returns match
|
||||
- grep "Cross-Origin-Opener-Policy" ui/vite.config.ts returns "same-origin"
|
||||
- grep "Cross-Origin-Embedder-Policy" ui/vite.config.ts returns "require-corp"
|
||||
- grep "copy-vad-assets" ui/package.json returns match
|
||||
</acceptance_criteria>
|
||||
<done>VAD library installed. ONNX model files and worklet bundle served from ui/public/. Vite dev server sends COOP/COEP headers. SharedArrayBuffer available in dev.</done>
|
||||
</task>
|
||||
|
||||
</tasks>
|
||||
|
||||
<verification>
|
||||
- server/src/routes/voice.ts exists with transcribe and synthesize endpoints
|
||||
- server/src/routes/nexus-settings.ts exists with GET/PATCH
|
||||
- server/src/app.ts mounts both route sets and has COOP/COEP middleware
|
||||
- server/src/routes/chat.ts handles voiceMode in stream handler
|
||||
- ui/public/ has all 3 VAD asset files
|
||||
- ui/vite.config.ts has COOP/COEP headers
|
||||
- @ricky0123/vad-react in ui/package.json dependencies
|
||||
</verification>
|
||||
|
||||
<success_criteria>
|
||||
All Phase 36 server deliverables present. COOP/COEP headers set on both Express and Vite dev server. VAD assets served from same-origin. Foundation ready for frontend voice components.
|
||||
</success_criteria>
|
||||
|
||||
<output>
|
||||
After completion, create `.planning/phases/37-web-chat-voice-ui/37-01-SUMMARY.md`
|
||||
</output>
|
||||
300
.planning/phases/37-web-chat-voice-ui/37-02-PLAN.md
Normal file
300
.planning/phases/37-web-chat-voice-ui/37-02-PLAN.md
Normal file
|
|
@ -0,0 +1,300 @@
|
|||
---
|
||||
phase: 37-web-chat-voice-ui
|
||||
plan: 02
|
||||
type: execute
|
||||
wave: 2
|
||||
depends_on: ["37-01"]
|
||||
files_modified:
|
||||
- ui/src/lib/encodeWav.ts
|
||||
- ui/src/hooks/useVadRecorder.ts
|
||||
- ui/src/hooks/useVoiceMode.ts
|
||||
- ui/src/components/VoiceWaveform.tsx
|
||||
- ui/src/components/VoiceMicButton.tsx
|
||||
autonomous: true
|
||||
requirements:
|
||||
- WCHAT-01
|
||||
- WCHAT-02
|
||||
- WCHAT-03
|
||||
- WCHAT-05
|
||||
|
||||
must_haves:
|
||||
truths:
|
||||
- "VoiceMicButton renders three visual states: idle (Mic icon), recording (waveform + ring), processing (Loader2 spinner)"
|
||||
- "Recording auto-stops on silence via VAD onSpeechEnd callback"
|
||||
- "VoiceWaveform renders animated canvas bars during recording"
|
||||
- "useVadRecorder converts Float32Array to WAV and POSTs to /api/transcribe"
|
||||
- "useVoiceMode reads voiceMode from GET /api/nexus/settings and writes via PATCH"
|
||||
artifacts:
|
||||
- path: "ui/src/lib/encodeWav.ts"
|
||||
provides: "Float32Array to WAV blob encoder"
|
||||
exports: ["encodeWav"]
|
||||
- path: "ui/src/hooks/useVadRecorder.ts"
|
||||
provides: "VAD recording hook with auto-stop"
|
||||
exports: ["useVadRecorder"]
|
||||
- path: "ui/src/hooks/useVoiceMode.ts"
|
||||
provides: "Voice mode state from nexus-settings"
|
||||
exports: ["useVoiceMode"]
|
||||
- path: "ui/src/components/VoiceWaveform.tsx"
|
||||
provides: "Canvas amplitude visualization"
|
||||
exports: ["VoiceWaveform"]
|
||||
- path: "ui/src/components/VoiceMicButton.tsx"
|
||||
provides: "VAD-powered mic button with three states"
|
||||
exports: ["VoiceMicButton"]
|
||||
key_links:
|
||||
- from: "ui/src/components/VoiceMicButton.tsx"
|
||||
to: "ui/src/hooks/useVadRecorder.ts"
|
||||
via: "useVadRecorder() hook call"
|
||||
pattern: "useVadRecorder"
|
||||
- from: "ui/src/hooks/useVadRecorder.ts"
|
||||
to: "ui/src/lib/encodeWav.ts"
|
||||
via: "encodeWav(audio) in onSpeechEnd"
|
||||
pattern: "encodeWav"
|
||||
- from: "ui/src/hooks/useVadRecorder.ts"
|
||||
to: "/api/transcribe"
|
||||
via: "fetch POST with FormData"
|
||||
pattern: "fetch.*api/transcribe"
|
||||
- from: "ui/src/components/VoiceMicButton.tsx"
|
||||
to: "ui/src/components/VoiceWaveform.tsx"
|
||||
via: "VoiceWaveform rendered inside recording state"
|
||||
pattern: "<VoiceWaveform"
|
||||
---
|
||||
|
||||
<objective>
|
||||
Build the core voice recording components: WAV encoder, VAD recorder hook, voice mode hook, waveform visualization, and the VoiceMicButton that ties them together.
|
||||
|
||||
Purpose: These are the foundational building blocks that replace VoiceRecordButton with VAD-powered auto-stop recording and real-time waveform visualization.
|
||||
|
||||
Output: 5 new files — encodeWav utility, useVadRecorder hook, useVoiceMode hook, VoiceWaveform component, VoiceMicButton component
|
||||
</objective>
|
||||
|
||||
<execution_context>
|
||||
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
|
||||
@$HOME/.claude/get-shit-done/templates/summary.md
|
||||
</execution_context>
|
||||
|
||||
<context>
|
||||
@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md
|
||||
@.planning/phases/37-web-chat-voice-ui/37-01-SUMMARY.md
|
||||
|
||||
<interfaces>
|
||||
<!-- From 37-RESEARCH.md — useMicVAD API -->
|
||||
```typescript
|
||||
// @ricky0123/vad-react useMicVAD hook
|
||||
import { useMicVAD } from "@ricky0123/vad-react";
|
||||
const vad = useMicVAD({
|
||||
startOnLoad: false,
|
||||
baseAssetPath: "/",
|
||||
onnxWASMBasePath: "/",
|
||||
positiveSpeechThreshold: 0.8,
|
||||
negativeSpeechThreshold: 0.65,
|
||||
redemptionFrames: 8,
|
||||
minSpeechFrames: 5,
|
||||
onSpeechStart: () => void,
|
||||
onSpeechEnd: (audio: Float32Array) => void,
|
||||
});
|
||||
// Returns: { listening, loading, errored, userSpeaking, start, pause }
|
||||
```
|
||||
|
||||
<!-- From existing VoiceRecordButton (parent branch) — replacement target -->
|
||||
```typescript
|
||||
interface VoiceRecordButtonProps {
|
||||
onTranscription: (text: string) => void;
|
||||
disabled?: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
<!-- nexus-settings API (from Plan 01) -->
|
||||
```
|
||||
GET /api/nexus/settings → { mode, voiceEnabled, voiceMode, ... }
|
||||
PATCH /api/nexus/settings → accepts partial, returns updated
|
||||
```
|
||||
</interfaces>
|
||||
</context>
|
||||
|
||||
<tasks>
|
||||
|
||||
<task type="auto">
|
||||
<name>Task 1: Create encodeWav utility and useVadRecorder + useVoiceMode hooks</name>
|
||||
<files>
|
||||
ui/src/lib/encodeWav.ts,
|
||||
ui/src/hooks/useVadRecorder.ts,
|
||||
ui/src/hooks/useVoiceMode.ts
|
||||
</files>
|
||||
<read_first>
|
||||
ui/src/hooks/useStreamingChat.ts,
|
||||
ui/src/api/chat.ts,
|
||||
ui/src/components/VoiceRecordButton.tsx
|
||||
</read_first>
|
||||
<action>
|
||||
1. **ui/src/lib/encodeWav.ts** — Create WAV encoder function:
|
||||
```typescript
|
||||
export function encodeWav(samples: Float32Array, sampleRate = 16000): Blob
|
||||
```
|
||||
- Standard 44-byte WAV header (RIFF/WAVE/fmt/data chunks)
|
||||
- PCM format (1), mono (1 channel), 16-bit depth
|
||||
- Clamp samples to [-1, 1] range before int16 conversion
|
||||
- Return Blob with type "audio/wav"
|
||||
- Helper: `function writeString(view: DataView, offset: number, str: string)`
|
||||
|
||||
2. **ui/src/hooks/useVadRecorder.ts** — Create VAD recording hook:
|
||||
```typescript
|
||||
interface UseVadRecorderOptions {
|
||||
onTranscript: (text: string) => void;
|
||||
}
|
||||
interface UseVadRecorderReturn {
|
||||
state: "idle" | "recording" | "processing";
|
||||
start: () => void;
|
||||
stop: () => void;
|
||||
mediaStream: MediaStream | null; // exposed for VoiceWaveform AnalyserNode
|
||||
}
|
||||
export function useVadRecorder(opts: UseVadRecorderOptions): UseVadRecorderReturn
|
||||
```
|
||||
Implementation:
|
||||
- Use `useMicVAD` from `@ricky0123/vad-react` with `startOnLoad: false`
|
||||
- Set `baseAssetPath: "/"` and `onnxWASMBasePath: "/"` (serve from ui/public/)
|
||||
- Set `positiveSpeechThreshold: 0.8`, `minSpeechFrames: 5` (300ms minimum to filter noise)
|
||||
- In `onSpeechEnd(audio: Float32Array)`:
|
||||
a. Call `vad.pause()` to stop listening
|
||||
b. Set state to "processing"
|
||||
c. Call `encodeWav(audio)` to get WAV blob
|
||||
d. Create FormData, append blob as "audio" field with filename "recording.wav"
|
||||
e. POST to `/api/transcribe` with `credentials: "include"`
|
||||
f. Parse response as `{ text: string }`
|
||||
g. If text is non-empty (length >= 2), call `opts.onTranscript(text.trim())`
|
||||
h. Set state back to "idle"
|
||||
- `start()`: calls `vad.start()`, sets state to "recording"
|
||||
- `stop()`: calls `vad.pause()`, sets state to "idle"
|
||||
- Expose `mediaStream` from `navigator.mediaDevices.getUserMedia({ audio: true })` — store in a ref. This is needed for VoiceWaveform AnalyserNode.
|
||||
- NOTE: useMicVAD manages its own media stream internally, but VoiceWaveform needs a separate reference to the stream for the AnalyserNode. Request the stream in the `start()` function and store in a ref. Stop tracks in `stop()`.
|
||||
|
||||
3. **ui/src/hooks/useVoiceMode.ts** — Create voice mode hook:
|
||||
```typescript
|
||||
type VoiceMode = "text" | "voice_input" | "full_voice";
|
||||
interface UseVoiceModeReturn {
|
||||
mode: VoiceMode;
|
||||
setMode: (next: VoiceMode) => Promise<void>;
|
||||
isLoading: boolean;
|
||||
}
|
||||
export function useVoiceMode(): UseVoiceModeReturn
|
||||
```
|
||||
Implementation:
|
||||
- On mount, GET /api/nexus/settings with credentials: "include"
|
||||
- Extract `voiceMode` from response, default to "text"
|
||||
- `setMode(next)`: optimistically update local state, then PATCH /api/nexus/settings with `{ voiceMode: next }`
|
||||
- Use useState for mode and isLoading
|
||||
- Wrap fetch in try/catch; on error, revert to previous mode
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/lib/encodeWav.ts && test -f ui/src/hooks/useVadRecorder.ts && test -f ui/src/hooks/useVoiceMode.ts && grep -q "encodeWav" ui/src/lib/encodeWav.ts && grep -q "useVadRecorder" ui/src/hooks/useVadRecorder.ts && grep -q "useVoiceMode" ui/src/hooks/useVoiceMode.ts && grep -q "useMicVAD" ui/src/hooks/useVadRecorder.ts && grep -q "api/transcribe" ui/src/hooks/useVadRecorder.ts && grep -q "api/nexus/settings" ui/src/hooks/useVoiceMode.ts && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- grep "export function encodeWav" ui/src/lib/encodeWav.ts returns match
|
||||
- grep "export function useVadRecorder" ui/src/hooks/useVadRecorder.ts returns match
|
||||
- grep "export function useVoiceMode" ui/src/hooks/useVoiceMode.ts returns match
|
||||
- grep "useMicVAD" ui/src/hooks/useVadRecorder.ts returns match
|
||||
- grep "startOnLoad.*false" ui/src/hooks/useVadRecorder.ts returns match
|
||||
- grep "baseAssetPath" ui/src/hooks/useVadRecorder.ts returns match with "/"
|
||||
- grep "api/transcribe" ui/src/hooks/useVadRecorder.ts returns match
|
||||
- grep "api/nexus/settings" ui/src/hooks/useVoiceMode.ts returns match
|
||||
- grep "encodeWav" ui/src/hooks/useVadRecorder.ts returns match (imports it)
|
||||
- grep "RIFF" ui/src/lib/encodeWav.ts returns match (WAV header)
|
||||
</acceptance_criteria>
|
||||
<done>encodeWav utility produces valid WAV blobs. useVadRecorder wraps useMicVAD with auto-stop + transcription. useVoiceMode reads/writes voiceMode from nexus-settings API.</done>
|
||||
</task>
|
||||
|
||||
<task type="auto">
|
||||
<name>Task 2: Create VoiceWaveform canvas component and VoiceMicButton</name>
|
||||
<files>
|
||||
ui/src/components/VoiceWaveform.tsx,
|
||||
ui/src/components/VoiceMicButton.tsx
|
||||
</files>
|
||||
<read_first>
|
||||
ui/src/hooks/useVadRecorder.ts,
|
||||
ui/src/lib/encodeWav.ts,
|
||||
ui/src/components/VoiceRecordButton.tsx
|
||||
</read_first>
|
||||
<action>
|
||||
1. **ui/src/components/VoiceWaveform.tsx** — Canvas-based amplitude visualization:
|
||||
```typescript
|
||||
interface VoiceWaveformProps {
|
||||
stream: MediaStream | null;
|
||||
active: boolean; // controls animation loop
|
||||
}
|
||||
export function VoiceWaveform({ stream, active }: VoiceWaveformProps)
|
||||
```
|
||||
Implementation:
|
||||
- Use a `<canvas>` element, width=80, height=32 (h-8 per UI spec), className="inline-block"
|
||||
- On mount (when stream is truthy and active is true):
|
||||
a. Create AudioContext (lazily — only create once, store in ref)
|
||||
b. If AudioContext is suspended, call `audioCtx.resume()`
|
||||
c. Create MediaStreamSource from stream
|
||||
d. Create AnalyserNode with fftSize=64 (gives 32 frequency bins)
|
||||
e. Connect source -> analyser
|
||||
f. Start requestAnimationFrame loop:
|
||||
- Call `analyser.getByteFrequencyData(dataArray)` into Uint8Array(32)
|
||||
- Clear canvas
|
||||
- Draw 20 bars (skip every other bin for cleaner look): each bar width=2px, gap=2px
|
||||
- Bar height = (dataArray[i*2] / 255) * canvasHeight, minimum 2px
|
||||
- Bar color: use CSS variable --primary via getComputedStyle
|
||||
g. Store animationFrame id in ref for cleanup
|
||||
- On cleanup or when active becomes false: cancelAnimationFrame, disconnect source
|
||||
- Do NOT close AudioContext on cleanup (reuse across start/stop cycles)
|
||||
|
||||
2. **ui/src/components/VoiceMicButton.tsx** — VAD-powered mic button:
|
||||
```typescript
|
||||
interface VoiceMicButtonProps {
|
||||
onTranscript: (text: string) => void;
|
||||
disabled?: boolean;
|
||||
}
|
||||
export function VoiceMicButton({ onTranscript, disabled }: VoiceMicButtonProps)
|
||||
```
|
||||
Implementation:
|
||||
- Call `useVadRecorder({ onTranscript })` to get `{ state, start, stop, mediaStream }`
|
||||
- Three visual states per UI spec:
|
||||
a. **idle** (state === "idle"): Render Button with ghost variant, size="icon", h-8 w-8. Content: `<Mic className="h-4 w-4" />`. aria-label="Start voice input". onClick calls start().
|
||||
b. **recording** (state === "recording"): Render Button with ghost variant, size="icon", h-8 w-8, with `ring-2 ring-primary` classes. Content: `<VoiceWaveform stream={mediaStream} active={true} />`. aria-label="Recording — speak now". onClick calls stop().
|
||||
c. **processing** (state === "processing"): Render Button disabled, ghost variant, size="icon", h-8 w-8. Content: `<Loader2 className="h-4 w-4 animate-spin" />`. aria-label="Transcribing...".
|
||||
- Import Mic, Loader2 from lucide-react
|
||||
- Import Button from @/components/ui/button
|
||||
- Import VoiceWaveform from ./VoiceWaveform
|
||||
- Import useVadRecorder from ../hooks/useVadRecorder
|
||||
- When disabled prop is true, render idle state with disabled attribute
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/components/VoiceWaveform.tsx && test -f ui/src/components/VoiceMicButton.tsx && grep -q "VoiceWaveform" ui/src/components/VoiceWaveform.tsx && grep -q "VoiceMicButton" ui/src/components/VoiceMicButton.tsx && grep -q "canvas" ui/src/components/VoiceWaveform.tsx && grep -q "useVadRecorder" ui/src/components/VoiceMicButton.tsx && grep -q "Mic" ui/src/components/VoiceMicButton.tsx && grep -q "Loader2" ui/src/components/VoiceMicButton.tsx && grep -q "ring-2 ring-primary" ui/src/components/VoiceMicButton.tsx && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- grep "export function VoiceWaveform" ui/src/components/VoiceWaveform.tsx returns match
|
||||
- grep "export function VoiceMicButton" ui/src/components/VoiceMicButton.tsx returns match
|
||||
- grep "canvas" ui/src/components/VoiceWaveform.tsx returns match
|
||||
- grep "AnalyserNode\|createAnalyser\|analyser" ui/src/components/VoiceWaveform.tsx returns match
|
||||
- grep "requestAnimationFrame" ui/src/components/VoiceWaveform.tsx returns match
|
||||
- grep "getByteFrequencyData" ui/src/components/VoiceWaveform.tsx returns match
|
||||
- grep "useVadRecorder" ui/src/components/VoiceMicButton.tsx returns match
|
||||
- grep 'aria-label="Start voice input"' ui/src/components/VoiceMicButton.tsx returns match
|
||||
- grep 'aria-label="Recording' ui/src/components/VoiceMicButton.tsx returns match
|
||||
- grep 'aria-label="Transcribing' ui/src/components/VoiceMicButton.tsx returns match
|
||||
- grep "ring-2 ring-primary" ui/src/components/VoiceMicButton.tsx returns match
|
||||
- grep "Loader2.*animate-spin" ui/src/components/VoiceMicButton.tsx returns match
|
||||
</acceptance_criteria>
|
||||
<done>VoiceWaveform renders 20 animated bars from Web Audio API AnalyserNode on a 80x32 canvas. VoiceMicButton shows idle/recording/processing states with correct icons, aria-labels, and ring styling.</done>
|
||||
</task>
|
||||
|
||||
</tasks>
|
||||
|
||||
<verification>
|
||||
- All 5 files exist and export their named functions
|
||||
- useVadRecorder uses useMicVAD with startOnLoad: false and baseAssetPath: "/"
|
||||
- VoiceMicButton has three distinct visual states with correct aria-labels
|
||||
- VoiceWaveform uses canvas + AnalyserNode pattern
|
||||
- encodeWav produces Blob with type audio/wav
|
||||
- useVoiceMode reads/writes via /api/nexus/settings
|
||||
</verification>
|
||||
|
||||
<success_criteria>
|
||||
Core voice recording pipeline complete: user clicks mic -> VAD listens -> waveform animates -> silence detected -> audio encoded to WAV -> POSTed to /api/transcribe -> transcript returned. Voice mode readable/writable from nexus-settings.
|
||||
</success_criteria>
|
||||
|
||||
<output>
|
||||
After completion, create `.planning/phases/37-web-chat-voice-ui/37-02-SUMMARY.md`
|
||||
</output>
|
||||
286
.planning/phases/37-web-chat-voice-ui/37-03-PLAN.md
Normal file
286
.planning/phases/37-web-chat-voice-ui/37-03-PLAN.md
Normal file
|
|
@ -0,0 +1,286 @@
|
|||
---
|
||||
phase: 37-web-chat-voice-ui
|
||||
plan: 03
|
||||
type: execute
|
||||
wave: 2
|
||||
depends_on: ["37-01"]
|
||||
files_modified:
|
||||
- ui/src/components/ChatVoicePlayer.tsx
|
||||
- ui/src/components/ChatVoiceBadge.tsx
|
||||
- ui/src/components/VoiceModeToggle.tsx
|
||||
autonomous: true
|
||||
requirements:
|
||||
- WCHAT-04
|
||||
- WCHAT-05
|
||||
- WCHAT-06
|
||||
|
||||
must_haves:
|
||||
truths:
|
||||
- "ChatVoicePlayer renders inline audio player with play/pause controls"
|
||||
- "ChatVoicePlayer auto-plays when autoPlay setting is true"
|
||||
- "ChatVoiceBadge shows 'Voice' badge on voice messages"
|
||||
- "ChatVoiceBadge has collapsible full markdown section for voice_full messages"
|
||||
- "VoiceModeToggle renders three pills: Text / Voice In / Full Voice"
|
||||
- "VoiceModeToggle persists selection via useVoiceMode hook"
|
||||
- "Auto-play preference stored in localStorage under nexus:voice:autoplay"
|
||||
artifacts:
|
||||
- path: "ui/src/components/ChatVoicePlayer.tsx"
|
||||
provides: "Inline audio player for synthesized voice responses"
|
||||
exports: ["ChatVoicePlayer"]
|
||||
- path: "ui/src/components/ChatVoiceBadge.tsx"
|
||||
provides: "Voice badge + collapsible markdown on agent messages"
|
||||
exports: ["ChatVoiceBadge"]
|
||||
- path: "ui/src/components/VoiceModeToggle.tsx"
|
||||
provides: "Three-state pill toggle for voice mode"
|
||||
exports: ["VoiceModeToggle"]
|
||||
key_links:
|
||||
- from: "ui/src/components/ChatVoicePlayer.tsx"
|
||||
to: "/api/synthesize"
|
||||
via: "fetch POST to get audio blob"
|
||||
pattern: "fetch.*api/synthesize"
|
||||
- from: "ui/src/components/ChatVoiceBadge.tsx"
|
||||
to: "shadcn Collapsible"
|
||||
via: "Collapsible/CollapsibleContent/CollapsibleTrigger"
|
||||
pattern: "Collapsible"
|
||||
- from: "ui/src/components/VoiceModeToggle.tsx"
|
||||
to: "ui/src/hooks/useVoiceMode.ts"
|
||||
via: "useVoiceMode() hook"
|
||||
pattern: "useVoiceMode"
|
||||
---
|
||||
|
||||
<objective>
|
||||
Build the voice output and mode selection components: ChatVoicePlayer for inline audio playback, ChatVoiceBadge for voice message display, and VoiceModeToggle for switching between text/voice_input/full_voice modes.
|
||||
|
||||
Purpose: These components handle the output side of voice I/O (playing synthesized responses, showing voice badges on messages) and the mode selector that controls the entire voice behavior.
|
||||
|
||||
Output: 3 new component files — ChatVoicePlayer, ChatVoiceBadge, VoiceModeToggle
|
||||
</objective>
|
||||
|
||||
<execution_context>
|
||||
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
|
||||
@$HOME/.claude/get-shit-done/templates/summary.md
|
||||
</execution_context>
|
||||
|
||||
<context>
|
||||
@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md
|
||||
|
||||
<interfaces>
|
||||
<!-- From Plan 01 — synthesize endpoint -->
|
||||
```
|
||||
POST /api/synthesize
|
||||
Body: { text: string, voiceId?: string }
|
||||
Response: audio/wav binary buffer
|
||||
```
|
||||
|
||||
<!-- From useVoiceMode hook (Plan 02) -->
|
||||
```typescript
|
||||
type VoiceMode = "text" | "voice_input" | "full_voice";
|
||||
export function useVoiceMode(): {
|
||||
mode: VoiceMode;
|
||||
setMode: (next: VoiceMode) => Promise<void>;
|
||||
isLoading: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
<!-- ChatMessage messageType values for voice -->
|
||||
```
|
||||
messageType: "voice_input" → user sent via voice, agent replied with text
|
||||
messageType: "voice_full" → user sent via voice, agent replied with SPOKEN + DETAILED format
|
||||
```
|
||||
|
||||
<!-- SPOKEN/DETAILED content format from formatForVoice -->
|
||||
```
|
||||
SPOKEN: <concise spoken version of the response>
|
||||
DETAILED: <full markdown response with code blocks etc>
|
||||
```
|
||||
|
||||
<!-- shadcn components already available -->
|
||||
```typescript
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible";
|
||||
import { Button } from "@/components/ui/button";
|
||||
```
|
||||
</interfaces>
|
||||
</context>
|
||||
|
||||
<tasks>
|
||||
|
||||
<task type="auto">
|
||||
<name>Task 1: Create ChatVoicePlayer and ChatVoiceBadge components</name>
|
||||
<files>
|
||||
ui/src/components/ChatVoicePlayer.tsx,
|
||||
ui/src/components/ChatVoiceBadge.tsx
|
||||
</files>
|
||||
<read_first>
|
||||
ui/src/components/ChatMessage.tsx,
|
||||
ui/src/components/ChatMarkdownMessage.tsx
|
||||
</read_first>
|
||||
<action>
|
||||
1. **ui/src/components/ChatVoicePlayer.tsx** — Inline audio player for voice responses:
|
||||
```typescript
|
||||
interface ChatVoicePlayerProps {
|
||||
text: string; // The spoken text to synthesize
|
||||
autoPlay?: boolean; // Whether to auto-play on mount
|
||||
}
|
||||
export function ChatVoicePlayer({ text, autoPlay = false }: ChatVoicePlayerProps)
|
||||
```
|
||||
Implementation:
|
||||
- State: `status: "idle" | "loading" | "playing" | "paused"`, `audioUrl: string | null`
|
||||
- On mount (or when text changes): POST /api/synthesize with `{ text }`, credentials: "include"
|
||||
- Set status to "loading"
|
||||
- Get response as blob: `const blob = await res.blob()`
|
||||
- Create object URL: `const url = URL.createObjectURL(blob)`
|
||||
- Store url in state, set status to "idle"
|
||||
- Create `<audio>` element ref. Set src to audioUrl when available.
|
||||
- If autoPlay is true AND audioUrl is set, call `audioRef.current.play()`, set status to "playing"
|
||||
- Audio event listeners:
|
||||
- `onEnded`: set status to "idle", revoke blob URL via `URL.revokeObjectURL(audioUrl)`
|
||||
- `onPause`: set status to "paused"
|
||||
- `onPlay`: set status to "playing"
|
||||
- Render:
|
||||
- loading: `<Loader2 className="h-3 w-3 animate-spin" />` with "Loading audio..." text
|
||||
- idle/paused: `<Button variant="ghost" size="sm">` with `<Play className="h-3 w-3" />` icon. onClick: `audioRef.current.play()`
|
||||
- playing: `<Button variant="ghost" size="sm">` with `<Pause className="h-3 w-3" />` icon. onClick: `audioRef.current.pause()`
|
||||
- Hidden `<audio ref={audioRef} />` element with aria-label="Voice response"
|
||||
- Import Play, Pause, Loader2 from lucide-react
|
||||
- Cleanup: revoke any blob URL on unmount
|
||||
|
||||
2. **ui/src/components/ChatVoiceBadge.tsx** — Voice badge + collapsible markdown:
|
||||
```typescript
|
||||
interface ChatVoiceBadgeProps {
|
||||
content: string;
|
||||
messageType: string; // "voice_input" | "voice_full"
|
||||
autoPlayVoice?: boolean;
|
||||
}
|
||||
export function ChatVoiceBadge({ content, messageType, autoPlayVoice = false }: ChatVoiceBadgeProps)
|
||||
```
|
||||
Implementation:
|
||||
- Parse content for SPOKEN/DETAILED sections:
|
||||
```typescript
|
||||
const spokenMatch = content.match(/SPOKEN:\s*([\s\S]*?)(?=\nDETAILED:|$)/);
|
||||
const spokenText = spokenMatch?.[1]?.trim() ?? content;
|
||||
const detailedMatch = content.match(/DETAILED:\s*([\s\S]*)/);
|
||||
```
|
||||
- Render:
|
||||
a. `<Badge variant="outline" className="text-xs mb-2">Voice</Badge>`
|
||||
b. `<p className="text-sm">{spokenText}</p>`
|
||||
c. If messageType === "voice_full":
|
||||
- `<ChatVoicePlayer text={spokenText} autoPlay={autoPlayVoice} />`
|
||||
- If detailedMatch exists, render shadcn Collapsible:
|
||||
```
|
||||
<Collapsible>
|
||||
<CollapsibleTrigger className="text-xs text-muted-foreground hover:text-foreground mt-1">
|
||||
{open ? "Hide full response" : "Show full response"}
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent>
|
||||
<ChatMarkdownMessage content={detailedMatch[1].trim()} />
|
||||
</CollapsibleContent>
|
||||
</Collapsible>
|
||||
```
|
||||
- For voice_input messageType: just show badge + spoken text, no player, no collapsible
|
||||
- Import ChatVoicePlayer from ./ChatVoicePlayer
|
||||
- Import ChatMarkdownMessage from ./ChatMarkdownMessage (already exists in codebase)
|
||||
- Import Badge from @/components/ui/badge
|
||||
- Import Collapsible, CollapsibleContent, CollapsibleTrigger from @/components/ui/collapsible
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/components/ChatVoicePlayer.tsx && test -f ui/src/components/ChatVoiceBadge.tsx && grep -q "export function ChatVoicePlayer" ui/src/components/ChatVoicePlayer.tsx && grep -q "export function ChatVoiceBadge" ui/src/components/ChatVoiceBadge.tsx && grep -q "api/synthesize" ui/src/components/ChatVoicePlayer.tsx && grep -q "URL.createObjectURL" ui/src/components/ChatVoicePlayer.tsx && grep -q "URL.revokeObjectURL" ui/src/components/ChatVoicePlayer.tsx && grep -q "Collapsible" ui/src/components/ChatVoiceBadge.tsx && grep -q "Show full response" ui/src/components/ChatVoiceBadge.tsx && grep -q "Badge" ui/src/components/ChatVoiceBadge.tsx && grep -q "SPOKEN:" ui/src/components/ChatVoiceBadge.tsx && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- grep "export function ChatVoicePlayer" ui/src/components/ChatVoicePlayer.tsx returns match
|
||||
- grep "export function ChatVoiceBadge" ui/src/components/ChatVoiceBadge.tsx returns match
|
||||
- grep "api/synthesize" ui/src/components/ChatVoicePlayer.tsx returns match
|
||||
- grep "URL.createObjectURL" ui/src/components/ChatVoicePlayer.tsx returns match
|
||||
- grep "URL.revokeObjectURL" ui/src/components/ChatVoicePlayer.tsx returns match
|
||||
- grep "audio" ui/src/components/ChatVoicePlayer.tsx returns match (native audio element)
|
||||
- grep "aria-label.*Voice response" ui/src/components/ChatVoicePlayer.tsx returns match
|
||||
- grep "Collapsible" ui/src/components/ChatVoiceBadge.tsx returns match
|
||||
- grep "Show full response" ui/src/components/ChatVoiceBadge.tsx returns match
|
||||
- grep "Hide full response" ui/src/components/ChatVoiceBadge.tsx returns match
|
||||
- grep "Badge.*Voice" ui/src/components/ChatVoiceBadge.tsx returns match
|
||||
- grep "SPOKEN:" ui/src/components/ChatVoiceBadge.tsx returns match
|
||||
- grep "ChatVoicePlayer" ui/src/components/ChatVoiceBadge.tsx returns match (imports it)
|
||||
</acceptance_criteria>
|
||||
<done>ChatVoicePlayer synthesizes and plays audio with play/pause controls, auto-play support, and proper blob URL cleanup. ChatVoiceBadge shows Voice badge, spoken text, optional audio player, and collapsible full markdown for voice_full messages.</done>
|
||||
</task>
|
||||
|
||||
<task type="auto">
|
||||
<name>Task 2: Create VoiceModeToggle three-pill component</name>
|
||||
<files>
|
||||
ui/src/components/VoiceModeToggle.tsx
|
||||
</files>
|
||||
<read_first>
|
||||
ui/src/hooks/useVoiceMode.ts
|
||||
</read_first>
|
||||
<action>
|
||||
**ui/src/components/VoiceModeToggle.tsx** — Three-state pill toggle:
|
||||
```typescript
|
||||
export function VoiceModeToggle()
|
||||
```
|
||||
Implementation:
|
||||
- Call `useVoiceMode()` to get `{ mode, setMode, isLoading }`
|
||||
- Read auto-play preference from localStorage: `localStorage.getItem("nexus:voice:autoplay") === "true"`
|
||||
- Provide `autoPlay` state + toggle in the component for WCHAT-06 (auto-play configurable)
|
||||
- Render a `<div role="group" aria-label="Voice mode" className="flex items-center gap-1">`:
|
||||
- Three pill buttons, each a `<button>`:
|
||||
- "Text" → `setMode("text")`
|
||||
- "Voice In" → `setMode("voice_input")`
|
||||
- "Full Voice" → `setMode("full_voice")`
|
||||
- Active pill: `bg-primary text-primary-foreground` classes
|
||||
- Inactive pills: `bg-muted text-muted-foreground` classes
|
||||
- All pills: `rounded-full px-3 py-1 text-xs font-medium transition-colors`
|
||||
- Disabled when isLoading
|
||||
- Below the pills (only when mode is "full_voice"), render auto-play toggle:
|
||||
```
|
||||
<label className="flex items-center gap-2 text-xs text-muted-foreground mt-1">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={autoPlay}
|
||||
onChange={(e) => {
|
||||
setAutoPlay(e.target.checked);
|
||||
localStorage.setItem("nexus:voice:autoplay", String(e.target.checked));
|
||||
}}
|
||||
/>
|
||||
Auto-play voice responses
|
||||
</label>
|
||||
```
|
||||
- Export autoPlay state for consumers: expose via a separate export or make VoiceModeToggle accept `onAutoPlayChange` callback. Better: just read localStorage directly in ChatVoiceBadge — keep it simple.
|
||||
- The auto-play checkbox label text per UI spec: "Auto-play voice responses"
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && test -f ui/src/components/VoiceModeToggle.tsx && grep -q "export function VoiceModeToggle" ui/src/components/VoiceModeToggle.tsx && grep -q "useVoiceMode" ui/src/components/VoiceModeToggle.tsx && grep -q "Voice In" ui/src/components/VoiceModeToggle.tsx && grep -q "Full Voice" ui/src/components/VoiceModeToggle.tsx && grep -q "Text" ui/src/components/VoiceModeToggle.tsx && grep -q "bg-primary" ui/src/components/VoiceModeToggle.tsx && grep -q 'role="group"' ui/src/components/VoiceModeToggle.tsx && grep -q "nexus:voice:autoplay" ui/src/components/VoiceModeToggle.tsx && grep -q "Auto-play voice responses" ui/src/components/VoiceModeToggle.tsx && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- grep "export function VoiceModeToggle" ui/src/components/VoiceModeToggle.tsx returns match
|
||||
- grep "useVoiceMode" ui/src/components/VoiceModeToggle.tsx returns match
|
||||
- grep "Text" ui/src/components/VoiceModeToggle.tsx returns match (first pill)
|
||||
- grep "Voice In" ui/src/components/VoiceModeToggle.tsx returns match (second pill)
|
||||
- grep "Full Voice" ui/src/components/VoiceModeToggle.tsx returns match (third pill)
|
||||
- grep "bg-primary text-primary-foreground" ui/src/components/VoiceModeToggle.tsx returns match (active state)
|
||||
- grep "bg-muted text-muted-foreground" ui/src/components/VoiceModeToggle.tsx returns match (inactive state)
|
||||
- grep 'role="group"' ui/src/components/VoiceModeToggle.tsx returns match
|
||||
- grep 'aria-label="Voice mode"' ui/src/components/VoiceModeToggle.tsx returns match
|
||||
- grep "nexus:voice:autoplay" ui/src/components/VoiceModeToggle.tsx returns match (localStorage key)
|
||||
- grep "Auto-play voice responses" ui/src/components/VoiceModeToggle.tsx returns match
|
||||
</acceptance_criteria>
|
||||
<done>VoiceModeToggle renders three pills with active/inactive styling. Clicking a pill persists voiceMode to nexus-settings. Auto-play checkbox appears in full_voice mode and persists to localStorage.</done>
|
||||
</task>
|
||||
|
||||
</tasks>
|
||||
|
||||
<verification>
|
||||
- ChatVoicePlayer POSTs to /api/synthesize and plays via native audio element
|
||||
- ChatVoicePlayer revokes blob URLs on cleanup (no memory leaks)
|
||||
- ChatVoiceBadge parses SPOKEN/DETAILED content format
|
||||
- ChatVoiceBadge shows collapsible section only for voice_full
|
||||
- VoiceModeToggle has three pills with correct labels and accessibility
|
||||
- Auto-play preference persisted in localStorage under nexus:voice:autoplay
|
||||
</verification>
|
||||
|
||||
<success_criteria>
|
||||
All three output-side voice components complete: ChatVoicePlayer plays synthesized audio with controls, ChatVoiceBadge renders voice badges with collapsible detail, VoiceModeToggle switches between text/voice_input/full_voice with persistence.
|
||||
</success_criteria>
|
||||
|
||||
<output>
|
||||
After completion, create `.planning/phases/37-web-chat-voice-ui/37-03-SUMMARY.md`
|
||||
</output>
|
||||
377
.planning/phases/37-web-chat-voice-ui/37-04-PLAN.md
Normal file
377
.planning/phases/37-web-chat-voice-ui/37-04-PLAN.md
Normal file
|
|
@ -0,0 +1,377 @@
|
|||
---
|
||||
phase: 37-web-chat-voice-ui
|
||||
plan: 04
|
||||
type: execute
|
||||
wave: 3
|
||||
depends_on: ["37-02", "37-03"]
|
||||
files_modified:
|
||||
- ui/src/components/ChatInput.tsx
|
||||
- ui/src/components/ChatMessage.tsx
|
||||
- ui/src/components/ChatPanel.tsx
|
||||
- ui/src/hooks/useStreamingChat.ts
|
||||
- ui/src/api/chat.ts
|
||||
autonomous: false
|
||||
requirements:
|
||||
- WCHAT-01
|
||||
- WCHAT-02
|
||||
- WCHAT-03
|
||||
- WCHAT-04
|
||||
- WCHAT-05
|
||||
- WCHAT-06
|
||||
|
||||
must_haves:
|
||||
truths:
|
||||
- "ChatInput renders VoiceMicButton instead of VoiceRecordButton"
|
||||
- "ChatInput shows VoiceModeToggle when voice mode is not 'text'"
|
||||
- "ChatMessage renders ChatVoiceBadge for voice_input and voice_full messageTypes"
|
||||
- "ChatMessage renders ChatVoicePlayer for voice_full messages with auto-play from localStorage"
|
||||
- "useStreamingChat.startStream accepts voiceMode parameter"
|
||||
- "chatApi.postMessageAndStream sends voiceMode in request body"
|
||||
- "ChatPanel passes voiceMode from useVoiceMode to startStream calls"
|
||||
- "Full voice flow works end-to-end: mic -> VAD -> transcribe -> stream -> voice badge + audio"
|
||||
artifacts:
|
||||
- path: "ui/src/components/ChatInput.tsx"
|
||||
provides: "Voice-enhanced chat input with VoiceMicButton + VoiceModeToggle"
|
||||
contains: "VoiceMicButton"
|
||||
- path: "ui/src/components/ChatMessage.tsx"
|
||||
provides: "Voice-aware message rendering"
|
||||
contains: "ChatVoiceBadge"
|
||||
- path: "ui/src/hooks/useStreamingChat.ts"
|
||||
provides: "Voice-mode-aware streaming"
|
||||
contains: "voiceMode"
|
||||
- path: "ui/src/api/chat.ts"
|
||||
provides: "Voice mode in stream request"
|
||||
contains: "voiceMode"
|
||||
key_links:
|
||||
- from: "ui/src/components/ChatPanel.tsx"
|
||||
to: "ui/src/hooks/useVoiceMode.ts"
|
||||
via: "useVoiceMode() hook call"
|
||||
pattern: "useVoiceMode"
|
||||
- from: "ui/src/components/ChatPanel.tsx"
|
||||
to: "ui/src/hooks/useStreamingChat.ts"
|
||||
via: "startStream(content, agentId, voiceMode)"
|
||||
pattern: "startStream.*voiceMode"
|
||||
- from: "ui/src/hooks/useStreamingChat.ts"
|
||||
to: "ui/src/api/chat.ts"
|
||||
via: "chatApi.postMessageAndStream with voiceMode"
|
||||
pattern: "voiceMode"
|
||||
- from: "ui/src/components/ChatInput.tsx"
|
||||
to: "ui/src/components/VoiceMicButton.tsx"
|
||||
via: "VoiceMicButton replaces VoiceRecordButton"
|
||||
pattern: "VoiceMicButton"
|
||||
- from: "ui/src/components/ChatMessage.tsx"
|
||||
to: "ui/src/components/ChatVoiceBadge.tsx"
|
||||
via: "ChatVoiceBadge for voice messageTypes"
|
||||
pattern: "ChatVoiceBadge"
|
||||
---
|
||||
|
||||
<objective>
|
||||
Wire all voice components into the existing chat system: replace VoiceRecordButton with VoiceMicButton in ChatInput, add VoiceModeToggle, render ChatVoiceBadge in ChatMessage, and thread voiceMode through useStreamingChat and chatApi.
|
||||
|
||||
Purpose: This is the integration plan that connects all Phase 37 components to the existing chat UI. Without this wiring, the components exist but aren't used.
|
||||
|
||||
Output: 5 modified files connecting voice I/O to the chat system
|
||||
</objective>
|
||||
|
||||
<execution_context>
|
||||
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
|
||||
@$HOME/.claude/get-shit-done/templates/summary.md
|
||||
</execution_context>
|
||||
|
||||
<context>
|
||||
@.planning/phases/37-web-chat-voice-ui/37-RESEARCH.md
|
||||
@.planning/phases/37-web-chat-voice-ui/37-02-SUMMARY.md
|
||||
@.planning/phases/37-web-chat-voice-ui/37-03-SUMMARY.md
|
||||
|
||||
<interfaces>
|
||||
<!-- VoiceMicButton (from Plan 02) -->
|
||||
```typescript
|
||||
interface VoiceMicButtonProps {
|
||||
onTranscript: (text: string) => void;
|
||||
disabled?: boolean;
|
||||
}
|
||||
export function VoiceMicButton({ onTranscript, disabled }: VoiceMicButtonProps)
|
||||
```
|
||||
|
||||
<!-- VoiceModeToggle (from Plan 03) -->
|
||||
```typescript
|
||||
export function VoiceModeToggle()
|
||||
// Uses useVoiceMode() internally; renders three pills + auto-play checkbox
|
||||
```
|
||||
|
||||
<!-- ChatVoiceBadge (from Plan 03) -->
|
||||
```typescript
|
||||
interface ChatVoiceBadgeProps {
|
||||
content: string;
|
||||
messageType: string; // "voice_input" | "voice_full"
|
||||
autoPlayVoice?: boolean;
|
||||
}
|
||||
export function ChatVoiceBadge({ content, messageType, autoPlayVoice }: ChatVoiceBadgeProps)
|
||||
```
|
||||
|
||||
<!-- useVoiceMode (from Plan 02) -->
|
||||
```typescript
|
||||
type VoiceMode = "text" | "voice_input" | "full_voice";
|
||||
export function useVoiceMode(): { mode: VoiceMode; setMode: (v: VoiceMode) => Promise<void>; isLoading: boolean }
|
||||
```
|
||||
|
||||
<!-- Current ChatInput props (parent branch) -->
|
||||
```typescript
|
||||
interface ChatInputProps {
|
||||
onSend: (content: string) => void;
|
||||
isSubmitting?: boolean;
|
||||
disabled?: boolean;
|
||||
placeholder?: string;
|
||||
agents?: Agent[];
|
||||
agentsLoading?: boolean;
|
||||
onFilesPicked?: (files: File[]) => void;
|
||||
pendingFiles?: PendingFile[];
|
||||
onRemoveFile?: (id: string) => void;
|
||||
enableVoiceInput?: boolean; // Controls VoiceRecordButton visibility
|
||||
}
|
||||
```
|
||||
|
||||
<!-- Current useStreamingChat (parent branch) -->
|
||||
```typescript
|
||||
export function useStreamingChat(conversationId: string | null) {
|
||||
// startStream(userMessage: string, agentId?: string) — needs voiceMode param added
|
||||
return { streamingContent, isStreaming, startStream, stop };
|
||||
}
|
||||
```
|
||||
|
||||
<!-- Current chatApi.postMessageAndStream (parent branch) -->
|
||||
```typescript
|
||||
async postMessageAndStream(
|
||||
conversationId: string,
|
||||
data: { content: string; agentId?: string }, // needs voiceMode added
|
||||
callbacks: { onToken, onDone, onError },
|
||||
signal?: AbortSignal,
|
||||
): Promise<void>
|
||||
```
|
||||
|
||||
<!-- Current ChatPanel send handler (parent branch) -->
|
||||
```typescript
|
||||
// handleSend calls startStream(content, resolvedAgentId) — needs voiceMode
|
||||
```
|
||||
</interfaces>
|
||||
</context>
|
||||
|
||||
<tasks>
|
||||
|
||||
<task type="auto">
|
||||
<name>Task 1: Thread voiceMode through chatApi and useStreamingChat</name>
|
||||
<files>
|
||||
ui/src/api/chat.ts,
|
||||
ui/src/hooks/useStreamingChat.ts
|
||||
</files>
|
||||
<read_first>
|
||||
ui/src/api/chat.ts,
|
||||
ui/src/hooks/useStreamingChat.ts
|
||||
</read_first>
|
||||
<action>
|
||||
1. **ui/src/api/chat.ts** — Extend postMessageAndStream data parameter:
|
||||
- Change the `data` parameter type from `{ content: string; agentId?: string }` to `{ content: string; agentId?: string; voiceMode?: string }`
|
||||
- The body is already sent as `JSON.stringify(data)`, so voiceMode will be included automatically when present
|
||||
- No other changes needed — the server's chat.ts stream handler already reads voiceMode from req.body (added in Plan 01)
|
||||
|
||||
2. **ui/src/hooks/useStreamingChat.ts** — Extend startStream to accept voiceMode:
|
||||
- Change `startStream` signature from `(userMessage: string, agentId?: string)` to `(userMessage: string, agentId?: string, voiceMode?: string)`
|
||||
- Pass voiceMode through to chatApi.postMessageAndStream:
|
||||
```typescript
|
||||
chatApi.postMessageAndStream(
|
||||
conversationId,
|
||||
{ content: userMessage, agentId, voiceMode },
|
||||
{ onToken, onDone, onError },
|
||||
abort.signal,
|
||||
);
|
||||
```
|
||||
- Add `voiceMode` to the useCallback dependency array if needed (it's a parameter, not state, so it shouldn't need to be)
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "voiceMode" ui/src/api/chat.ts && grep -q "voiceMode" ui/src/hooks/useStreamingChat.ts && grep "postMessageAndStream" ui/src/api/chat.ts | grep -q "voiceMode" && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- grep "voiceMode" ui/src/api/chat.ts returns match in postMessageAndStream data type
|
||||
- grep "voiceMode" ui/src/hooks/useStreamingChat.ts returns match in startStream signature
|
||||
- grep "voiceMode" ui/src/hooks/useStreamingChat.ts returns match in postMessageAndStream call
|
||||
</acceptance_criteria>
|
||||
<done>chatApi.postMessageAndStream sends voiceMode in request body. useStreamingChat.startStream accepts and forwards voiceMode parameter.</done>
|
||||
</task>
|
||||
|
||||
<task type="auto">
|
||||
<name>Task 2: Wire VoiceMicButton + VoiceModeToggle into ChatInput, ChatVoiceBadge into ChatMessage, voiceMode into ChatPanel</name>
|
||||
<files>
|
||||
ui/src/components/ChatInput.tsx,
|
||||
ui/src/components/ChatMessage.tsx,
|
||||
ui/src/components/ChatPanel.tsx
|
||||
</files>
|
||||
<read_first>
|
||||
ui/src/components/ChatInput.tsx,
|
||||
ui/src/components/ChatMessage.tsx,
|
||||
ui/src/components/ChatPanel.tsx,
|
||||
ui/src/components/VoiceMicButton.tsx,
|
||||
ui/src/components/VoiceModeToggle.tsx,
|
||||
ui/src/components/ChatVoiceBadge.tsx,
|
||||
ui/src/hooks/useVoiceMode.ts,
|
||||
ui/src/hooks/useStreamingChat.ts
|
||||
</read_first>
|
||||
<action>
|
||||
1. **ui/src/components/ChatInput.tsx** — Replace VoiceRecordButton with VoiceMicButton:
|
||||
- Remove import of VoiceRecordButton: `import { VoiceRecordButton } from "./VoiceRecordButton";`
|
||||
- Add import: `import { VoiceMicButton } from "./VoiceMicButton";`
|
||||
- Add import: `import { VoiceModeToggle } from "./VoiceModeToggle";`
|
||||
- In the JSX, find the VoiceRecordButton rendering block:
|
||||
```tsx
|
||||
{enableVoiceInput && (
|
||||
<VoiceRecordButton
|
||||
onTranscription={handleTranscription}
|
||||
disabled={disabled}
|
||||
/>
|
||||
)}
|
||||
```
|
||||
Replace with:
|
||||
```tsx
|
||||
{enableVoiceInput && (
|
||||
<VoiceMicButton
|
||||
onTranscript={handleTranscription}
|
||||
disabled={disabled}
|
||||
/>
|
||||
)}
|
||||
```
|
||||
- Add VoiceModeToggle ABOVE the input form, inside the ChatInput component, after ChatFileDropZone opens but before the form:
|
||||
```tsx
|
||||
<ChatFileDropZone ...>
|
||||
{enableVoiceInput && <VoiceModeToggle />}
|
||||
<form ...>
|
||||
```
|
||||
This places the toggle above the input row so it doesn't crowd the send button area.
|
||||
|
||||
2. **ui/src/components/ChatMessage.tsx** — Add ChatVoiceBadge for voice messages:
|
||||
- Add imports:
|
||||
```typescript
|
||||
import { ChatVoiceBadge } from "./ChatVoiceBadge";
|
||||
```
|
||||
- In the messageType dispatch block (after the existing spec_card, handoff, task_created, status_update checks), add:
|
||||
```typescript
|
||||
if (messageType === "voice_input" || messageType === "voice_full") {
|
||||
const autoPlay = typeof window !== "undefined"
|
||||
? localStorage.getItem("nexus:voice:autoplay") === "true"
|
||||
: false;
|
||||
return (
|
||||
<div className="max-w-full group relative">
|
||||
{agentName && (
|
||||
<ChatMessageIdentityBar
|
||||
agentName={agentName}
|
||||
agentIcon={agentIcon}
|
||||
agentRole={agentRole}
|
||||
timestamp={timestamp}
|
||||
isStreaming={isStreaming}
|
||||
/>
|
||||
)}
|
||||
<ChatVoiceBadge
|
||||
content={content}
|
||||
messageType={messageType}
|
||||
autoPlayVoice={autoPlay}
|
||||
/>
|
||||
{isStreaming && <ChatStreamingCursor />}
|
||||
<ChatMessageActions
|
||||
role="assistant"
|
||||
isStreaming={isAnyStreaming}
|
||||
onRetry={id && onRetry ? () => onRetry(id) : undefined}
|
||||
onBookmark={id && onBookmark ? () => onBookmark(id) : undefined}
|
||||
isBookmarked={isBookmarked}
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
```
|
||||
- Place this BEFORE the general "fall through to default system message rendering" comment, but AFTER the status_update check
|
||||
|
||||
3. **ui/src/components/ChatPanel.tsx** — Connect useVoiceMode and pass voiceMode to startStream:
|
||||
- Add imports:
|
||||
```typescript
|
||||
import { useVoiceMode } from "../hooks/useVoiceMode";
|
||||
```
|
||||
- Inside the ChatPanel component, call the hook:
|
||||
```typescript
|
||||
const { mode: voiceMode } = useVoiceMode();
|
||||
```
|
||||
- Find ALL calls to `startStream(content, agentId)` (there are ~5 of them per the read_first scan). Add voiceMode as third argument:
|
||||
```typescript
|
||||
startStream(content, resolvedAgentId ?? undefined, voiceMode);
|
||||
```
|
||||
- The five locations are approximately:
|
||||
- In handleSend: `startStream(content, resolvedAgentId ?? undefined)` (two calls — online and offline branches)
|
||||
- In handleEdit callback: `startStream(newContent, activeAgentId ?? undefined)`
|
||||
- In handleRetry: `startStream(newContent, activeAgentId ?? undefined)`
|
||||
- In retry from error: `startStream(lastUserContent, activeAgentId ?? undefined)`
|
||||
- Update each to include `voiceMode` as the third argument
|
||||
- Also pass `enableVoiceInput={voiceMode !== "text" || true}` to ChatInput — actually, keep `enableVoiceInput={true}` always (or however it's currently set). The VoiceModeToggle handles mode selection independently. The mic button should always be visible when voice is available.
|
||||
- Check how enableVoiceInput is currently set in ChatPanel. If it's hardcoded or conditional, ensure it stays true so VoiceMicButton renders.
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "VoiceMicButton" ui/src/components/ChatInput.tsx && grep -q "VoiceModeToggle" ui/src/components/ChatInput.tsx && ! grep -q "VoiceRecordButton" ui/src/components/ChatInput.tsx && grep -q "ChatVoiceBadge" ui/src/components/ChatMessage.tsx && grep -q "voice_input\|voice_full" ui/src/components/ChatMessage.tsx && grep -q "useVoiceMode" ui/src/components/ChatPanel.tsx && grep -q "voiceMode" ui/src/components/ChatPanel.tsx && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- grep "VoiceMicButton" ui/src/components/ChatInput.tsx returns match
|
||||
- grep "VoiceModeToggle" ui/src/components/ChatInput.tsx returns match
|
||||
- grep "VoiceRecordButton" ui/src/components/ChatInput.tsx returns NO match (replaced)
|
||||
- grep "ChatVoiceBadge" ui/src/components/ChatMessage.tsx returns match
|
||||
- grep "voice_input" ui/src/components/ChatMessage.tsx returns match
|
||||
- grep "voice_full" ui/src/components/ChatMessage.tsx returns match
|
||||
- grep "nexus:voice:autoplay" ui/src/components/ChatMessage.tsx returns match (reads localStorage)
|
||||
- grep "useVoiceMode" ui/src/components/ChatPanel.tsx returns match
|
||||
- grep "voiceMode" ui/src/components/ChatPanel.tsx appears in startStream calls
|
||||
- grep "startStream.*voiceMode" ui/src/components/ChatPanel.tsx returns match
|
||||
</acceptance_criteria>
|
||||
<done>ChatInput uses VoiceMicButton (VAD-powered) instead of VoiceRecordButton. VoiceModeToggle shown above input. ChatMessage renders ChatVoiceBadge for voice messages. ChatPanel passes voiceMode to all startStream calls.</done>
|
||||
</task>
|
||||
|
||||
<task type="checkpoint:human-verify" gate="blocking">
|
||||
<name>Task 3: Verify voice flow end-to-end</name>
|
||||
<files>ui/src/components/ChatPanel.tsx</files>
|
||||
<read_first>ui/src/components/ChatPanel.tsx</read_first>
|
||||
<action>
|
||||
Human verification of the complete voice I/O integration. No code changes in this task — all implementation was done in Tasks 1-2. This checkpoint confirms the full voice flow works visually and functionally in the browser.
|
||||
|
||||
What was built across all Phase 37 plans:
|
||||
- VoiceMicButton with VAD auto-stop replacing VoiceRecordButton
|
||||
- VoiceWaveform canvas animation during recording
|
||||
- VoiceModeToggle (Text / Voice In / Full Voice) with nexus-settings persistence
|
||||
- ChatVoiceBadge with collapsible full markdown for voice_full messages
|
||||
- ChatVoicePlayer with play/pause and auto-play from localStorage
|
||||
- voiceMode threaded through ChatPanel -> useStreamingChat -> chatApi -> server chat.ts
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /opt/nexus/.claude/worktrees/agent-a009558f && grep -q "VoiceMicButton" ui/src/components/ChatInput.tsx && grep -q "ChatVoiceBadge" ui/src/components/ChatMessage.tsx && grep -q "voiceMode" ui/src/components/ChatPanel.tsx && echo "PASS" || echo "FAIL"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- VoiceModeToggle visible above chat input with three pills
|
||||
- Mic button starts recording with waveform animation
|
||||
- Recording auto-stops on silence detection
|
||||
- Transcribed text populates input field
|
||||
- Voice badge appears on agent responses in voice modes
|
||||
- Audio player works for voice_full messages
|
||||
- Auto-play toggle persists across page refresh
|
||||
</acceptance_criteria>
|
||||
<done>End-to-end voice flow verified by human: recording, VAD auto-stop, transcription, voice mode toggle, voice badge, audio playback, and auto-play setting all working correctly.</done>
|
||||
</task>
|
||||
|
||||
</tasks>
|
||||
|
||||
<verification>
|
||||
- VoiceRecordButton fully replaced by VoiceMicButton in ChatInput
|
||||
- VoiceModeToggle renders above chat input
|
||||
- ChatMessage dispatches voice_input and voice_full to ChatVoiceBadge
|
||||
- voiceMode flows: ChatPanel -> useStreamingChat -> chatApi -> server chat.ts
|
||||
- Auto-play reads from localStorage
|
||||
- TypeScript compiles without errors: pnpm --filter @paperclipai/ui typecheck
|
||||
</verification>
|
||||
|
||||
<success_criteria>
|
||||
Complete voice I/O working in browser chat: VAD-powered recording with waveform, auto-stop on silence, voice mode toggle with persistence, voice badge on responses, inline audio player with auto-play setting. User can have a full voice conversation with their agent.
|
||||
</success_criteria>
|
||||
|
||||
<output>
|
||||
After completion, create `.planning/phases/37-web-chat-voice-ui/37-04-SUMMARY.md`
|
||||
</output>
|
||||
Loading…
Add table
Reference in a new issue