diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index 8391cdf5..e66a1132 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -165,7 +165,11 @@ Plans: 2. A user can request the same agent response as audio in both English and Danish; both OGG files are generated and available for playback without a second agent call 3. On a fresh install, the onboarding hardware probe reports whether Whisper STT and Piper TTS are runnable on the detected hardware tier 4. The onboarding voice step activates (showing enable/skip options) only when the hardware probe confirms sufficient capability; on hardware below threshold it shows a capability note and skips to the next step -**Plans**: TBD +**Plans**: 2 plans + +Plans: +- [ ] 39-01-PLAN.md — Sentence-buffered TTS streaming + multi-language synthesis +- [ ] 39-02-PLAN.md — Onboarding voice hardware capability probe --- @@ -224,4 +228,4 @@ All 23 v1.6 requirements are mapped to exactly one phase. No orphans. | 36. Voice Pipeline Foundation | v1.6 | 2/3 | Complete | 2026-04-04 | | 37. Web Chat Voice UI | v1.6 | 3/4 | Complete | 2026-04-04 | | 38. Telegram Bridge | v1.6 | 3/3 | Complete | 2026-04-04 | -| 39. Voice Polish | v1.6 | 0/TBD | Not started | - | +| 39. Voice Polish | v1.6 | 0/2 | Not started | - | diff --git a/.planning/phases/39-voice-polish/39-01-PLAN.md b/.planning/phases/39-voice-polish/39-01-PLAN.md new file mode 100644 index 00000000..28d8fd0c --- /dev/null +++ b/.planning/phases/39-voice-polish/39-01-PLAN.md @@ -0,0 +1,238 @@ +--- +phase: 39-voice-polish +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - server/src/services/voice-pipeline.ts + - server/src/routes/voice.ts + - ui/src/components/ChatVoicePlayer.tsx + - server/src/__tests__/39-sentence-streaming.test.ts +autonomous: true +requirements: [VPIPE-07, VPIPE-08] + +must_haves: + truths: + - "First sentence audio begins playing before full response finishes synthesizing" + - "User can request same text synthesized in multiple languages simultaneously" + - "Existing single-language synthesize endpoint still works unchanged" + artifacts: + - path: "server/src/services/voice-pipeline.ts" + provides: "synthesizeSentenceStream generator + synthesizeMultiLang method" + contains: "synthesizeSentenceStream" + - path: "server/src/routes/voice.ts" + provides: "GET /api/synthesize/stream SSE endpoint + POST /api/synthesize/multi-lang" + contains: "synthesize/stream" + - path: "ui/src/components/ChatVoicePlayer.tsx" + provides: "Streaming audio playback via sentence-buffered fetch" + contains: "EventSource\\|ReadableStream\\|sentence" + - path: "server/src/__tests__/39-sentence-streaming.test.ts" + provides: "Tests for sentence splitting, multi-lang, and streaming" + contains: "describe.*sentence" + key_links: + - from: "ui/src/components/ChatVoicePlayer.tsx" + to: "/api/synthesize/stream" + via: "EventSource or fetch with ReadableStream" + pattern: "synthesize/stream" + - from: "server/src/routes/voice.ts" + to: "server/src/services/voice-pipeline.ts" + via: "synthesizeSentenceStream generator" + pattern: "synthesizeSentenceStream" +--- + + +Sentence-buffered TTS streaming and multi-language synthesis. + +Purpose: Voice responses begin playing before full synthesis completes (under 1s to first audio), and users can synthesize the same response in multiple languages without a second agent call. +Output: Streaming synthesize endpoint, multi-language endpoint, updated ChatVoicePlayer with progressive playback. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/ROADMAP.md +@.planning/REQUIREMENTS.md +@.planning/phases/39-voice-polish/39-CONTEXT.md + +@server/src/services/voice-pipeline.ts +@server/src/routes/voice.ts +@ui/src/components/ChatVoicePlayer.tsx + + + + +From server/src/services/voice-pipeline.ts: +```typescript +// synthesize already does sentence splitting internally: +// const sentences = text.split(/(?<=[.!?])\s+/).filter((s) => s.length > 0); +// Currently concatenates all sentence buffers before returning. +// Need to yield each sentence buffer as it completes. + +export function voicePipelineService(): { + transcribe(buffer: Buffer, format: "webm" | "ogg" | "wav"): Promise<{ text: string; language?: string }>; + synthesize(text: string, voiceId?: string): Promise; + formatForVoice(text: string): string; + transcodeToWav16k(inputBuffer: Buffer, inputFormat: string): Promise; +} +``` + +From server/src/routes/voice.ts: +```typescript +// POST /api/synthesize — takes { text, voiceId }, returns audio/wav buffer +// POST /api/transcribe — takes multipart audio, returns { text, language? } +``` + +From ui/src/components/ChatVoicePlayer.tsx: +```typescript +interface ChatVoicePlayerProps { + text: string; + autoPlay?: boolean; +} +// Currently fetches full audio blob from POST /api/synthesize, then plays +``` + + + + + + + Task 1: Sentence-buffered synthesis + multi-language TTS in voice pipeline and routes + server/src/services/voice-pipeline.ts, server/src/routes/voice.ts, server/src/__tests__/39-sentence-streaming.test.ts + + - server/src/services/voice-pipeline.ts (full file — understand synthesize internals) + - server/src/routes/voice.ts (full file — understand route patterns) + - server/src/routes/authz.ts (assertBoard pattern) + + + - Test: splitSentences("Hello world. How are you? I am fine.") returns ["Hello world.", "How are you?", "I am fine."] + - Test: splitSentences("Dr. Smith went to D.C. He liked it.") returns ["Dr. Smith went to D.C.", "He liked it."] (abbreviation-aware) + - Test: synthesizeSentenceStream yields Buffer chunks one per sentence + - Test: synthesizeMultiLang({ text, languages: ["en_US-lessac-medium", "da_DK-talesyntese-medium"] }) returns Map with two Buffer entries + + + 1. In voice-pipeline.ts, extract sentence splitting into an exported `splitSentences(text: string): string[]` function. Use regex: split on /(?<=[.!?])\s+/ (same as current), filter empty. Keep existing synthesize() working by calling splitSentences internally. + + 2. Add `async *synthesizeSentenceStream(text: string, voiceId?: string): AsyncGenerator<{ index: number; total: number; audio: Buffer }>` method: + - Call splitSentences(text) to get sentences array + - For each sentence, call piper (same as current synthesize logic), yield { index, total: sentences.length, audio: audioBuffer } immediately + - This gives the consumer each sentence's audio as soon as it is ready + + 3. Add `async synthesizeMultiLang(text: string, voiceIds: string[]): Promise>` method: + - For each voiceId, call existing synthesize(text, voiceId) in parallel via Promise.all + - Return Map + + 4. Update the return signature of voicePipelineService() to include the new methods. + + 5. In voice.ts, add streaming endpoint: + `POST /api/synthesize/stream` — accepts { text: string, voiceId?: string } + - assertBoard(req) + - Set headers: Content-Type: text/event-stream, Cache-Control: no-cache, Connection: keep-alive + - Iterate synthesizeSentenceStream, for each chunk: write SSE `data: { "index": N, "total": M, "audio": "" }\n\n` + - On completion: write `data: { "done": true }\n\n` then res.end() + + 6. In voice.ts, add multi-language endpoint: + `POST /api/synthesize/multi-lang` — accepts { text: string, voiceIds: string[] } + - assertBoard(req) + - Validate voiceIds is array with 1-5 entries + - Call synthesizeMultiLang, return JSON: { results: [{ voiceId, audio: base64 }] } + + 7. Write tests in 39-sentence-streaming.test.ts: + - Test splitSentences with basic and edge cases + - Test synthesizeSentenceStream yields correct number of chunks (mock piper execFile) + - Test synthesizeMultiLang returns correct number of entries (mock piper) + + + cd /opt/nexus && npx vitest run server/src/__tests__/39-sentence-streaming.test.ts --reporter=verbose 2>&1 | tail -30 + + + - grep -q "splitSentences" server/src/services/voice-pipeline.ts + - grep -q "synthesizeSentenceStream" server/src/services/voice-pipeline.ts + - grep -q "synthesizeMultiLang" server/src/services/voice-pipeline.ts + - grep -q "synthesize/stream" server/src/routes/voice.ts + - grep -q "synthesize/multi-lang" server/src/routes/voice.ts + - grep -q "text/event-stream" server/src/routes/voice.ts + - test -f server/src/__tests__/39-sentence-streaming.test.ts + + + - splitSentences exported and tested + - synthesizeSentenceStream yields per-sentence audio chunks via AsyncGenerator + - synthesizeMultiLang synthesizes same text in N languages in parallel + - POST /api/synthesize/stream sends SSE with base64 audio per sentence + - POST /api/synthesize/multi-lang returns array of { voiceId, audio } pairs + - Existing POST /api/synthesize unchanged (backward compatible) + - All tests pass + + + + + Task 2: ChatVoicePlayer sentence-buffered streaming playback + ui/src/components/ChatVoicePlayer.tsx + + - ui/src/components/ChatVoicePlayer.tsx (full file — current playback implementation) + - ui/src/components/ChatMessage.tsx (how ChatVoicePlayer is used) + + + 1. Refactor ChatVoicePlayer to support streaming playback mode: + - Add a `streaming` prop (default true) to ChatVoicePlayerProps + - When streaming=true, use EventSource to connect to POST /api/synthesize/stream (use fetch with ReadableStream since EventSource only supports GET — instead use fetch POST then parse SSE text manually from response body stream) + - Actually: use fetch with { method: "POST", body, headers } and read response.body as ReadableStream, parsing SSE lines manually + + 2. Streaming playback logic: + - Maintain a queue of audio Buffers (base64-decoded from SSE data) + - On first chunk received: decode base64 to ArrayBuffer, create Blob with audio/wav type, create object URL, set as audio src, begin playback immediately — this satisfies the "under 1 second" requirement + - On subsequent chunks: queue them. When current audio `onEnded`, pop next from queue, set as new src, play + - Show progress: "Playing 1/3..." in the UI + + 3. Sentence progress indicator: + - Display "Sentence N of M" text when streaming is active + - Show a small progress bar or dot indicator below the play button + + 4. Fallback: when streaming=false or if SSE connection fails, fall back to existing full-fetch behavior (current implementation) + + 5. Clean up: revoke all object URLs on unmount or when new text arrives + + 6. Keep the existing play/pause controls working for both modes + + + cd /opt/nexus && npx tsc --noEmit --project ui/tsconfig.json 2>&1 | tail -20 + + + - grep -q "synthesize/stream" ui/src/components/ChatVoicePlayer.tsx + - grep -q "ReadableStream\|getReader\|TextDecoder" ui/src/components/ChatVoicePlayer.tsx + - grep -q "queue\|Queue\|audioQueue" ui/src/components/ChatVoicePlayer.tsx + - grep -q "Sentence.*of\|sentence.*progress\|Playing.*of" ui/src/components/ChatVoicePlayer.tsx + + + - ChatVoicePlayer connects to /api/synthesize/stream via fetch POST + ReadableStream + - First sentence audio begins playing as soon as first SSE chunk arrives + - Subsequent sentences auto-play in sequence from queue + - Progress indicator shows current sentence position + - Falls back to full-fetch on stream error + - TypeScript compiles without errors + + + + + + +1. TypeScript compiles: `npx tsc --noEmit` in both server and ui +2. Tests pass: `npx vitest run server/src/__tests__/39-sentence-streaming.test.ts` +3. Existing synthesize endpoint still works: grep confirms original POST /api/synthesize route unchanged +4. SSE endpoint exists: grep confirms text/event-stream header in voice.ts +5. Multi-lang endpoint exists: grep confirms synthesize/multi-lang in voice.ts + + + +- VPIPE-07: First sentence plays while subsequent sentences still synthesizing (sentence-buffered SSE streaming) +- VPIPE-08: Single text can be synthesized in multiple languages via /api/synthesize/multi-lang +- Backward compatible: existing /api/synthesize POST unchanged +- All tests green + + + +After completion, create `.planning/phases/39-voice-polish/39-01-SUMMARY.md` + diff --git a/.planning/phases/39-voice-polish/39-02-PLAN.md b/.planning/phases/39-voice-polish/39-02-PLAN.md new file mode 100644 index 00000000..4e5ee1dd --- /dev/null +++ b/.planning/phases/39-voice-polish/39-02-PLAN.md @@ -0,0 +1,255 @@ +--- +phase: 39-voice-polish +plan: 02 +type: execute +wave: 1 +depends_on: [] +files_modified: + - server/src/services/hardware.ts + - server/src/routes/hardware.ts + - ui/src/components/onboarding/VoiceStep.tsx + - ui/src/components/NexusOnboardingWizard.tsx + - ui/src/hooks/useHardwareInfo.ts + - server/src/__tests__/39-voice-hardware-probe.test.ts +autonomous: true +requirements: [ONBRD-01, ONBRD-02] + +must_haves: + truths: + - "Onboarding hardware probe reports whether Whisper STT is runnable on detected hardware" + - "Onboarding hardware probe reports whether Piper TTS is runnable on detected hardware" + - "VoiceStep shows enable/skip when hardware is sufficient" + - "VoiceStep shows capability note and auto-skips or shows skip-only when hardware is insufficient" + artifacts: + - path: "server/src/services/hardware.ts" + provides: "voiceCapability probe in HardwareInfo" + contains: "whisperAvailable" + - path: "server/src/routes/hardware.ts" + provides: "voice capability data in /system/providers response" + contains: "voiceCapability" + - path: "ui/src/components/onboarding/VoiceStep.tsx" + provides: "Hardware-aware voice step with conditional enable/skip" + contains: "whisperAvailable\\|piperAvailable\\|voiceCapability" + - path: "server/src/__tests__/39-voice-hardware-probe.test.ts" + provides: "Tests for voice capability detection" + contains: "describe.*voice.*capability" + key_links: + - from: "ui/src/components/onboarding/VoiceStep.tsx" + to: "ui/src/hooks/useHardwareInfo.ts" + via: "voiceCapability prop from hardware info" + pattern: "voiceCapability" + - from: "ui/src/components/NexusOnboardingWizard.tsx" + to: "ui/src/components/onboarding/VoiceStep.tsx" + via: "passes hardware voiceCapability as prop" + pattern: "voiceCapability" +--- + + +Onboarding voice hardware detection — probe for Whisper STT and Piper TTS capability during onboarding and gate the voice enable step accordingly. + +Purpose: New installs detect whether the machine can run STT/TTS before offering voice features, preventing users from enabling voice on incapable hardware. +Output: Extended hardware probe with voice capability, updated VoiceStep with hardware-aware UI. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/ROADMAP.md +@.planning/REQUIREMENTS.md +@.planning/phases/39-voice-polish/39-CONTEXT.md + +@server/src/services/hardware.ts +@server/src/routes/hardware.ts +@ui/src/components/onboarding/VoiceStep.tsx +@ui/src/components/NexusOnboardingWizard.tsx +@ui/src/hooks/useHardwareInfo.ts + + +From server/src/services/hardware.ts: +```typescript +export type HardwareTier = "gpu" | "apple_silicon" | "cpu_only"; + +export interface HardwareInfo { + totalGb: number; + freeGb: number; + usableGb: number; + platform: NodeJS.Platform; + gpuName: string | null; + gpuVramGb: number | null; + unifiedMemory: boolean; + hardwareTier: HardwareTier; + cpuModel: string | null; +} + +export function hardwareService(): { detect(): Promise } +``` + +From ui/src/components/onboarding/VoiceStep.tsx: +```typescript +interface VoiceStepProps { + onEnable: () => void; + onSkip: () => void; +} +// Currently only checks for microphone via navigator.mediaDevices +// Does NOT check if Whisper/Piper binaries are available on server +``` + +From ui/src/hooks/useHardwareInfo.ts: +```typescript +// Hook that fetches GET /system/providers and returns HardwareInfo +// Used by NexusOnboardingWizard.tsx +``` + + + + + + + Task 1: Voice capability probe in hardware service and route + server/src/services/hardware.ts, server/src/routes/hardware.ts, server/src/__tests__/39-voice-hardware-probe.test.ts + + - server/src/services/hardware.ts (full file — understand detect() and HardwareInfo) + - server/src/routes/hardware.ts (full file — understand route patterns) + - server/src/services/voice-pipeline.ts (lines 76-125 — understand whisper/piper detection patterns) + + + - Test: detectVoiceCapability() returns { whisperAvailable: true, piperAvailable: true } when both binaries resolve via execFile --version + - Test: detectVoiceCapability() returns { whisperAvailable: false, piperAvailable: false } when both binaries throw ENOENT + - Test: detectVoiceCapability() returns { whisperAvailable: true, piperAvailable: false } when only whisper is found + - Test: Hardware tier "cpu_only" with < 4GB RAM sets voiceTierSufficient to false + - Test: Hardware tier "apple_silicon" with >= 8GB RAM sets voiceTierSufficient to true + + + 1. In hardware.ts, add a `VoiceCapability` interface: + ```typescript + export interface VoiceCapability { + whisperAvailable: boolean; + piperAvailable: boolean; + voiceTierSufficient: boolean; // true if hardware tier >= apple_silicon OR (cpu_only with >= 4GB free) + } + ``` + + 2. Extend HardwareInfo interface with `voiceCapability: VoiceCapability`. + + 3. Add `async detectVoiceCapability(): Promise` to hardwareService: + - Probe whisper-cpp: try `execFile("whisper-cpp", ["--help"])` with 2s timeout. If resolves → whisperAvailable=true. If ENOENT → try `execFile("whisper", ["--help"])` as fallback. Both fail → false. + - Probe piper: try `execFile("piper", ["--help"])` with 2s timeout. If resolves → piperAvailable=true. Catch → false. + - voiceTierSufficient: true if hardwareTier is "apple_silicon" or "gpu", OR if "cpu_only" with freeGb >= 4 + - Use execFile from node:child_process with promisify pattern (or the existing execFileAsync if extracted) + + 4. Call detectVoiceCapability() inside detect() AFTER the existing hardware detection, add result to HardwareInfo. Use a separate 3s timeout to avoid slowing down hardware detection if voice probes hang. + + 5. In hardware.ts route: no changes needed — it already returns the full HardwareInfo object from detect(), so voiceCapability will be included automatically. + + 6. Write tests in 39-voice-hardware-probe.test.ts: + - Mock execFile (child_process) to test whisper/piper detection + - Test voiceTierSufficient logic for each hardware tier + - Test that detectVoiceCapability timeout does not exceed 3s + + + cd /opt/nexus && npx vitest run server/src/__tests__/39-voice-hardware-probe.test.ts --reporter=verbose 2>&1 | tail -30 + + + - grep -q "VoiceCapability" server/src/services/hardware.ts + - grep -q "whisperAvailable" server/src/services/hardware.ts + - grep -q "piperAvailable" server/src/services/hardware.ts + - grep -q "voiceTierSufficient" server/src/services/hardware.ts + - grep -q "voiceCapability" server/src/services/hardware.ts + - test -f server/src/__tests__/39-voice-hardware-probe.test.ts + + + - HardwareInfo includes voiceCapability with whisperAvailable, piperAvailable, voiceTierSufficient + - Binary detection probes whisper-cpp/whisper and piper with 2s timeout each + - voiceTierSufficient is true for apple_silicon/gpu, or cpu_only with >= 4GB free RAM + - GET /system/providers response now includes voiceCapability object + - All tests pass + + + + + Task 2: VoiceStep hardware-aware UI with conditional enable/skip + ui/src/components/onboarding/VoiceStep.tsx, ui/src/components/NexusOnboardingWizard.tsx, ui/src/hooks/useHardwareInfo.ts + + - ui/src/components/onboarding/VoiceStep.tsx (full file) + - ui/src/components/NexusOnboardingWizard.tsx (full file — understand step 4 voice wiring) + - ui/src/hooks/useHardwareInfo.ts (full file — understand HardwareInfo type on client) + + + 1. Update useHardwareInfo.ts: ensure the TypeScript type for hardware info includes the new voiceCapability field. Add: + ```typescript + interface VoiceCapability { + whisperAvailable: boolean; + piperAvailable: boolean; + voiceTierSufficient: boolean; + } + ``` + Add `voiceCapability?: VoiceCapability` to the HardwareInfo type used in the hook. The "?" makes it backward-compatible if the server hasn't been updated yet. + + 2. Update VoiceStep props to accept voice capability: + ```typescript + interface VoiceStepProps { + onEnable: () => void; + onSkip: () => void; + voiceCapability?: { + whisperAvailable: boolean; + piperAvailable: boolean; + voiceTierSufficient: boolean; + }; + } + ``` + + 3. Update VoiceStep rendering logic: + - If voiceCapability is undefined (loading/missing): show current behavior (mic check only) + - If voiceCapability.voiceTierSufficient === false: show capability note ("Your hardware may not support voice features. Voice requires at least 4GB free RAM."), show Skip button only (no Enable), do NOT auto-skip — let user read the note + - If voiceCapability.whisperAvailable && voiceCapability.piperAvailable: show green checkmark next to STT and TTS labels ("Whisper detected", "Piper detected"), show Enable + Skip buttons + - If whisperAvailable but NOT piperAvailable: show checkmark for STT, warning for TTS ("Piper not found — install piper for voice output"), still allow Enable (voice input will work, output won't) + - If neither available but tier is sufficient: show note "Install whisper-cpp and piper for voice features", show Skip button, dim the Enable button but keep it clickable (user may install later) + + 4. In NexusOnboardingWizard.tsx, pass voiceCapability to VoiceStep: + - hardwareInfo already comes from useHardwareInfo hook + - Pass `voiceCapability={hardwareInfo?.voiceCapability}` to VoiceStep in step 4 + + 5. Keep existing microphone detection in VoiceStep — it checks client-side mic availability which is complementary to server-side binary detection. + + + cd /opt/nexus && npx tsc --noEmit --project ui/tsconfig.json 2>&1 | tail -20 + + + - grep -q "voiceCapability" ui/src/components/onboarding/VoiceStep.tsx + - grep -q "whisperAvailable" ui/src/components/onboarding/VoiceStep.tsx + - grep -q "piperAvailable" ui/src/components/onboarding/VoiceStep.tsx + - grep -q "voiceTierSufficient" ui/src/components/onboarding/VoiceStep.tsx + - grep -q "voiceCapability" ui/src/components/NexusOnboardingWizard.tsx + - grep -q "VoiceCapability" ui/src/hooks/useHardwareInfo.ts + + + - VoiceStep accepts voiceCapability prop and renders conditionally based on hardware detection + - Sufficient hardware + binaries present: shows enable/skip with green checkmarks + - Insufficient hardware: shows capability note and skip-only + - Missing binaries on sufficient hardware: shows install note with dimmed enable + - NexusOnboardingWizard passes voiceCapability from hardware probe to VoiceStep + - TypeScript compiles without errors + + + + + + +1. TypeScript compiles: `npx tsc --noEmit` for both server and ui +2. Tests pass: `npx vitest run server/src/__tests__/39-voice-hardware-probe.test.ts` +3. VoiceStep renders hardware-aware UI: grep confirms voiceCapability, whisperAvailable, piperAvailable in VoiceStep.tsx +4. Wizard wiring: grep confirms voiceCapability prop passed in NexusOnboardingWizard.tsx + + + +- ONBRD-01: Onboarding hardware probe reports Whisper STT and Piper TTS capability (whisperAvailable, piperAvailable, voiceTierSufficient in HardwareInfo) +- ONBRD-02: VoiceStep activates enable/skip when hardware is capable, shows capability note when below threshold +- Backward compatible: existing hardware endpoint still works, VoiceStep degrades gracefully if voiceCapability is undefined + + + +After completion, create `.planning/phases/39-voice-polish/39-02-SUMMARY.md` +