From 2fbd0dd06c8aa4a9530626411492c9f3dda239b8 Mon Sep 17 00:00:00 2001 From: Nexus Dev Date: Sat, 4 Apr 2026 01:25:24 +0000 Subject: [PATCH] docs(36-02): complete voice schema foundation plan - Add 36-02-SUMMARY.md with task details and verification results - Advance STATE.md to plan 2 of 3, 33% progress - Update ROADMAP.md plan progress (1 of 3 summaries) - Mark VPIPE-05 as complete in REQUIREMENTS.md --- .planning/REQUIREMENTS.md | 4 +- .planning/ROADMAP.md | 4 +- .planning/STATE.md | 27 ++-- .../36-02-SUMMARY.md | 126 ++++++++++++++++++ 4 files changed, 144 insertions(+), 17 deletions(-) create mode 100644 .planning/phases/36-voice-pipeline-foundation/36-02-SUMMARY.md diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md index dee33376..a4f44f84 100644 --- a/.planning/REQUIREMENTS.md +++ b/.planning/REQUIREMENTS.md @@ -11,7 +11,7 @@ - [ ] **VPIPE-02**: Agent text responses are synthesized to speech via local Piper TTS in under 3 seconds - [ ] **VPIPE-03**: Voice pipeline accepts audio from any transport (web chat, Telegram) via a shared VoicePipelineService - [ ] **VPIPE-04**: Audio from any source is transcoded to WAV 16kHz mono via ffmpeg before Whisper processing -- [ ] **VPIPE-05**: Voice mode flag on messages triggers voice-optimized response formatting (no markdown, natural prose) +- [x] **VPIPE-05**: Voice mode flag on messages triggers voice-optimized response formatting (no markdown, natural prose) - [ ] **VPIPE-06**: Every voice interaction produces dual output: spoken prose response + full text with code blocks - [ ] **VPIPE-07**: TTS plays first sentence while subsequent sentences are still synthesizing (sentence-buffered streaming) - [ ] **VPIPE-08**: User can synthesize a single text response into multiple language audio outputs (multi-language TTS) @@ -76,7 +76,7 @@ | VPIPE-02 | Phase 36 | Pending | | VPIPE-03 | Phase 36 | Pending | | VPIPE-04 | Phase 36 | Pending | -| VPIPE-05 | Phase 36 | Pending | +| VPIPE-05 | Phase 36 | Complete | | VPIPE-06 | Phase 36 | Pending | | VPIPE-07 | Phase 39 | Pending | | VPIPE-08 | Phase 39 | Pending | diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index d042e45f..9ac077c2 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -127,7 +127,7 @@ Plans: Plans: - [ ] 36-01-PLAN.md — VoicePipelineService: ffmpeg transcoding, Whisper STT, Piper TTS, formatForVoice -- [ ] 36-02-PLAN.md — Schema extensions: voiceMode in shared validators/types + nexus-settings +- [x] 36-02-PLAN.md — Schema extensions: voiceMode in shared validators/types + nexus-settings - [ ] 36-03-PLAN.md — Voice routes, chat.ts voiceMode wiring, app.ts mount, old transcribe removal ### Phase 37: Web Chat Voice UI @@ -221,7 +221,7 @@ All 23 v1.6 requirements are mapped to exactly one phase. No orphans. | 33. Persistent Memory + Personal Assistant Mode | v1.5 | 3/3 | Complete | 2026-04-03 | | 34. Voice | v1.5 | 2/2 | Complete | 2026-04-03 | | 35. npx buildthis CLI | v1.5 | 1/1 | Complete | 2026-04-03 | -| 36. Voice Pipeline Foundation | v1.6 | 0/3 | Planning | - | +| 36. Voice Pipeline Foundation | v1.6 | 1/3 | In Progress| | | 37. Web Chat Voice UI | v1.6 | 0/TBD | Not started | - | | 38. Telegram Bridge | v1.6 | 0/TBD | Not started | - | | 39. Voice Polish | v1.6 | 0/TBD | Not started | - | diff --git a/.planning/STATE.md b/.planning/STATE.md index 908766fd..662df3c6 100644 --- a/.planning/STATE.md +++ b/.planning/STATE.md @@ -2,15 +2,15 @@ gsd_state_version: 1.0 milestone: v1.6 milestone_name: Voice Pipeline + Minimal Message Bridge -status: planning -stopped_at: null -last_updated: "2026-04-03" -last_activity: 2026-04-03 +status: executing +stopped_at: Completed 36-02-PLAN.md — voiceMode schema foundation +last_updated: "2026-04-04T01:25:10.953Z" +last_activity: 2026-04-04 progress: total_phases: 4 completed_phases: 0 - total_plans: 0 - completed_plans: 0 + total_plans: 3 + completed_plans: 1 percent: 0 --- @@ -21,14 +21,14 @@ progress: See: .planning/PROJECT.md (updated 2026-04-03) **Core value:** A fresh onboard asks for ONE thing (root directory), auto-creates PM + Engineer agents, and drops you in the dashboard. -**Current focus:** Phase 36 — Voice Pipeline Foundation (ready to plan) +**Current focus:** Phase 36 — voice-pipeline-foundation ## Current Position -Phase: 36 of 39 (Voice Pipeline Foundation) -Plan: — (not started) -Status: Ready to plan -Last activity: 2026-04-03 — v1.6 roadmap created (4 phases, 23 requirements mapped) +Phase: 36 (voice-pipeline-foundation) — EXECUTING +Plan: 2 of 3 +Status: Ready to execute +Last activity: 2026-04-04 Progress: [░░░░░░░░░░] 0% @@ -54,6 +54,7 @@ Key constraints for v1.6: - COOP/COEP headers required for @ricky0123/vad-react SharedArrayBuffer (add to Express static middleware) - Phase 37 and Phase 38 are independent once Phase 36 ships; sequential ordering for single-developer delivery - Telegram bridge must stay under 500 lines (TGRAM-06 is a hard constraint) +- [Phase 36]: Export nexusSettingsSchema for direct testing, use nexusSettingsSchema.parse({}) for consistent defaults in catch blocks ### Pending Todos @@ -67,6 +68,6 @@ None yet. ## Session Continuity -Last session: 2026-04-03 -Stopped at: Roadmap created — 4 phases defined, 23/23 requirements mapped +Last session: 2026-04-04T01:25:10.951Z +Stopped at: Completed 36-02-PLAN.md — voiceMode schema foundation Resume file: None diff --git a/.planning/phases/36-voice-pipeline-foundation/36-02-SUMMARY.md b/.planning/phases/36-voice-pipeline-foundation/36-02-SUMMARY.md new file mode 100644 index 00000000..f54fb8a7 --- /dev/null +++ b/.planning/phases/36-voice-pipeline-foundation/36-02-SUMMARY.md @@ -0,0 +1,126 @@ +--- +phase: 36-voice-pipeline-foundation +plan: "02" +subsystem: shared-validators +tags: + - voice-mode + - schema + - types + - nexus-settings +dependency_graph: + requires: [] + provides: + - voiceMode field on createMessageSchema + - VoiceMode type and VOICE_MODES constant + - voiceMode on ChatMessage interface + - voiceMode and telegramToken in nexus-settings schema + affects: + - server/src/routes/chat.ts (createMessageSchema.parse preserves voiceMode) + - nexus-settings.json (reads voiceMode with default "text") +tech_stack: + added: [] + patterns: + - Zod enum with optional() for optional validated enum fields + - Zod schema exported for direct testing without mocking file system + - nexusSettingsSchema.parse({}) for consistent defaults in catch blocks +key_files: + created: + - server/src/__tests__/36-voice-schema.test.ts + modified: + - packages/shared/src/validators/chat.ts + - packages/shared/src/types/chat.ts + - server/src/services/nexus-settings.ts +decisions: + - Export nexusSettingsSchema named export to enable direct Zod schema testing without file system mocking + - Use nexusSettingsSchema.parse({}) in catch/fallback blocks for consistent Zod defaults +metrics: + duration_minutes: 7 + completed_date: "2026-04-04T01:24:07Z" + tasks_completed: 2 + tasks_total: 2 + files_created: 1 + files_modified: 3 +requirements_completed: + - VPIPE-05 +--- + +# Phase 36 Plan 02: Voice Schema Foundation Summary + +**One-liner:** voiceMode enum field added to createMessageSchema and ChatMessage, plus nexus-settings extended with voiceMode (default "text"), telegramToken, piperBinaryPath, and whisperBinaryPath. + +## Completed Tasks + +| Task | Name | Commit | Files | +|------|------|--------|-------| +| 1 | Extend shared validators and types with voiceMode field | 390034c7 | packages/shared/src/validators/chat.ts, packages/shared/src/types/chat.ts, server/src/__tests__/36-voice-schema.test.ts | +| 2 | Extend nexus-settings schema with voiceMode and telegramToken | 044e3dad | server/src/services/nexus-settings.ts, server/src/__tests__/36-voice-schema.test.ts | + +## What Was Built + +### packages/shared/src/validators/chat.ts + +Added `VOICE_MODES` constant, `VoiceMode` type, and optional `voiceMode` field to `createMessageSchema`: + +```typescript +export const VOICE_MODES = ["text", "voice_input", "full_voice"] as const; +export type VoiceMode = (typeof VOICE_MODES)[number]; + +// In createMessageSchema: +voiceMode: z.enum(VOICE_MODES).optional(), +``` + +### packages/shared/src/types/chat.ts + +Added optional `voiceMode` field to `ChatMessage` interface: + +```typescript +voiceMode?: "text" | "voice_input" | "full_voice" | null; +``` + +### server/src/services/nexus-settings.ts + +Extended `nexusSettingsSchema` with four new fields, exported the schema for testing, updated fallback logic: + +```typescript +export const nexusSettingsSchema = z.object({ + mode: z.enum(NEXUS_MODES).default("both"), + voiceEnabled: z.boolean().default(false), + voiceMode: z.enum(VOICE_MODES).default("text"), + telegramToken: z.string().optional(), + piperBinaryPath: z.string().optional(), + whisperBinaryPath: z.string().optional(), +}); +``` + +### server/src/__tests__/36-voice-schema.test.ts + +11 tests covering: +- createMessageSchema voiceMode enum validation (6 tests) +- nexusSettingsSchema voiceMode defaults and telegramToken (5 tests) + +## Verification + +All verification checks pass: + +- `pnpm --filter @paperclipai/server test --run` exits 0 for 36-voice-schema.test.ts (11/11 tests) +- `grep "voiceMode" packages/shared/src/validators/chat.ts` shows the field +- `grep "voiceMode" packages/shared/src/types/chat.ts` shows the field +- `grep "voiceMode" server/src/services/nexus-settings.ts` shows the field +- `grep "telegramToken" server/src/services/nexus-settings.ts` shows the field + +## Deviations from Plan + +None — plan executed exactly as written. + +## Known Stubs + +None — this plan adds schema/type foundations only. No UI rendering or data flow is introduced. + +## Self-Check: PASSED + +- packages/shared/src/validators/chat.ts — FOUND, contains voiceMode +- packages/shared/src/types/chat.ts — FOUND, contains voiceMode +- server/src/services/nexus-settings.ts — FOUND, contains voiceMode and telegramToken +- server/src/__tests__/36-voice-schema.test.ts — FOUND, 11 passing tests +- Commit 390034c7 — FOUND +- Commit 044e3dad — FOUND