From 14ecbf00bbe217dfd6e1002d77ac44e62c77e325 Mon Sep 17 00:00:00 2001 From: Nexus Dev Date: Sat, 11 Apr 2026 13:22:48 +0000 Subject: [PATCH] feat(nexus): VoiceContext for phase 14 voice globalization Lifts MediaStream, recording state, transcription buffer, and queue for non-Assistant captures out of ChatInput's internal VoiceMicButton. The provider owns the POST /api/transcribe fetch (v1.6 pipeline, unchanged), exposes idle/listening/speaking state to the top-strip GlobalMicButton, and queues transcripts captured away from /assistant for PersonalAssistant to drain on mount. Per spec sections 4.2 (mic states), 5.5 (voice from non-Assistant modes), and 10.3 (voice as global affordance). Tests use manual createRoot + act with a mocked getUserMedia injector to stay deterministic in jsdom. Co-Authored-By: Claude Opus 4.6 (1M context) --- ui/src/context/VoiceContext.test.tsx | 192 ++++++++++++++++ ui/src/context/VoiceContext.tsx | 315 +++++++++++++++++++++++++++ 2 files changed, 507 insertions(+) create mode 100644 ui/src/context/VoiceContext.test.tsx create mode 100644 ui/src/context/VoiceContext.tsx diff --git a/ui/src/context/VoiceContext.test.tsx b/ui/src/context/VoiceContext.test.tsx new file mode 100644 index 00000000..579c6bdf --- /dev/null +++ b/ui/src/context/VoiceContext.test.tsx @@ -0,0 +1,192 @@ +// @vitest-environment jsdom + +import { act } from "react"; +import { createRoot } from "react-dom/client"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { VoiceProvider, useVoice } from "./VoiceContext"; + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +(globalThis as any).IS_REACT_ACT_ENVIRONMENT = true; + +// Lightweight MediaStream stand-in for jsdom. +function makeFakeStream(): MediaStream { + const tracks = [{ stop: vi.fn() }]; + return { + getTracks: () => tracks, + } as unknown as MediaStream; +} + +describe("VoiceContext", () => { + let container: HTMLDivElement; + let root: ReturnType | null = null; + + beforeEach(() => { + container = document.createElement("div"); + document.body.appendChild(container); + root = null; + }); + + afterEach(() => { + if (root) { + act(() => { + root!.unmount(); + }); + root = null; + } + if (container.parentNode) container.remove(); + }); + + function renderWithProvider( + Consumer: React.FC, + props: { + getUserMedia?: (c: MediaStreamConstraints) => Promise; + transcribe?: (blob: Blob) => Promise; + } = {}, + ) { + root = createRoot(container); + act(() => { + root!.render( + + + , + ); + }); + } + + it("starts in idle state with an empty queue", () => { + let captured: ReturnType | null = null; + const Consumer = () => { + captured = useVoice(); + return
{captured.state}
; + }; + + renderWithProvider(Consumer); + expect(captured!.state).toBe("idle"); + expect(captured!.queue).toEqual([]); + expect(captured!.hasQueuedVoice).toBe(false); + }); + + it("transitions idle → listening when startListening resolves", async () => { + const fakeStream = makeFakeStream(); + const getUserMedia = vi.fn(async () => fakeStream); + + let ctxRef: ReturnType | null = null; + const Consumer = () => { + ctxRef = useVoice(); + return
{ctxRef.state}
; + }; + + renderWithProvider(Consumer, { getUserMedia }); + + await act(async () => { + await ctxRef!.startListening(); + }); + + expect(getUserMedia).toHaveBeenCalledWith({ audio: true }); + expect(ctxRef!.state).toBe("listening"); + expect(ctxRef!.mediaStream).toBe(fakeStream); + }); + + it("handles getUserMedia rejection by staying idle", async () => { + const getUserMedia = vi.fn(async () => { + throw new Error("NotAllowedError"); + }); + + let ctxRef: ReturnType | null = null; + const Consumer = () => { + ctxRef = useVoice(); + return
{ctxRef.state}
; + }; + + renderWithProvider(Consumer, { getUserMedia }); + await act(async () => { + await ctxRef!.startListening(); + }); + expect(ctxRef!.state).toBe("idle"); + }); + + it("enqueue adds transcripts to the queue and sets hasQueuedVoice", () => { + let ctxRef: ReturnType | null = null; + const Consumer = () => { + ctxRef = useVoice(); + return ( +
+ {ctxRef.queue.length} + {ctxRef.hasQueuedVoice ? "y" : "n"} +
+ ); + }; + + renderWithProvider(Consumer); + act(() => { + ctxRef!.enqueue("hello world"); + }); + expect(ctxRef!.queue).toEqual(["hello world"]); + expect(ctxRef!.hasQueuedVoice).toBe(true); + }); + + it("drainQueue returns all entries and clears the queue", () => { + let ctxRef: ReturnType | null = null; + const Consumer = () => { + ctxRef = useVoice(); + return
{ctxRef.queue.length}
; + }; + + renderWithProvider(Consumer); + act(() => { + ctxRef!.enqueue("one"); + ctxRef!.enqueue("two"); + }); + expect(ctxRef!.queue.length).toBe(2); + + let drained: string[] = []; + act(() => { + drained = ctxRef!.drainQueue(); + }); + expect(drained).toEqual(["one", "two"]); + expect(ctxRef!.queue).toEqual([]); + expect(ctxRef!.hasQueuedVoice).toBe(false); + }); + + it("stopListening returns the context to idle and stops stream tracks", async () => { + const tracks = [{ stop: vi.fn() }]; + const fakeStream = { + getTracks: () => tracks, + } as unknown as MediaStream; + const getUserMedia = vi.fn(async () => fakeStream); + const transcribe = vi.fn(async () => "transcribed text"); + + let ctxRef: ReturnType | null = null; + const Consumer = () => { + ctxRef = useVoice(); + return
{ctxRef.state}
; + }; + + renderWithProvider(Consumer, { getUserMedia, transcribe }); + await act(async () => { + await ctxRef!.startListening(); + }); + expect(ctxRef!.state).toBe("listening"); + + await act(async () => { + await ctxRef!.stopListening(); + }); + expect(ctxRef!.state).toBe("idle"); + expect(tracks[0]!.stop).toHaveBeenCalled(); + }); + + it("throws when useVoice is used outside a provider", () => { + const Consumer = () => { + useVoice(); + return null; + }; + const spy = vi.spyOn(console, "error").mockImplementation(() => {}); + root = createRoot(container); + expect(() => + act(() => { + root!.render(); + }), + ).toThrow(/VoiceProvider/); + spy.mockRestore(); + }); +}); diff --git a/ui/src/context/VoiceContext.tsx b/ui/src/context/VoiceContext.tsx new file mode 100644 index 00000000..49fb4870 --- /dev/null +++ b/ui/src/context/VoiceContext.tsx @@ -0,0 +1,315 @@ +import { + createContext, + useCallback, + useContext, + useMemo, + useRef, + useState, + type ReactNode, +} from "react"; + +/** + * VoiceContext — Phase 14 globalization of voice capture. + * + * Before Phase 14, the voice capture state (MediaStream, recorder, VAD + * transcription pipeline) lived inside `ChatInput`'s `VoiceMicButton`. + * Phase 14 lifts it up so the top-strip `GlobalMicButton` can drive voice + * from any route, and speech captured while the user is away from + * `/assistant` is queued for draining when they arrive there. + * + * The existing `/api/transcribe` endpoint (server-side Whisper pipeline + * shipped in v1.6) is consumed unchanged — see `voice-pipeline.ts` on the + * server. No new backend endpoint is introduced. + * + * Spec references: + * - §4.2 GlobalMicButton states (idle / listening / speaking) + * - §5.5 voice routing from non-Assistant modes → queue → Assistant inbox + * - §10.3 voice as global affordance + * - §10.4 single notification surface + */ + +export type VoiceState = "idle" | "listening" | "speaking"; + +export interface VoiceStartOptions { + /** + * When true, the transcript produced by this capture is NOT pushed onto + * the Assistant queue. Used by ChatInput's in-place mic which inserts + * the transcript directly into the textarea via `onTranscript`. + */ + inline?: boolean; + /** + * Optional callback invoked with the final trimmed transcript once + * transcription completes. Called even when `inline` is false, so + * consumers that want to observe speech alongside the queue can. + */ + onTranscript?: (text: string) => void; +} + +export interface VoiceContextValue { + state: VoiceState; + mediaStream: MediaStream | null; + transcript: string; + queue: string[]; + hasQueuedVoice: boolean; + /** + * Start the microphone. If permission is denied the state stays `idle` + * and an error is logged to the console (no toast — Layout owns toasts). + * + * When `options.inline` is true, the resulting transcript is NOT pushed + * onto the Assistant queue — the caller is expected to consume + * `transcript` directly (used by ChatInput's in-place mic). When + * `options.onTranscript` is provided, it is invoked with the final + * transcript alongside (or in place of) the queue push. + */ + startListening: (options?: VoiceStartOptions) => Promise; + /** Stop the current capture and flush the buffered transcript. */ + stopListening: () => Promise; + /** Cycle idle → listening → idle. Used by GlobalMicButton's click. */ + toggleListening: (options?: VoiceStartOptions) => Promise; + /** + * Push a transcript onto the queue. PersonalAssistant drains this on + * mount and sends each entry as a new user message through the existing + * chat streaming pipeline. + */ + enqueue: (text: string) => void; + /** Drain and return every queued transcript, clearing the queue. */ + drainQueue: () => string[]; + /** Clear the transient transcript buffer (not the queue). */ + clearTranscript: () => void; +} + +const VoiceContext = createContext(undefined); + +interface VoiceProviderProps { + children: ReactNode; + /** + * Optional override so tests can avoid hitting a real network. When + * omitted the provider calls `fetch("/api/transcribe", ...)`. + */ + transcribe?: (audio: Blob) => Promise; + /** + * Optional injector for `navigator.mediaDevices.getUserMedia`, used by + * tests that need deterministic stream objects without polluting the + * global navigator. + */ + getUserMedia?: (constraints: MediaStreamConstraints) => Promise; + /** + * When true the provider swallows transient capture errors silently. + * Defaults to the standard behavior (console.error). + */ + silenceErrors?: boolean; +} + +async function defaultTranscribe(audio: Blob): Promise { + const formData = new FormData(); + formData.append("audio", audio, "recording.webm"); + const res = await fetch("/api/transcribe", { + method: "POST", + credentials: "include", + body: formData, + }); + if (!res.ok) { + throw new Error(`transcribe failed: ${res.status}`); + } + const data = (await res.json()) as { text?: string }; + return (data.text ?? "").trim(); +} + +export function VoiceProvider({ + children, + transcribe = defaultTranscribe, + getUserMedia, + silenceErrors = false, +}: VoiceProviderProps) { + const [state, setState] = useState("idle"); + const [transcript, setTranscript] = useState(""); + const [queue, setQueue] = useState([]); + const [mediaStream, setMediaStream] = useState(null); + + const recorderRef = useRef(null); + const chunksRef = useRef([]); + const streamRef = useRef(null); + const optionsRef = useRef({}); + // Mirror of `queue` maintained for synchronous reads — drainQueue() + // must return entries immediately so the caller can iterate without + // waiting for a React re-render. + const queueRef = useRef([]); + + const stopTracks = useCallback(() => { + if (streamRef.current) { + streamRef.current.getTracks().forEach((t) => { + try { + t.stop(); + } catch { + // ignore + } + }); + streamRef.current = null; + } + setMediaStream(null); + }, []); + + const enqueue = useCallback((text: string) => { + const trimmed = text.trim(); + if (!trimmed) return; + queueRef.current = [...queueRef.current, trimmed]; + setQueue(queueRef.current); + }, []); + + const drainQueue = useCallback((): string[] => { + const drained = queueRef.current; + queueRef.current = []; + setQueue([]); + return drained; + }, []); + + const clearTranscript = useCallback(() => { + setTranscript(""); + }, []); + + const startListening = useCallback(async ( + options: VoiceStartOptions = {}, + ): Promise => { + if (state !== "idle") return; + optionsRef.current = options; + const acquire = getUserMedia + ? getUserMedia({ audio: true }) + : navigator.mediaDevices?.getUserMedia?.({ audio: true }); + + if (!acquire) { + if (!silenceErrors) { + console.error("[VoiceContext] getUserMedia is not available"); + } + return; + } + + try { + const stream = await acquire; + streamRef.current = stream; + setMediaStream(stream); + chunksRef.current = []; + + // MediaRecorder may not exist in some test environments. We only + // instantiate it when available — the stream itself is still tracked + // so VoiceWaveform and other consumers can observe capture state. + if (typeof MediaRecorder !== "undefined") { + const recorder = new MediaRecorder(stream); + recorder.ondataavailable = (e) => { + if (e.data && e.data.size > 0) chunksRef.current.push(e.data); + }; + recorderRef.current = recorder; + recorder.start(); + } + + setState("listening"); + } catch (err) { + if (!silenceErrors) { + console.error("[VoiceContext] Microphone access denied:", err); + } + stopTracks(); + setState("idle"); + } + }, [state, getUserMedia, silenceErrors, stopTracks]); + + const stopListening = useCallback(async (): Promise => { + if (state !== "listening") { + stopTracks(); + return; + } + + setState("speaking"); + + const recorder = recorderRef.current; + recorderRef.current = null; + + const finalize = async () => { + try { + if (chunksRef.current.length > 0) { + const blob = new Blob(chunksRef.current, { + type: recorder?.mimeType || "audio/webm", + }); + chunksRef.current = []; + const text = await transcribe(blob); + if (text) { + setTranscript(text); + const opts = optionsRef.current; + opts.onTranscript?.(text); + if (!opts.inline) { + enqueue(text); + } + } + } + } catch (err) { + if (!silenceErrors) { + console.error("[VoiceContext] Transcription error:", err); + } + } finally { + stopTracks(); + setState("idle"); + } + }; + + if (recorder && recorder.state !== "inactive") { + await new Promise((resolve) => { + recorder.onstop = () => resolve(); + try { + recorder.stop(); + } catch { + resolve(); + } + }); + } + + await finalize(); + }, [state, transcribe, enqueue, silenceErrors, stopTracks]); + + const toggleListening = useCallback(async ( + options: VoiceStartOptions = {}, + ): Promise => { + if (state === "listening") { + await stopListening(); + return; + } + if (state === "idle") { + await startListening(options); + } + }, [state, startListening, stopListening]); + + const value = useMemo( + () => ({ + state, + mediaStream, + transcript, + queue, + hasQueuedVoice: queue.length > 0, + startListening, + stopListening, + toggleListening, + enqueue, + drainQueue, + clearTranscript, + }), + [ + state, + mediaStream, + transcript, + queue, + startListening, + stopListening, + toggleListening, + enqueue, + drainQueue, + clearTranscript, + ], + ); + + return {children}; +} + +export function useVoice(): VoiceContextValue { + const ctx = useContext(VoiceContext); + if (!ctx) { + throw new Error("useVoice must be used within a VoiceProvider"); + } + return ctx; +}