feat(nexus): VoiceContext for phase 14 voice globalization

Lifts MediaStream, recording state, transcription buffer, and queue for non-Assistant captures out of ChatInput's internal VoiceMicButton. The provider owns the POST /api/transcribe fetch (v1.6 pipeline, unchanged), exposes idle/listening/speaking state to the top-strip GlobalMicButton, and queues transcripts captured away from /assistant for PersonalAssistant to drain on mount. Per spec sections 4.2 (mic states), 5.5 (voice from non-Assistant modes), and 10.3 (voice as global affordance). Tests use manual createRoot + act with a mocked getUserMedia injector to stay deterministic in jsdom. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 13:22:48 +00:00 · 2026-04-11 13:22:48 +00:00 · 14ecbf00bb
commit 14ecbf00bb
parent 4623c8aea0
2 changed files with 507 additions and 0 deletions
--- a/ui/src/context/VoiceContext.test.tsx
+++ b/ui/src/context/VoiceContext.test.tsx
@ -0,0 +1,192 @@
+// @vitest-environment jsdom
+
+import { act } from "react";
+import { createRoot } from "react-dom/client";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { VoiceProvider, useVoice } from "./VoiceContext";
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+(globalThis as any).IS_REACT_ACT_ENVIRONMENT = true;
+
+// Lightweight MediaStream stand-in for jsdom.
+function makeFakeStream(): MediaStream {
+  const tracks = [{ stop: vi.fn() }];
+  return {
+    getTracks: () => tracks,
+  } as unknown as MediaStream;
+}
+
+describe("VoiceContext", () => {
+  let container: HTMLDivElement;
+  let root: ReturnType<typeof createRoot> | null = null;
+
+  beforeEach(() => {
+    container = document.createElement("div");
+    document.body.appendChild(container);
+    root = null;
+  });
+
+  afterEach(() => {
+    if (root) {
+      act(() => {
+        root!.unmount();
+      });
+      root = null;
+    }
+    if (container.parentNode) container.remove();
+  });
+
+  function renderWithProvider(
+    Consumer: React.FC,
+    props: {
+      getUserMedia?: (c: MediaStreamConstraints) => Promise<MediaStream>;
+      transcribe?: (blob: Blob) => Promise<string>;
+    } = {},
+  ) {
+    root = createRoot(container);
+    act(() => {
+      root!.render(
+        <VoiceProvider {...props} silenceErrors>
+          <Consumer />
+        </VoiceProvider>,
+      );
+    });
+  }
+
+  it("starts in idle state with an empty queue", () => {
+    let captured: ReturnType<typeof useVoice> | null = null;
+    const Consumer = () => {
+      captured = useVoice();
+      return <div data-testid="state">{captured.state}</div>;
+    };
+
+    renderWithProvider(Consumer);
+    expect(captured!.state).toBe("idle");
+    expect(captured!.queue).toEqual([]);
+    expect(captured!.hasQueuedVoice).toBe(false);
+  });
+
+  it("transitions idle → listening when startListening resolves", async () => {
+    const fakeStream = makeFakeStream();
+    const getUserMedia = vi.fn(async () => fakeStream);
+
+    let ctxRef: ReturnType<typeof useVoice> | null = null;
+    const Consumer = () => {
+      ctxRef = useVoice();
+      return <div data-testid="state">{ctxRef.state}</div>;
+    };
+
+    renderWithProvider(Consumer, { getUserMedia });
+
+    await act(async () => {
+      await ctxRef!.startListening();
+    });
+
+    expect(getUserMedia).toHaveBeenCalledWith({ audio: true });
+    expect(ctxRef!.state).toBe("listening");
+    expect(ctxRef!.mediaStream).toBe(fakeStream);
+  });
+
+  it("handles getUserMedia rejection by staying idle", async () => {
+    const getUserMedia = vi.fn(async () => {
+      throw new Error("NotAllowedError");
+    });
+
+    let ctxRef: ReturnType<typeof useVoice> | null = null;
+    const Consumer = () => {
+      ctxRef = useVoice();
+      return <div>{ctxRef.state}</div>;
+    };
+
+    renderWithProvider(Consumer, { getUserMedia });
+    await act(async () => {
+      await ctxRef!.startListening();
+    });
+    expect(ctxRef!.state).toBe("idle");
+  });
+
+  it("enqueue adds transcripts to the queue and sets hasQueuedVoice", () => {
+    let ctxRef: ReturnType<typeof useVoice> | null = null;
+    const Consumer = () => {
+      ctxRef = useVoice();
+      return (
+        <div>
+          <span data-testid="count">{ctxRef.queue.length}</span>
+          <span data-testid="has">{ctxRef.hasQueuedVoice ? "y" : "n"}</span>
+        </div>
+      );
+    };
+
+    renderWithProvider(Consumer);
+    act(() => {
+      ctxRef!.enqueue("hello world");
+    });
+    expect(ctxRef!.queue).toEqual(["hello world"]);
+    expect(ctxRef!.hasQueuedVoice).toBe(true);
+  });
+
+  it("drainQueue returns all entries and clears the queue", () => {
+    let ctxRef: ReturnType<typeof useVoice> | null = null;
+    const Consumer = () => {
+      ctxRef = useVoice();
+      return <div>{ctxRef.queue.length}</div>;
+    };
+
+    renderWithProvider(Consumer);
+    act(() => {
+      ctxRef!.enqueue("one");
+      ctxRef!.enqueue("two");
+    });
+    expect(ctxRef!.queue.length).toBe(2);
+
+    let drained: string[] = [];
+    act(() => {
+      drained = ctxRef!.drainQueue();
+    });
+    expect(drained).toEqual(["one", "two"]);
+    expect(ctxRef!.queue).toEqual([]);
+    expect(ctxRef!.hasQueuedVoice).toBe(false);
+  });
+
+  it("stopListening returns the context to idle and stops stream tracks", async () => {
+    const tracks = [{ stop: vi.fn() }];
+    const fakeStream = {
+      getTracks: () => tracks,
+    } as unknown as MediaStream;
+    const getUserMedia = vi.fn(async () => fakeStream);
+    const transcribe = vi.fn(async () => "transcribed text");
+
+    let ctxRef: ReturnType<typeof useVoice> | null = null;
+    const Consumer = () => {
+      ctxRef = useVoice();
+      return <div>{ctxRef.state}</div>;
+    };
+
+    renderWithProvider(Consumer, { getUserMedia, transcribe });
+    await act(async () => {
+      await ctxRef!.startListening();
+    });
+    expect(ctxRef!.state).toBe("listening");
+
+    await act(async () => {
+      await ctxRef!.stopListening();
+    });
+    expect(ctxRef!.state).toBe("idle");
+    expect(tracks[0]!.stop).toHaveBeenCalled();
+  });
+
+  it("throws when useVoice is used outside a provider", () => {
+    const Consumer = () => {
+      useVoice();
+      return null;
+    };
+    const spy = vi.spyOn(console, "error").mockImplementation(() => {});
+    root = createRoot(container);
+    expect(() =>
+      act(() => {
+        root!.render(<Consumer />);
+      }),
+    ).toThrow(/VoiceProvider/);
+    spy.mockRestore();
+  });
+});
--- a/ui/src/context/VoiceContext.tsx
+++ b/ui/src/context/VoiceContext.tsx
@ -0,0 +1,315 @@
+import {
+  createContext,
+  useCallback,
+  useContext,
+  useMemo,
+  useRef,
+  useState,
+  type ReactNode,
+} from "react";
+
+/**
+ * VoiceContext — Phase 14 globalization of voice capture.
+ *
+ * Before Phase 14, the voice capture state (MediaStream, recorder, VAD
+ * transcription pipeline) lived inside `ChatInput`'s `VoiceMicButton`.
+ * Phase 14 lifts it up so the top-strip `GlobalMicButton` can drive voice
+ * from any route, and speech captured while the user is away from
+ * `/assistant` is queued for draining when they arrive there.
+ *
+ * The existing `/api/transcribe` endpoint (server-side Whisper pipeline
+ * shipped in v1.6) is consumed unchanged — see `voice-pipeline.ts` on the
+ * server. No new backend endpoint is introduced.
+ *
+ * Spec references:
+ *   - §4.2 GlobalMicButton states (idle / listening / speaking)
+ *   - §5.5 voice routing from non-Assistant modes → queue → Assistant inbox
+ *   - §10.3 voice as global affordance
+ *   - §10.4 single notification surface
+ */
+
+export type VoiceState = "idle" | "listening" | "speaking";
+
+export interface VoiceStartOptions {
+  /**
+   * When true, the transcript produced by this capture is NOT pushed onto
+   * the Assistant queue. Used by ChatInput's in-place mic which inserts
+   * the transcript directly into the textarea via `onTranscript`.
+   */
+  inline?: boolean;
+  /**
+   * Optional callback invoked with the final trimmed transcript once
+   * transcription completes. Called even when `inline` is false, so
+   * consumers that want to observe speech alongside the queue can.
+   */
+  onTranscript?: (text: string) => void;
+}
+
+export interface VoiceContextValue {
+  state: VoiceState;
+  mediaStream: MediaStream | null;
+  transcript: string;
+  queue: string[];
+  hasQueuedVoice: boolean;
+  /**
+   * Start the microphone. If permission is denied the state stays `idle`
+   * and an error is logged to the console (no toast — Layout owns toasts).
+   *
+   * When `options.inline` is true, the resulting transcript is NOT pushed
+   * onto the Assistant queue — the caller is expected to consume
+   * `transcript` directly (used by ChatInput's in-place mic). When
+   * `options.onTranscript` is provided, it is invoked with the final
+   * transcript alongside (or in place of) the queue push.
+   */
+  startListening: (options?: VoiceStartOptions) => Promise<void>;
+  /** Stop the current capture and flush the buffered transcript. */
+  stopListening: () => Promise<void>;
+  /** Cycle idle → listening → idle. Used by GlobalMicButton's click. */
+  toggleListening: (options?: VoiceStartOptions) => Promise<void>;
+  /**
+   * Push a transcript onto the queue. PersonalAssistant drains this on
+   * mount and sends each entry as a new user message through the existing
+   * chat streaming pipeline.
+   */
+  enqueue: (text: string) => void;
+  /** Drain and return every queued transcript, clearing the queue. */
+  drainQueue: () => string[];
+  /** Clear the transient transcript buffer (not the queue). */
+  clearTranscript: () => void;
+}
+
+const VoiceContext = createContext<VoiceContextValue | undefined>(undefined);
+
+interface VoiceProviderProps {
+  children: ReactNode;
+  /**
+   * Optional override so tests can avoid hitting a real network. When
+   * omitted the provider calls `fetch("/api/transcribe", ...)`.
+   */
+  transcribe?: (audio: Blob) => Promise<string>;
+  /**
+   * Optional injector for `navigator.mediaDevices.getUserMedia`, used by
+   * tests that need deterministic stream objects without polluting the
+   * global navigator.
+   */
+  getUserMedia?: (constraints: MediaStreamConstraints) => Promise<MediaStream>;
+  /**
+   * When true the provider swallows transient capture errors silently.
+   * Defaults to the standard behavior (console.error).
+   */
+  silenceErrors?: boolean;
+}
+
+async function defaultTranscribe(audio: Blob): Promise<string> {
+  const formData = new FormData();
+  formData.append("audio", audio, "recording.webm");
+  const res = await fetch("/api/transcribe", {
+    method: "POST",
+    credentials: "include",
+    body: formData,
+  });
+  if (!res.ok) {
+    throw new Error(`transcribe failed: ${res.status}`);
+  }
+  const data = (await res.json()) as { text?: string };
+  return (data.text ?? "").trim();
+}
+
+export function VoiceProvider({
+  children,
+  transcribe = defaultTranscribe,
+  getUserMedia,
+  silenceErrors = false,
+}: VoiceProviderProps) {
+  const [state, setState] = useState<VoiceState>("idle");
+  const [transcript, setTranscript] = useState("");
+  const [queue, setQueue] = useState<string[]>([]);
+  const [mediaStream, setMediaStream] = useState<MediaStream | null>(null);
+
+  const recorderRef = useRef<MediaRecorder | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const streamRef = useRef<MediaStream | null>(null);
+  const optionsRef = useRef<VoiceStartOptions>({});
+  // Mirror of `queue` maintained for synchronous reads — drainQueue()
+  // must return entries immediately so the caller can iterate without
+  // waiting for a React re-render.
+  const queueRef = useRef<string[]>([]);
+
+  const stopTracks = useCallback(() => {
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach((t) => {
+        try {
+          t.stop();
+        } catch {
+          // ignore
+        }
+      });
+      streamRef.current = null;
+    }
+    setMediaStream(null);
+  }, []);
+
+  const enqueue = useCallback((text: string) => {
+    const trimmed = text.trim();
+    if (!trimmed) return;
+    queueRef.current = [...queueRef.current, trimmed];
+    setQueue(queueRef.current);
+  }, []);
+
+  const drainQueue = useCallback((): string[] => {
+    const drained = queueRef.current;
+    queueRef.current = [];
+    setQueue([]);
+    return drained;
+  }, []);
+
+  const clearTranscript = useCallback(() => {
+    setTranscript("");
+  }, []);
+
+  const startListening = useCallback(async (
+    options: VoiceStartOptions = {},
+  ): Promise<void> => {
+    if (state !== "idle") return;
+    optionsRef.current = options;
+    const acquire = getUserMedia
+      ? getUserMedia({ audio: true })
+      : navigator.mediaDevices?.getUserMedia?.({ audio: true });
+
+    if (!acquire) {
+      if (!silenceErrors) {
+        console.error("[VoiceContext] getUserMedia is not available");
+      }
+      return;
+    }
+
+    try {
+      const stream = await acquire;
+      streamRef.current = stream;
+      setMediaStream(stream);
+      chunksRef.current = [];
+
+      // MediaRecorder may not exist in some test environments. We only
+      // instantiate it when available — the stream itself is still tracked
+      // so VoiceWaveform and other consumers can observe capture state.
+      if (typeof MediaRecorder !== "undefined") {
+        const recorder = new MediaRecorder(stream);
+        recorder.ondataavailable = (e) => {
+          if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
+        };
+        recorderRef.current = recorder;
+        recorder.start();
+      }
+
+      setState("listening");
+    } catch (err) {
+      if (!silenceErrors) {
+        console.error("[VoiceContext] Microphone access denied:", err);
+      }
+      stopTracks();
+      setState("idle");
+    }
+  }, [state, getUserMedia, silenceErrors, stopTracks]);
+
+  const stopListening = useCallback(async (): Promise<void> => {
+    if (state !== "listening") {
+      stopTracks();
+      return;
+    }
+
+    setState("speaking");
+
+    const recorder = recorderRef.current;
+    recorderRef.current = null;
+
+    const finalize = async () => {
+      try {
+        if (chunksRef.current.length > 0) {
+          const blob = new Blob(chunksRef.current, {
+            type: recorder?.mimeType || "audio/webm",
+          });
+          chunksRef.current = [];
+          const text = await transcribe(blob);
+          if (text) {
+            setTranscript(text);
+            const opts = optionsRef.current;
+            opts.onTranscript?.(text);
+            if (!opts.inline) {
+              enqueue(text);
+            }
+          }
+        }
+      } catch (err) {
+        if (!silenceErrors) {
+          console.error("[VoiceContext] Transcription error:", err);
+        }
+      } finally {
+        stopTracks();
+        setState("idle");
+      }
+    };
+
+    if (recorder && recorder.state !== "inactive") {
+      await new Promise<void>((resolve) => {
+        recorder.onstop = () => resolve();
+        try {
+          recorder.stop();
+        } catch {
+          resolve();
+        }
+      });
+    }
+
+    await finalize();
+  }, [state, transcribe, enqueue, silenceErrors, stopTracks]);
+
+  const toggleListening = useCallback(async (
+    options: VoiceStartOptions = {},
+  ): Promise<void> => {
+    if (state === "listening") {
+      await stopListening();
+      return;
+    }
+    if (state === "idle") {
+      await startListening(options);
+    }
+  }, [state, startListening, stopListening]);
+
+  const value = useMemo<VoiceContextValue>(
+    () => ({
+      state,
+      mediaStream,
+      transcript,
+      queue,
+      hasQueuedVoice: queue.length > 0,
+      startListening,
+      stopListening,
+      toggleListening,
+      enqueue,
+      drainQueue,
+      clearTranscript,
+    }),
+    [
+      state,
+      mediaStream,
+      transcript,
+      queue,
+      startListening,
+      stopListening,
+      toggleListening,
+      enqueue,
+      drainQueue,
+      clearTranscript,
+    ],
+  );
+
+  return <VoiceContext.Provider value={value}>{children}</VoiceContext.Provider>;
+}
+
+export function useVoice(): VoiceContextValue {
+  const ctx = useContext(VoiceContext);
+  if (!ctx) {
+    throw new Error("useVoice must be used within a VoiceProvider");
+  }
+  return ctx;
+}