feat(nexus): VoiceContext for phase 14 voice globalization

Lifts MediaStream, recording state, transcription buffer, and queue for non-Assistant captures out of ChatInput's internal VoiceMicButton. The provider owns the POST /api/transcribe fetch (v1.6 pipeline, unchanged), exposes idle/listening/speaking state to the top-strip GlobalMicButton, and queues transcripts captured away from /assistant for PersonalAssistant to drain on mount. Per spec sections 4.2 (mic states), 5.5 (voice from non-Assistant modes), and 10.3 (voice as global affordance). Tests use manual createRoot + act with a mocked getUserMedia injector to stay deterministic in jsdom. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 13:22:48 +00:00 · 2026-04-11 13:22:48 +00:00 · 14ecbf00bb
commit 14ecbf00bb
parent 4623c8aea0
2 changed files with 507 additions and 0 deletions
--- a/ui/src/context/VoiceContext.test.tsx
+++ b/ui/src/context/VoiceContext.test.tsx
@ -0,0 +1,192 @@
 // @vitest-environment jsdom
 import { act } from "react";
 import { createRoot } from "react-dom/client";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import { VoiceProvider, useVoice } from "./VoiceContext";
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
 (globalThis as any).IS_REACT_ACT_ENVIRONMENT = true;
 // Lightweight MediaStream stand-in for jsdom.
 function makeFakeStream(): MediaStream {
  const tracks = [{ stop: vi.fn() }];
  return {
    getTracks: () => tracks,
  } as unknown as MediaStream;
 }
 describe("VoiceContext", () => {
  let container: HTMLDivElement;
  let root: ReturnType<typeof createRoot> | null = null;
  beforeEach(() => {
    container = document.createElement("div");
    document.body.appendChild(container);
    root = null;
  });
  afterEach(() => {
    if (root) {
      act(() => {
        root!.unmount();
      });
      root = null;
    }
    if (container.parentNode) container.remove();
  });
  function renderWithProvider(
    Consumer: React.FC,
    props: {
      getUserMedia?: (c: MediaStreamConstraints) => Promise<MediaStream>;
      transcribe?: (blob: Blob) => Promise<string>;
    } = {},
  ) {
    root = createRoot(container);
    act(() => {
      root!.render(
        <VoiceProvider {...props} silenceErrors>
          <Consumer />
        </VoiceProvider>,
      );
    });
  }
  it("starts in idle state with an empty queue", () => {
    let captured: ReturnType<typeof useVoice> | null = null;
    const Consumer = () => {
      captured = useVoice();
      return <div data-testid="state">{captured.state}</div>;
    };
    renderWithProvider(Consumer);
    expect(captured!.state).toBe("idle");
    expect(captured!.queue).toEqual([]);
    expect(captured!.hasQueuedVoice).toBe(false);
  });
  it("transitions idle → listening when startListening resolves", async () => {
    const fakeStream = makeFakeStream();
    const getUserMedia = vi.fn(async () => fakeStream);
    let ctxRef: ReturnType<typeof useVoice> | null = null;
    const Consumer = () => {
      ctxRef = useVoice();
      return <div data-testid="state">{ctxRef.state}</div>;
    };
    renderWithProvider(Consumer, { getUserMedia });
    await act(async () => {
      await ctxRef!.startListening();
    });
    expect(getUserMedia).toHaveBeenCalledWith({ audio: true });
    expect(ctxRef!.state).toBe("listening");
    expect(ctxRef!.mediaStream).toBe(fakeStream);
  });
  it("handles getUserMedia rejection by staying idle", async () => {
    const getUserMedia = vi.fn(async () => {
      throw new Error("NotAllowedError");
    });
    let ctxRef: ReturnType<typeof useVoice> | null = null;
    const Consumer = () => {
      ctxRef = useVoice();
      return <div>{ctxRef.state}</div>;
    };
    renderWithProvider(Consumer, { getUserMedia });
    await act(async () => {
      await ctxRef!.startListening();
    });
    expect(ctxRef!.state).toBe("idle");
  });
  it("enqueue adds transcripts to the queue and sets hasQueuedVoice", () => {
    let ctxRef: ReturnType<typeof useVoice> | null = null;
    const Consumer = () => {
      ctxRef = useVoice();
      return (
        <div>
          <span data-testid="count">{ctxRef.queue.length}</span>
          <span data-testid="has">{ctxRef.hasQueuedVoice ? "y" : "n"}</span>
        </div>
      );
    };
    renderWithProvider(Consumer);
    act(() => {
      ctxRef!.enqueue("hello world");
    });
    expect(ctxRef!.queue).toEqual(["hello world"]);
    expect(ctxRef!.hasQueuedVoice).toBe(true);
  });
  it("drainQueue returns all entries and clears the queue", () => {
    let ctxRef: ReturnType<typeof useVoice> | null = null;
    const Consumer = () => {
      ctxRef = useVoice();
      return <div>{ctxRef.queue.length}</div>;
    };
    renderWithProvider(Consumer);
    act(() => {
      ctxRef!.enqueue("one");
      ctxRef!.enqueue("two");
    });
    expect(ctxRef!.queue.length).toBe(2);
    let drained: string[] = [];
    act(() => {
      drained = ctxRef!.drainQueue();
    });
    expect(drained).toEqual(["one", "two"]);
    expect(ctxRef!.queue).toEqual([]);
    expect(ctxRef!.hasQueuedVoice).toBe(false);
  });
  it("stopListening returns the context to idle and stops stream tracks", async () => {
    const tracks = [{ stop: vi.fn() }];
    const fakeStream = {
      getTracks: () => tracks,
    } as unknown as MediaStream;
    const getUserMedia = vi.fn(async () => fakeStream);
    const transcribe = vi.fn(async () => "transcribed text");
    let ctxRef: ReturnType<typeof useVoice> | null = null;
    const Consumer = () => {
      ctxRef = useVoice();
      return <div>{ctxRef.state}</div>;
    };
    renderWithProvider(Consumer, { getUserMedia, transcribe });
    await act(async () => {
      await ctxRef!.startListening();
    });
    expect(ctxRef!.state).toBe("listening");
    await act(async () => {
      await ctxRef!.stopListening();
    });
    expect(ctxRef!.state).toBe("idle");
    expect(tracks[0]!.stop).toHaveBeenCalled();
  });
  it("throws when useVoice is used outside a provider", () => {
    const Consumer = () => {
      useVoice();
      return null;
    };
    const spy = vi.spyOn(console, "error").mockImplementation(() => {});
    root = createRoot(container);
    expect(() =>
      act(() => {
        root!.render(<Consumer />);
      }),
    ).toThrow(/VoiceProvider/);
    spy.mockRestore();
  });
 });
--- a/ui/src/context/VoiceContext.tsx
+++ b/ui/src/context/VoiceContext.tsx
@ -0,0 +1,315 @@
 import {
  createContext,
  useCallback,
  useContext,
  useMemo,
  useRef,
  useState,
  type ReactNode,
 } from "react";
 /**
 * VoiceContext — Phase 14 globalization of voice capture.
 *
 * Before Phase 14, the voice capture state (MediaStream, recorder, VAD
 * transcription pipeline) lived inside `ChatInput`'s `VoiceMicButton`.
 * Phase 14 lifts it up so the top-strip `GlobalMicButton` can drive voice
 * from any route, and speech captured while the user is away from
 * `/assistant` is queued for draining when they arrive there.
 *
 * The existing `/api/transcribe` endpoint (server-side Whisper pipeline
 * shipped in v1.6) is consumed unchanged — see `voice-pipeline.ts` on the
 * server. No new backend endpoint is introduced.
 *
 * Spec references:
 *   - §4.2 GlobalMicButton states (idle / listening / speaking)
 *   - §5.5 voice routing from non-Assistant modes → queue → Assistant inbox
 *   - §10.3 voice as global affordance
 *   - §10.4 single notification surface
 */
 export type VoiceState = "idle" | "listening" | "speaking";
 export interface VoiceStartOptions {
  /**
   * When true, the transcript produced by this capture is NOT pushed onto
   * the Assistant queue. Used by ChatInput's in-place mic which inserts
   * the transcript directly into the textarea via `onTranscript`.
   */
  inline?: boolean;
  /**
   * Optional callback invoked with the final trimmed transcript once
   * transcription completes. Called even when `inline` is false, so
   * consumers that want to observe speech alongside the queue can.
   */
  onTranscript?: (text: string) => void;
 }
 export interface VoiceContextValue {
  state: VoiceState;
  mediaStream: MediaStream | null;
  transcript: string;
  queue: string[];
  hasQueuedVoice: boolean;
  /**
   * Start the microphone. If permission is denied the state stays `idle`
   * and an error is logged to the console (no toast — Layout owns toasts).
   *
   * When `options.inline` is true, the resulting transcript is NOT pushed
   * onto the Assistant queue — the caller is expected to consume
   * `transcript` directly (used by ChatInput's in-place mic). When
   * `options.onTranscript` is provided, it is invoked with the final
   * transcript alongside (or in place of) the queue push.
   */
  startListening: (options?: VoiceStartOptions) => Promise<void>;
  /** Stop the current capture and flush the buffered transcript. */
  stopListening: () => Promise<void>;
  /** Cycle idle → listening → idle. Used by GlobalMicButton's click. */
  toggleListening: (options?: VoiceStartOptions) => Promise<void>;
  /**
   * Push a transcript onto the queue. PersonalAssistant drains this on
   * mount and sends each entry as a new user message through the existing
   * chat streaming pipeline.
   */
  enqueue: (text: string) => void;
  /** Drain and return every queued transcript, clearing the queue. */
  drainQueue: () => string[];
  /** Clear the transient transcript buffer (not the queue). */
  clearTranscript: () => void;
 }
 const VoiceContext = createContext<VoiceContextValue | undefined>(undefined);
 interface VoiceProviderProps {
  children: ReactNode;
  /**
   * Optional override so tests can avoid hitting a real network. When
   * omitted the provider calls `fetch("/api/transcribe", ...)`.
   */
  transcribe?: (audio: Blob) => Promise<string>;
  /**
   * Optional injector for `navigator.mediaDevices.getUserMedia`, used by
   * tests that need deterministic stream objects without polluting the
   * global navigator.
   */
  getUserMedia?: (constraints: MediaStreamConstraints) => Promise<MediaStream>;
  /**
   * When true the provider swallows transient capture errors silently.
   * Defaults to the standard behavior (console.error).
   */
  silenceErrors?: boolean;
 }
 async function defaultTranscribe(audio: Blob): Promise<string> {
  const formData = new FormData();
  formData.append("audio", audio, "recording.webm");
  const res = await fetch("/api/transcribe", {
    method: "POST",
    credentials: "include",
    body: formData,
  });
  if (!res.ok) {
    throw new Error(`transcribe failed: ${res.status}`);
  }
  const data = (await res.json()) as { text?: string };
  return (data.text ?? "").trim();
 }
 export function VoiceProvider({
  children,
  transcribe = defaultTranscribe,
  getUserMedia,
  silenceErrors = false,
 }: VoiceProviderProps) {
  const [state, setState] = useState<VoiceState>("idle");
  const [transcript, setTranscript] = useState("");
  const [queue, setQueue] = useState<string[]>([]);
  const [mediaStream, setMediaStream] = useState<MediaStream | null>(null);
  const recorderRef = useRef<MediaRecorder | null>(null);
  const chunksRef = useRef<Blob[]>([]);
  const streamRef = useRef<MediaStream | null>(null);
  const optionsRef = useRef<VoiceStartOptions>({});
  // Mirror of `queue` maintained for synchronous reads — drainQueue()
  // must return entries immediately so the caller can iterate without
  // waiting for a React re-render.
  const queueRef = useRef<string[]>([]);
  const stopTracks = useCallback(() => {
    if (streamRef.current) {
      streamRef.current.getTracks().forEach((t) => {
        try {
          t.stop();
        } catch {
          // ignore
        }
      });
      streamRef.current = null;
    }
    setMediaStream(null);
  }, []);
  const enqueue = useCallback((text: string) => {
    const trimmed = text.trim();
    if (!trimmed) return;
    queueRef.current = [...queueRef.current, trimmed];
    setQueue(queueRef.current);
  }, []);
  const drainQueue = useCallback((): string[] => {
    const drained = queueRef.current;
    queueRef.current = [];
    setQueue([]);
    return drained;
  }, []);
  const clearTranscript = useCallback(() => {
    setTranscript("");
  }, []);
  const startListening = useCallback(async (
    options: VoiceStartOptions = {},
  ): Promise<void> => {
    if (state !== "idle") return;
    optionsRef.current = options;
    const acquire = getUserMedia
      ? getUserMedia({ audio: true })
      : navigator.mediaDevices?.getUserMedia?.({ audio: true });
    if (!acquire) {
      if (!silenceErrors) {
        console.error("[VoiceContext] getUserMedia is not available");
      }
      return;
    }
    try {
      const stream = await acquire;
      streamRef.current = stream;
      setMediaStream(stream);
      chunksRef.current = [];
      // MediaRecorder may not exist in some test environments. We only
      // instantiate it when available — the stream itself is still tracked
      // so VoiceWaveform and other consumers can observe capture state.
      if (typeof MediaRecorder !== "undefined") {
        const recorder = new MediaRecorder(stream);
        recorder.ondataavailable = (e) => {
          if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
        };
        recorderRef.current = recorder;
        recorder.start();
      }
      setState("listening");
    } catch (err) {
      if (!silenceErrors) {
        console.error("[VoiceContext] Microphone access denied:", err);
      }
      stopTracks();
      setState("idle");
    }
  }, [state, getUserMedia, silenceErrors, stopTracks]);
  const stopListening = useCallback(async (): Promise<void> => {
    if (state !== "listening") {
      stopTracks();
      return;
    }
    setState("speaking");
    const recorder = recorderRef.current;
    recorderRef.current = null;
    const finalize = async () => {
      try {
        if (chunksRef.current.length > 0) {
          const blob = new Blob(chunksRef.current, {
            type: recorder?.mimeType || "audio/webm",
          });
          chunksRef.current = [];
          const text = await transcribe(blob);
          if (text) {
            setTranscript(text);
            const opts = optionsRef.current;
            opts.onTranscript?.(text);
            if (!opts.inline) {
              enqueue(text);
            }
          }
        }
      } catch (err) {
        if (!silenceErrors) {
          console.error("[VoiceContext] Transcription error:", err);
        }
      } finally {
        stopTracks();
        setState("idle");
      }
    };
    if (recorder && recorder.state !== "inactive") {
      await new Promise<void>((resolve) => {
        recorder.onstop = () => resolve();
        try {
          recorder.stop();
        } catch {
          resolve();
        }
      });
    }
    await finalize();
  }, [state, transcribe, enqueue, silenceErrors, stopTracks]);
  const toggleListening = useCallback(async (
    options: VoiceStartOptions = {},
  ): Promise<void> => {
    if (state === "listening") {
      await stopListening();
      return;
    }
    if (state === "idle") {
      await startListening(options);
    }
  }, [state, startListening, stopListening]);
  const value = useMemo<VoiceContextValue>(
    () => ({
      state,
      mediaStream,
      transcript,
      queue,
      hasQueuedVoice: queue.length > 0,
      startListening,
      stopListening,
      toggleListening,
      enqueue,
      drainQueue,
      clearTranscript,
    }),
    [
      state,
      mediaStream,
      transcript,
      queue,
      startListening,
      stopListening,
      toggleListening,
      enqueue,
      drainQueue,
      clearTranscript,
    ],
  );
  return <VoiceContext.Provider value={value}>{children}</VoiceContext.Provider>;
 }
 export function useVoice(): VoiceContextValue {
  const ctx = useContext(VoiceContext);
  if (!ctx) {
    throw new Error("useVoice must be used within a VoiceProvider");
  }
  return ctx;
 }