feat(nexus): VoiceContext for phase 14 voice globalization

Lifts MediaStream, recording state, transcription buffer, and queue for
non-Assistant captures out of ChatInput's internal VoiceMicButton. The
provider owns the POST /api/transcribe fetch (v1.6 pipeline, unchanged),
exposes idle/listening/speaking state to the top-strip GlobalMicButton,
and queues transcripts captured away from /assistant for PersonalAssistant
to drain on mount.

Per spec sections 4.2 (mic states), 5.5 (voice from non-Assistant modes),
and 10.3 (voice as global affordance). Tests use manual createRoot + act
with a mocked getUserMedia injector to stay deterministic in jsdom.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Nexus Dev 2026-04-11 13:22:48 +00:00
parent 4623c8aea0
commit 14ecbf00bb
2 changed files with 507 additions and 0 deletions

View file

@ -0,0 +1,192 @@
// @vitest-environment jsdom
import { act } from "react";
import { createRoot } from "react-dom/client";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { VoiceProvider, useVoice } from "./VoiceContext";
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(globalThis as any).IS_REACT_ACT_ENVIRONMENT = true;
// Lightweight MediaStream stand-in for jsdom.
function makeFakeStream(): MediaStream {
const tracks = [{ stop: vi.fn() }];
return {
getTracks: () => tracks,
} as unknown as MediaStream;
}
describe("VoiceContext", () => {
let container: HTMLDivElement;
let root: ReturnType<typeof createRoot> | null = null;
beforeEach(() => {
container = document.createElement("div");
document.body.appendChild(container);
root = null;
});
afterEach(() => {
if (root) {
act(() => {
root!.unmount();
});
root = null;
}
if (container.parentNode) container.remove();
});
function renderWithProvider(
Consumer: React.FC,
props: {
getUserMedia?: (c: MediaStreamConstraints) => Promise<MediaStream>;
transcribe?: (blob: Blob) => Promise<string>;
} = {},
) {
root = createRoot(container);
act(() => {
root!.render(
<VoiceProvider {...props} silenceErrors>
<Consumer />
</VoiceProvider>,
);
});
}
it("starts in idle state with an empty queue", () => {
let captured: ReturnType<typeof useVoice> | null = null;
const Consumer = () => {
captured = useVoice();
return <div data-testid="state">{captured.state}</div>;
};
renderWithProvider(Consumer);
expect(captured!.state).toBe("idle");
expect(captured!.queue).toEqual([]);
expect(captured!.hasQueuedVoice).toBe(false);
});
it("transitions idle → listening when startListening resolves", async () => {
const fakeStream = makeFakeStream();
const getUserMedia = vi.fn(async () => fakeStream);
let ctxRef: ReturnType<typeof useVoice> | null = null;
const Consumer = () => {
ctxRef = useVoice();
return <div data-testid="state">{ctxRef.state}</div>;
};
renderWithProvider(Consumer, { getUserMedia });
await act(async () => {
await ctxRef!.startListening();
});
expect(getUserMedia).toHaveBeenCalledWith({ audio: true });
expect(ctxRef!.state).toBe("listening");
expect(ctxRef!.mediaStream).toBe(fakeStream);
});
it("handles getUserMedia rejection by staying idle", async () => {
const getUserMedia = vi.fn(async () => {
throw new Error("NotAllowedError");
});
let ctxRef: ReturnType<typeof useVoice> | null = null;
const Consumer = () => {
ctxRef = useVoice();
return <div>{ctxRef.state}</div>;
};
renderWithProvider(Consumer, { getUserMedia });
await act(async () => {
await ctxRef!.startListening();
});
expect(ctxRef!.state).toBe("idle");
});
it("enqueue adds transcripts to the queue and sets hasQueuedVoice", () => {
let ctxRef: ReturnType<typeof useVoice> | null = null;
const Consumer = () => {
ctxRef = useVoice();
return (
<div>
<span data-testid="count">{ctxRef.queue.length}</span>
<span data-testid="has">{ctxRef.hasQueuedVoice ? "y" : "n"}</span>
</div>
);
};
renderWithProvider(Consumer);
act(() => {
ctxRef!.enqueue("hello world");
});
expect(ctxRef!.queue).toEqual(["hello world"]);
expect(ctxRef!.hasQueuedVoice).toBe(true);
});
it("drainQueue returns all entries and clears the queue", () => {
let ctxRef: ReturnType<typeof useVoice> | null = null;
const Consumer = () => {
ctxRef = useVoice();
return <div>{ctxRef.queue.length}</div>;
};
renderWithProvider(Consumer);
act(() => {
ctxRef!.enqueue("one");
ctxRef!.enqueue("two");
});
expect(ctxRef!.queue.length).toBe(2);
let drained: string[] = [];
act(() => {
drained = ctxRef!.drainQueue();
});
expect(drained).toEqual(["one", "two"]);
expect(ctxRef!.queue).toEqual([]);
expect(ctxRef!.hasQueuedVoice).toBe(false);
});
it("stopListening returns the context to idle and stops stream tracks", async () => {
const tracks = [{ stop: vi.fn() }];
const fakeStream = {
getTracks: () => tracks,
} as unknown as MediaStream;
const getUserMedia = vi.fn(async () => fakeStream);
const transcribe = vi.fn(async () => "transcribed text");
let ctxRef: ReturnType<typeof useVoice> | null = null;
const Consumer = () => {
ctxRef = useVoice();
return <div>{ctxRef.state}</div>;
};
renderWithProvider(Consumer, { getUserMedia, transcribe });
await act(async () => {
await ctxRef!.startListening();
});
expect(ctxRef!.state).toBe("listening");
await act(async () => {
await ctxRef!.stopListening();
});
expect(ctxRef!.state).toBe("idle");
expect(tracks[0]!.stop).toHaveBeenCalled();
});
it("throws when useVoice is used outside a provider", () => {
const Consumer = () => {
useVoice();
return null;
};
const spy = vi.spyOn(console, "error").mockImplementation(() => {});
root = createRoot(container);
expect(() =>
act(() => {
root!.render(<Consumer />);
}),
).toThrow(/VoiceProvider/);
spy.mockRestore();
});
});

View file

@ -0,0 +1,315 @@
import {
createContext,
useCallback,
useContext,
useMemo,
useRef,
useState,
type ReactNode,
} from "react";
/**
* VoiceContext Phase 14 globalization of voice capture.
*
* Before Phase 14, the voice capture state (MediaStream, recorder, VAD
* transcription pipeline) lived inside `ChatInput`'s `VoiceMicButton`.
* Phase 14 lifts it up so the top-strip `GlobalMicButton` can drive voice
* from any route, and speech captured while the user is away from
* `/assistant` is queued for draining when they arrive there.
*
* The existing `/api/transcribe` endpoint (server-side Whisper pipeline
* shipped in v1.6) is consumed unchanged see `voice-pipeline.ts` on the
* server. No new backend endpoint is introduced.
*
* Spec references:
* - §4.2 GlobalMicButton states (idle / listening / speaking)
* - §5.5 voice routing from non-Assistant modes queue Assistant inbox
* - §10.3 voice as global affordance
* - §10.4 single notification surface
*/
export type VoiceState = "idle" | "listening" | "speaking";
export interface VoiceStartOptions {
/**
* When true, the transcript produced by this capture is NOT pushed onto
* the Assistant queue. Used by ChatInput's in-place mic which inserts
* the transcript directly into the textarea via `onTranscript`.
*/
inline?: boolean;
/**
* Optional callback invoked with the final trimmed transcript once
* transcription completes. Called even when `inline` is false, so
* consumers that want to observe speech alongside the queue can.
*/
onTranscript?: (text: string) => void;
}
export interface VoiceContextValue {
state: VoiceState;
mediaStream: MediaStream | null;
transcript: string;
queue: string[];
hasQueuedVoice: boolean;
/**
* Start the microphone. If permission is denied the state stays `idle`
* and an error is logged to the console (no toast Layout owns toasts).
*
* When `options.inline` is true, the resulting transcript is NOT pushed
* onto the Assistant queue the caller is expected to consume
* `transcript` directly (used by ChatInput's in-place mic). When
* `options.onTranscript` is provided, it is invoked with the final
* transcript alongside (or in place of) the queue push.
*/
startListening: (options?: VoiceStartOptions) => Promise<void>;
/** Stop the current capture and flush the buffered transcript. */
stopListening: () => Promise<void>;
/** Cycle idle → listening → idle. Used by GlobalMicButton's click. */
toggleListening: (options?: VoiceStartOptions) => Promise<void>;
/**
* Push a transcript onto the queue. PersonalAssistant drains this on
* mount and sends each entry as a new user message through the existing
* chat streaming pipeline.
*/
enqueue: (text: string) => void;
/** Drain and return every queued transcript, clearing the queue. */
drainQueue: () => string[];
/** Clear the transient transcript buffer (not the queue). */
clearTranscript: () => void;
}
const VoiceContext = createContext<VoiceContextValue | undefined>(undefined);
interface VoiceProviderProps {
children: ReactNode;
/**
* Optional override so tests can avoid hitting a real network. When
* omitted the provider calls `fetch("/api/transcribe", ...)`.
*/
transcribe?: (audio: Blob) => Promise<string>;
/**
* Optional injector for `navigator.mediaDevices.getUserMedia`, used by
* tests that need deterministic stream objects without polluting the
* global navigator.
*/
getUserMedia?: (constraints: MediaStreamConstraints) => Promise<MediaStream>;
/**
* When true the provider swallows transient capture errors silently.
* Defaults to the standard behavior (console.error).
*/
silenceErrors?: boolean;
}
async function defaultTranscribe(audio: Blob): Promise<string> {
const formData = new FormData();
formData.append("audio", audio, "recording.webm");
const res = await fetch("/api/transcribe", {
method: "POST",
credentials: "include",
body: formData,
});
if (!res.ok) {
throw new Error(`transcribe failed: ${res.status}`);
}
const data = (await res.json()) as { text?: string };
return (data.text ?? "").trim();
}
export function VoiceProvider({
children,
transcribe = defaultTranscribe,
getUserMedia,
silenceErrors = false,
}: VoiceProviderProps) {
const [state, setState] = useState<VoiceState>("idle");
const [transcript, setTranscript] = useState("");
const [queue, setQueue] = useState<string[]>([]);
const [mediaStream, setMediaStream] = useState<MediaStream | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<Blob[]>([]);
const streamRef = useRef<MediaStream | null>(null);
const optionsRef = useRef<VoiceStartOptions>({});
// Mirror of `queue` maintained for synchronous reads — drainQueue()
// must return entries immediately so the caller can iterate without
// waiting for a React re-render.
const queueRef = useRef<string[]>([]);
const stopTracks = useCallback(() => {
if (streamRef.current) {
streamRef.current.getTracks().forEach((t) => {
try {
t.stop();
} catch {
// ignore
}
});
streamRef.current = null;
}
setMediaStream(null);
}, []);
const enqueue = useCallback((text: string) => {
const trimmed = text.trim();
if (!trimmed) return;
queueRef.current = [...queueRef.current, trimmed];
setQueue(queueRef.current);
}, []);
const drainQueue = useCallback((): string[] => {
const drained = queueRef.current;
queueRef.current = [];
setQueue([]);
return drained;
}, []);
const clearTranscript = useCallback(() => {
setTranscript("");
}, []);
const startListening = useCallback(async (
options: VoiceStartOptions = {},
): Promise<void> => {
if (state !== "idle") return;
optionsRef.current = options;
const acquire = getUserMedia
? getUserMedia({ audio: true })
: navigator.mediaDevices?.getUserMedia?.({ audio: true });
if (!acquire) {
if (!silenceErrors) {
console.error("[VoiceContext] getUserMedia is not available");
}
return;
}
try {
const stream = await acquire;
streamRef.current = stream;
setMediaStream(stream);
chunksRef.current = [];
// MediaRecorder may not exist in some test environments. We only
// instantiate it when available — the stream itself is still tracked
// so VoiceWaveform and other consumers can observe capture state.
if (typeof MediaRecorder !== "undefined") {
const recorder = new MediaRecorder(stream);
recorder.ondataavailable = (e) => {
if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
};
recorderRef.current = recorder;
recorder.start();
}
setState("listening");
} catch (err) {
if (!silenceErrors) {
console.error("[VoiceContext] Microphone access denied:", err);
}
stopTracks();
setState("idle");
}
}, [state, getUserMedia, silenceErrors, stopTracks]);
const stopListening = useCallback(async (): Promise<void> => {
if (state !== "listening") {
stopTracks();
return;
}
setState("speaking");
const recorder = recorderRef.current;
recorderRef.current = null;
const finalize = async () => {
try {
if (chunksRef.current.length > 0) {
const blob = new Blob(chunksRef.current, {
type: recorder?.mimeType || "audio/webm",
});
chunksRef.current = [];
const text = await transcribe(blob);
if (text) {
setTranscript(text);
const opts = optionsRef.current;
opts.onTranscript?.(text);
if (!opts.inline) {
enqueue(text);
}
}
}
} catch (err) {
if (!silenceErrors) {
console.error("[VoiceContext] Transcription error:", err);
}
} finally {
stopTracks();
setState("idle");
}
};
if (recorder && recorder.state !== "inactive") {
await new Promise<void>((resolve) => {
recorder.onstop = () => resolve();
try {
recorder.stop();
} catch {
resolve();
}
});
}
await finalize();
}, [state, transcribe, enqueue, silenceErrors, stopTracks]);
const toggleListening = useCallback(async (
options: VoiceStartOptions = {},
): Promise<void> => {
if (state === "listening") {
await stopListening();
return;
}
if (state === "idle") {
await startListening(options);
}
}, [state, startListening, stopListening]);
const value = useMemo<VoiceContextValue>(
() => ({
state,
mediaStream,
transcript,
queue,
hasQueuedVoice: queue.length > 0,
startListening,
stopListening,
toggleListening,
enqueue,
drainQueue,
clearTranscript,
}),
[
state,
mediaStream,
transcript,
queue,
startListening,
stopListening,
toggleListening,
enqueue,
drainQueue,
clearTranscript,
],
);
return <VoiceContext.Provider value={value}>{children}</VoiceContext.Provider>;
}
export function useVoice(): VoiceContextValue {
const ctx = useContext(VoiceContext);
if (!ctx) {
throw new Error("useVoice must be used within a VoiceProvider");
}
return ctx;
}