feat(nexus): VoiceContext for phase 14 voice globalization
Lifts MediaStream, recording state, transcription buffer, and queue for non-Assistant captures out of ChatInput's internal VoiceMicButton. The provider owns the POST /api/transcribe fetch (v1.6 pipeline, unchanged), exposes idle/listening/speaking state to the top-strip GlobalMicButton, and queues transcripts captured away from /assistant for PersonalAssistant to drain on mount. Per spec sections 4.2 (mic states), 5.5 (voice from non-Assistant modes), and 10.3 (voice as global affordance). Tests use manual createRoot + act with a mocked getUserMedia injector to stay deterministic in jsdom. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4623c8aea0
commit
14ecbf00bb
2 changed files with 507 additions and 0 deletions
192
ui/src/context/VoiceContext.test.tsx
Normal file
192
ui/src/context/VoiceContext.test.tsx
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
// @vitest-environment jsdom
|
||||
|
||||
import { act } from "react";
|
||||
import { createRoot } from "react-dom/client";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { VoiceProvider, useVoice } from "./VoiceContext";
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(globalThis as any).IS_REACT_ACT_ENVIRONMENT = true;
|
||||
|
||||
// Lightweight MediaStream stand-in for jsdom.
|
||||
function makeFakeStream(): MediaStream {
|
||||
const tracks = [{ stop: vi.fn() }];
|
||||
return {
|
||||
getTracks: () => tracks,
|
||||
} as unknown as MediaStream;
|
||||
}
|
||||
|
||||
describe("VoiceContext", () => {
|
||||
let container: HTMLDivElement;
|
||||
let root: ReturnType<typeof createRoot> | null = null;
|
||||
|
||||
beforeEach(() => {
|
||||
container = document.createElement("div");
|
||||
document.body.appendChild(container);
|
||||
root = null;
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (root) {
|
||||
act(() => {
|
||||
root!.unmount();
|
||||
});
|
||||
root = null;
|
||||
}
|
||||
if (container.parentNode) container.remove();
|
||||
});
|
||||
|
||||
function renderWithProvider(
|
||||
Consumer: React.FC,
|
||||
props: {
|
||||
getUserMedia?: (c: MediaStreamConstraints) => Promise<MediaStream>;
|
||||
transcribe?: (blob: Blob) => Promise<string>;
|
||||
} = {},
|
||||
) {
|
||||
root = createRoot(container);
|
||||
act(() => {
|
||||
root!.render(
|
||||
<VoiceProvider {...props} silenceErrors>
|
||||
<Consumer />
|
||||
</VoiceProvider>,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
it("starts in idle state with an empty queue", () => {
|
||||
let captured: ReturnType<typeof useVoice> | null = null;
|
||||
const Consumer = () => {
|
||||
captured = useVoice();
|
||||
return <div data-testid="state">{captured.state}</div>;
|
||||
};
|
||||
|
||||
renderWithProvider(Consumer);
|
||||
expect(captured!.state).toBe("idle");
|
||||
expect(captured!.queue).toEqual([]);
|
||||
expect(captured!.hasQueuedVoice).toBe(false);
|
||||
});
|
||||
|
||||
it("transitions idle → listening when startListening resolves", async () => {
|
||||
const fakeStream = makeFakeStream();
|
||||
const getUserMedia = vi.fn(async () => fakeStream);
|
||||
|
||||
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||
const Consumer = () => {
|
||||
ctxRef = useVoice();
|
||||
return <div data-testid="state">{ctxRef.state}</div>;
|
||||
};
|
||||
|
||||
renderWithProvider(Consumer, { getUserMedia });
|
||||
|
||||
await act(async () => {
|
||||
await ctxRef!.startListening();
|
||||
});
|
||||
|
||||
expect(getUserMedia).toHaveBeenCalledWith({ audio: true });
|
||||
expect(ctxRef!.state).toBe("listening");
|
||||
expect(ctxRef!.mediaStream).toBe(fakeStream);
|
||||
});
|
||||
|
||||
it("handles getUserMedia rejection by staying idle", async () => {
|
||||
const getUserMedia = vi.fn(async () => {
|
||||
throw new Error("NotAllowedError");
|
||||
});
|
||||
|
||||
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||
const Consumer = () => {
|
||||
ctxRef = useVoice();
|
||||
return <div>{ctxRef.state}</div>;
|
||||
};
|
||||
|
||||
renderWithProvider(Consumer, { getUserMedia });
|
||||
await act(async () => {
|
||||
await ctxRef!.startListening();
|
||||
});
|
||||
expect(ctxRef!.state).toBe("idle");
|
||||
});
|
||||
|
||||
it("enqueue adds transcripts to the queue and sets hasQueuedVoice", () => {
|
||||
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||
const Consumer = () => {
|
||||
ctxRef = useVoice();
|
||||
return (
|
||||
<div>
|
||||
<span data-testid="count">{ctxRef.queue.length}</span>
|
||||
<span data-testid="has">{ctxRef.hasQueuedVoice ? "y" : "n"}</span>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
renderWithProvider(Consumer);
|
||||
act(() => {
|
||||
ctxRef!.enqueue("hello world");
|
||||
});
|
||||
expect(ctxRef!.queue).toEqual(["hello world"]);
|
||||
expect(ctxRef!.hasQueuedVoice).toBe(true);
|
||||
});
|
||||
|
||||
it("drainQueue returns all entries and clears the queue", () => {
|
||||
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||
const Consumer = () => {
|
||||
ctxRef = useVoice();
|
||||
return <div>{ctxRef.queue.length}</div>;
|
||||
};
|
||||
|
||||
renderWithProvider(Consumer);
|
||||
act(() => {
|
||||
ctxRef!.enqueue("one");
|
||||
ctxRef!.enqueue("two");
|
||||
});
|
||||
expect(ctxRef!.queue.length).toBe(2);
|
||||
|
||||
let drained: string[] = [];
|
||||
act(() => {
|
||||
drained = ctxRef!.drainQueue();
|
||||
});
|
||||
expect(drained).toEqual(["one", "two"]);
|
||||
expect(ctxRef!.queue).toEqual([]);
|
||||
expect(ctxRef!.hasQueuedVoice).toBe(false);
|
||||
});
|
||||
|
||||
it("stopListening returns the context to idle and stops stream tracks", async () => {
|
||||
const tracks = [{ stop: vi.fn() }];
|
||||
const fakeStream = {
|
||||
getTracks: () => tracks,
|
||||
} as unknown as MediaStream;
|
||||
const getUserMedia = vi.fn(async () => fakeStream);
|
||||
const transcribe = vi.fn(async () => "transcribed text");
|
||||
|
||||
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||
const Consumer = () => {
|
||||
ctxRef = useVoice();
|
||||
return <div>{ctxRef.state}</div>;
|
||||
};
|
||||
|
||||
renderWithProvider(Consumer, { getUserMedia, transcribe });
|
||||
await act(async () => {
|
||||
await ctxRef!.startListening();
|
||||
});
|
||||
expect(ctxRef!.state).toBe("listening");
|
||||
|
||||
await act(async () => {
|
||||
await ctxRef!.stopListening();
|
||||
});
|
||||
expect(ctxRef!.state).toBe("idle");
|
||||
expect(tracks[0]!.stop).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("throws when useVoice is used outside a provider", () => {
|
||||
const Consumer = () => {
|
||||
useVoice();
|
||||
return null;
|
||||
};
|
||||
const spy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
root = createRoot(container);
|
||||
expect(() =>
|
||||
act(() => {
|
||||
root!.render(<Consumer />);
|
||||
}),
|
||||
).toThrow(/VoiceProvider/);
|
||||
spy.mockRestore();
|
||||
});
|
||||
});
|
||||
315
ui/src/context/VoiceContext.tsx
Normal file
315
ui/src/context/VoiceContext.tsx
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
import {
|
||||
createContext,
|
||||
useCallback,
|
||||
useContext,
|
||||
useMemo,
|
||||
useRef,
|
||||
useState,
|
||||
type ReactNode,
|
||||
} from "react";
|
||||
|
||||
/**
|
||||
* VoiceContext — Phase 14 globalization of voice capture.
|
||||
*
|
||||
* Before Phase 14, the voice capture state (MediaStream, recorder, VAD
|
||||
* transcription pipeline) lived inside `ChatInput`'s `VoiceMicButton`.
|
||||
* Phase 14 lifts it up so the top-strip `GlobalMicButton` can drive voice
|
||||
* from any route, and speech captured while the user is away from
|
||||
* `/assistant` is queued for draining when they arrive there.
|
||||
*
|
||||
* The existing `/api/transcribe` endpoint (server-side Whisper pipeline
|
||||
* shipped in v1.6) is consumed unchanged — see `voice-pipeline.ts` on the
|
||||
* server. No new backend endpoint is introduced.
|
||||
*
|
||||
* Spec references:
|
||||
* - §4.2 GlobalMicButton states (idle / listening / speaking)
|
||||
* - §5.5 voice routing from non-Assistant modes → queue → Assistant inbox
|
||||
* - §10.3 voice as global affordance
|
||||
* - §10.4 single notification surface
|
||||
*/
|
||||
|
||||
export type VoiceState = "idle" | "listening" | "speaking";
|
||||
|
||||
export interface VoiceStartOptions {
|
||||
/**
|
||||
* When true, the transcript produced by this capture is NOT pushed onto
|
||||
* the Assistant queue. Used by ChatInput's in-place mic which inserts
|
||||
* the transcript directly into the textarea via `onTranscript`.
|
||||
*/
|
||||
inline?: boolean;
|
||||
/**
|
||||
* Optional callback invoked with the final trimmed transcript once
|
||||
* transcription completes. Called even when `inline` is false, so
|
||||
* consumers that want to observe speech alongside the queue can.
|
||||
*/
|
||||
onTranscript?: (text: string) => void;
|
||||
}
|
||||
|
||||
export interface VoiceContextValue {
|
||||
state: VoiceState;
|
||||
mediaStream: MediaStream | null;
|
||||
transcript: string;
|
||||
queue: string[];
|
||||
hasQueuedVoice: boolean;
|
||||
/**
|
||||
* Start the microphone. If permission is denied the state stays `idle`
|
||||
* and an error is logged to the console (no toast — Layout owns toasts).
|
||||
*
|
||||
* When `options.inline` is true, the resulting transcript is NOT pushed
|
||||
* onto the Assistant queue — the caller is expected to consume
|
||||
* `transcript` directly (used by ChatInput's in-place mic). When
|
||||
* `options.onTranscript` is provided, it is invoked with the final
|
||||
* transcript alongside (or in place of) the queue push.
|
||||
*/
|
||||
startListening: (options?: VoiceStartOptions) => Promise<void>;
|
||||
/** Stop the current capture and flush the buffered transcript. */
|
||||
stopListening: () => Promise<void>;
|
||||
/** Cycle idle → listening → idle. Used by GlobalMicButton's click. */
|
||||
toggleListening: (options?: VoiceStartOptions) => Promise<void>;
|
||||
/**
|
||||
* Push a transcript onto the queue. PersonalAssistant drains this on
|
||||
* mount and sends each entry as a new user message through the existing
|
||||
* chat streaming pipeline.
|
||||
*/
|
||||
enqueue: (text: string) => void;
|
||||
/** Drain and return every queued transcript, clearing the queue. */
|
||||
drainQueue: () => string[];
|
||||
/** Clear the transient transcript buffer (not the queue). */
|
||||
clearTranscript: () => void;
|
||||
}
|
||||
|
||||
const VoiceContext = createContext<VoiceContextValue | undefined>(undefined);
|
||||
|
||||
interface VoiceProviderProps {
|
||||
children: ReactNode;
|
||||
/**
|
||||
* Optional override so tests can avoid hitting a real network. When
|
||||
* omitted the provider calls `fetch("/api/transcribe", ...)`.
|
||||
*/
|
||||
transcribe?: (audio: Blob) => Promise<string>;
|
||||
/**
|
||||
* Optional injector for `navigator.mediaDevices.getUserMedia`, used by
|
||||
* tests that need deterministic stream objects without polluting the
|
||||
* global navigator.
|
||||
*/
|
||||
getUserMedia?: (constraints: MediaStreamConstraints) => Promise<MediaStream>;
|
||||
/**
|
||||
* When true the provider swallows transient capture errors silently.
|
||||
* Defaults to the standard behavior (console.error).
|
||||
*/
|
||||
silenceErrors?: boolean;
|
||||
}
|
||||
|
||||
async function defaultTranscribe(audio: Blob): Promise<string> {
|
||||
const formData = new FormData();
|
||||
formData.append("audio", audio, "recording.webm");
|
||||
const res = await fetch("/api/transcribe", {
|
||||
method: "POST",
|
||||
credentials: "include",
|
||||
body: formData,
|
||||
});
|
||||
if (!res.ok) {
|
||||
throw new Error(`transcribe failed: ${res.status}`);
|
||||
}
|
||||
const data = (await res.json()) as { text?: string };
|
||||
return (data.text ?? "").trim();
|
||||
}
|
||||
|
||||
export function VoiceProvider({
|
||||
children,
|
||||
transcribe = defaultTranscribe,
|
||||
getUserMedia,
|
||||
silenceErrors = false,
|
||||
}: VoiceProviderProps) {
|
||||
const [state, setState] = useState<VoiceState>("idle");
|
||||
const [transcript, setTranscript] = useState("");
|
||||
const [queue, setQueue] = useState<string[]>([]);
|
||||
const [mediaStream, setMediaStream] = useState<MediaStream | null>(null);
|
||||
|
||||
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||
const chunksRef = useRef<Blob[]>([]);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const optionsRef = useRef<VoiceStartOptions>({});
|
||||
// Mirror of `queue` maintained for synchronous reads — drainQueue()
|
||||
// must return entries immediately so the caller can iterate without
|
||||
// waiting for a React re-render.
|
||||
const queueRef = useRef<string[]>([]);
|
||||
|
||||
const stopTracks = useCallback(() => {
|
||||
if (streamRef.current) {
|
||||
streamRef.current.getTracks().forEach((t) => {
|
||||
try {
|
||||
t.stop();
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
});
|
||||
streamRef.current = null;
|
||||
}
|
||||
setMediaStream(null);
|
||||
}, []);
|
||||
|
||||
const enqueue = useCallback((text: string) => {
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) return;
|
||||
queueRef.current = [...queueRef.current, trimmed];
|
||||
setQueue(queueRef.current);
|
||||
}, []);
|
||||
|
||||
const drainQueue = useCallback((): string[] => {
|
||||
const drained = queueRef.current;
|
||||
queueRef.current = [];
|
||||
setQueue([]);
|
||||
return drained;
|
||||
}, []);
|
||||
|
||||
const clearTranscript = useCallback(() => {
|
||||
setTranscript("");
|
||||
}, []);
|
||||
|
||||
const startListening = useCallback(async (
|
||||
options: VoiceStartOptions = {},
|
||||
): Promise<void> => {
|
||||
if (state !== "idle") return;
|
||||
optionsRef.current = options;
|
||||
const acquire = getUserMedia
|
||||
? getUserMedia({ audio: true })
|
||||
: navigator.mediaDevices?.getUserMedia?.({ audio: true });
|
||||
|
||||
if (!acquire) {
|
||||
if (!silenceErrors) {
|
||||
console.error("[VoiceContext] getUserMedia is not available");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const stream = await acquire;
|
||||
streamRef.current = stream;
|
||||
setMediaStream(stream);
|
||||
chunksRef.current = [];
|
||||
|
||||
// MediaRecorder may not exist in some test environments. We only
|
||||
// instantiate it when available — the stream itself is still tracked
|
||||
// so VoiceWaveform and other consumers can observe capture state.
|
||||
if (typeof MediaRecorder !== "undefined") {
|
||||
const recorder = new MediaRecorder(stream);
|
||||
recorder.ondataavailable = (e) => {
|
||||
if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
|
||||
};
|
||||
recorderRef.current = recorder;
|
||||
recorder.start();
|
||||
}
|
||||
|
||||
setState("listening");
|
||||
} catch (err) {
|
||||
if (!silenceErrors) {
|
||||
console.error("[VoiceContext] Microphone access denied:", err);
|
||||
}
|
||||
stopTracks();
|
||||
setState("idle");
|
||||
}
|
||||
}, [state, getUserMedia, silenceErrors, stopTracks]);
|
||||
|
||||
const stopListening = useCallback(async (): Promise<void> => {
|
||||
if (state !== "listening") {
|
||||
stopTracks();
|
||||
return;
|
||||
}
|
||||
|
||||
setState("speaking");
|
||||
|
||||
const recorder = recorderRef.current;
|
||||
recorderRef.current = null;
|
||||
|
||||
const finalize = async () => {
|
||||
try {
|
||||
if (chunksRef.current.length > 0) {
|
||||
const blob = new Blob(chunksRef.current, {
|
||||
type: recorder?.mimeType || "audio/webm",
|
||||
});
|
||||
chunksRef.current = [];
|
||||
const text = await transcribe(blob);
|
||||
if (text) {
|
||||
setTranscript(text);
|
||||
const opts = optionsRef.current;
|
||||
opts.onTranscript?.(text);
|
||||
if (!opts.inline) {
|
||||
enqueue(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
if (!silenceErrors) {
|
||||
console.error("[VoiceContext] Transcription error:", err);
|
||||
}
|
||||
} finally {
|
||||
stopTracks();
|
||||
setState("idle");
|
||||
}
|
||||
};
|
||||
|
||||
if (recorder && recorder.state !== "inactive") {
|
||||
await new Promise<void>((resolve) => {
|
||||
recorder.onstop = () => resolve();
|
||||
try {
|
||||
recorder.stop();
|
||||
} catch {
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
await finalize();
|
||||
}, [state, transcribe, enqueue, silenceErrors, stopTracks]);
|
||||
|
||||
const toggleListening = useCallback(async (
|
||||
options: VoiceStartOptions = {},
|
||||
): Promise<void> => {
|
||||
if (state === "listening") {
|
||||
await stopListening();
|
||||
return;
|
||||
}
|
||||
if (state === "idle") {
|
||||
await startListening(options);
|
||||
}
|
||||
}, [state, startListening, stopListening]);
|
||||
|
||||
const value = useMemo<VoiceContextValue>(
|
||||
() => ({
|
||||
state,
|
||||
mediaStream,
|
||||
transcript,
|
||||
queue,
|
||||
hasQueuedVoice: queue.length > 0,
|
||||
startListening,
|
||||
stopListening,
|
||||
toggleListening,
|
||||
enqueue,
|
||||
drainQueue,
|
||||
clearTranscript,
|
||||
}),
|
||||
[
|
||||
state,
|
||||
mediaStream,
|
||||
transcript,
|
||||
queue,
|
||||
startListening,
|
||||
stopListening,
|
||||
toggleListening,
|
||||
enqueue,
|
||||
drainQueue,
|
||||
clearTranscript,
|
||||
],
|
||||
);
|
||||
|
||||
return <VoiceContext.Provider value={value}>{children}</VoiceContext.Provider>;
|
||||
}
|
||||
|
||||
export function useVoice(): VoiceContextValue {
|
||||
const ctx = useContext(VoiceContext);
|
||||
if (!ctx) {
|
||||
throw new Error("useVoice must be used within a VoiceProvider");
|
||||
}
|
||||
return ctx;
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue