feat(nexus): VoiceContext for phase 14 voice globalization
Lifts MediaStream, recording state, transcription buffer, and queue for non-Assistant captures out of ChatInput's internal VoiceMicButton. The provider owns the POST /api/transcribe fetch (v1.6 pipeline, unchanged), exposes idle/listening/speaking state to the top-strip GlobalMicButton, and queues transcripts captured away from /assistant for PersonalAssistant to drain on mount. Per spec sections 4.2 (mic states), 5.5 (voice from non-Assistant modes), and 10.3 (voice as global affordance). Tests use manual createRoot + act with a mocked getUserMedia injector to stay deterministic in jsdom. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4623c8aea0
commit
14ecbf00bb
2 changed files with 507 additions and 0 deletions
192
ui/src/context/VoiceContext.test.tsx
Normal file
192
ui/src/context/VoiceContext.test.tsx
Normal file
|
|
@ -0,0 +1,192 @@
|
||||||
|
// @vitest-environment jsdom
|
||||||
|
|
||||||
|
import { act } from "react";
|
||||||
|
import { createRoot } from "react-dom/client";
|
||||||
|
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||||
|
import { VoiceProvider, useVoice } from "./VoiceContext";
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
(globalThis as any).IS_REACT_ACT_ENVIRONMENT = true;
|
||||||
|
|
||||||
|
// Lightweight MediaStream stand-in for jsdom.
|
||||||
|
function makeFakeStream(): MediaStream {
|
||||||
|
const tracks = [{ stop: vi.fn() }];
|
||||||
|
return {
|
||||||
|
getTracks: () => tracks,
|
||||||
|
} as unknown as MediaStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("VoiceContext", () => {
|
||||||
|
let container: HTMLDivElement;
|
||||||
|
let root: ReturnType<typeof createRoot> | null = null;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
container = document.createElement("div");
|
||||||
|
document.body.appendChild(container);
|
||||||
|
root = null;
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
if (root) {
|
||||||
|
act(() => {
|
||||||
|
root!.unmount();
|
||||||
|
});
|
||||||
|
root = null;
|
||||||
|
}
|
||||||
|
if (container.parentNode) container.remove();
|
||||||
|
});
|
||||||
|
|
||||||
|
function renderWithProvider(
|
||||||
|
Consumer: React.FC,
|
||||||
|
props: {
|
||||||
|
getUserMedia?: (c: MediaStreamConstraints) => Promise<MediaStream>;
|
||||||
|
transcribe?: (blob: Blob) => Promise<string>;
|
||||||
|
} = {},
|
||||||
|
) {
|
||||||
|
root = createRoot(container);
|
||||||
|
act(() => {
|
||||||
|
root!.render(
|
||||||
|
<VoiceProvider {...props} silenceErrors>
|
||||||
|
<Consumer />
|
||||||
|
</VoiceProvider>,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
it("starts in idle state with an empty queue", () => {
|
||||||
|
let captured: ReturnType<typeof useVoice> | null = null;
|
||||||
|
const Consumer = () => {
|
||||||
|
captured = useVoice();
|
||||||
|
return <div data-testid="state">{captured.state}</div>;
|
||||||
|
};
|
||||||
|
|
||||||
|
renderWithProvider(Consumer);
|
||||||
|
expect(captured!.state).toBe("idle");
|
||||||
|
expect(captured!.queue).toEqual([]);
|
||||||
|
expect(captured!.hasQueuedVoice).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("transitions idle → listening when startListening resolves", async () => {
|
||||||
|
const fakeStream = makeFakeStream();
|
||||||
|
const getUserMedia = vi.fn(async () => fakeStream);
|
||||||
|
|
||||||
|
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||||
|
const Consumer = () => {
|
||||||
|
ctxRef = useVoice();
|
||||||
|
return <div data-testid="state">{ctxRef.state}</div>;
|
||||||
|
};
|
||||||
|
|
||||||
|
renderWithProvider(Consumer, { getUserMedia });
|
||||||
|
|
||||||
|
await act(async () => {
|
||||||
|
await ctxRef!.startListening();
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(getUserMedia).toHaveBeenCalledWith({ audio: true });
|
||||||
|
expect(ctxRef!.state).toBe("listening");
|
||||||
|
expect(ctxRef!.mediaStream).toBe(fakeStream);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("handles getUserMedia rejection by staying idle", async () => {
|
||||||
|
const getUserMedia = vi.fn(async () => {
|
||||||
|
throw new Error("NotAllowedError");
|
||||||
|
});
|
||||||
|
|
||||||
|
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||||
|
const Consumer = () => {
|
||||||
|
ctxRef = useVoice();
|
||||||
|
return <div>{ctxRef.state}</div>;
|
||||||
|
};
|
||||||
|
|
||||||
|
renderWithProvider(Consumer, { getUserMedia });
|
||||||
|
await act(async () => {
|
||||||
|
await ctxRef!.startListening();
|
||||||
|
});
|
||||||
|
expect(ctxRef!.state).toBe("idle");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("enqueue adds transcripts to the queue and sets hasQueuedVoice", () => {
|
||||||
|
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||||
|
const Consumer = () => {
|
||||||
|
ctxRef = useVoice();
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<span data-testid="count">{ctxRef.queue.length}</span>
|
||||||
|
<span data-testid="has">{ctxRef.hasQueuedVoice ? "y" : "n"}</span>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
renderWithProvider(Consumer);
|
||||||
|
act(() => {
|
||||||
|
ctxRef!.enqueue("hello world");
|
||||||
|
});
|
||||||
|
expect(ctxRef!.queue).toEqual(["hello world"]);
|
||||||
|
expect(ctxRef!.hasQueuedVoice).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("drainQueue returns all entries and clears the queue", () => {
|
||||||
|
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||||
|
const Consumer = () => {
|
||||||
|
ctxRef = useVoice();
|
||||||
|
return <div>{ctxRef.queue.length}</div>;
|
||||||
|
};
|
||||||
|
|
||||||
|
renderWithProvider(Consumer);
|
||||||
|
act(() => {
|
||||||
|
ctxRef!.enqueue("one");
|
||||||
|
ctxRef!.enqueue("two");
|
||||||
|
});
|
||||||
|
expect(ctxRef!.queue.length).toBe(2);
|
||||||
|
|
||||||
|
let drained: string[] = [];
|
||||||
|
act(() => {
|
||||||
|
drained = ctxRef!.drainQueue();
|
||||||
|
});
|
||||||
|
expect(drained).toEqual(["one", "two"]);
|
||||||
|
expect(ctxRef!.queue).toEqual([]);
|
||||||
|
expect(ctxRef!.hasQueuedVoice).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("stopListening returns the context to idle and stops stream tracks", async () => {
|
||||||
|
const tracks = [{ stop: vi.fn() }];
|
||||||
|
const fakeStream = {
|
||||||
|
getTracks: () => tracks,
|
||||||
|
} as unknown as MediaStream;
|
||||||
|
const getUserMedia = vi.fn(async () => fakeStream);
|
||||||
|
const transcribe = vi.fn(async () => "transcribed text");
|
||||||
|
|
||||||
|
let ctxRef: ReturnType<typeof useVoice> | null = null;
|
||||||
|
const Consumer = () => {
|
||||||
|
ctxRef = useVoice();
|
||||||
|
return <div>{ctxRef.state}</div>;
|
||||||
|
};
|
||||||
|
|
||||||
|
renderWithProvider(Consumer, { getUserMedia, transcribe });
|
||||||
|
await act(async () => {
|
||||||
|
await ctxRef!.startListening();
|
||||||
|
});
|
||||||
|
expect(ctxRef!.state).toBe("listening");
|
||||||
|
|
||||||
|
await act(async () => {
|
||||||
|
await ctxRef!.stopListening();
|
||||||
|
});
|
||||||
|
expect(ctxRef!.state).toBe("idle");
|
||||||
|
expect(tracks[0]!.stop).toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("throws when useVoice is used outside a provider", () => {
|
||||||
|
const Consumer = () => {
|
||||||
|
useVoice();
|
||||||
|
return null;
|
||||||
|
};
|
||||||
|
const spy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||||
|
root = createRoot(container);
|
||||||
|
expect(() =>
|
||||||
|
act(() => {
|
||||||
|
root!.render(<Consumer />);
|
||||||
|
}),
|
||||||
|
).toThrow(/VoiceProvider/);
|
||||||
|
spy.mockRestore();
|
||||||
|
});
|
||||||
|
});
|
||||||
315
ui/src/context/VoiceContext.tsx
Normal file
315
ui/src/context/VoiceContext.tsx
Normal file
|
|
@ -0,0 +1,315 @@
|
||||||
|
import {
|
||||||
|
createContext,
|
||||||
|
useCallback,
|
||||||
|
useContext,
|
||||||
|
useMemo,
|
||||||
|
useRef,
|
||||||
|
useState,
|
||||||
|
type ReactNode,
|
||||||
|
} from "react";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* VoiceContext — Phase 14 globalization of voice capture.
|
||||||
|
*
|
||||||
|
* Before Phase 14, the voice capture state (MediaStream, recorder, VAD
|
||||||
|
* transcription pipeline) lived inside `ChatInput`'s `VoiceMicButton`.
|
||||||
|
* Phase 14 lifts it up so the top-strip `GlobalMicButton` can drive voice
|
||||||
|
* from any route, and speech captured while the user is away from
|
||||||
|
* `/assistant` is queued for draining when they arrive there.
|
||||||
|
*
|
||||||
|
* The existing `/api/transcribe` endpoint (server-side Whisper pipeline
|
||||||
|
* shipped in v1.6) is consumed unchanged — see `voice-pipeline.ts` on the
|
||||||
|
* server. No new backend endpoint is introduced.
|
||||||
|
*
|
||||||
|
* Spec references:
|
||||||
|
* - §4.2 GlobalMicButton states (idle / listening / speaking)
|
||||||
|
* - §5.5 voice routing from non-Assistant modes → queue → Assistant inbox
|
||||||
|
* - §10.3 voice as global affordance
|
||||||
|
* - §10.4 single notification surface
|
||||||
|
*/
|
||||||
|
|
||||||
|
export type VoiceState = "idle" | "listening" | "speaking";
|
||||||
|
|
||||||
|
export interface VoiceStartOptions {
|
||||||
|
/**
|
||||||
|
* When true, the transcript produced by this capture is NOT pushed onto
|
||||||
|
* the Assistant queue. Used by ChatInput's in-place mic which inserts
|
||||||
|
* the transcript directly into the textarea via `onTranscript`.
|
||||||
|
*/
|
||||||
|
inline?: boolean;
|
||||||
|
/**
|
||||||
|
* Optional callback invoked with the final trimmed transcript once
|
||||||
|
* transcription completes. Called even when `inline` is false, so
|
||||||
|
* consumers that want to observe speech alongside the queue can.
|
||||||
|
*/
|
||||||
|
onTranscript?: (text: string) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface VoiceContextValue {
|
||||||
|
state: VoiceState;
|
||||||
|
mediaStream: MediaStream | null;
|
||||||
|
transcript: string;
|
||||||
|
queue: string[];
|
||||||
|
hasQueuedVoice: boolean;
|
||||||
|
/**
|
||||||
|
* Start the microphone. If permission is denied the state stays `idle`
|
||||||
|
* and an error is logged to the console (no toast — Layout owns toasts).
|
||||||
|
*
|
||||||
|
* When `options.inline` is true, the resulting transcript is NOT pushed
|
||||||
|
* onto the Assistant queue — the caller is expected to consume
|
||||||
|
* `transcript` directly (used by ChatInput's in-place mic). When
|
||||||
|
* `options.onTranscript` is provided, it is invoked with the final
|
||||||
|
* transcript alongside (or in place of) the queue push.
|
||||||
|
*/
|
||||||
|
startListening: (options?: VoiceStartOptions) => Promise<void>;
|
||||||
|
/** Stop the current capture and flush the buffered transcript. */
|
||||||
|
stopListening: () => Promise<void>;
|
||||||
|
/** Cycle idle → listening → idle. Used by GlobalMicButton's click. */
|
||||||
|
toggleListening: (options?: VoiceStartOptions) => Promise<void>;
|
||||||
|
/**
|
||||||
|
* Push a transcript onto the queue. PersonalAssistant drains this on
|
||||||
|
* mount and sends each entry as a new user message through the existing
|
||||||
|
* chat streaming pipeline.
|
||||||
|
*/
|
||||||
|
enqueue: (text: string) => void;
|
||||||
|
/** Drain and return every queued transcript, clearing the queue. */
|
||||||
|
drainQueue: () => string[];
|
||||||
|
/** Clear the transient transcript buffer (not the queue). */
|
||||||
|
clearTranscript: () => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
const VoiceContext = createContext<VoiceContextValue | undefined>(undefined);
|
||||||
|
|
||||||
|
interface VoiceProviderProps {
|
||||||
|
children: ReactNode;
|
||||||
|
/**
|
||||||
|
* Optional override so tests can avoid hitting a real network. When
|
||||||
|
* omitted the provider calls `fetch("/api/transcribe", ...)`.
|
||||||
|
*/
|
||||||
|
transcribe?: (audio: Blob) => Promise<string>;
|
||||||
|
/**
|
||||||
|
* Optional injector for `navigator.mediaDevices.getUserMedia`, used by
|
||||||
|
* tests that need deterministic stream objects without polluting the
|
||||||
|
* global navigator.
|
||||||
|
*/
|
||||||
|
getUserMedia?: (constraints: MediaStreamConstraints) => Promise<MediaStream>;
|
||||||
|
/**
|
||||||
|
* When true the provider swallows transient capture errors silently.
|
||||||
|
* Defaults to the standard behavior (console.error).
|
||||||
|
*/
|
||||||
|
silenceErrors?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function defaultTranscribe(audio: Blob): Promise<string> {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("audio", audio, "recording.webm");
|
||||||
|
const res = await fetch("/api/transcribe", {
|
||||||
|
method: "POST",
|
||||||
|
credentials: "include",
|
||||||
|
body: formData,
|
||||||
|
});
|
||||||
|
if (!res.ok) {
|
||||||
|
throw new Error(`transcribe failed: ${res.status}`);
|
||||||
|
}
|
||||||
|
const data = (await res.json()) as { text?: string };
|
||||||
|
return (data.text ?? "").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
export function VoiceProvider({
|
||||||
|
children,
|
||||||
|
transcribe = defaultTranscribe,
|
||||||
|
getUserMedia,
|
||||||
|
silenceErrors = false,
|
||||||
|
}: VoiceProviderProps) {
|
||||||
|
const [state, setState] = useState<VoiceState>("idle");
|
||||||
|
const [transcript, setTranscript] = useState("");
|
||||||
|
const [queue, setQueue] = useState<string[]>([]);
|
||||||
|
const [mediaStream, setMediaStream] = useState<MediaStream | null>(null);
|
||||||
|
|
||||||
|
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||||
|
const chunksRef = useRef<Blob[]>([]);
|
||||||
|
const streamRef = useRef<MediaStream | null>(null);
|
||||||
|
const optionsRef = useRef<VoiceStartOptions>({});
|
||||||
|
// Mirror of `queue` maintained for synchronous reads — drainQueue()
|
||||||
|
// must return entries immediately so the caller can iterate without
|
||||||
|
// waiting for a React re-render.
|
||||||
|
const queueRef = useRef<string[]>([]);
|
||||||
|
|
||||||
|
const stopTracks = useCallback(() => {
|
||||||
|
if (streamRef.current) {
|
||||||
|
streamRef.current.getTracks().forEach((t) => {
|
||||||
|
try {
|
||||||
|
t.stop();
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
});
|
||||||
|
streamRef.current = null;
|
||||||
|
}
|
||||||
|
setMediaStream(null);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const enqueue = useCallback((text: string) => {
|
||||||
|
const trimmed = text.trim();
|
||||||
|
if (!trimmed) return;
|
||||||
|
queueRef.current = [...queueRef.current, trimmed];
|
||||||
|
setQueue(queueRef.current);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const drainQueue = useCallback((): string[] => {
|
||||||
|
const drained = queueRef.current;
|
||||||
|
queueRef.current = [];
|
||||||
|
setQueue([]);
|
||||||
|
return drained;
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const clearTranscript = useCallback(() => {
|
||||||
|
setTranscript("");
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const startListening = useCallback(async (
|
||||||
|
options: VoiceStartOptions = {},
|
||||||
|
): Promise<void> => {
|
||||||
|
if (state !== "idle") return;
|
||||||
|
optionsRef.current = options;
|
||||||
|
const acquire = getUserMedia
|
||||||
|
? getUserMedia({ audio: true })
|
||||||
|
: navigator.mediaDevices?.getUserMedia?.({ audio: true });
|
||||||
|
|
||||||
|
if (!acquire) {
|
||||||
|
if (!silenceErrors) {
|
||||||
|
console.error("[VoiceContext] getUserMedia is not available");
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const stream = await acquire;
|
||||||
|
streamRef.current = stream;
|
||||||
|
setMediaStream(stream);
|
||||||
|
chunksRef.current = [];
|
||||||
|
|
||||||
|
// MediaRecorder may not exist in some test environments. We only
|
||||||
|
// instantiate it when available — the stream itself is still tracked
|
||||||
|
// so VoiceWaveform and other consumers can observe capture state.
|
||||||
|
if (typeof MediaRecorder !== "undefined") {
|
||||||
|
const recorder = new MediaRecorder(stream);
|
||||||
|
recorder.ondataavailable = (e) => {
|
||||||
|
if (e.data && e.data.size > 0) chunksRef.current.push(e.data);
|
||||||
|
};
|
||||||
|
recorderRef.current = recorder;
|
||||||
|
recorder.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
setState("listening");
|
||||||
|
} catch (err) {
|
||||||
|
if (!silenceErrors) {
|
||||||
|
console.error("[VoiceContext] Microphone access denied:", err);
|
||||||
|
}
|
||||||
|
stopTracks();
|
||||||
|
setState("idle");
|
||||||
|
}
|
||||||
|
}, [state, getUserMedia, silenceErrors, stopTracks]);
|
||||||
|
|
||||||
|
const stopListening = useCallback(async (): Promise<void> => {
|
||||||
|
if (state !== "listening") {
|
||||||
|
stopTracks();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setState("speaking");
|
||||||
|
|
||||||
|
const recorder = recorderRef.current;
|
||||||
|
recorderRef.current = null;
|
||||||
|
|
||||||
|
const finalize = async () => {
|
||||||
|
try {
|
||||||
|
if (chunksRef.current.length > 0) {
|
||||||
|
const blob = new Blob(chunksRef.current, {
|
||||||
|
type: recorder?.mimeType || "audio/webm",
|
||||||
|
});
|
||||||
|
chunksRef.current = [];
|
||||||
|
const text = await transcribe(blob);
|
||||||
|
if (text) {
|
||||||
|
setTranscript(text);
|
||||||
|
const opts = optionsRef.current;
|
||||||
|
opts.onTranscript?.(text);
|
||||||
|
if (!opts.inline) {
|
||||||
|
enqueue(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
if (!silenceErrors) {
|
||||||
|
console.error("[VoiceContext] Transcription error:", err);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
stopTracks();
|
||||||
|
setState("idle");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (recorder && recorder.state !== "inactive") {
|
||||||
|
await new Promise<void>((resolve) => {
|
||||||
|
recorder.onstop = () => resolve();
|
||||||
|
try {
|
||||||
|
recorder.stop();
|
||||||
|
} catch {
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await finalize();
|
||||||
|
}, [state, transcribe, enqueue, silenceErrors, stopTracks]);
|
||||||
|
|
||||||
|
const toggleListening = useCallback(async (
|
||||||
|
options: VoiceStartOptions = {},
|
||||||
|
): Promise<void> => {
|
||||||
|
if (state === "listening") {
|
||||||
|
await stopListening();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (state === "idle") {
|
||||||
|
await startListening(options);
|
||||||
|
}
|
||||||
|
}, [state, startListening, stopListening]);
|
||||||
|
|
||||||
|
const value = useMemo<VoiceContextValue>(
|
||||||
|
() => ({
|
||||||
|
state,
|
||||||
|
mediaStream,
|
||||||
|
transcript,
|
||||||
|
queue,
|
||||||
|
hasQueuedVoice: queue.length > 0,
|
||||||
|
startListening,
|
||||||
|
stopListening,
|
||||||
|
toggleListening,
|
||||||
|
enqueue,
|
||||||
|
drainQueue,
|
||||||
|
clearTranscript,
|
||||||
|
}),
|
||||||
|
[
|
||||||
|
state,
|
||||||
|
mediaStream,
|
||||||
|
transcript,
|
||||||
|
queue,
|
||||||
|
startListening,
|
||||||
|
stopListening,
|
||||||
|
toggleListening,
|
||||||
|
enqueue,
|
||||||
|
drainQueue,
|
||||||
|
clearTranscript,
|
||||||
|
],
|
||||||
|
);
|
||||||
|
|
||||||
|
return <VoiceContext.Provider value={value}>{children}</VoiceContext.Provider>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function useVoice(): VoiceContextValue {
|
||||||
|
const ctx = useContext(VoiceContext);
|
||||||
|
if (!ctx) {
|
||||||
|
throw new Error("useVoice must be used within a VoiceProvider");
|
||||||
|
}
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue