diff --git a/ui/src/hooks/useVadRecorder.ts b/ui/src/hooks/useVadRecorder.ts new file mode 100644 index 00000000..6debfb2c --- /dev/null +++ b/ui/src/hooks/useVadRecorder.ts @@ -0,0 +1,98 @@ +import { useState, useRef, useCallback } from "react"; +import { useMicVAD } from "@ricky0123/vad-react"; +import { encodeWav } from "../lib/encodeWav"; + +interface UseVadRecorderOptions { + onTranscript: (text: string) => void; +} + +interface UseVadRecorderReturn { + state: "idle" | "recording" | "processing"; + start: () => void; + stop: () => void; + mediaStream: MediaStream | null; +} + +export function useVadRecorder(opts: UseVadRecorderOptions): UseVadRecorderReturn { + const [state, setState] = useState<"idle" | "recording" | "processing">("idle"); + const mediaStreamRef = useRef(null); + + const handleSpeechEnd = useCallback( + async (audio: Float32Array) => { + vad.pause(); + setState("processing"); + + try { + const wavBlob = encodeWav(audio); + const formData = new FormData(); + formData.append("audio", wavBlob, "recording.wav"); + + const res = await fetch("/api/transcribe", { + method: "POST", + credentials: "include", + body: formData, + }); + + if (res.ok) { + const data = (await res.json()) as { text: string }; + if (data.text && data.text.length >= 2) { + opts.onTranscript(data.text.trim()); + } + } + } catch (err) { + console.error("[useVadRecorder] Transcription error:", err); + } finally { + setState("idle"); + } + }, + // eslint-disable-next-line react-hooks/exhaustive-deps + [opts.onTranscript], + ); + + const vad = useMicVAD({ + startOnLoad: false, + baseAssetPath: "/", + onnxWASMBasePath: "/", + positiveSpeechThreshold: 0.8, + negativeSpeechThreshold: 0.65, + redemptionFrames: 8, + minSpeechFrames: 5, + onSpeechStart: () => { + // VAD detected start of speech — no action needed, state was set to "recording" in start() + }, + onSpeechEnd: handleSpeechEnd, + }); + + const start = useCallback(async () => { + try { + // Request a separate stream reference for VoiceWaveform AnalyserNode + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaStreamRef.current = stream; + } catch (err) { + console.error("[useVadRecorder] Microphone access denied:", err); + return; + } + + vad.start(); + setState("recording"); + }, [vad]); + + const stop = useCallback(() => { + vad.pause(); + + // Stop the separate stream tracks + if (mediaStreamRef.current) { + mediaStreamRef.current.getTracks().forEach((t) => t.stop()); + mediaStreamRef.current = null; + } + + setState("idle"); + }, [vad]); + + return { + state, + start, + stop, + mediaStream: mediaStreamRef.current, + }; +} diff --git a/ui/src/hooks/useVoiceMode.ts b/ui/src/hooks/useVoiceMode.ts new file mode 100644 index 00000000..52d8f431 --- /dev/null +++ b/ui/src/hooks/useVoiceMode.ts @@ -0,0 +1,71 @@ +import { useState, useEffect } from "react"; + +type VoiceMode = "text" | "voice_input" | "full_voice"; + +interface UseVoiceModeReturn { + mode: VoiceMode; + setMode: (next: VoiceMode) => Promise; + isLoading: boolean; +} + +export function useVoiceMode(): UseVoiceModeReturn { + const [mode, setModeState] = useState("text"); + const [isLoading, setIsLoading] = useState(true); + + // Load current voiceMode from nexus-settings on mount + useEffect(() => { + let cancelled = false; + + const load = async () => { + try { + const res = await fetch("/api/nexus/settings", { + credentials: "include", + }); + + if (res.ok && !cancelled) { + const data = (await res.json()) as { voiceMode?: string }; + const raw = data.voiceMode; + if (raw === "voice_input" || raw === "full_voice" || raw === "text") { + setModeState(raw as VoiceMode); + } + } + } catch (err) { + console.error("[useVoiceMode] Failed to load settings:", err); + } finally { + if (!cancelled) { + setIsLoading(false); + } + } + }; + + load(); + return () => { + cancelled = true; + }; + }, []); + + const setMode = async (next: VoiceMode): Promise => { + const previous = mode; + // Optimistic update + setModeState(next); + + try { + const res = await fetch("/api/nexus/settings", { + method: "PATCH", + credentials: "include", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ voiceMode: next }), + }); + + if (!res.ok) { + throw new Error(`PATCH /api/nexus/settings returned ${res.status}`); + } + } catch (err) { + console.error("[useVoiceMode] Failed to update voiceMode:", err); + // Revert on error + setModeState(previous); + } + }; + + return { mode, setMode, isLoading }; +} diff --git a/ui/src/lib/encodeWav.ts b/ui/src/lib/encodeWav.ts new file mode 100644 index 00000000..871f7210 --- /dev/null +++ b/ui/src/lib/encodeWav.ts @@ -0,0 +1,56 @@ +/** + * Encodes a Float32Array of audio samples (mono, 16kHz) into a WAV Blob. + * + * WAV format: 44-byte header + PCM 16-bit samples + * - RIFF chunk: "RIFF", file size, "WAVE" + * - fmt chunk: PCM (1), mono (1), 16kHz, 16-bit depth + * - data chunk: raw PCM samples + */ + +function writeString(view: DataView, offset: number, str: string): void { + for (let i = 0; i < str.length; i++) { + view.setUint8(offset + i, str.charCodeAt(i)); + } +} + +export function encodeWav(samples: Float32Array, sampleRate = 16000): Blob { + const numChannels = 1; + const bitsPerSample = 16; + const byteRate = (sampleRate * numChannels * bitsPerSample) / 8; + const blockAlign = (numChannels * bitsPerSample) / 8; + + const dataLength = samples.length * 2; // 2 bytes per 16-bit sample + const headerLength = 44; + const buffer = new ArrayBuffer(headerLength + dataLength); + const view = new DataView(buffer); + + // RIFF chunk + writeString(view, 0, "RIFF"); + view.setUint32(4, 36 + dataLength, true); // file size - 8 + writeString(view, 8, "WAVE"); + + // fmt chunk + writeString(view, 12, "fmt "); + view.setUint32(16, 16, true); // chunk size (16 for PCM) + view.setUint16(20, 1, true); // PCM format + view.setUint16(22, numChannels, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, bitsPerSample, true); + + // data chunk + writeString(view, 36, "data"); + view.setUint32(40, dataLength, true); + + // Write PCM samples — clamp to [-1, 1] then convert to int16 + let offset = 44; + for (let i = 0; i < samples.length; i++) { + const clamped = Math.max(-1, Math.min(1, samples[i])); + const int16 = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff; + view.setInt16(offset, Math.round(int16), true); + offset += 2; + } + + return new Blob([buffer], { type: "audio/wav" }); +}