feat(37-02): encodeWav utility, useVadRecorder + useVoiceMode hooks
- encodeWav: 44-byte WAV header encoder (RIFF/WAVE/fmt/data), PCM mono 16-bit - useVadRecorder: wraps useMicVAD with startOnLoad:false, auto-stop on speech end, POSTs to /api/transcribe - useVoiceMode: reads/writes voiceMode from GET/PATCH /api/nexus/settings with optimistic update
This commit is contained in:
parent
16371f01f5
commit
0d0b17c8a0
3 changed files with 225 additions and 0 deletions
98
ui/src/hooks/useVadRecorder.ts
Normal file
98
ui/src/hooks/useVadRecorder.ts
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
import { useState, useRef, useCallback } from "react";
|
||||
import { useMicVAD } from "@ricky0123/vad-react";
|
||||
import { encodeWav } from "../lib/encodeWav";
|
||||
|
||||
interface UseVadRecorderOptions {
|
||||
onTranscript: (text: string) => void;
|
||||
}
|
||||
|
||||
interface UseVadRecorderReturn {
|
||||
state: "idle" | "recording" | "processing";
|
||||
start: () => void;
|
||||
stop: () => void;
|
||||
mediaStream: MediaStream | null;
|
||||
}
|
||||
|
||||
export function useVadRecorder(opts: UseVadRecorderOptions): UseVadRecorderReturn {
|
||||
const [state, setState] = useState<"idle" | "recording" | "processing">("idle");
|
||||
const mediaStreamRef = useRef<MediaStream | null>(null);
|
||||
|
||||
const handleSpeechEnd = useCallback(
|
||||
async (audio: Float32Array) => {
|
||||
vad.pause();
|
||||
setState("processing");
|
||||
|
||||
try {
|
||||
const wavBlob = encodeWav(audio);
|
||||
const formData = new FormData();
|
||||
formData.append("audio", wavBlob, "recording.wav");
|
||||
|
||||
const res = await fetch("/api/transcribe", {
|
||||
method: "POST",
|
||||
credentials: "include",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const data = (await res.json()) as { text: string };
|
||||
if (data.text && data.text.length >= 2) {
|
||||
opts.onTranscript(data.text.trim());
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[useVadRecorder] Transcription error:", err);
|
||||
} finally {
|
||||
setState("idle");
|
||||
}
|
||||
},
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
[opts.onTranscript],
|
||||
);
|
||||
|
||||
const vad = useMicVAD({
|
||||
startOnLoad: false,
|
||||
baseAssetPath: "/",
|
||||
onnxWASMBasePath: "/",
|
||||
positiveSpeechThreshold: 0.8,
|
||||
negativeSpeechThreshold: 0.65,
|
||||
redemptionFrames: 8,
|
||||
minSpeechFrames: 5,
|
||||
onSpeechStart: () => {
|
||||
// VAD detected start of speech — no action needed, state was set to "recording" in start()
|
||||
},
|
||||
onSpeechEnd: handleSpeechEnd,
|
||||
});
|
||||
|
||||
const start = useCallback(async () => {
|
||||
try {
|
||||
// Request a separate stream reference for VoiceWaveform AnalyserNode
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
mediaStreamRef.current = stream;
|
||||
} catch (err) {
|
||||
console.error("[useVadRecorder] Microphone access denied:", err);
|
||||
return;
|
||||
}
|
||||
|
||||
vad.start();
|
||||
setState("recording");
|
||||
}, [vad]);
|
||||
|
||||
const stop = useCallback(() => {
|
||||
vad.pause();
|
||||
|
||||
// Stop the separate stream tracks
|
||||
if (mediaStreamRef.current) {
|
||||
mediaStreamRef.current.getTracks().forEach((t) => t.stop());
|
||||
mediaStreamRef.current = null;
|
||||
}
|
||||
|
||||
setState("idle");
|
||||
}, [vad]);
|
||||
|
||||
return {
|
||||
state,
|
||||
start,
|
||||
stop,
|
||||
mediaStream: mediaStreamRef.current,
|
||||
};
|
||||
}
|
||||
71
ui/src/hooks/useVoiceMode.ts
Normal file
71
ui/src/hooks/useVoiceMode.ts
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
import { useState, useEffect } from "react";
|
||||
|
||||
type VoiceMode = "text" | "voice_input" | "full_voice";
|
||||
|
||||
interface UseVoiceModeReturn {
|
||||
mode: VoiceMode;
|
||||
setMode: (next: VoiceMode) => Promise<void>;
|
||||
isLoading: boolean;
|
||||
}
|
||||
|
||||
export function useVoiceMode(): UseVoiceModeReturn {
|
||||
const [mode, setModeState] = useState<VoiceMode>("text");
|
||||
const [isLoading, setIsLoading] = useState(true);
|
||||
|
||||
// Load current voiceMode from nexus-settings on mount
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
|
||||
const load = async () => {
|
||||
try {
|
||||
const res = await fetch("/api/nexus/settings", {
|
||||
credentials: "include",
|
||||
});
|
||||
|
||||
if (res.ok && !cancelled) {
|
||||
const data = (await res.json()) as { voiceMode?: string };
|
||||
const raw = data.voiceMode;
|
||||
if (raw === "voice_input" || raw === "full_voice" || raw === "text") {
|
||||
setModeState(raw as VoiceMode);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[useVoiceMode] Failed to load settings:", err);
|
||||
} finally {
|
||||
if (!cancelled) {
|
||||
setIsLoading(false);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
load();
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, []);
|
||||
|
||||
const setMode = async (next: VoiceMode): Promise<void> => {
|
||||
const previous = mode;
|
||||
// Optimistic update
|
||||
setModeState(next);
|
||||
|
||||
try {
|
||||
const res = await fetch("/api/nexus/settings", {
|
||||
method: "PATCH",
|
||||
credentials: "include",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ voiceMode: next }),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
throw new Error(`PATCH /api/nexus/settings returned ${res.status}`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[useVoiceMode] Failed to update voiceMode:", err);
|
||||
// Revert on error
|
||||
setModeState(previous);
|
||||
}
|
||||
};
|
||||
|
||||
return { mode, setMode, isLoading };
|
||||
}
|
||||
56
ui/src/lib/encodeWav.ts
Normal file
56
ui/src/lib/encodeWav.ts
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
/**
|
||||
* Encodes a Float32Array of audio samples (mono, 16kHz) into a WAV Blob.
|
||||
*
|
||||
* WAV format: 44-byte header + PCM 16-bit samples
|
||||
* - RIFF chunk: "RIFF", file size, "WAVE"
|
||||
* - fmt chunk: PCM (1), mono (1), 16kHz, 16-bit depth
|
||||
* - data chunk: raw PCM samples
|
||||
*/
|
||||
|
||||
function writeString(view: DataView, offset: number, str: string): void {
|
||||
for (let i = 0; i < str.length; i++) {
|
||||
view.setUint8(offset + i, str.charCodeAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
export function encodeWav(samples: Float32Array, sampleRate = 16000): Blob {
|
||||
const numChannels = 1;
|
||||
const bitsPerSample = 16;
|
||||
const byteRate = (sampleRate * numChannels * bitsPerSample) / 8;
|
||||
const blockAlign = (numChannels * bitsPerSample) / 8;
|
||||
|
||||
const dataLength = samples.length * 2; // 2 bytes per 16-bit sample
|
||||
const headerLength = 44;
|
||||
const buffer = new ArrayBuffer(headerLength + dataLength);
|
||||
const view = new DataView(buffer);
|
||||
|
||||
// RIFF chunk
|
||||
writeString(view, 0, "RIFF");
|
||||
view.setUint32(4, 36 + dataLength, true); // file size - 8
|
||||
writeString(view, 8, "WAVE");
|
||||
|
||||
// fmt chunk
|
||||
writeString(view, 12, "fmt ");
|
||||
view.setUint32(16, 16, true); // chunk size (16 for PCM)
|
||||
view.setUint16(20, 1, true); // PCM format
|
||||
view.setUint16(22, numChannels, true);
|
||||
view.setUint32(24, sampleRate, true);
|
||||
view.setUint32(28, byteRate, true);
|
||||
view.setUint16(32, blockAlign, true);
|
||||
view.setUint16(34, bitsPerSample, true);
|
||||
|
||||
// data chunk
|
||||
writeString(view, 36, "data");
|
||||
view.setUint32(40, dataLength, true);
|
||||
|
||||
// Write PCM samples — clamp to [-1, 1] then convert to int16
|
||||
let offset = 44;
|
||||
for (let i = 0; i < samples.length; i++) {
|
||||
const clamped = Math.max(-1, Math.min(1, samples[i]));
|
||||
const int16 = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff;
|
||||
view.setInt16(offset, Math.round(int16), true);
|
||||
offset += 2;
|
||||
}
|
||||
|
||||
return new Blob([buffer], { type: "audio/wav" });
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue