feat(37-02): encodeWav utility, useVadRecorder + useVoiceMode hooks
- encodeWav: 44-byte WAV header encoder (RIFF/WAVE/fmt/data), PCM mono 16-bit - useVadRecorder: wraps useMicVAD with startOnLoad:false, auto-stop on speech end, POSTs to /api/transcribe - useVoiceMode: reads/writes voiceMode from GET/PATCH /api/nexus/settings with optimistic update
This commit is contained in:
parent
16371f01f5
commit
0d0b17c8a0
3 changed files with 225 additions and 0 deletions
98
ui/src/hooks/useVadRecorder.ts
Normal file
98
ui/src/hooks/useVadRecorder.ts
Normal file
|
|
@ -0,0 +1,98 @@
|
||||||
|
import { useState, useRef, useCallback } from "react";
|
||||||
|
import { useMicVAD } from "@ricky0123/vad-react";
|
||||||
|
import { encodeWav } from "../lib/encodeWav";
|
||||||
|
|
||||||
|
interface UseVadRecorderOptions {
|
||||||
|
onTranscript: (text: string) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface UseVadRecorderReturn {
|
||||||
|
state: "idle" | "recording" | "processing";
|
||||||
|
start: () => void;
|
||||||
|
stop: () => void;
|
||||||
|
mediaStream: MediaStream | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function useVadRecorder(opts: UseVadRecorderOptions): UseVadRecorderReturn {
|
||||||
|
const [state, setState] = useState<"idle" | "recording" | "processing">("idle");
|
||||||
|
const mediaStreamRef = useRef<MediaStream | null>(null);
|
||||||
|
|
||||||
|
const handleSpeechEnd = useCallback(
|
||||||
|
async (audio: Float32Array) => {
|
||||||
|
vad.pause();
|
||||||
|
setState("processing");
|
||||||
|
|
||||||
|
try {
|
||||||
|
const wavBlob = encodeWav(audio);
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("audio", wavBlob, "recording.wav");
|
||||||
|
|
||||||
|
const res = await fetch("/api/transcribe", {
|
||||||
|
method: "POST",
|
||||||
|
credentials: "include",
|
||||||
|
body: formData,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (res.ok) {
|
||||||
|
const data = (await res.json()) as { text: string };
|
||||||
|
if (data.text && data.text.length >= 2) {
|
||||||
|
opts.onTranscript(data.text.trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error("[useVadRecorder] Transcription error:", err);
|
||||||
|
} finally {
|
||||||
|
setState("idle");
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
|
[opts.onTranscript],
|
||||||
|
);
|
||||||
|
|
||||||
|
const vad = useMicVAD({
|
||||||
|
startOnLoad: false,
|
||||||
|
baseAssetPath: "/",
|
||||||
|
onnxWASMBasePath: "/",
|
||||||
|
positiveSpeechThreshold: 0.8,
|
||||||
|
negativeSpeechThreshold: 0.65,
|
||||||
|
redemptionFrames: 8,
|
||||||
|
minSpeechFrames: 5,
|
||||||
|
onSpeechStart: () => {
|
||||||
|
// VAD detected start of speech — no action needed, state was set to "recording" in start()
|
||||||
|
},
|
||||||
|
onSpeechEnd: handleSpeechEnd,
|
||||||
|
});
|
||||||
|
|
||||||
|
const start = useCallback(async () => {
|
||||||
|
try {
|
||||||
|
// Request a separate stream reference for VoiceWaveform AnalyserNode
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
mediaStreamRef.current = stream;
|
||||||
|
} catch (err) {
|
||||||
|
console.error("[useVadRecorder] Microphone access denied:", err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
vad.start();
|
||||||
|
setState("recording");
|
||||||
|
}, [vad]);
|
||||||
|
|
||||||
|
const stop = useCallback(() => {
|
||||||
|
vad.pause();
|
||||||
|
|
||||||
|
// Stop the separate stream tracks
|
||||||
|
if (mediaStreamRef.current) {
|
||||||
|
mediaStreamRef.current.getTracks().forEach((t) => t.stop());
|
||||||
|
mediaStreamRef.current = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
setState("idle");
|
||||||
|
}, [vad]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
state,
|
||||||
|
start,
|
||||||
|
stop,
|
||||||
|
mediaStream: mediaStreamRef.current,
|
||||||
|
};
|
||||||
|
}
|
||||||
71
ui/src/hooks/useVoiceMode.ts
Normal file
71
ui/src/hooks/useVoiceMode.ts
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
import { useState, useEffect } from "react";
|
||||||
|
|
||||||
|
type VoiceMode = "text" | "voice_input" | "full_voice";
|
||||||
|
|
||||||
|
interface UseVoiceModeReturn {
|
||||||
|
mode: VoiceMode;
|
||||||
|
setMode: (next: VoiceMode) => Promise<void>;
|
||||||
|
isLoading: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function useVoiceMode(): UseVoiceModeReturn {
|
||||||
|
const [mode, setModeState] = useState<VoiceMode>("text");
|
||||||
|
const [isLoading, setIsLoading] = useState(true);
|
||||||
|
|
||||||
|
// Load current voiceMode from nexus-settings on mount
|
||||||
|
useEffect(() => {
|
||||||
|
let cancelled = false;
|
||||||
|
|
||||||
|
const load = async () => {
|
||||||
|
try {
|
||||||
|
const res = await fetch("/api/nexus/settings", {
|
||||||
|
credentials: "include",
|
||||||
|
});
|
||||||
|
|
||||||
|
if (res.ok && !cancelled) {
|
||||||
|
const data = (await res.json()) as { voiceMode?: string };
|
||||||
|
const raw = data.voiceMode;
|
||||||
|
if (raw === "voice_input" || raw === "full_voice" || raw === "text") {
|
||||||
|
setModeState(raw as VoiceMode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error("[useVoiceMode] Failed to load settings:", err);
|
||||||
|
} finally {
|
||||||
|
if (!cancelled) {
|
||||||
|
setIsLoading(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
load();
|
||||||
|
return () => {
|
||||||
|
cancelled = true;
|
||||||
|
};
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const setMode = async (next: VoiceMode): Promise<void> => {
|
||||||
|
const previous = mode;
|
||||||
|
// Optimistic update
|
||||||
|
setModeState(next);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch("/api/nexus/settings", {
|
||||||
|
method: "PATCH",
|
||||||
|
credentials: "include",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ voiceMode: next }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
throw new Error(`PATCH /api/nexus/settings returned ${res.status}`);
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error("[useVoiceMode] Failed to update voiceMode:", err);
|
||||||
|
// Revert on error
|
||||||
|
setModeState(previous);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return { mode, setMode, isLoading };
|
||||||
|
}
|
||||||
56
ui/src/lib/encodeWav.ts
Normal file
56
ui/src/lib/encodeWav.ts
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
/**
|
||||||
|
* Encodes a Float32Array of audio samples (mono, 16kHz) into a WAV Blob.
|
||||||
|
*
|
||||||
|
* WAV format: 44-byte header + PCM 16-bit samples
|
||||||
|
* - RIFF chunk: "RIFF", file size, "WAVE"
|
||||||
|
* - fmt chunk: PCM (1), mono (1), 16kHz, 16-bit depth
|
||||||
|
* - data chunk: raw PCM samples
|
||||||
|
*/
|
||||||
|
|
||||||
|
function writeString(view: DataView, offset: number, str: string): void {
|
||||||
|
for (let i = 0; i < str.length; i++) {
|
||||||
|
view.setUint8(offset + i, str.charCodeAt(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function encodeWav(samples: Float32Array, sampleRate = 16000): Blob {
|
||||||
|
const numChannels = 1;
|
||||||
|
const bitsPerSample = 16;
|
||||||
|
const byteRate = (sampleRate * numChannels * bitsPerSample) / 8;
|
||||||
|
const blockAlign = (numChannels * bitsPerSample) / 8;
|
||||||
|
|
||||||
|
const dataLength = samples.length * 2; // 2 bytes per 16-bit sample
|
||||||
|
const headerLength = 44;
|
||||||
|
const buffer = new ArrayBuffer(headerLength + dataLength);
|
||||||
|
const view = new DataView(buffer);
|
||||||
|
|
||||||
|
// RIFF chunk
|
||||||
|
writeString(view, 0, "RIFF");
|
||||||
|
view.setUint32(4, 36 + dataLength, true); // file size - 8
|
||||||
|
writeString(view, 8, "WAVE");
|
||||||
|
|
||||||
|
// fmt chunk
|
||||||
|
writeString(view, 12, "fmt ");
|
||||||
|
view.setUint32(16, 16, true); // chunk size (16 for PCM)
|
||||||
|
view.setUint16(20, 1, true); // PCM format
|
||||||
|
view.setUint16(22, numChannels, true);
|
||||||
|
view.setUint32(24, sampleRate, true);
|
||||||
|
view.setUint32(28, byteRate, true);
|
||||||
|
view.setUint16(32, blockAlign, true);
|
||||||
|
view.setUint16(34, bitsPerSample, true);
|
||||||
|
|
||||||
|
// data chunk
|
||||||
|
writeString(view, 36, "data");
|
||||||
|
view.setUint32(40, dataLength, true);
|
||||||
|
|
||||||
|
// Write PCM samples — clamp to [-1, 1] then convert to int16
|
||||||
|
let offset = 44;
|
||||||
|
for (let i = 0; i < samples.length; i++) {
|
||||||
|
const clamped = Math.max(-1, Math.min(1, samples[i]));
|
||||||
|
const int16 = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff;
|
||||||
|
view.setInt16(offset, Math.round(int16), true);
|
||||||
|
offset += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Blob([buffer], { type: "audio/wav" });
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue