feat(25-08): create VoiceRecordButton and server transcription endpoint
- Add VoiceRecordButton with MediaRecorder API, recording/transcribing/idle states - Add POST /transcribe endpoint to chat-files.ts using execFileAsync (safe, no shell) - Tries whisper-cpp first, falls back to openai-whisper Python CLI - Returns 503 with helpful message if whisper is not installed
This commit is contained in:
parent
8b0e3f052d
commit
64a90c284e
2 changed files with 201 additions and 0 deletions
|
|
@ -1,3 +1,4 @@
|
|||
import path from "node:path";
|
||||
import { Router, type Request, type Response } from "express";
|
||||
import multer from "multer";
|
||||
import type { Db } from "@paperclipai/db";
|
||||
|
|
@ -200,5 +201,96 @@ export function chatFileRoutes(db: Db, storage: StorageService) {
|
|||
res.json(updated);
|
||||
});
|
||||
|
||||
// POST /transcribe — Transcribe audio via local Whisper
|
||||
const audioUpload = multer({
|
||||
storage: multer.memoryStorage(),
|
||||
limits: { fileSize: MAX_ATTACHMENT_BYTES, files: 1 },
|
||||
});
|
||||
|
||||
async function runAudioUpload(
|
||||
upload: ReturnType<typeof multer>,
|
||||
req: Request,
|
||||
res: Response,
|
||||
) {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
upload.single("audio")(req, res, (err: unknown) => {
|
||||
if (err) reject(err);
|
||||
else resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
router.post("/transcribe", async (req, res) => {
|
||||
assertBoard(req);
|
||||
|
||||
try {
|
||||
await runAudioUpload(audioUpload, req, res);
|
||||
} catch (err) {
|
||||
if (err instanceof multer.MulterError) {
|
||||
res.status(400).json({ error: err.message });
|
||||
return;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
|
||||
const file = (req as Request & { file?: { buffer: Buffer; mimetype: string } }).file;
|
||||
if (!file) {
|
||||
res.status(400).json({ error: "Missing audio field" });
|
||||
return;
|
||||
}
|
||||
|
||||
// Write to temp file
|
||||
const { writeFile, unlink } = await import("node:fs/promises");
|
||||
const { tmpdir } = await import("node:os");
|
||||
const tmpPath = path.join(tmpdir(), `nexus-audio-${Date.now()}.webm`);
|
||||
|
||||
try {
|
||||
await writeFile(tmpPath, file.buffer);
|
||||
|
||||
// Try whisper CLI (whisper.cpp or openai-whisper)
|
||||
const { promisify } = await import("node:util");
|
||||
const { execFile: execFileCb } = await import("node:child_process");
|
||||
const execFileAsync = promisify(execFileCb);
|
||||
|
||||
try {
|
||||
// Try whisper.cpp first (outputs transcription to stdout with --no-timestamps)
|
||||
const { stdout } = await execFileAsync("whisper-cpp", [
|
||||
"--model", "base.en",
|
||||
"--file", tmpPath,
|
||||
"--no-timestamps",
|
||||
"--output-txt",
|
||||
], { timeout: 30000 });
|
||||
res.json({ text: stdout.trim() });
|
||||
} catch {
|
||||
try {
|
||||
// Fallback: openai-whisper Python CLI
|
||||
const { stdout } = await execFileAsync("whisper", [
|
||||
tmpPath,
|
||||
"--model", "base.en",
|
||||
"--output_format", "txt",
|
||||
"--output_dir", tmpdir(),
|
||||
], { timeout: 60000 });
|
||||
// whisper CLI writes to a .txt file alongside the input
|
||||
const txtPath = tmpPath.replace(/\.webm$/, ".txt");
|
||||
try {
|
||||
const { readFile } = await import("node:fs/promises");
|
||||
const text = await readFile(txtPath, "utf-8");
|
||||
await unlink(txtPath).catch(() => {});
|
||||
res.json({ text: text.trim() });
|
||||
} catch {
|
||||
// Parse stdout as fallback
|
||||
res.json({ text: stdout.trim() });
|
||||
}
|
||||
} catch {
|
||||
res.status(503).json({
|
||||
error: "Whisper not available. Install whisper-cpp or openai-whisper for voice input.",
|
||||
});
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await unlink(tmpPath).catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
|
|
|
|||
109
ui/src/components/VoiceRecordButton.tsx
Normal file
109
ui/src/components/VoiceRecordButton.tsx
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
import { useState, useRef, useCallback } from "react";
|
||||
import { Mic, Square, Loader2 } from "lucide-react";
|
||||
import { Button } from "./ui/button";
|
||||
|
||||
interface VoiceRecordButtonProps {
|
||||
onTranscription: (text: string) => void;
|
||||
disabled?: boolean;
|
||||
}
|
||||
|
||||
export function VoiceRecordButton({ onTranscription, disabled }: VoiceRecordButtonProps) {
|
||||
const [recording, setRecording] = useState(false);
|
||||
const [transcribing, setTranscribing] = useState(false);
|
||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
||||
const chunksRef = useRef<Blob[]>([]);
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
const mediaRecorder = new MediaRecorder(stream, {
|
||||
mimeType: MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
|
||||
? "audio/webm;codecs=opus"
|
||||
: "audio/webm",
|
||||
});
|
||||
|
||||
chunksRef.current = [];
|
||||
mediaRecorder.ondataavailable = (e) => {
|
||||
if (e.data.size > 0) chunksRef.current.push(e.data);
|
||||
};
|
||||
|
||||
mediaRecorder.onstop = async () => {
|
||||
stream.getTracks().forEach((t) => t.stop());
|
||||
const blob = new Blob(chunksRef.current, { type: "audio/webm" });
|
||||
if (blob.size === 0) return;
|
||||
|
||||
setTranscribing(true);
|
||||
try {
|
||||
const formData = new FormData();
|
||||
formData.append("audio", blob, "recording.webm");
|
||||
|
||||
const res = await fetch("/api/transcribe", {
|
||||
method: "POST",
|
||||
credentials: "include",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const data = (await res.json()) as { text: string };
|
||||
if (data.text?.trim()) {
|
||||
onTranscription(data.text.trim());
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
setTranscribing(false);
|
||||
}
|
||||
};
|
||||
|
||||
mediaRecorderRef.current = mediaRecorder;
|
||||
mediaRecorder.start(250); // 250ms chunks
|
||||
setRecording(true);
|
||||
} catch {
|
||||
// Microphone permission denied or unavailable
|
||||
}
|
||||
}, [onTranscription]);
|
||||
|
||||
const stopRecording = useCallback(() => {
|
||||
if (mediaRecorderRef.current?.state === "recording") {
|
||||
mediaRecorderRef.current.stop();
|
||||
mediaRecorderRef.current = null;
|
||||
}
|
||||
setRecording(false);
|
||||
}, []);
|
||||
|
||||
if (transcribing) {
|
||||
return (
|
||||
<Button variant="ghost" size="icon" className="h-8 w-8" disabled>
|
||||
<Loader2 className="h-4 w-4 animate-spin" />
|
||||
</Button>
|
||||
);
|
||||
}
|
||||
|
||||
if (recording) {
|
||||
return (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 text-destructive"
|
||||
onClick={stopRecording}
|
||||
aria-label="Stop recording"
|
||||
title="Stop recording"
|
||||
>
|
||||
<Square className="h-4 w-4" />
|
||||
</Button>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8"
|
||||
onClick={startRecording}
|
||||
disabled={disabled}
|
||||
aria-label="Voice input"
|
||||
title="Voice input"
|
||||
>
|
||||
<Mic className="h-4 w-4" />
|
||||
</Button>
|
||||
);
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue