diff --git a/server/src/routes/chat-files.ts b/server/src/routes/chat-files.ts index 0a75e6cb..7b51c591 100644 --- a/server/src/routes/chat-files.ts +++ b/server/src/routes/chat-files.ts @@ -1,3 +1,4 @@ +import path from "node:path"; import { Router, type Request, type Response } from "express"; import multer from "multer"; import type { Db } from "@paperclipai/db"; @@ -200,5 +201,96 @@ export function chatFileRoutes(db: Db, storage: StorageService) { res.json(updated); }); + // POST /transcribe — Transcribe audio via local Whisper + const audioUpload = multer({ + storage: multer.memoryStorage(), + limits: { fileSize: MAX_ATTACHMENT_BYTES, files: 1 }, + }); + + async function runAudioUpload( + upload: ReturnType, + req: Request, + res: Response, + ) { + await new Promise((resolve, reject) => { + upload.single("audio")(req, res, (err: unknown) => { + if (err) reject(err); + else resolve(); + }); + }); + } + + router.post("/transcribe", async (req, res) => { + assertBoard(req); + + try { + await runAudioUpload(audioUpload, req, res); + } catch (err) { + if (err instanceof multer.MulterError) { + res.status(400).json({ error: err.message }); + return; + } + throw err; + } + + const file = (req as Request & { file?: { buffer: Buffer; mimetype: string } }).file; + if (!file) { + res.status(400).json({ error: "Missing audio field" }); + return; + } + + // Write to temp file + const { writeFile, unlink } = await import("node:fs/promises"); + const { tmpdir } = await import("node:os"); + const tmpPath = path.join(tmpdir(), `nexus-audio-${Date.now()}.webm`); + + try { + await writeFile(tmpPath, file.buffer); + + // Try whisper CLI (whisper.cpp or openai-whisper) + const { promisify } = await import("node:util"); + const { execFile: execFileCb } = await import("node:child_process"); + const execFileAsync = promisify(execFileCb); + + try { + // Try whisper.cpp first (outputs transcription to stdout with --no-timestamps) + const { stdout } = await execFileAsync("whisper-cpp", [ + "--model", "base.en", + "--file", tmpPath, + "--no-timestamps", + "--output-txt", + ], { timeout: 30000 }); + res.json({ text: stdout.trim() }); + } catch { + try { + // Fallback: openai-whisper Python CLI + const { stdout } = await execFileAsync("whisper", [ + tmpPath, + "--model", "base.en", + "--output_format", "txt", + "--output_dir", tmpdir(), + ], { timeout: 60000 }); + // whisper CLI writes to a .txt file alongside the input + const txtPath = tmpPath.replace(/\.webm$/, ".txt"); + try { + const { readFile } = await import("node:fs/promises"); + const text = await readFile(txtPath, "utf-8"); + await unlink(txtPath).catch(() => {}); + res.json({ text: text.trim() }); + } catch { + // Parse stdout as fallback + res.json({ text: stdout.trim() }); + } + } catch { + res.status(503).json({ + error: "Whisper not available. Install whisper-cpp or openai-whisper for voice input.", + }); + } + } + } finally { + await unlink(tmpPath).catch(() => {}); + } + }); + return router; } diff --git a/ui/src/components/VoiceRecordButton.tsx b/ui/src/components/VoiceRecordButton.tsx new file mode 100644 index 00000000..30582d96 --- /dev/null +++ b/ui/src/components/VoiceRecordButton.tsx @@ -0,0 +1,109 @@ +import { useState, useRef, useCallback } from "react"; +import { Mic, Square, Loader2 } from "lucide-react"; +import { Button } from "./ui/button"; + +interface VoiceRecordButtonProps { + onTranscription: (text: string) => void; + disabled?: boolean; +} + +export function VoiceRecordButton({ onTranscription, disabled }: VoiceRecordButtonProps) { + const [recording, setRecording] = useState(false); + const [transcribing, setTranscribing] = useState(false); + const mediaRecorderRef = useRef(null); + const chunksRef = useRef([]); + + const startRecording = useCallback(async () => { + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const mediaRecorder = new MediaRecorder(stream, { + mimeType: MediaRecorder.isTypeSupported("audio/webm;codecs=opus") + ? "audio/webm;codecs=opus" + : "audio/webm", + }); + + chunksRef.current = []; + mediaRecorder.ondataavailable = (e) => { + if (e.data.size > 0) chunksRef.current.push(e.data); + }; + + mediaRecorder.onstop = async () => { + stream.getTracks().forEach((t) => t.stop()); + const blob = new Blob(chunksRef.current, { type: "audio/webm" }); + if (blob.size === 0) return; + + setTranscribing(true); + try { + const formData = new FormData(); + formData.append("audio", blob, "recording.webm"); + + const res = await fetch("/api/transcribe", { + method: "POST", + credentials: "include", + body: formData, + }); + + if (res.ok) { + const data = (await res.json()) as { text: string }; + if (data.text?.trim()) { + onTranscription(data.text.trim()); + } + } + } finally { + setTranscribing(false); + } + }; + + mediaRecorderRef.current = mediaRecorder; + mediaRecorder.start(250); // 250ms chunks + setRecording(true); + } catch { + // Microphone permission denied or unavailable + } + }, [onTranscription]); + + const stopRecording = useCallback(() => { + if (mediaRecorderRef.current?.state === "recording") { + mediaRecorderRef.current.stop(); + mediaRecorderRef.current = null; + } + setRecording(false); + }, []); + + if (transcribing) { + return ( + + ); + } + + if (recording) { + return ( + + ); + } + + return ( + + ); +}