feat(25-08): create VoiceRecordButton and server transcription endpoint

- Add VoiceRecordButton with MediaRecorder API, recording/transcribing/idle states - Add POST /transcribe endpoint to chat-files.ts using execFileAsync (safe, no shell) - Tries whisper-cpp first, falls back to openai-whisper Python CLI - Returns 503 with helpful message if whisper is not installed
2026-04-01 23:57:58 +00:00 · 2026-04-01 23:57:58 +00:00 · 64a90c284e
commit 64a90c284e
parent 8b0e3f052d
2 changed files with 201 additions and 0 deletions
--- a/server/src/routes/chat-files.ts
+++ b/server/src/routes/chat-files.ts
@ -1,3 +1,4 @@
+import path from "node:path";
 import { Router, type Request, type Response } from "express";
 import multer from "multer";
 import type { Db } from "@paperclipai/db";
@ -200,5 +201,96 @@ export function chatFileRoutes(db: Db, storage: StorageService) {
    res.json(updated);
  });

+  // POST /transcribe — Transcribe audio via local Whisper
+  const audioUpload = multer({
+    storage: multer.memoryStorage(),
+    limits: { fileSize: MAX_ATTACHMENT_BYTES, files: 1 },
+  });
+
+  async function runAudioUpload(
+    upload: ReturnType<typeof multer>,
+    req: Request,
+    res: Response,
+  ) {
+    await new Promise<void>((resolve, reject) => {
+      upload.single("audio")(req, res, (err: unknown) => {
+        if (err) reject(err);
+        else resolve();
+      });
+    });
+  }
+
+  router.post("/transcribe", async (req, res) => {
+    assertBoard(req);
+
+    try {
+      await runAudioUpload(audioUpload, req, res);
+    } catch (err) {
+      if (err instanceof multer.MulterError) {
+        res.status(400).json({ error: err.message });
+        return;
+      }
+      throw err;
+    }
+
+    const file = (req as Request & { file?: { buffer: Buffer; mimetype: string } }).file;
+    if (!file) {
+      res.status(400).json({ error: "Missing audio field" });
+      return;
+    }
+
+    // Write to temp file
+    const { writeFile, unlink } = await import("node:fs/promises");
+    const { tmpdir } = await import("node:os");
+    const tmpPath = path.join(tmpdir(), `nexus-audio-${Date.now()}.webm`);
+
+    try {
+      await writeFile(tmpPath, file.buffer);
+
+      // Try whisper CLI (whisper.cpp or openai-whisper)
+      const { promisify } = await import("node:util");
+      const { execFile: execFileCb } = await import("node:child_process");
+      const execFileAsync = promisify(execFileCb);
+
+      try {
+        // Try whisper.cpp first (outputs transcription to stdout with --no-timestamps)
+        const { stdout } = await execFileAsync("whisper-cpp", [
+          "--model", "base.en",
+          "--file", tmpPath,
+          "--no-timestamps",
+          "--output-txt",
+        ], { timeout: 30000 });
+        res.json({ text: stdout.trim() });
+      } catch {
+        try {
+          // Fallback: openai-whisper Python CLI
+          const { stdout } = await execFileAsync("whisper", [
+            tmpPath,
+            "--model", "base.en",
+            "--output_format", "txt",
+            "--output_dir", tmpdir(),
+          ], { timeout: 60000 });
+          // whisper CLI writes to a .txt file alongside the input
+          const txtPath = tmpPath.replace(/\.webm$/, ".txt");
+          try {
+            const { readFile } = await import("node:fs/promises");
+            const text = await readFile(txtPath, "utf-8");
+            await unlink(txtPath).catch(() => {});
+            res.json({ text: text.trim() });
+          } catch {
+            // Parse stdout as fallback
+            res.json({ text: stdout.trim() });
+          }
+        } catch {
+          res.status(503).json({
+            error: "Whisper not available. Install whisper-cpp or openai-whisper for voice input.",
+          });
+        }
+      }
+    } finally {
+      await unlink(tmpPath).catch(() => {});
+    }
+  });
+
  return router;
 }
--- a/ui/src/components/VoiceRecordButton.tsx
+++ b/ui/src/components/VoiceRecordButton.tsx
@ -0,0 +1,109 @@
+import { useState, useRef, useCallback } from "react";
+import { Mic, Square, Loader2 } from "lucide-react";
+import { Button } from "./ui/button";
+
+interface VoiceRecordButtonProps {
+  onTranscription: (text: string) => void;
+  disabled?: boolean;
+}
+
+export function VoiceRecordButton({ onTranscription, disabled }: VoiceRecordButtonProps) {
+  const [recording, setRecording] = useState(false);
+  const [transcribing, setTranscribing] = useState(false);
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+
+  const startRecording = useCallback(async () => {
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      const mediaRecorder = new MediaRecorder(stream, {
+        mimeType: MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
+          ? "audio/webm;codecs=opus"
+          : "audio/webm",
+      });
+
+      chunksRef.current = [];
+      mediaRecorder.ondataavailable = (e) => {
+        if (e.data.size > 0) chunksRef.current.push(e.data);
+      };
+
+      mediaRecorder.onstop = async () => {
+        stream.getTracks().forEach((t) => t.stop());
+        const blob = new Blob(chunksRef.current, { type: "audio/webm" });
+        if (blob.size === 0) return;
+
+        setTranscribing(true);
+        try {
+          const formData = new FormData();
+          formData.append("audio", blob, "recording.webm");
+
+          const res = await fetch("/api/transcribe", {
+            method: "POST",
+            credentials: "include",
+            body: formData,
+          });
+
+          if (res.ok) {
+            const data = (await res.json()) as { text: string };
+            if (data.text?.trim()) {
+              onTranscription(data.text.trim());
+            }
+          }
+        } finally {
+          setTranscribing(false);
+        }
+      };
+
+      mediaRecorderRef.current = mediaRecorder;
+      mediaRecorder.start(250); // 250ms chunks
+      setRecording(true);
+    } catch {
+      // Microphone permission denied or unavailable
+    }
+  }, [onTranscription]);
+
+  const stopRecording = useCallback(() => {
+    if (mediaRecorderRef.current?.state === "recording") {
+      mediaRecorderRef.current.stop();
+      mediaRecorderRef.current = null;
+    }
+    setRecording(false);
+  }, []);
+
+  if (transcribing) {
+    return (
+      <Button variant="ghost" size="icon" className="h-8 w-8" disabled>
+        <Loader2 className="h-4 w-4 animate-spin" />
+      </Button>
+    );
+  }
+
+  if (recording) {
+    return (
+      <Button
+        variant="ghost"
+        size="icon"
+        className="h-8 w-8 text-destructive"
+        onClick={stopRecording}
+        aria-label="Stop recording"
+        title="Stop recording"
+      >
+        <Square className="h-4 w-4" />
+      </Button>
+    );
+  }
+
+  return (
+    <Button
+      variant="ghost"
+      size="icon"
+      className="h-8 w-8"
+      onClick={startRecording}
+      disabled={disabled}
+      aria-label="Voice input"
+      title="Voice input"
+    >
+      <Mic className="h-4 w-4" />
+    </Button>
+  );
+}