375 lines
14 KiB
Markdown
375 lines
14 KiB
Markdown
---
|
|
phase: 25-file-system
|
|
plan: 08
|
|
type: execute
|
|
wave: 1
|
|
depends_on: ["25-02"]
|
|
files_modified:
|
|
- ui/src/components/VoiceRecordButton.tsx
|
|
- ui/src/components/ChatInput.tsx
|
|
- ui/src/components/ChatPanel.tsx
|
|
- server/src/routes/chat-files.ts
|
|
- .planning/REQUIREMENTS.md
|
|
autonomous: true
|
|
gap_closure: true
|
|
requirements: [INPUT-02, INPUT-03, INPUT-04]
|
|
|
|
must_haves:
|
|
truths:
|
|
- "User can hold a record button to capture voice audio"
|
|
- "Recorded audio is sent to the server for transcription"
|
|
- "Transcription preview appears in the chat input before sending"
|
|
artifacts:
|
|
- path: "ui/src/components/VoiceRecordButton.tsx"
|
|
provides: "Voice recording button with MediaRecorder API, preview, and confirm flow"
|
|
min_lines: 60
|
|
- path: "server/src/routes/chat-files.ts"
|
|
provides: "POST /transcribe endpoint for audio transcription"
|
|
key_links:
|
|
- from: "ui/src/components/ChatInput.tsx"
|
|
to: "ui/src/components/VoiceRecordButton.tsx"
|
|
via: "import and render in input toolbar"
|
|
pattern: "VoiceRecordButton"
|
|
- from: "ui/src/components/VoiceRecordButton.tsx"
|
|
to: "/api/transcribe"
|
|
via: "fetch POST with audio blob"
|
|
pattern: "fetch.*transcribe"
|
|
---
|
|
|
|
<objective>
|
|
Add voice input with transcription preview to the chat input.
|
|
|
|
Purpose: INPUT-04 requires voice input via Whisper when local AI is enabled: user can hold a record button, speak, see a transcription preview, and confirm to send. This plan creates a VoiceRecordButton component using the browser MediaRecorder API, a server transcription endpoint that calls a local Whisper process, and wires the transcription result into the chat input textarea.
|
|
|
|
Output: VoiceRecordButton component, server transcription endpoint, ChatInput integration
|
|
</objective>
|
|
|
|
<execution_context>
|
|
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
|
|
@$HOME/.claude/get-shit-done/templates/summary.md
|
|
</execution_context>
|
|
|
|
<context>
|
|
@.planning/PROJECT.md
|
|
@.planning/ROADMAP.md
|
|
@.planning/STATE.md
|
|
@.planning/phases/25-file-system/25-02-SUMMARY.md
|
|
|
|
<interfaces>
|
|
From ui/src/components/ChatInput.tsx:
|
|
- Props include onSend(content: string), onFilesPicked, pendingFiles, onRemoveFile
|
|
- Contains Paperclip button for file upload in the button row
|
|
- Textarea with handleKeyDown and handlePaste
|
|
|
|
From server/src/routes/chat-files.ts:
|
|
- chatFileRoutes(db, storage) returns Express Router
|
|
- Uses multer for file upload, assertBoard for auth
|
|
</interfaces>
|
|
</context>
|
|
|
|
<tasks>
|
|
|
|
<task type="auto">
|
|
<name>Task 1: Create VoiceRecordButton and server transcription endpoint</name>
|
|
<files>ui/src/components/VoiceRecordButton.tsx, server/src/routes/chat-files.ts</files>
|
|
<read_first>
|
|
- ui/src/components/ChatInput.tsx
|
|
- server/src/routes/chat-files.ts
|
|
- server/src/attachment-types.ts
|
|
</read_first>
|
|
<action>
|
|
1. Create ui/src/components/VoiceRecordButton.tsx:
|
|
|
|
```typescript
|
|
import { useState, useRef, useCallback } from "react";
|
|
import { Mic, Square, Loader2 } from "lucide-react";
|
|
import { Button } from "./ui/button";
|
|
|
|
interface VoiceRecordButtonProps {
|
|
onTranscription: (text: string) => void;
|
|
disabled?: boolean;
|
|
}
|
|
|
|
export function VoiceRecordButton({ onTranscription, disabled }: VoiceRecordButtonProps) {
|
|
const [recording, setRecording] = useState(false);
|
|
const [transcribing, setTranscribing] = useState(false);
|
|
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
|
const chunksRef = useRef<Blob[]>([]);
|
|
|
|
const startRecording = useCallback(async () => {
|
|
try {
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
const mediaRecorder = new MediaRecorder(stream, {
|
|
mimeType: MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
|
|
? "audio/webm;codecs=opus"
|
|
: "audio/webm",
|
|
});
|
|
|
|
chunksRef.current = [];
|
|
mediaRecorder.ondataavailable = (e) => {
|
|
if (e.data.size > 0) chunksRef.current.push(e.data);
|
|
};
|
|
|
|
mediaRecorder.onstop = async () => {
|
|
stream.getTracks().forEach((t) => t.stop());
|
|
const blob = new Blob(chunksRef.current, { type: "audio/webm" });
|
|
if (blob.size === 0) return;
|
|
|
|
setTranscribing(true);
|
|
try {
|
|
const formData = new FormData();
|
|
formData.append("audio", blob, "recording.webm");
|
|
|
|
const res = await fetch("/api/transcribe", {
|
|
method: "POST",
|
|
credentials: "include",
|
|
body: formData,
|
|
});
|
|
|
|
if (res.ok) {
|
|
const data = (await res.json()) as { text: string };
|
|
if (data.text?.trim()) {
|
|
onTranscription(data.text.trim());
|
|
}
|
|
}
|
|
} finally {
|
|
setTranscribing(false);
|
|
}
|
|
};
|
|
|
|
mediaRecorderRef.current = mediaRecorder;
|
|
mediaRecorder.start(250); // 250ms chunks
|
|
setRecording(true);
|
|
} catch {
|
|
// Microphone permission denied or unavailable
|
|
}
|
|
}, [onTranscription]);
|
|
|
|
const stopRecording = useCallback(() => {
|
|
if (mediaRecorderRef.current?.state === "recording") {
|
|
mediaRecorderRef.current.stop();
|
|
mediaRecorderRef.current = null;
|
|
}
|
|
setRecording(false);
|
|
}, []);
|
|
|
|
if (transcribing) {
|
|
return (
|
|
<Button variant="ghost" size="icon" className="h-8 w-8" disabled>
|
|
<Loader2 className="h-4 w-4 animate-spin" />
|
|
</Button>
|
|
);
|
|
}
|
|
|
|
if (recording) {
|
|
return (
|
|
<Button
|
|
variant="ghost"
|
|
size="icon"
|
|
className="h-8 w-8 text-destructive"
|
|
onClick={stopRecording}
|
|
aria-label="Stop recording"
|
|
title="Stop recording"
|
|
>
|
|
<Square className="h-4 w-4" />
|
|
</Button>
|
|
);
|
|
}
|
|
|
|
return (
|
|
<Button
|
|
variant="ghost"
|
|
size="icon"
|
|
className="h-8 w-8"
|
|
onClick={startRecording}
|
|
disabled={disabled}
|
|
aria-label="Voice input"
|
|
title="Voice input"
|
|
>
|
|
<Mic className="h-4 w-4" />
|
|
</Button>
|
|
);
|
|
}
|
|
```
|
|
|
|
2. In server/src/routes/chat-files.ts, add a transcription endpoint. This endpoint receives audio via multer, writes it to a temp file, and shells out to the `whisper` CLI (from OpenAI's whisper or whisper.cpp). If whisper is not installed, return a 503 with a helpful message.
|
|
|
|
Add this route inside chatFileRoutes, after the file upload routes:
|
|
|
|
```typescript
|
|
// POST /transcribe -- Transcribe audio via local Whisper
|
|
router.post("/transcribe", async (req, res) => {
|
|
assertBoard(req);
|
|
|
|
try {
|
|
await runSingleFileUpload(fileUpload, req, res);
|
|
} catch (err) {
|
|
if (err instanceof multer.MulterError) {
|
|
res.status(400).json({ error: err.message });
|
|
return;
|
|
}
|
|
throw err;
|
|
}
|
|
|
|
const file = (req as Request & { file?: { buffer: Buffer; mimetype: string } }).file;
|
|
if (!file) {
|
|
res.status(400).json({ error: "Missing audio field" });
|
|
return;
|
|
}
|
|
|
|
// Write to temp file
|
|
const { writeFile, unlink } = await import("node:fs/promises");
|
|
const { tmpdir } = await import("node:os");
|
|
const tmpPath = path.join(tmpdir(), `nexus-audio-${Date.now()}.webm`);
|
|
|
|
try {
|
|
await writeFile(tmpPath, file.buffer);
|
|
|
|
// Try whisper CLI (whisper.cpp or openai-whisper)
|
|
const { promisify } = await import("node:util");
|
|
const { execFile: execFileCb } = await import("node:child_process");
|
|
const execFileAsync = promisify(execFileCb);
|
|
|
|
try {
|
|
// Try whisper.cpp first (outputs to stdout with --output-txt --output-file -)
|
|
const { stdout } = await execFileAsync("whisper-cpp", [
|
|
"--model", "base.en",
|
|
"--file", tmpPath,
|
|
"--no-timestamps",
|
|
"--output-txt",
|
|
], { timeout: 30000 });
|
|
res.json({ text: stdout.trim() });
|
|
} catch {
|
|
try {
|
|
// Fallback: openai-whisper Python CLI
|
|
const { stdout } = await execFileAsync("whisper", [
|
|
tmpPath,
|
|
"--model", "base.en",
|
|
"--output_format", "txt",
|
|
"--output_dir", tmpdir(),
|
|
], { timeout: 60000 });
|
|
// whisper CLI outputs to a .txt file
|
|
const txtPath = tmpPath.replace(/\.webm$/, ".txt");
|
|
try {
|
|
const { readFile } = await import("node:fs/promises");
|
|
const text = await readFile(txtPath, "utf-8");
|
|
await unlink(txtPath).catch(() => {});
|
|
res.json({ text: text.trim() });
|
|
} catch {
|
|
// Parse stdout as fallback
|
|
res.json({ text: stdout.trim() });
|
|
}
|
|
} catch {
|
|
res.status(503).json({
|
|
error: "Whisper not available. Install whisper-cpp or openai-whisper for voice input.",
|
|
});
|
|
}
|
|
}
|
|
} finally {
|
|
await unlink(tmpPath).catch(() => {});
|
|
}
|
|
});
|
|
```
|
|
|
|
Note: This uses execFileAsync (promisified execFile) -- NOT exec -- to avoid shell injection. The tmpPath is system-generated and safe.
|
|
</action>
|
|
<verify>
|
|
<automated>cd /opt/nexus && test -f ui/src/components/VoiceRecordButton.tsx && echo "VoiceRecordButton exists" && grep -n "transcribe" server/src/routes/chat-files.ts | head -5</automated>
|
|
</verify>
|
|
<acceptance_criteria>
|
|
- File ui/src/components/VoiceRecordButton.tsx exists
|
|
- Contains MediaRecorder API usage (navigator.mediaDevices.getUserMedia)
|
|
- Contains fetch("/api/transcribe") call
|
|
- Contains recording/transcribing/idle states with Mic/Square/Loader2 icons
|
|
- server/src/routes/chat-files.ts contains router.post("/transcribe") endpoint
|
|
- Transcription endpoint uses execFileAsync (safe, no shell) for whisper CLI
|
|
- Returns 503 with helpful message if whisper is not installed
|
|
</acceptance_criteria>
|
|
<done>VoiceRecordButton captures audio and sends to /api/transcribe; server transcription endpoint processes via local Whisper</done>
|
|
</task>
|
|
|
|
<task type="auto">
|
|
<name>Task 2: Wire VoiceRecordButton into ChatInput and update REQUIREMENTS.md</name>
|
|
<files>ui/src/components/ChatInput.tsx, .planning/REQUIREMENTS.md</files>
|
|
<read_first>
|
|
- ui/src/components/ChatInput.tsx
|
|
- ui/src/components/ChatPanel.tsx
|
|
- ui/src/components/VoiceRecordButton.tsx
|
|
- .planning/REQUIREMENTS.md
|
|
</read_first>
|
|
<action>
|
|
1. Update ui/src/components/ChatInput.tsx:
|
|
- Import VoiceRecordButton: `import { VoiceRecordButton } from "./VoiceRecordButton";`
|
|
- Add an optional prop `enableVoiceInput?: boolean` to ChatInput's props interface
|
|
- Add a handler that inserts transcription text into the textarea:
|
|
```typescript
|
|
const handleTranscription = useCallback((text: string) => {
|
|
// Append transcription to current input value
|
|
const textarea = textareaRef.current;
|
|
if (textarea) {
|
|
const current = textarea.value;
|
|
const newValue = current ? `${current} ${text}` : text;
|
|
// Trigger onChange through native input event for controlled components
|
|
const nativeInputValueSetter = Object.getOwnPropertyDescriptor(
|
|
window.HTMLTextAreaElement.prototype, "value"
|
|
)?.set;
|
|
nativeInputValueSetter?.call(textarea, newValue);
|
|
textarea.dispatchEvent(new Event("input", { bubbles: true }));
|
|
textarea.focus();
|
|
}
|
|
}, []);
|
|
```
|
|
Alternatively, if ChatInput uses a state variable for the input value, just update that state directly. Read ChatInput.tsx first to determine the correct approach.
|
|
- Render VoiceRecordButton in the button row (next to the Paperclip button), only when `enableVoiceInput` is true:
|
|
```tsx
|
|
{enableVoiceInput && (
|
|
<VoiceRecordButton
|
|
onTranscription={handleTranscription}
|
|
disabled={/* same disabled condition as send button if any */}
|
|
/>
|
|
)}
|
|
```
|
|
|
|
2. Update ui/src/components/ChatPanel.tsx:
|
|
- Pass `enableVoiceInput={true}` to the `<ChatInput>` component. This makes the voice button visible to users. (If a `localAIEnabled` flag or config exists, gate on that instead; otherwise pass `true` unconditionally — the server returns 503 gracefully if whisper is not installed.)
|
|
|
|
3. Update .planning/REQUIREMENTS.md:
|
|
- Change INPUT-04 from `- [ ] **INPUT-04**` to `- [x] **INPUT-04**`
|
|
- In Traceability table, change INPUT-04 from Pending to Complete
|
|
- Also change INPUT-02 and INPUT-03 from Pending to Complete if not already (they were implemented in Plan 25-02)
|
|
</action>
|
|
<verify>
|
|
<automated>cd /opt/nexus && grep -n "VoiceRecordButton\|enableVoiceInput\|handleTranscription" ui/src/components/ChatInput.tsx | head -5 && grep "INPUT-02\|INPUT-03\|INPUT-04" .planning/REQUIREMENTS.md | head -6</automated>
|
|
</verify>
|
|
<acceptance_criteria>
|
|
- ui/src/components/ChatInput.tsx imports VoiceRecordButton
|
|
- Contains enableVoiceInput prop in the props interface
|
|
- Contains handleTranscription callback that inserts text into textarea
|
|
- Renders VoiceRecordButton conditionally when enableVoiceInput is true
|
|
- ui/src/components/ChatPanel.tsx passes enableVoiceInput={true} to ChatInput
|
|
- .planning/REQUIREMENTS.md contains `- [x] **INPUT-02**`
|
|
- .planning/REQUIREMENTS.md contains `- [x] **INPUT-03**`
|
|
- .planning/REQUIREMENTS.md contains `- [x] **INPUT-04**`
|
|
</acceptance_criteria>
|
|
<done>Voice input button in chat input; transcription inserts into textarea; INPUT-02/03/04 marked Complete</done>
|
|
</task>
|
|
|
|
</tasks>
|
|
|
|
<verification>
|
|
- npx tsc --noEmit -p ui/tsconfig.json passes
|
|
- grep "VoiceRecordButton" ui/src/components/ChatInput.tsx matches
|
|
- grep "\[x\].*INPUT-04" .planning/REQUIREMENTS.md matches
|
|
</verification>
|
|
|
|
<success_criteria>
|
|
- VoiceRecordButton appears in ChatInput when voice input is enabled
|
|
- Recording captures audio, sends to /api/transcribe, and inserts result into textarea
|
|
- Server returns transcription via local Whisper (or 503 if not installed)
|
|
- INPUT-02, INPUT-03, INPUT-04 marked Complete in REQUIREMENTS.md
|
|
- TypeScript compiles without errors
|
|
</success_criteria>
|
|
|
|
<output>
|
|
After completion, create `.planning/phases/25-file-system/25-08-SUMMARY.md`
|
|
</output>
|