nexus/server/src/routes/voice.ts
Nexus Dev b95634c61a feat(39-01): sentence-buffered TTS streaming + multi-language synthesis
- Export splitSentences() with title-abbreviation protection (Dr., Mr. etc.)
- Add synthesizeSentenceStream() AsyncGenerator yielding per-sentence audio chunks
- Add synthesizeMultiLang() synthesizing same text in N voices via Promise.all
- Add POST /api/synthesize/stream SSE endpoint with base64 audio per sentence
- Add POST /api/synthesize/multi-lang returning array of voiceId+audio pairs
- Existing POST /api/synthesize unchanged (backward compatible)
2026-04-04 03:35:31 +00:00

104 lines
3.7 KiB
TypeScript

import { Router } from "express";
import multer from "multer";
import { assertBoard } from "./authz.js";
import { voicePipelineService } from "../services/voice-pipeline.js";
import { MAX_ATTACHMENT_BYTES } from "../attachment-types.js";
export function voiceRoutes(): Router {
const router = Router();
const svc = voicePipelineService();
const audioUpload = multer({
storage: multer.memoryStorage(),
limits: { fileSize: MAX_ATTACHMENT_BYTES, files: 1 },
});
// POST /api/transcribe — transcribe uploaded audio via VoicePipelineService
router.post("/transcribe", async (req, res) => {
assertBoard(req);
await new Promise<void>((resolve, reject) =>
audioUpload.single("audio")(req, res, (err) => (err ? reject(err) : resolve()))
);
const file = (req as any).file as { buffer: Buffer; mimetype: string } | undefined;
if (!file) {
res.status(400).json({ error: "Missing audio field" });
return;
}
const fmt = file.mimetype.includes("ogg") ? "ogg"
: file.mimetype.includes("wav") ? "wav"
: "webm";
const result = await svc.transcribe(file.buffer, fmt);
res.json(result);
});
// POST /api/synthesize — synthesize text to speech via VoicePipelineService
router.post("/synthesize", async (req, res) => {
assertBoard(req);
const { text, voiceId } = req.body as { text?: string; voiceId?: string };
if (!text || typeof text !== "string") {
res.status(400).json({ error: "text is required" });
return;
}
const audioBuffer = await svc.synthesize(text, voiceId);
res.setHeader("Content-Type", "audio/wav");
res.send(audioBuffer);
});
// POST /api/synthesize/stream — sentence-buffered SSE streaming TTS
router.post("/synthesize/stream", async (req, res) => {
assertBoard(req);
const { text, voiceId } = req.body as { text?: string; voiceId?: string };
if (!text || typeof text !== "string") {
res.status(400).json({ error: "text is required" });
return;
}
res.setHeader("Content-Type", "text/event-stream");
res.setHeader("Cache-Control", "no-cache");
res.setHeader("Connection", "keep-alive");
res.flushHeaders();
try {
for await (const chunk of svc.synthesizeSentenceStream(text, voiceId)) {
const payload = JSON.stringify({
index: chunk.index,
total: chunk.total,
audio: chunk.audio.toString("base64"),
});
res.write(`data: ${payload}\n\n`);
}
res.write(`data: ${JSON.stringify({ done: true })}\n\n`);
} catch (err) {
const message = err instanceof Error ? err.message : "Synthesis failed";
res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
} finally {
res.end();
}
});
// POST /api/synthesize/multi-lang — synthesize same text in multiple languages/voices
router.post("/synthesize/multi-lang", async (req, res) => {
assertBoard(req);
const { text, voiceIds } = req.body as { text?: string; voiceIds?: unknown };
if (!text || typeof text !== "string") {
res.status(400).json({ error: "text is required" });
return;
}
if (!Array.isArray(voiceIds) || voiceIds.length < 1 || voiceIds.length > 5) {
res.status(400).json({ error: "voiceIds must be an array with 1-5 entries" });
return;
}
if (!voiceIds.every((v) => typeof v === "string")) {
res.status(400).json({ error: "voiceIds must be an array of strings" });
return;
}
const resultMap = await svc.synthesizeMultiLang(text, voiceIds as string[]);
const results = Array.from(resultMap.entries()).map(([voiceId, audio]) => ({
voiceId,
audio: audio.toString("base64"),
}));
res.json({ results });
});
return router;
}