Spaces:
Running
Running
Cleanup TTS API and add utterance route for simple one-party TTS
Browse files
api.js
CHANGED
|
@@ -76,7 +76,9 @@ async function runOpenAITTS(text, audioFilename, voiceId, ttsModel='tts-1') {
|
|
| 76 |
await fsp.writeFile(audioFilename, buffer);
|
| 77 |
}
|
| 78 |
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
const voiceLookupTable = {
|
| 81 |
DEFAULT: 'alloy',
|
| 82 |
ALICE: 'shimmer',
|
|
@@ -86,14 +88,64 @@ async function generateAudio(speakerName, content) {
|
|
| 86 |
MALE_GUEST: 'onyx',
|
| 87 |
FEMALE_GUEST: 'alloy',
|
| 88 |
};
|
| 89 |
-
|
| 90 |
-
|
|
|
|
| 91 |
const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);
|
| 92 |
|
| 93 |
-
await runOpenAITTS(content, fileName, actualVoiceId,
|
| 94 |
return fileName;
|
| 95 |
}
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
function concatenateAudioFiles(audioFiles, outputFilePath) {
|
| 98 |
return new Promise((resolve, reject) => {
|
| 99 |
if (audioFiles.length === 1) {
|
|
@@ -134,64 +186,44 @@ function concatenateAudioFiles(audioFiles, outputFilePath) {
|
|
| 134 |
});
|
| 135 |
}
|
| 136 |
|
| 137 |
-
//
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
if (apiKey !== 'their_api_key') {
|
| 148 |
-
// Replace "their_api_key" with your actual method of managing API keys
|
| 149 |
-
res.status(401).send('Unauthorized');
|
| 150 |
-
return;
|
| 151 |
-
}
|
| 152 |
|
| 153 |
-
const script = req.query.payload;
|
| 154 |
-
if (!script) {
|
| 155 |
-
res.status(400).send('Bad Request: Missing payload');
|
| 156 |
-
return;
|
| 157 |
-
}
|
| 158 |
|
| 159 |
-
const hash = crypto.createHash('sha1');
|
| 160 |
-
hash.update(script);
|
| 161 |
-
const scriptHash = hash.digest('hex');
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
}
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
|
|
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
}
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
}
|
| 181 |
|
| 182 |
-
// Concatenate audio files into one using FFmpeg
|
| 183 |
-
const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
|
| 184 |
-
await concatenateAudioFiles(audioSegments, combinedAudioPath);
|
| 185 |
|
| 186 |
-
audioCache[scriptHash] = combinedAudioPath;
|
| 187 |
-
res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
|
| 188 |
-
} catch (error) {
|
| 189 |
-
console.error('Error generating speech:', error);
|
| 190 |
-
res.status(500).send('Internal Server Error');
|
| 191 |
-
}
|
| 192 |
-
});
|
| 193 |
|
| 194 |
-
//
|
|
|
|
| 195 |
app.post('/api/generate/speech/stream', async (req, res) => {
|
| 196 |
try {
|
| 197 |
const apiKey = req.query.api_key || 'their_api_key';
|
|
|
|
| 76 |
await fsp.writeFile(audioFilename, buffer);
|
| 77 |
}
|
| 78 |
|
| 79 |
+
//this supports all openai voices with tts-1 and tts-1-hd models
|
| 80 |
+
//voice name can be in openai format or one of the aliases in voiceLookupTable below
|
| 81 |
+
async function generateAudio(speakerName, content, ttsModel="tts-1") {
|
| 82 |
const voiceLookupTable = {
|
| 83 |
DEFAULT: 'alloy',
|
| 84 |
ALICE: 'shimmer',
|
|
|
|
| 88 |
MALE_GUEST: 'onyx',
|
| 89 |
FEMALE_GUEST: 'alloy',
|
| 90 |
};
|
| 91 |
+
const openaiVoices = ['alloy', 'shimmer', 'echo', 'nova', 'fable', 'onyx']
|
| 92 |
+
|
| 93 |
+
const actualVoiceId = openaiVoices.indexOf(speakerName) > -1 ? speakerName : (voiceLookupTable[speakerName] || voiceLookupTable['DEFAULT']);
|
| 94 |
const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);
|
| 95 |
|
| 96 |
+
await runOpenAITTS(content, fileName, actualVoiceId, ttsModel);
|
| 97 |
return fileName;
|
| 98 |
}
|
| 99 |
|
| 100 |
+
async function generateSpeechFromScript(script="ALICE: Hello, world\n\nBOB: Hello, hamster", res) {
|
| 101 |
+
try {
|
| 102 |
+
/* TODO
|
| 103 |
+
if (apiKey !== 'DEFAULT_API_KEY') {
|
| 104 |
+
// Replace "DEFAULT_API_KEY" with your actual method of managing API keys
|
| 105 |
+
res.status(401).send('Unauthorized');
|
| 106 |
+
return;
|
| 107 |
+
} */
|
| 108 |
+
|
| 109 |
+
if (!script) {
|
| 110 |
+
res.status(400).send('Bad Request: Missing payload');
|
| 111 |
+
return;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
const hash = crypto.createHash('sha1');
|
| 115 |
+
hash.update(script);
|
| 116 |
+
const scriptHash = hash.digest('hex');
|
| 117 |
+
|
| 118 |
+
if (audioCache[scriptHash]) {
|
| 119 |
+
const filePath = audioCache[scriptHash];
|
| 120 |
+
res.sendFile(path.resolve(filePath), { headers: { 'Content-Type': 'audio/mpeg' } });
|
| 121 |
+
return;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
const parsedSegments = parseScript(script);
|
| 125 |
+
const audioSegments = [];
|
| 126 |
+
|
| 127 |
+
for (const segment of parsedSegments) {
|
| 128 |
+
const audioPath = await generateAudio(segment.speaker_name, segment.content);
|
| 129 |
+
audioSegments.push(audioPath);
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
if (audioSegments.length === 0) {
|
| 133 |
+
res.status(400).send('No audio generated');
|
| 134 |
+
return;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
// Concatenate audio files into one using FFmpeg
|
| 138 |
+
const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
|
| 139 |
+
await concatenateAudioFiles(audioSegments, combinedAudioPath);
|
| 140 |
+
|
| 141 |
+
audioCache[scriptHash] = combinedAudioPath;
|
| 142 |
+
res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
|
| 143 |
+
} catch (error) {
|
| 144 |
+
console.error('Error generating speech:', error);
|
| 145 |
+
res.status(500).send('Internal Server Error');
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
function concatenateAudioFiles(audioFiles, outputFilePath) {
|
| 150 |
return new Promise((resolve, reject) => {
|
| 151 |
if (audioFiles.length === 1) {
|
|
|
|
| 186 |
});
|
| 187 |
}
|
| 188 |
|
| 189 |
+
// Payload should be film script style: speakernames in all caps and a blank line between them
|
| 190 |
+
// ALICE: Hi bob,how are you?
|
| 191 |
+
//
|
| 192 |
+
// BOB: Shitty. One of my coworkers put my hamster in the microwave thinking it was his lunch
|
| 193 |
+
// This is for multi-party TTS... For ordinary TTS call api/generate/utterance
|
| 194 |
+
app.get('api/generate/speech', async (req, res) => {
|
| 195 |
+
const {payload} = req.query
|
| 196 |
+
await generateSpeechFromScript(payload)
|
| 197 |
+
})
|
| 198 |
|
| 199 |
+
app.post('api/generate/speech', async (req, res) =>{
|
| 200 |
+
const {payload} = req.body
|
| 201 |
+
await generateSpeechFromScript(payload)
|
| 202 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
+
// This is normal TTS: specify voice, text, model. Voices are from openai, use those names or the aliases in lookup table
|
| 207 |
+
app.get('api/generate/utterance', async (req, res) => {
|
| 208 |
+
const {voice, text, model} = req.query
|
| 209 |
+
const outputFilename= await generateAudio(voice, text, model || "tts-1")
|
|
|
|
| 210 |
|
| 211 |
+
// We want the browser to cache this response, because there's no reason to TTS the same text-voice-model combination more than once
|
| 212 |
+
res.sendFile(path.resolve(outputFilename), { headers: { 'Content-Type': 'audio/mpeg', 'Cache-Control', 'Max-Age=8640000' } });
|
| 213 |
+
})
|
| 214 |
|
| 215 |
+
app.post('api/generate/utterance', async (req, res) =>{
|
| 216 |
+
const {voice, text, model} = req.body
|
| 217 |
+
const outputFilename= await generateAudio(voice, text, model || "tts-1")
|
|
|
|
| 218 |
|
| 219 |
+
// We want the browser to cache this response, because there's no reason to TTS the same text-voice-model combination more than once
|
| 220 |
+
res.sendFile(path.resolve(outputFilename), { headers: { 'Content-Type': 'audio/mpeg', 'Cache-Control', 'Max-Age=8640000' } });
|
| 221 |
+
})
|
|
|
|
| 222 |
|
|
|
|
|
|
|
|
|
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
// This returns a stream of SSE (application/event-stream) similar to a streaming response from an LLM
|
| 226 |
+
// See example in public/client for how to consume the stream
|
| 227 |
app.post('/api/generate/speech/stream', async (req, res) => {
|
| 228 |
try {
|
| 229 |
const apiKey = req.query.api_key || 'their_api_key';
|