Open Zaniyar opened 3 months ago
app.post('/text-to-speech-timestamps', async (req, res) => {
const VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
const YOUR_XI_API_KEY = "XXX";
const url = `https://api.elevenlabs.io/v1/text-to-speech/${VOICE_ID}/stream/with-timestamps`;
const data = {
text: req.body.text,
model_id: "eleven_multilingual_v2",
voice_settings: {
stability: 0.5,
similarity_boost: 0.75
}
};
try {
const response = await axios({
method: 'post',
url: url,
headers: {
'Content-Type': 'application/json',
'xi-api-key': YOUR_XI_API_KEY
},
data: data,
responseType: 'stream'
});
let audioBytes = Buffer.from('');
let characters = [];
let characterStartTimesSeconds = [];
let characterEndTimesSeconds = [];
let buffer = '';
response.data.on('data', (chunk) => {
buffer += chunk.toString('utf-8');
let boundary = buffer.lastIndexOf('\n');
if (boundary !== -1) {
const jsonString = buffer.slice(0, boundary);
buffer = buffer.slice(boundary + 1);
try {
const responseDict = JSON.parse(jsonString);
const audioBytesChunk = Buffer.from(responseDict.audio_base64, 'base64');
audioBytes = Buffer.concat([audioBytes, audioBytesChunk]);
if (responseDict.alignment) {
characters = characters.concat(responseDict.alignment.characters);
characterStartTimesSeconds = characterStartTimesSeconds.concat(responseDict.alignment.character_start_times_seconds);
characterEndTimesSeconds = characterEndTimesSeconds.concat(responseDict.alignment.character_end_times_seconds);
}
} catch (e) {
console.error('JSON parsing error:', e);
}
}
});
response.data.on('end', () => {
res.json({
audio: audioBytes.toString('base64'),
characters: characters,
character_start_times_seconds: characterStartTimesSeconds,
character_end_times_seconds: characterEndTimesSeconds
});
});
} catch (error) {
console.error('Error:', error);
res.status(500).send('Failed to generate speech with timestamps');
}
});
ok with http request it's now working - any websocket examples?
Just wanted to comment and thank you for providing a working example of timestamps in JS @Zaniyar
Thanks @Zaniyar for the example - Here's another example of using the Websocket endpoint to get audio with word alignment data: elevenlabs-websockets-demo.
How can I do the same with the websocket endpoint using this npm package? I need the fastest way to get the audio + timestamp for words/phonems