How to get text to speech with timestamps?

Zaniyar commented 3 months ago

app.post('/text-to-speech-timestamps', async (req, res) => {
    try {
      const audioStream = await client.textToSpeech.streamWithTimestamps("pMsXgVXv3BLzUgSXRplE", {
        text: req.body.text,
        optimize_streaming_latency: 0,
        output_format: 'pcm_22050',
        voice_settings: {
          stability: 0.1,
          similarity_boost: 0.3,
          style: 0.2
        }
      });
      **console.log(audioStream); // audioStream is always undefined ...** 

      // Collecting chunks of JSON with timestamps and audio
      const chunks = [];
      audioStream.on('data', (chunk) => {
        chunks.push(chunk);
      });

      audioStream.on('end', () => {
        const fullResponse = Buffer.concat(chunks).toString();
        console.log(fullResponse)
        res.send(fullResponse);
      });

    } catch (error) {
      console.error('Error generating speech with timestamps:', error);
      res.status(500).send('Failed to generate speech with timestamps');
    }
  });

How can I do the same with the websocket endpoint using this npm package? I need the fastest way to get the audio + timestamp for words/phonems

Zaniyar commented 3 months ago

app.post('/text-to-speech-timestamps', async (req, res) => {
    const VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
    const YOUR_XI_API_KEY = "XXX";
    const url = `https://api.elevenlabs.io/v1/text-to-speech/${VOICE_ID}/stream/with-timestamps`;

    const data = {
      text: req.body.text,
      model_id: "eleven_multilingual_v2",
      voice_settings: {
        stability: 0.5,
        similarity_boost: 0.75
      }
    };

    try {
      const response = await axios({
        method: 'post',
        url: url,
        headers: {
          'Content-Type': 'application/json',
          'xi-api-key': YOUR_XI_API_KEY
        },
        data: data,
        responseType: 'stream'
      });

      let audioBytes = Buffer.from('');
      let characters = [];
      let characterStartTimesSeconds = [];
      let characterEndTimesSeconds = [];
      let buffer = '';

      response.data.on('data', (chunk) => {
        buffer += chunk.toString('utf-8');

        let boundary = buffer.lastIndexOf('\n');
        if (boundary !== -1) {
          const jsonString = buffer.slice(0, boundary);
          buffer = buffer.slice(boundary + 1);

          try {
            const responseDict = JSON.parse(jsonString);

            const audioBytesChunk = Buffer.from(responseDict.audio_base64, 'base64');
            audioBytes = Buffer.concat([audioBytes, audioBytesChunk]);

            if (responseDict.alignment) {
              characters = characters.concat(responseDict.alignment.characters);
              characterStartTimesSeconds = characterStartTimesSeconds.concat(responseDict.alignment.character_start_times_seconds);
              characterEndTimesSeconds = characterEndTimesSeconds.concat(responseDict.alignment.character_end_times_seconds);
            }
          } catch (e) {
            console.error('JSON parsing error:', e);
          }
        }
      });

      response.data.on('end', () => {
        res.json({
          audio: audioBytes.toString('base64'),
          characters: characters,
          character_start_times_seconds: characterStartTimesSeconds,
          character_end_times_seconds: characterEndTimesSeconds
        });
      });

    } catch (error) {
      console.error('Error:', error);
      res.status(500).send('Failed to generate speech with timestamps');
    }
  });

ok with http request it's now working - any websocket examples?

rayfarer commented 2 months ago

Just wanted to comment and thank you for providing a working example of timestamps in JS @Zaniyar

louisjoecodes commented 1 month ago

Thanks @Zaniyar for the example - Here's another example of using the Websocket endpoint to get audio with word alignment data: elevenlabs-websockets-demo.

elevenlabs / elevenlabs-js

How to get text to speech with timestamps? #83