elevenlabs / elevenlabs-js

The official JavaScript (Node) library for ElevenLabs Text to Speech.
https://elevenlabs.io
MIT License
149 stars 15 forks source link

Streaming input #4

Open vhmartinezm opened 10 months ago

vhmartinezm commented 10 months ago

Hey! It's nice to have a JS library, thanks! I'd like to know if it's in your plans to add streaming input as the Python library has?

dsinghvi commented 10 months ago

@vhmartinezm definitely -- this should be released in the next couple weeks and I will follow up here when it is.

tday commented 8 months ago

hi @dsinghvi, any update on if this is supported yet?

dsinghvi commented 8 months ago

Hey @tday -- ElevenLabs is currently putting together an AsyncAPI spec. I expect this to be released by April!

tday commented 8 months ago

For those looking for a temporary solution while we wait for the official SDK to be updated...

Here is an implementation based on the eleven labs docs

This is a rough snippet that I am typing here in GitHub ripped out of my app. It may have some errors that you will need to clean up

import { WebSocket } from "ws";

export async function streamTextToSpeech(
  text: string,
  options?: { ttsClient?: TextToSpeechClient },
): Promise<stream.Readable> {
  const ttsClient = options?.ttsClient ?? getTextToSpeechClient();
  const data: ElevenLabsClient.GeneratAudioBulk = {
    model_id: "eleven_turbo_v2",
    text: text,
    voice_settings: {
      similarity_boost: 0.75,
      stability: 0.5,
      use_speaker_boost: true,
    },
  };

  // NOTE: using voice ID saves an API roundtrip to get voice name -> voice ID
  return ttsClient.generate({
    voice: michaelVoiceID,
    stream: true,
    ...data,
  });
}

interface AudioChunk {
  audio: string;
  isFinal: boolean;
  alignment: {
    char_start_times_ms: number[];
    chars_durations_ms: number[];
    chars: string[];
  };
  normalizedAlignment: {
    char_start_times_ms: number[];
    chars_durations_ms: number[];
    chars: string[];
  };
}

export function inputStreamTextToSpeech(
  textStream: AsyncIterable<string>,
): AsyncGenerator<AudioChunk> {
  const model = "eleven_turbo_v2";
  const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${michaelVoiceID}/stream-input?model_id=${model}`;
  const socket = new WebSocket(wsUrl);

  socket.onopen = function () {
    const streamStart = {
      text: " ",
      voice_settings: {
        stability: 0.5,
        similarity_boost: 0.8,
      },
      xi_api_key: elevenLabsApiKey,
    };

    socket.send(JSON.stringify(streamStart));

    // send stream until done
    const streamComplete = new Promise((resolve, reject) => {
      (async () => {
        for await (const message of textStream) {
          const request = {
            text: message,
            try_trigger_generation: true,
          };
          socket.send(JSON.stringify(request));
        }
      })()
        .then(resolve)
        .catch(reject);
    });

    streamComplete
      .then(() => {
        const endStream = {
          text: "",
        };

        socket.send(JSON.stringify(endStream));
      })
      .catch((e) => {
        throw e;
      });
  };

  return (async function* audioStream() {
    let isDone = false;
    let chunks: AudioChunk[] = [];
    let resolve: (value: unknown) => void;
    let waitForMessage = new Promise((r) => (resolve = r));

    socket.onmessage = function (event) {
      const audioChunk = JSON.parse(event.data as string) as AudioChunk;
      if (audioChunk.audio && audioChunk.alignment) {
        chunks.push(audioChunk);
        resolve(null);
        waitForMessage = new Promise((r) => (resolve = r));
      }
    };

    socket.onerror = function (error) {
      throw error;
    };

    // Handle socket closing
    socket.onclose = function () {
      isDone = true;
    };

    while (!isDone) {
      await waitForMessage;
      yield* chunks;
      chunks = [];
    }
  })();
}

Call it like this

import { OpenAI } from "openai";
import { ChatCompletionStream } from "openai/lib/ChatCompletionStream";

export async function streamCompletion() {
  const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
  return client.beta.chat.completions.stream({
    model: "gpt-4-0125-preview",
    messages: [{ role: "system", content: systemPrompt }, ...messages],
  });
}

export async function* llmMessageSource(
  llmStream: ChatCompletionStream,
): AsyncIterable<string> {
  for await (const chunk of llmStream) {
    const message = chunk.choices[0].delta.content;
    if (message) {
      yield message;
    }
  }
}

async function main(systemPrompt: string, prompt: string) {
    const llmStream = streamCompletion({
      systemPrompt,
      messages: [{ role: "user", content: prompt }],
    });
    const llmMessageStream = llmMessageSource(llmStream);
    for await (const audio of inputStreamTextToSpeech(llmMessageStream)) {
       console.log(audio)
    }
}
VewMet commented 6 months ago

@dsinghvi everyone is waiting for this feature ⌛ 🚀

bharath-naik commented 6 months ago

Hey @dsinghvi any update on this, been waiting for this from quite a long time. Thank You

bharath-naik commented 6 months ago

hello @dsinghvi. The streaming API is expected to be released by April and we are nearing to june. our applications which needs to go live are on hold and we are waiting for eleven labs to provide streaming input. what's so challenging for such a great AI voice company to build this. i would request you to put this on priority. we have to go live.

jbaudanza commented 6 months ago

Here is another simple implementation based on node streams. I like this method because back-pressure is handled internally by the streams.

const WebSocket = require('ws');
const { createWebSocketStream } = WebSocket;
const { pipeline } = require('node:stream');

const { Transform } = require('stream');

const fs = require('fs');

const modelId = "eleven_multilingual_v2";
const voiceId = "21m00Tcm4TlvDq8ikWAM";

const ws = new WebSocket(`wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input?model_id=${modelId}`);

const messageStream = createWebSocketStream(ws, { encoding: 'utf8' });

ws.on('open', () => {
  const payload = {
    "text": " ",
    "voice_settings": {
        "stability": 0.5,
        "similarity_boost": 0.5
    },
    "xi_api_key": process.env.XI_API_KEY,
  }
  ws.send(JSON.stringify(payload));

  // These must always end with a space. 
  ws.send(JSON.stringify({ "text": "Hello world. " }));
  ws.send(JSON.stringify({ "text": "This is some text. "}));

  // End of stream
  ws.send(JSON.stringify({ "text": "" }));
});

ws.on("error", (error) => {
  console.log(error);
});

const transform = new Transform({
  transform(chunk, encoding, callback) {
    if (encoding !== 'utf8') {
      callback(new Error("Expected utf8 encoding"));
      return;
    }

    const message = JSON.parse(chunk);
    if (message.audio) {
      const buffer = Buffer.from(message['audio'], 'base64')
      this.push(buffer);
    }

    callback(null);
  },

  decodeStrings: false,
});

async function run() {
  // Send output to a file, but this could also be a node http.ServerResponse object
  const output = fs.createWriteStream('output.mp3');

  await pipeline(
    messageStream,
    transform,
    output,
    (err) => {
      if (err) {
        console.error('Pipeline failed.', err);
      } else {
        console.log("finished")
      }
    }
  );
}

run();
igoralentyev commented 5 months ago

Does anyone understand how to divide multiple ws requests? I mean a have 2+ calls from twilio for example, and they provide streamId, but i cant find similar feature in 11labs ws api? How do i handle this? Maybe some sort of threads?