End-to-end sample code for streaming text-to-speech

Issue

Understanding how to stream and play the generated audio isn't very straightforward for devs who are new to streaming and playing audio.

Expected

Clear sample code showing an end-to-end example or having convenience methods within this JS lib that helps people stream and play audio both for server- and client-side needs.

Example code

For now, here is some sample code others can work off of (and hopefully improve upon)

Text-to-Speech (server-side)

import { ElevenLabsClient } from 'elevenlabs'
import { OptimizeStreamingLatency } from 'elevenlabs/api'
import { Readable } from 'stream'

const elevenlabs = new ElevenLabsClient({
  apiKey: process.env.ELEVENLABS_API_KEY
})

type TextToSpeechProps = {
  text: string
}

enum ElevenLabsVoice {
  MyVoice = 'abcd1234'
}

enum ElevenLabsModel {
  MultilingualV2 = 'eleven_multilingual_v2',
  TurboV2 = 'eleven_turbo_v2'
}

export async function textToSpeech({
  text
}: TextToSpeechProps): Promise<Readable> {
  const voiceId = ElevenLabsVoice.MyVoice
  const modelId = ElevenLabsModel.TurboV2

  const audioStream = await elevenlabs.generate({
    stream: true,
    voice: voiceId,
    text: text,
    model_id: modelId,
    optimize_streaming_latency: OptimizeStreamingLatency.Three,
    voice_settings: {
      stability: 0.7,
      similarity_boost: 1.0,
      style: 0.5,
      use_speaker_boost: true
    }
  })
  return audioStream
}

API Route to wrap it (since this ElevenLabs lib works only in Node envs for now)

// /api/tts/route.ts
import { textToSpeech } from '@/lib/voice/elevenlabs/textToSpeech'
import { NextResponse } from 'next/server'

export async function POST(req: Request) {
  const { message } = await req.json()
  try {
    console.time('textToSpeech latency')
    const audioStream = await textToSpeech({ text: message })
    console.timeEnd('textToSpeech latency')

    return new Response(audioStream as unknown as BodyInit, {
      headers: { 'Content-Type': 'audio/mpeg' }
    })
  } catch (error: any) {
    console.error(error)
    return NextResponse.json(
      { error: error.message },
      { status: error.statusCode || 500 }
    )
  }
}

Hook to call into API route

import { useCallback, useRef, useState } from 'react'
import { playAudioFromResponse } from '@/lib/audioPlayer/audioPlayer'

export const useTextToSpeech = () => {
  const audioRef = useRef<HTMLAudioElement | null>(null)
  const [audioLoaded, setAudioLoaded] = useState(false)
  const [isPlaying, setIsPlaying] = useState(false)
  const [isLoading, setIsLoading] = useState(false)

  const stop = useCallback(() => {
    if (audioRef.current) {
      audioRef.current.pause()
      audioRef.current.currentTime = 0
      setIsPlaying(false)
    }
  }, [])

  const speak = useCallback(async (text: string) => {
    setIsLoading(true)

    const response = await fetch('/api/tts', {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({ message: text })
    })

    if (!response.ok) {
      throw new Error('Network response was not ok')
    }

    playAudioFromResponse(
      response,
      audioRef,
      () => setAudioLoaded(true),
      isPlaying => setIsPlaying(isPlaying),
      isLoading => setIsLoading(isLoading)
    )
  }, [])

  return { speak, stop, audioLoaded, isPlaying, isLoading }
}

Audio player to play streamed readable

export function playAudioFromResponse(
  response: Response,
  audioRef: React.MutableRefObject<HTMLAudioElement | null>,
  onAudioLoaded: () => void,
  onIsPlayingChange: (isPlaying: boolean) => void,
  onIsLoadingChange: (isLoading: boolean) => void
) {
  if (!MediaSource.isTypeSupported('audio/mpeg')) {
    throw new Error('Unsupported MIME type or codec: audio/mpeg')
  }

  const mediaSource = new MediaSource()
  const audio = new Audio()
  audio.src = URL.createObjectURL(mediaSource)
  audioRef.current = audio

  mediaSource.addEventListener('sourceopen', () => {
    const sourceBuffer = mediaSource.addSourceBuffer('audio/mpeg')
    onAudioLoaded()
    readAudioChunks(response.body!.getReader(), sourceBuffer, mediaSource)
    onIsLoadingChange(false)
    onIsPlayingChange(true)
    audio.play()
  })

  audio.onended = () => {
    onIsPlayingChange(false)
    onIsLoadingChange(false)
  }

  audio.addEventListener('error', e => {
    console.error('Error playing audio', e)
  })
}

function readAudioChunks(
  reader: ReadableStreamDefaultReader<Uint8Array>,
  sourceBuffer: SourceBuffer,
  mediaSource: MediaSource
) {
  let queue: Uint8Array[] = []
  let isAppendingInProgress = false

  function processQueue() {
    if (queue.length > 0 && !sourceBuffer.updating) {
      sourceBuffer.appendBuffer(queue.shift()!)
    }
  }

  function push() {
    reader.read().then(({ done, value }) => {
      if (done) {
        mediaSource.endOfStream()
        return
      }
      queue.push(value!)
      if (!isAppendingInProgress) {
        isAppendingInProgress = true
        processQueue()
      }
      push()
    })
  }

  sourceBuffer.addEventListener('updateend', () => {
    isAppendingInProgress = false
    processQueue()
  })

  push()
}

React code to bring it all together

import Image from 'next/image'
import { MdPlayCircle, MdStopCircle } from 'react-icons/md'
import { ImSpinner8 } from 'react-icons/im'
import { useTextToSpeech } from '@/lib/hooks/use-text-to-speech'
import { cn } from '@/lib/utils'

export function EmptyScreen() {
  const { speak, stop, isLoading, isPlaying } = useTextToSpeech()

  const handlePlayAudio = () => {
    isPlaying ? stop() : speak("Hello, I'm a virtual assistant! Welcome to our AI chatbot. Here, you can chat with me and get assistance with various tasks. What can I help you with?")
  }

  return (
    <div className="mx-auto max-w-2xl px-4">
      <div className="flex flex-col gap-2 rounded-lg border bg-background p-8">
        <Image
          src="/images/assistant.svg"
          alt="Virtual Assistant"
          width={700}
          height={392}
          className="mx-auto"
          priority
        />
        <h1 className="text-lg font-semibold">Hello, I&apos;m your Virtual Assistant!</h1>
        <p className="leading-normal text-muted-foreground">
          Welcome to our AI chatbot. Here, you can chat with me and get assistance with various tasks.
        </p>
        <button
          onClick={handlePlayAudio}
          className={cn(
            'flex gap-2 items-center justify-center mt-4 px-4 py-2 bg-blue-500 text-white rounded',
            isLoading && 'opacity-50 cursor-not-allowed'
          )}
        >
          {isLoading ? (
            <ImSpinner8 className="size-4 shrink-0 animate-spin" />
          ) : isPlaying ? (
            <MdStopCircle />
          ) : (
            <MdPlayCircle />
          )}
          {isLoading ? 'Loading...' : isPlaying ? 'Stop' : 'Play'}
        </button>
      </div>
    </div>
  )
}

elevenlabs / elevenlabs-js

End-to-end sample code for streaming text-to-speech #48