deepgram / deepgram-js-sdk

Official JavaScript SDK for Deepgram's automated speech recognition APIs.
https://developers.deepgram.com
MIT License
155 stars 54 forks source link

Corrupted buffer on SpeakLiveClient and ListenLiveClient #331

Closed DecathectZero closed 1 month ago

DecathectZero commented 1 month ago

This doesn't work on my end: https://developers.deepgram.com/docs/streaming-text-to-speech

when we get a websocket message, it will call the handleMessage function. https://github.com/deepgram/deepgram-js-sdk/blob/main/src/packages/SpeakLiveClient.ts#L139-L162

When event.data itself is a Buffer, buffer.buffer, which is a ArrayBufferLike - however this looses some crucial metadata in the buffer itself: BYTES_PER_ELEMENT, byteLength, byteOffset. https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint8Array

We shouldn't be using ArrayBuffer but just Buffer instead in the event.

Perhaps it works for MP3 or some other files, but it definitely fails with linear16.

To reproduce:

const fs = require("fs");
const { createClient, LiveTTSEvents } = require("../../dist/main/index");

const live = async () => {
  const text = "Hello, how can I help you today?";

  const deepgram = createClient(process.env.DEEPGRAM_API_KEY);

  const dgConnection = deepgram.speak.live({
      encoding: 'linear16',
      bit_rate: 16000
  });

  let audioBuffer = Buffer.alloc(0);

  dgConnection.on(LiveTTSEvents.Open, () => {
    // Send text data for TTS synthesis
    dgConnection.sendText(text);

    // Send Flush message to the server after sending the text
    dgConnection.flush();

    dgConnection.on(LiveTTSEvents.Audio, (data) => {
      console.log("Deepgram audio data received");
      // this is no longer the same as the original buffer
      const buffer = Buffer.from(data);
      audioBuffer = Buffer.concat([audioBuffer, buffer]);
    });

    dgConnection.on(LiveTTSEvents.Flushed, () => {
      console.log("Deepgram Flushed");
      // Write the buffered audio data to a file when the flush event is received
      writeFile();
    });
  });

  function createWavHeader(bufferLength, sampleRate, numChannels, bitsPerSample, audioFormat = 1) {
      const header = Buffer.alloc(44);

      const byteRate = (sampleRate * numChannels * bitsPerSample) / 8;
      const blockAlign = (numChannels * bitsPerSample) / 8;

      // RIFF chunk descriptor
      header.write('RIFF', 0);                          // ChunkID: "RIFF"
      header.writeUInt32LE(36 + bufferLength, 4);       // ChunkSize: 36 + SubChunk2Size
      header.write('WAVE', 8);                          // Format: "WAVE"

      // fmt subchunk
      header.write('fmt ', 12);                         // Subchunk1ID: "fmt "
      header.writeUInt32LE(16, 16);                     // Subchunk1Size: 16 for PCM
      header.writeUInt16LE(audioFormat, 20);            // AudioFormat: 1 for PCM (linear16)
      header.writeUInt16LE(numChannels, 22);            // NumChannels
      header.writeUInt32LE(sampleRate, 24);             // SampleRate
      header.writeUInt32LE(byteRate, 28);               // ByteRate
      header.writeUInt16LE(blockAlign, 32);             // BlockAlign
      header.writeUInt16LE(bitsPerSample, 34);          // BitsPerSample (16 for linear16)

      // data subchunk
      header.write('data', 36);                         // Subchunk2ID: "data"
      header.writeUInt32LE(bufferLength, 40);           // Subchunk2Size

      return header;
  }

  const writeFile = () => {
    if (audioBuffer.length > 0) {
      // Example usage:
      const sampleRate = 8000;
      const numChannels = 1;        // Mono
      const bitsPerSample = 16;     // 16 bits for linear16 encoding
      const audioFormat = 1;        // PCM format

      const wavHeader = createWavHeader(audioBuffer.length, sampleRate, numChannels, bitsPerSample, audioFormat);
      const wavBuffer = Buffer.concat([wavHeader, audioBuffer]);

      fs.writeFile("output.wav", audioBuffer, (err) => {
        if (err) {
          console.error("Error writing audio file:", err);
        } else {
          console.log("Audio file saved as output.mp3");
        }
      });
    }
  };
};

live();

This is just to generate the file, nothing wrong with createWavHeader - because I've also tried this with vonage's streaming API: https://developer.vonage.com/en/voice/voice-api/concepts/websockets#writing-audio-to-the-websocket and with Twilio's https://www.twilio.com/docs/voice/media-streams/websocket-messages#send-websocket-messages-to-twilio

If I don't use the SDK and use my own custom WS client, it works fine. Basically instead of:

    } else if (event.data instanceof ArrayBuffer) {
      this.handleBinaryMessage(event.data);
    } else if (Buffer.isBuffer(event.data)) {
      this.handleBinaryMessage(event.data.buffer);

it should be:

    } else if (event.data instanceof ArrayBuffer) {
      this.handleBinaryMessage(Buffer.from(event.data));
    } else if (Buffer.isBuffer(event.data)) {
      this.handleBinaryMessage(event.data);
taf2 commented 1 month ago

maybe 👍

        if (typeof event.data === "string") {
            try {
                const data = JSON.parse(event.data);
                this.handleTextMessage(data);
            }
            catch (error) {
                this.emit(LiveTTSEvents.Error, {
                    event,
                    message: "Unable to parse `data` as JSON.",
                    error,
                });
            }
        }
        else if (event.data instanceof ArrayBuffer) {
            this.handleBinaryMessage(event.data);
        }
        else if (Buffer.isBuffer(event.data)) {
            this.handleBinaryMessage(event.data.buffer);
        } else if (event.data instanceof Blob) {
            const reader = new FileReader();
            reader.onload = () => {
                this.handleBinaryMessage(reader.result);
            };
            reader.readAsArrayBuffer(event.data);
        }
        else {
            console.log("Received unknown data type", event.data);
            this.emit(LiveTTSEvents.Error, {
                event,
                message: "Received unknown data type.",
            });
        }
    }
taf2 commented 1 month ago

Looks like you can do the following to work around the bug for now handling the error event by checking if event data is a Blob and then converting the blob to a buffer and calling handleBinaryMessage as below:

this.deepgramText2Speech.on(LiveTTSEvents.Error, (error) => {
      const event = error.event;   
 // XXX: deepgramText2Speech client has a bug so at last as of version 3.7.0  we need to hack the handleMessage method
      if (event.data instanceof Blob) {
        console.log("read the blob", event.data);
        event.data.arrayBuffer().then(arrayBuffer => {
          const buffer = Buffer.from(arrayBuffer);
          this.deepgramText2Speech.handleBinaryMessage(buffer);
        }).catch(error => {
          console.error("Failed to read blob", error);
        });
      } else {
        console.error(new Date(), "[deepgramText2Speech]: error:", error);
      }
    });
DecathectZero commented 1 month ago

Looks like you can do the following to work around the bug for now handling the error event by checking if event data is a Blob and then converting the blob to a buffer and calling handleBinaryMessage as below:

this.deepgramText2Speech.on(LiveTTSEvents.Error, (error) => {
      const event = error.event;   
 // XXX: deepgramText2Speech client has a bug so at last as of version 3.7.0  we need to hack the handleMessage method
      if (event.data instanceof Blob) {
        console.log("read the blob", event.data);
        event.data.arrayBuffer().then(arrayBuffer => {
          const buffer = Buffer.from(arrayBuffer);
          this.deepgramText2Speech.handleBinaryMessage(buffer);
        }).catch(error => {
          console.error("Failed to read blob", error);
        });
      } else {
        console.error(new Date(), "[deepgramText2Speech]: error:", error);
      }
    });

this isn't on LiveTTSEvents.Error, but rather on LiveTTSEvents.Audio

taf2 commented 1 month ago

@DecathectZero correct, but the library internally does not handle the type correctly... so until there is a new published npm package using error and adding the fixed logic to detect blob resolves the issue...