deepgram / deepgram-js-sdk

Official JavaScript SDK for Deepgram's automated speech recognition APIs.
https://developers.deepgram.com
MIT License
127 stars 45 forks source link

How to implement quick Speech-to-Speech system? #304

Closed GoldenDragon0710 closed 1 week ago

GoldenDragon0710 commented 1 week ago

Hi, Deepgram developers. I love Deepgram API. It is really quick I already checked your live demo using OpenAI It's really amazing So I tried to implement speech-to-speech with Deepgram API and OpenAI GPT-4o model But it has long delay time to generate AI audio response.

This is my current code, but I amnot sure what is issue

const getResponse = async () => {
    let transcript = "";
    let ai_response = "";
    try {
      function readAudioFile(file) {
        const reader = new FileReader();

        const headers = {
          Authorization: `Token ${process.env.DEEPGRAM_API_KEY}`,
          "Content-Type": "audio/wav",
        };

        reader.onload = async function (event) {
          const audioData = event.target.result;
          await axios
            .post("https://api.deepgram.com/v1/listen", audioData, { headers: headers })
            .then((response) => {
              if (response.data) {
                transcript = response.data.results.channels[0].alternatives[0].transcript;
              }
            }).catch((error) => {
              console.error("Error while transcripting:", error); // Handle errors
            });

          // Get response from OpenAI GPT-4o
          ...

          // Generate audio from text and play
          const config = {
            headers: {
              Authorization: `Token ${process.env.DEEPGRAM_API_KEY}`,
              "Content-Type": "application/json",
            },
          };
          const data = {
            text: ai_response,
          };

          const response = await fetch("https://api.deepgram.com/v1/speak?model=aura-zeus-en",
            { 
              method: "POST", 
              headers: { ...config.headers }, 
              body: JSON.stringify(data)
            }
          );
          if (!response.ok) {
            throw new Error(`HTTP error! Status: ${response.status}`);
          }

          const audioContext = new (window.AudioContext || window.webkitAudioContext)();
          const source = audioContext.createBufferSource();
          // Fetch the audio data as an ArrayBuffer
          const arrayBuffer = await response.arrayBuffer();
          audioContext.decodeAudioData(
            arrayBuffer,
            (buffer) => {
              source.buffer = buffer;
              source.connect(audioContext.destination);
              source.start(0);
            },
            (e) => {
              console.log("Error with decoding audio data" + e.err);
            }
          );
        };

        // Define what happens on error
        reader.onerror = function (event) {
          console.error(
            "File could not be read! Code " + event.target.error.code
          );
        };

        // Read the file as an ArrayBuffer (useful for binary files like audio)
        reader.readAsArrayBuffer(file);
      }

      const file = new File([recordedBlob.blob], "recording.wav", {
        type: "audio/wav",
      });
      readAudioFile(file);
    } catch (err) {
      console.log(err);
    }
  };
jpvajda commented 1 week ago

@GoldenDragon0710 👋 Since this is a question please post in Deepgram GitHub Discussions or on Discord and some one from our Community can try to help you.