SYSTRAN / faster-whisper

Faster Whisper transcription with CTranslate2
MIT License
10k stars 841 forks source link

Having issue in decoding audio chunks properly for fasterWhisper transcribe func #836

Open Akshay-akkay opened 1 month ago

Akshay-akkay commented 1 month ago

I have a HTML frontend which sends the audio chunks to the FastAPI based backend over websocket in byte form but after decoding and transcribing the result is always empty string. Can somebody help me here?

HTML Frontend


<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Realtime WebSocket Audio Streaming</title>
  <style>
    body {
      background-color: black;
      color: green;
    }

    button {
      background-color: green;
      color: white;
      border: none;
      padding: 10px 20px;
      font-size: 16px;
      cursor: pointer;
    }

    button:hover {
      background-color: #4CAF50;
    }

    #responseContainer {
      margin-top: 10px;
    }
  </style>
</head>

<body>
  <h1>Realtime WebSocket Audio Streaming</h1>
  <button id="startButton">Start Streaming</button>
  <button id="stopButton">Stop Streaming</button>
  <div id="responseContainer"></div>
  <script src="https://www.WebRTC-Experiment.com/RecordRTC.js"></script>
  <script>
    let ws = new WebSocket("ws://localhost:8000/ws");
    let mediaRecorder;

    ws.onmessage = event => {
      let responseContainer = document.getElementById('responseContainer');
      responseContainer.innerHTML += `<p>${JSON.stringify(event)}</p>`;
    };

    let audioChunks = [];

    let handleDataAvailable = (event) => {
      if (event.size > 0) {
        console.log('blob', event)
        audioChunks.push(event);
        blobToBase64(event).then(b64 => {
          ws.send(JSON.stringify({
            "message": b64,
            "messageType": "audio"
          }));
        })
      }
    };

    function blobToBase64(blob) {
      return new Promise((resolve, _) => {
        const reader = new FileReader();
        reader.onloadend = () => resolve(reader.result);
        reader.readAsDataURL(blob);
      });
    }

    navigator.mediaDevices.getUserMedia({ audio: true })
      .then(stream => {
        let recorder = RecordRTC(stream, {
          type: 'audio',
          recorderType: StereoAudioRecorder,
          mimeType: 'audio/wav',
          timeSlice: 1000,
          desiredSampRate: 16000,
          numberOfAudioChannels: 1,
          ondataavailable: handleDataAvailable
        });

        document.getElementById('startButton').addEventListener('click', () => {
          recorder.startRecording();
          document.getElementById('startButton').color = 'gray';
        });

        document.getElementById('stopButton').addEventListener('click', () => {
          recorder.stopRecording();
          ws.send(JSON.stringify({
            "message": "EOA",
            "messageType": "audio"
          }));

          // Save audio chunks to file and show the download link
          const blob = new Blob(audioChunks, { type: 'audio/wav' });
          const url = URL.createObjectURL(blob);
          const link = document.createElement('a');
          link.href = url;
          link.download = 'audio.wav';
          link.click();
        });
      });

    ws.onopen = () => {
      console.log('WebSocket connection opened');
    };

    ws.onclose = () => {
      console.log('WebSocket connection closed');
    };
  </script>
</body>

</html>

FastAPI Backend


from venv import logger
from fastapi import FastAPI, WebSocket
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
from typing import Union
import re
import numpy as np

from faster_whisper import WhisperModel

model_size = "base.en"

app = FastAPI()

class ChatModel(BaseModel):
    messageType: str
    message: Union[str, bytes, None] = None
    history: list[str] = []

model = WhisperModel(model_size, device="cpu", compute_type="int8")
logger.setLevel(10)  # for debugging
# Use the static/index.html
html = open("static/index.html").read()

@app.get("/")
async def get():
    return HTMLResponse(html)

@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()
    history = list[str]()
    audioBuffer = list[bytes]()
    while True:
        data = await websocket.receive_json()
        # print(data)
        if isinstance(data, dict):  # "EOA" message
            chat = ChatModel(
                history=history,
                message=data["message"],
                messageType=data["messageType"],
            )
            if chat.messageType == "text":
                history.append(chat.message)
                chat.history = history
                await websocket.send_text(
                    f"{chat.messageType}: {chat.message} \n {chat.history}"
                )

            elif chat.message == "EOA":
                # buf = bytes(chat.message.encode("utf-8"))

                # what if this takes so long the client gives up? should send incremental results,
                # or a least pretend results, while we process
                audio_data = b"".join(audioBuffer)

                print(data)
                print(
                    f"audio buffer had {len(data)} segments, final length {len(audio_data)}"
                )

                # Save to file
                with open("audio.wav", "wb") as f:
                    f.write(audio_data)

                audio_np = (
                    np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
                    / 32768.0
                )
                segments, _ = model.transcribe(audio_np)
                print(f"Info: {_}")
                rr = []
                for segment in segments:
                    print(segment.text)
                    rr.append(segment.text)
                m = " ".join(rr)
                # print("recognition result before strip:", m)
                m = m.strip()
                # sometimes whisper puts in commentary like [soft music] and we strip that out:
                # print("recognition result before bracket re:", m)
                m = re.sub(r"\[.*\]", "", m)
                # print("recognition result after bracket re:", m)
                # was having problems with things like "He said hello" which became He said, "Hello"
                m = re.sub(r'"', '\\"', m)  # convert " to \\"
                # print("recognition result after quoting:", m)
                # send json result to konele client:
                msg = f'{{"status": 0, "result": {{"hypotheses": [{{"transcript": "{m}"}}], "final": true}}}}'
                print("msg is", msg)
                await websocket.send_text(msg)
            else:
                audioBuffer.append(chat.message.encode("utf-8"))
                print(f"audio buf now has {len(audioBuffer)} segments")```