SYSTRAN / faster-whisper

Faster Whisper transcription with CTranslate2
MIT License
12.28k stars 1.03k forks source link

using faster_whisper with javascript frontend via websockets - different audio format? #671

Closed ewagner70 closed 3 months ago

ewagner70 commented 9 months ago

I would like to use the microphone from webbrowser and send the audio chunks in realtime to a python backend. The data sent (float32arrays) are sent, but seem to be different than the one created by pyaudio.

here sample code for webfrontend

<html>
<head>
  <script>

    let isTalking = false;

    function toggleTalk() {
      let button = document.getElementById("TalkButton");
      if (isTalking) {
          button.innerHTML = "Talk";
          stopTalking();
      } else {
          button.innerHTML = "Stop Talk";
          startTalking();
      }
      isTalking = !isTalking;
    }

    // Create an AudioContext object
    var audioContext=new AudioContext();

    // Create a ScriptProcessorNode object with a buffer size of 4096 and one input and output channel
    var processor = audioContext.createScriptProcessor(1024, 1, 1);

    // Define a function that will be called when the processor has audio data available
    processor.onaudioprocess = function(event) {
      // Get the input audio data as a Float32Array
      var input = event.inputBuffer.getChannelData(0);

      // Send the output audio data as a binary message to the server
      socket.send(input.buffer);
    };

    // Define a function that will request the access to the microphone and start the audio capture
    function startTalking() {
      // Request the access to the microphone and create a MediaStream object
      navigator.mediaDevices.getUserMedia({audio: true})
          //.then(function(stream) {
          .then(stream => {
            // Create a WebSocket connection to the server
            socket = new WebSocket("ws://localhost:5000");

            // Resume AudioContext
            audioContext.resume();
            // Create a MediaStreamSource object from the stream
            var source = audioContext.createMediaStreamSource(stream);
            // Connect the source to the processor
            source.connect(processor);
            // Connect the processor to the destination
            processor.connect(audioContext.destination);
            // Display a message that the capture has started
            document.getElementById("status").innerHTML = "Capture started";
          })
          .catch(function(error) {
            // Display an error message if the request fails
            document.getElementById("status").innerHTML = "Capture failed: " + error.message;
          });
    }

    // Define a function that will stop the audio capture
    function stopTalking() {
      // Disconnect the processor from the source and the destination
      processor.disconnect();

      // Close websocket
      socket.close()

      // Display a message that the capture has stopped
      document.getElementById("status").innerHTML = "Capture stopped";
    }

  </script>
</head>
<body>
  <h2>MediaRecorder and WebSocket Demo</h2>
  <p id="status">Ready to capture</p>
  <button id="TalkButton" onclick="toggleTalk()">Talk</button>
</body>
</html>

here the corresponding python backend for faster_whisper

from faster_whisper import WhisperModel
import numpy as np
import sys
import asyncio
from websockets.server import serve

accumulated_transcription=''

async def echo(websocket):
    global accumulated_transcription
    global count
    # global model
    model_size='large-v1'
    model=WhisperModel(model_size, device='cuda', compute_type='float32', num_workers=4)
    count=0

    frames=[]
    print("Server started.")
    async for data in websocket:
        # samples=np.sin(np.arange(50000)/20).astype(np.float32)
        # print(type(data), len(data))
        frames.append(data)
        count+=1
        # print('count: ',count)
        if count>20:
            chunk_frames=np.frombuffer(b''.join(frames), dtype=np.float32)
            segments, info = model.transcribe(chunk_frames, 
                                    language='en',
                                    beam_size=3, 
                                    temperature=0,
                                    suppress_blank=True, 
                                    vad_filter=True,
                                    condition_on_previous_text=True,
                                    compression_ratio_threshold =1.8)
            results=''
            for segment in segments:
                results += segment.text
            sys.stdout.write("sst: "+results)
            sys.stdout.flush()
            accumulated_transcription += results
            count=0
            frames=[]
    # await websocket.send(message)

async def server():
    async with serve(echo, "localhost", 5000):
        await asyncio.Future()  # run forever

async def main():

    # model=WhisperModel(model_size, device='cpu', compute_type='int8')

    print('Starting Server...')
    # asyncio.run(server())
    server_task = asyncio.create_task(server())

    await asyncio.sleep(15)
    server_was_cancelled=server_task.cancel()
    print(f'server was cancelled: {server_was_cancelled}')

asyncio.run(main())

as comparison, the following standalone python code with pyaudio works as expeced ... so I don't really know why the data streamed from javascript frontend doesn't seem to be properly recognized (no errors are generated, btw)

from faster_whisper import WhisperModel
import pyaudio
import numpy as np
import sys

def main():
    # model_size='small.en'
    # model_size='medium.en'
    model_size='large-v1'
    model=WhisperModel(model_size, device='cuda', compute_type='float32', num_workers=4)
    # model=WhisperModel(model_size, device='cpu', compute_type='int8')

    chunk=1024
    format=pyaudio.paFloat32
    # format=pyaudio.paInt16
    channels=1
    sample_rate=16384
    record_seconds=3

    p=pyaudio.PyAudio()

    mics=[]
    for i in range(p.get_device_count()):
        print(p.get_device_info_by_index(i)['name'])

    stream=p.open(format=format, 
                  channels=channels, 
                  rate=sample_rate, 
                  input=True, 
                  output=True, 
                  frames_per_buffer=chunk)
    accumulated_transcription=""
    print("Start Talking ...")
    try:
        while True:
            # samples=np.sin(np.arange(50000)/20).astype(np.float32)
            chunk_file='temp_chunk.wav'
            # with open(chunk_file, 'wb') as audio_file:
            #     audio_bytes=audio_file.write()
            #     audio_file=io.BytesIO(audio_bytes)
            frames=[]
            for _ in range(int(sample_rate/chunk*record_seconds)):
                data=stream.read(chunk)
                print(type(data), len(data))
                frames.append(data)
            chunk_frames=np.frombuffer(b''.join(frames), dtype=np.float32)
            segments, info = model.transcribe(chunk_frames, 
                                    language='en',
                                    beam_size=5, 
                                    temperature=0,
                                    suppress_blank=True, 
                                    vad_filter=True,
                                    condition_on_previous_text=True,
                                    compression_ratio_threshold =1.8)
            results=''
            for segment in segments:
                results += segment.text
            sys.stdout.write(results)
            sys.stdout.flush()
            accumulated_transcription += results
    except KeyboardInterrupt:
        print('Stopping ...')
        with open('log.txt', 'w') as log_file:
            log_file.write(accumulated_transcription)
    finally:
        print('LOG:' + accumulated_transcription)
        stream.stop_stream()
        stream.close()
        p.terminate()

if __name__=='__main__':
    main()
Purfview commented 9 months ago

Uneducated guess:


chunk_frames=np.frombuffer(b''.join(frames), dtype=np.float32) / 32768.0
ewagner70 commented 9 months ago

Uneducated guess:

chunk_frames=np.frombuffer(b''.join(frames), dtype=np.float32) / 32768.0

no, would have wondered if that would work ... did you try it on your own? both codes should work out-of-the-box with the 2nd one producing ok results, but the combination of the javascript/python-backend only recognizes "Bye" although never mentioned. It seems like that

Purfview commented 9 months ago

Try to save audio to a file and look at the differences.

...did you try it on your own?

Of course not. Uneducated guess - An arbitrary guess with no particular reasoning behind it.

ewagner70 commented 9 months ago

Try to save audio to a file and look at the differences.

that's tricky as I use microphone and all I get is a byte-stream of float32 structure. It is obviously hard to find a solution, which streams audio from a web-page to a python-backend via websockets (or anything else like SignalR, etc.). Either I'm blind/too stupid for google searches or there is literally not a single (based on non-deprecated libs/functions) example in the internet. I even tried ChatGPT, but also - nothing ...

Spiritcow commented 9 months ago

Please, let me know if you will find a solution. I have the same problem.

phineas-pta commented 9 months ago

@ewagner70 the reason behind 32768.0 is to normalize array value between -1 & 1

it's a specification on how to represent audio in numerical array

ewagner70 commented 9 months ago

@ewagner70 the reason behind 32768.0 is to normalize array value between -1 & 1

it's a specification on how to represent audio in numerical array

the values in the float32array are already between -1 and +1. They also arrive at the python-backend as such ... Is it a different encoding (big/little endian or pcm codec or ... ???)

anbzerc commented 6 months ago

Hi @ewagner70 , do you have updates ?

ewagner70 commented 6 months ago

Hi @ewagner70 , do you have updates ?

@anbzerc : unfortunately no update as I am at the end of my wisdom ... even the faster_whisper guys obviously don't know what's the difference between the javascript and python libs ... it would be really helpful, if one of the colleagues provide a sample code where javascript picks up the audio-chunks and transfer them to a python-backend for faster_whisper transcription ... the issue that obviously no one (?!) can resolve that is hindering many use cases where more than one person shall use such a solution.

anbzerc commented 6 months ago

:sob: . Thanks for the answer

Spiritcow commented 6 months ago

@ewagner70 @anbzerc You can try to use web RTC with aiortc on the Python backend. AioRTC handles the conversion of raw audio packets to av.AudioFrame (from PyAV). Because with WebSockets you need to handle this conversion by your self. You need to know about codecs, bitrate and so on.

ewagner70 commented 6 months ago

@ewagner70 @anbzerc You can try to use web RTC with aiortc on the Python backend. AioRTC handles the conversion of raw audio packets to av.AudioFrame (from PyAV). Because with WebSockets you need to handle this conversion by your self. You need to know about codecs, bitrate and so on.

@Spiritcow : I am not struggling with the data-transfer (this is working). I am struggling with the conversion - that's what I'm looking for, as the data format is nowhere described ... Do you have any pointer on that as well?

@anbzerc : Did you make any progress?

anbzerc commented 6 months ago

Not yet unfortunately :cry:

anbzerc commented 6 months ago

This example should be interesting : https://github.com/aiortc/aiortc/tree/main/examples/server I'll try it as soon as possible

ewagner70 commented 6 months ago

This example should be interesting : https://github.com/aiortc/aiortc/tree/main/examples/server I'll try it as soon as possible

@anbzerc : this example uses ICE server ... when you solve it without using ICE, but directly via winsocket or similar - let us all know!

Spiritcow commented 6 months ago

@ewagner70 That's why I propose using web RTC with AIO because it does the conversion. Try to learn codecs and audio formats, if you want to use just play sockets

ldolegowski92 commented 6 months ago

@ewagner70 if you save the file to disk and pass it to the model, then the transcription is ok

fname = r"C:\test.wav"
sig = np.frombuffer(b''.join(frames), dtype=np.float32)
sf.write(fname, sig, 16000, format="wav")
segments, info = model.transcribe(fname, language="en")
text = " ".join([segment.text.strip() for segment in segments])

can you try:

import soundfile as sf

f = io.BytesIO()
sf.write(f, sig, 16000, format="wav")
f.seek(0)
segments, _ = model.transcribe(f, language="en")
text = " ".join([segment.text.strip() for segment in segments])
f.close()
ewagner70 commented 6 months ago

@ewagner70 if you save the file to disk and pass it to the model, then the transcription is ok

fname = r"C:\test.wav"
sig = np.frombuffer(b''.join(frames), dtype=np.float32)
sf.write(fname, sig, 16000, format="wav")
segments, info = model.transcribe(fname, language="en")
text = " ".join([segment.text.strip() for segment in segments])

can you try:

import soundfile as sf

f = io.BytesIO()
sf.write(f, sig, 16000, format="wav")
f.seek(0)
segments, _ = model.transcribe(f, language="en")
text = " ".join([segment.text.strip() for segment in segments])
f.close()

Dear @ldolegowski92, I know that saving as wav file is working (the 42byte wave header as "only" distinction between raw audio chunks and a wav-file).

However, my attempt was to use one of the two other transcribe formats (BinaryIO or ndarray) to send the received audio chunks directly to transcribe ...

oyang886 commented 3 months ago

Greeting,

My guess is that the data obtained from the microphone is 44100 or 48K, while the model supports 16K, so you got a strange output. And pyaudio should have used resampling to resample the received data to 16K, so his program runs smoothly.

ewagner70 commented 3 months ago

Greeting,

My guess is that the data obtained from the microphone is 44100 or 48K, while the model supports 16K, so you got a strange output. And pyaudio should have used resampling to resample the received data to 16K, so his program runs smoothly.

Thank you, @oyang886 - finally after 6 months someone who solved it. I had to merely adapt my html with the following downsampling code:

        // Create an AudioContext object
        var audioContext = new AudioContext();
        fromSampleRate = audioContext.sampleRate;
        toSampleRate = 16000;

        // Create a ScriptProcessorNode object with a buffer size of 4096 and one input and output channel
        var processor = audioContext.createScriptProcessor(4096, 1, 1);

        // Define a function that will be called when the processor has audio data available
        processor.onaudioprocess = function (event) {
            // Get the input audio data as a Float32Array
            var input = event.inputBuffer.getChannelData(0);

            // Send the output audio data as a binary message to the server
            socket.send(downsample(input, fromSampleRate, toSampleRate));
        };

and then simply add another function:

        function downsample(buffer, fromSampleRate, toSampleRate) {
            // buffer is a Float32Array
            var sampleRateRatio = Math.round(fromSampleRate / toSampleRate);
            var newLength = Math.round(buffer.length / sampleRateRatio);

            var result = new Float32Array(newLength);
            var offsetResult = 0;
            var offsetBuffer = 0;
            while (offsetResult < result.length) {
                var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
                var accum = 0, count = 0;
                for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
                    accum += buffer[i];
                    count++;
                }
                result[offsetResult] = accum / count;
                offsetResult++;
                offsetBuffer = nextOffsetBuffer;
            }
            return result;
        }
ewagner70 commented 3 months ago

can be closed now ... the documentation is really sub-par and nothing for the weary ...