xenova / whisper-web

ML-powered speech recognition directly in your browser
https://hf.co/spaces/Xenova/whisper-web
MIT License
1.29k stars 152 forks source link

Vanilla JS example #30

Open flatsiedatsie opened 3 months ago

flatsiedatsie commented 3 months ago

It would be great if there was a minimal vanilla JS example of how to use this in a project. I don't use React, so the current output of npm install is very difficult to extract any understanding from.

xenova commented 3 months ago

Check out the whisper-tiny model card for example usage of the model: https://huggingface.co/Xenova/whisper-tiny.en

You'll need to implement the additional UI features and functionality yourself (e.g., mic input), though. But there are resources online to assist you!

Example: Transcribe English.

// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';

let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';

// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url);
// { text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country." }

Example: Transcribe English w/ timestamps.

// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';

let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';

// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url, { return_timestamps: true });
// {
//   text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country."
//   chunks: [
//     { timestamp: [0, 8],  text: " And so my fellow Americans ask not what your country can do for you" }
//     { timestamp: [8, 11], text: " ask what you can do for your country." }
//   ]
// }

Example: Transcribe English w/ word-level timestamps.

// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';

let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';

// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url, { return_timestamps: 'word' });
// {
//   "text": " And so my fellow Americans ask not what your country can do for you ask what you can do for your country.",
//   "chunks": [
//     { "text": " And", "timestamp": [0, 0.78] },
//     { "text": " so", "timestamp": [0.78, 1.06] },
//     { "text": " my", "timestamp": [1.06, 1.46] },
//     ...
//     { "text": " for", "timestamp": [9.72, 9.92] },
//     { "text": " your", "timestamp": [9.92, 10.22] },
//     { "text": " country.", "timestamp": [10.22, 13.5] }
//   ]
// }
flatsiedatsie commented 3 months ago

Thanks!

In the end I extracted the necessary part from the React version.

whisper_worker.js

import { pipeline,env } from './js/transformers.js';
console.log("WHISPER WEB WORKER EXISTS");

env.allowLocalModels = false;

addEventListener('message', async (event) => {
    console.log("WHISPER WEB WORKER: RECEIVED MESSAGE");
    console.log("WHISPER WEB WORKER: event.data: ", event.data);

    const message = event.data;
    let task = message.task;

    // Do some work...
    // TODO use message data
    try{

        let transcript = await transcribe(
            message.task.recorded_audio,
            message.model,
            message.multilingual,
            message.quantized,
            message.subtask,
            message.language,
        );
        console.log("WHISPER WEB WORKER: TRANSCRIPTION RESULT: ", transcript);
        if (transcript === null){
            console.error("WHISPER WEB WORKER: transcription was null");
        }
        if (typeof transcript === 'undefined'){
            console.error("WHISPER WEB WORKER: transcription was undefined??");
        }

        delete task.recorded_audio;
        task['transcript'] = transcript;
        // Send the result back to the main thread
        self.postMessage({
            task: task,
            status: "complete",
            //task: "automatic-speech-recognition",
            transcript: transcript,
        });

    }catch(e){
        console.error("ERROR: whisper worker: ", e);
    }

});

// Define model factories
// Ensures only one model is created of each type

class PipelineFactory {
    static task = null;
    static model = null;
    static quantized = null;
    static instance = null;

    constructor(tokenizer, model, quantized) {
        this.tokenizer = tokenizer;
        this.model = model;
        this.quantized = quantized;
    }

    static async getInstance(progress_callback = null) {
        if (this.instance === null) {
            this.instance = pipeline(this.task, this.model, {
                quantized: this.quantized,
                progress_callback,

                // For medium models, we need to load the `no_attentions` revision to avoid running out of memory
                revision: this.model.includes("/whisper-medium") ? "no_attentions" : "main"
            });
        }

        return this.instance;
    }
}

class AutomaticSpeechRecognitionPipelineFactory extends PipelineFactory {
    static task = "automatic-speech-recognition";
    static model = null;
    static quantized = null;
}

const transcribe = async (
    audio,
    model,
    multilingual,
    quantized,
    subtask,
    language,
) => {
    console.log("in transcribe. audio: ", audio);
    console.log("whisper web worker: in transcribe.  model,multilingual,quantized,subtask,language: ", model, multilingual, quantized, subtask, language);

    let output = null;

    try{
        const isDistilWhisper = model.startsWith("distil-whisper/");

        let modelName = model;
        if (!isDistilWhisper && !multilingual) {
            modelName += ".en"
        }

        const p = AutomaticSpeechRecognitionPipelineFactory;
        if (p.model !== modelName || p.quantized !== quantized) {
            // Invalidate model if different
            p.model = modelName;
            p.quantized = quantized;

            if (p.instance !== null) {
                (await p.getInstance()).dispose();
                p.instance = null;
            }
        }

        // Load transcriber model
        let transcriber = await p.getInstance((data) => {
            console.log("whisper web worker: posting something back: ", data);
            self.postMessage(data);
        });

        const time_precision =
            transcriber.processor.feature_extractor.config.chunk_length /
            transcriber.model.config.max_source_positions;

        // Storage for chunks to be processed. Initialise with an empty chunk.
        let chunks_to_process = [
            {
                tokens: [],
                finalised: false,
            },
        ];

        // TODO: Storage for fully-processed and merged chunks
        // let decoded_chunks = [];

        function chunk_callback(chunk) {
            console.log("in whisper chunk callback. chunk: ", chunk);
            let last = chunks_to_process[chunks_to_process.length - 1];

            // Overwrite last chunk with new info
            Object.assign(last, chunk);
            last.finalised = true;

            // Create an empty chunk after, if it not the last chunk
            if (!chunk.is_last) {
                chunks_to_process.push({
                    tokens: [],
                    finalised: false,
                });
            }
        }

        // Inject custom callback function to handle merging of chunks
        function callback_function(item) {
            //console.log("whisper_worker: COMPLETE?  item: ", item);
            let last = chunks_to_process[chunks_to_process.length - 1];

            // Update tokens of last chunk
            last.tokens = [...item[0].output_token_ids];

            // Merge text chunks
            // TODO optimise so we don't have to decode all chunks every time
            let data = transcriber.tokenizer._decode_asr(chunks_to_process, {
                time_precision: time_precision,
                return_timestamps: true,
                force_full_sequences: false,
            });

            self.postMessage({
                status: "update",
                //task: "automatic-speech-recognition",
                data: data,
            });
        }

        // Actually run transcription
        output = await transcriber(audio, {

            // Greedy
            top_k: 0,
            do_sample: false,

            // Sliding window
            chunk_length_s: isDistilWhisper ? 20 : 30,
            stride_length_s: isDistilWhisper ? 3 : 5,

            // Language and task
            language: language,
            task: subtask,

            // Return timestamps
            return_timestamps: true,
            force_full_sequences: false,

            // Callback functions
            callback_function: callback_function, // after each generation step
            chunk_callback: chunk_callback, // after each chunk is processed
        })

        .catch((error) => {
            console.error("ERROR, actually running whisper failed");

            return null;
        });

        console.log("beyond WHISPER transcribe. output: ", output);

    }
    catch(e){
        console.error("Whisper worker: error in transcribe function: ", e);
    }

    return output;
};

Which I start with this code in a JS module:


window.whisper_worker = null;
window.whisper_worker_busy = false;
let whisper_worker_error_count = 0;

//
//   WHISPER
//

function create_whisper_worker(){
    console.log("in create_whisper_worker");

    window.whisper_worker = null;
    window.whisper_worker = new Worker('./whisper_worker.js', {
      type: 'module'
    });

    console.log("whisper_module: window.whisper_worker: ", window.whisper_worker);

    window.whisper_worker.addEventListener('message', e => {
        //console.log("whisper_module: received message from whisper_worker: ", e.data);

        if(typeof e.data.status == 'string'){
            if(e.data.status == 'progress'){
                //console.log("whisper worker sent download percentage: ", e.data.progress);
                let whisper_progress_el = document.getElementById('download-progress-whisper');
                if(whisper_progress_el == null){
                    console.error("whisper (down)load progress element is missing");
                    add_chat_message("whisper",'download_progress#setting---');
                }
                else{
                    //console.log("updating whisper (down)load progress");
                    whisper_progress_el.value = e.data.progress / 100;
                }

            }
            else if(e.data.status == 'ready'){
                console.log("whisper worker sent ready message");
                window.whisper_worker_busy = false;
                add_chat_message("whisper",get_translation('Voice_recognition_has_loaded'));
                let whisper_progress_el = document.getElementById('download-progress-whisper');
                if(whisper_progress_el){
                    whisper_progress_el.classList.add('download-complete-chat-message');
                }
                else{
                    console.error("whisper became ready, but cannot find loading progress indicator element");
                }
            }
            else if(e.data.status == 'initiate'){
                console.log("whisper worker sent initiate message");
            }
            else if(e.data.status == 'download'){
                console.log("whisper worker sent download message");
                add_chat_message("whisper","(down)loading: " + e.data.file);
            }

            else if(e.data.status == 'update'){
                if(typeof e.data.data == 'object' && e.data.data != null && e.data.data.length){
                    set_chat_status(e.data.data[0],2);
                }

            }

            else if(e.data.status == 'complete'){
                window.whisper_worker_busy = false;
                console.log('GOT WHISPER COMPLETE.  e.data: ', e.data);
                console.log('GOT WHISPER COMPLETE.  e.data.transcript: ', e.data.transcript);
                console.log('GOT WHISPER COMPLETE.  e.data.task: ', e.data.task);

                if(e.data.transcript == null){
                    console.warn("whisper recognition failed. If this is the first run, that's normal.");
                    set_state(LISTENING);
                }
                else if(typeof e.data.transcript != 'undefined'){
                    console.log("whisper returned transcription text: ", e.data.transcript);

                    if(Array.isArray(e.data.transcript)){
                        console.log("typeof transcription is array");
                    }
                    else if(typeof e.data.transcript == 'object'){
                        if(typeof e.data.transcript.text == 'string'){
                            console.log("GOT TEXT: ", e.data.transcript.text);
                        }
                    }
                }
                else{
                    console.log("transcript was not in whisper e.data");
                }

                //add_chat_message("whisper","(down)loading: " + e.data.file);
            }
            else{
                console.log("whisper worker sent a content message");
                window.whisper_worker_busy = false;

                if(e.data.data == null){
                    console.warn("whisper recognition failed. If this is the first run, that's normal.");
                    set_state(LISTENING);
                }
            }
        }

            if(window.enable_microphone == false){
                console.log("whisper worker returned audio file, but in the meantime enable_microphone was disabled. Throwing away the data.");
            }
            else{

                /*

                if(window.whisper_queue.length){
                    console.log("whisper worker done, but there is more work to do. Sentences still in whisper_queue: ", window.whisper_queue.length);
                    let next_sentence = window.whisper_queue[0][0] + window.whisper_queue[0][1]; // sentence plus punctuation mark
                    window.whisper_queue.splice(0,1);

                    whisper_worker.postMessage({'whisper_counter':window.whisper_counter,'sentence':next_sentence});
                    window.whisper_counter++;
                }
                else{
                    console.log("whisper worker was done, and there are no more sentences in the whisper queue. Worker is now idle.");
                    window.whisper_worker_busy = false;
                }
                */
            }

    });

    window.whisper_worker.addEventListener('error', (error) => {
        console.error("ERROR: whisper_worker sent error. terminating!. Error was: ", error, error.message);
        whisper_worker_error_count++;

        window.whisper_worker.terminate();
        window.whisper_worker_busy = false;
        if(typeof error != 'undefined' && whisper_worker_error_count < 10){
            setTimeout(() => {
                console.log("attempting to restart whisper worker");
                create_whisper_worker();
            },1000);
        }
        else{
            console.error("whisper_worker errored out");
        }
    });
}

// create whisper worker
create_whisper_worker();

//
//  Send audio buffer to whisper worker
//
function do_whisper_web(task,language=null){
    console.log("in do_whisper_web. task: ", task);

    if(window.whisper_worker_busy){
        console.error("do_whisper_web was called while whisper worker was busy. Aborting.");
        return
    }

    if(typeof task.recorded_audio == 'undefined'){
        console.error("do_whisper_web: task did not contain recorded_audio. Aborting.");
        return
    }

    task.state == 'stt_in_progress';

    let multilingual = false;
    if(typeof language == 'string'){
        if(language != 'en'){
            multilingual = true;
        }
    }
    const quantized = false;
    const model = "Xenova/whisper-tiny";

    const subtask = null;

    console.log("do_whisper_web: sending audio to whisper worker: ", task.recorded_audio);

    window.whisper_worker.postMessage({
        task:task,
        model,
        multilingual,
        quantized,
        subtask: multilingual ? subtask : null,
        language:
            multilingual && language !== "auto" ? language : null,
    });

}
window.do_whisper_web = do_whisper_web;
hadfield commented 2 months ago

I had a similar desire to use this in vanilla javascript form and had extracted some code from the react version to give it a try, but this version is much nicer. thanks for posting it! A rough version of whisper + vanilla js is in the repo: https://github.com/vital-ai/vital-stt-js i wanted to test this out in combination with a browser based wake word detector, which I posted here: https://github.com/chat-ai-app/chat-ai-assistant-demo with webapp posted here: https://demo-voice.chat.ai/