Gadersd / whisper-burn

A Rust implementation of OpenAI's Whisper model using the burn framework
MIT License
263 stars 29 forks source link

Trying the example I got an empty transcription #15

Closed DavidGOrtega closed 11 months ago

DavidGOrtega commented 11 months ago
whisper-burn % cargo run --release --bin transcribe tiny_en audio16k.wav en transcription.txt
warning: unused imports: `Bool`, `Float`, `Int`
 --> src/helper.rs:2:51
  |
2 |     activation::relu, backend::Backend, BasicOps, Bool, Element, Float, Int, Numeric, Tensor,
  |                                                   ^^^^           ^^^^^  ^^^
  |
  = note: `#[warn(unused_imports)]` on by default

warning: unused imports: `Bool`, `Int`, `activation::relu`
 --> src/model/load.rs:8:14
  |
8 |     tensor::{activation::relu, backend::Backend, Bool, Int, Tensor},
  |              ^^^^^^^^^^^^^^^^                    ^^^^  ^^^

warning: unused import: `Conv1dRecord`
  --> src/model/mod.rs:10:38
   |
10 |         conv::{Conv1d, Conv1dConfig, Conv1dRecord},
   |                                      ^^^^^^^^^^^^

warning: unused import: `Tokenizer`
 --> src/token.rs:4:30
  |
4 | use tokenizers::{AddedToken, Tokenizer};
  |                              ^^^^^^^^^

warning: unused import: `crate::helper::*`
 --> src/transcribe.rs:2:5
  |
2 | use crate::helper::*;
  |     ^^^^^^^^^^^^^^^^

warning: unused import: `num_traits::ToPrimitive`
 --> src/transcribe.rs:7:5
  |
7 | use num_traits::ToPrimitive;
  |     ^^^^^^^^^^^^^^^^^^^^^^^

warning: unused imports: `Float`, `Int`, `config::Config`, `self`
  --> src/transcribe.rs:12:5
   |
12 |     config::Config,
   |     ^^^^^^^^^^^^^^
...
16 |         backend::{self, Backend},
   |                   ^^^^
17 |         Data, Float, Int, Tensor,
   |               ^^^^^  ^^^

warning: unused import: `std::cmp::Ordering`
 --> src/beam.rs:1:5
  |
1 | use std::cmp::Ordering;
  |     ^^^^^^^^^^^^^^^^^^

warning: unused variable: `n_batch`
   --> src/model/mod.rs:132:14
    |
132 |         let [n_batch, seq_len] = x.dims();
    |              ^^^^^^^ help: if this is intentional, prefix it with an underscore: `_n_batch`
    |
    = note: `#[warn(unused_variables)]` on by default

warning: variable does not need to be mutable
  --> src/token.rs:15:13
   |
15 |         let mut tokenizer = tokenizers::Tokenizer::from_file("tokenizer.json")?;
   |             ----^^^^^^^^^
   |             |
   |             help: remove this `mut`
   |
   = note: `#[warn(unused_mut)]` on by default

warning: unused variable: `new_text`
  --> src/transcribe.rs:53:14
   |
53 |         let (new_text, new_tokens) =
   |              ^^^^^^^^ help: if this is intentional, prefix it with an underscore: `_new_text`

warning: unused variable: `n_ctx_max_decoder`
   --> src/transcribe.rs:159:9
    |
159 |     let n_ctx_max_decoder = whisper.decoder_ctx_size();
    |         ^^^^^^^^^^^^^^^^^ help: if this is intentional, prefix it with an underscore: `_n_ctx_max_decoder`

warning: unused variable: `n_channel`
   --> src/transcribe.rs:161:10
    |
161 |     let [n_channel, n_mel, n_ctx] = mels.dims();
    |          ^^^^^^^^^ help: if this is intentional, prefix it with an underscore: `_n_channel`

warning: unused variable: `first_timestamp_token`
   --> src/transcribe.rs:183:9
    |
183 |     let first_timestamp_token = bpe.special_token(SpecialToken::Timestamp(0.0)).unwrap();
    |         ^^^^^^^^^^^^^^^^^^^^^ help: if this is intentional, prefix it with an underscore: `_first_timestamp_token`

warning: unused variable: `initial_tokens`
   --> src/transcribe.rs:195:13
    |
195 |     let mut initial_tokens = if prev_nonspecial_tokens.len() > 0 {
    |             ^^^^^^^^^^^^^^ help: if this is intentional, prefix it with an underscore: `_initial_tokens`

warning: unused variable: `n_batch`
   --> src/transcribe.rs:263:14
    |
263 |         let [n_batch, n_token, n_dict] = log_probs.dims();
    |              ^^^^^^^ help: if this is intentional, prefix it with an underscore: `_n_batch`

warning: unused variable: `n_token`
   --> src/transcribe.rs:263:23
    |
263 |         let [n_batch, n_token, n_dict] = log_probs.dims();
    |                       ^^^^^^^ help: if this is intentional, prefix it with an underscore: `_n_token`

warning: unused variable: `n_dict`
   --> src/transcribe.rs:263:32
    |
263 |         let [n_batch, n_token, n_dict] = log_probs.dims();
    |                                ^^^^^^ help: if this is intentional, prefix it with an underscore: `_n_dict`

warning: variable does not need to be mutable
   --> src/transcribe.rs:195:9
    |
195 |     let mut initial_tokens = if prev_nonspecial_tokens.len() > 0 {
    |         ----^^^^^^^^^^^^^^
    |         |
    |         help: remove this `mut`

warning: unused variable: `end_node`
  --> src/beam.rs:74:17
   |
74 |             let end_node = continuations[end_node_index].clone();
   |                 ^^^^^^^^ help: if this is intentional, prefix it with an underscore: `_end_node`

warning: unused variable: `tok1`
  --> src/beam.rs:77:39
   |
77 | ..._unstable_by(|(tok1, log_prob1), (tok2, log_prob2)| log_prob1.partial_cmp(&log_prob2).unwrap());
   |                   ^^^^ help: if this is intentional, prefix it with an underscore: `_tok1`

warning: unused variable: `tok2`
  --> src/beam.rs:77:58
   |
77 | ..., log_prob1), (tok2, log_prob2)| log_prob1.partial_cmp(&log_prob2).unwrap());
   |                   ^^^^ help: if this is intentional, prefix it with an underscore: `_tok2`

warning: function `get_mel_filters` is never used
  --> src/audio.rs:58:4
   |
58 | fn get_mel_filters<B: Backend>(
   |    ^^^^^^^^^^^^^^^
   |
   = note: `#[warn(dead_code)]` on by default

warning: function `fft_frequencies` is never used
   --> src/audio.rs:145:4
    |
145 | fn fft_frequencies<B: Backend>(sample_rate: f64, n_fft: usize) -> Tensor<B, 1> {
    |    ^^^^^^^^^^^^^^^

warning: function `test_fft_frequencies` is never used
   --> src/audio.rs:159:4
    |
159 | fn test_fft_frequencies<B: Backend>() {
    |    ^^^^^^^^^^^^^^^^^^^^

warning: function `test_mel_frequencies` is never used
   --> src/audio.rs:166:4
    |
166 | fn test_mel_frequencies<B: Backend>(htk: bool) {
    |    ^^^^^^^^^^^^^^^^^^^^

warning: function `mel_frequencies` is never used
   --> src/audio.rs:174:4
    |
174 | fn mel_frequencies<B: Backend>(n_mels: usize, fmin: f64, fmax: f64, htk: bool) -> Tensor<B, 1> {
    |    ^^^^^^^^^^^^^^^

warning: function `construct_special_tokens` is never used
   --> src/token.rs:297:4
    |
297 | fn construct_special_tokens() -> Vec<AddedToken> {
    |    ^^^^^^^^^^^^^^^^^^^^^^^^

warning: field `log_prob` is never read
   --> src/transcribe.rs:145:5
    |
143 | struct BeamSearchToken {
    |        --------------- field in this struct
144 |     token: usize, 
145 |     log_prob: f64, 
    |     ^^^^^^^^
    |
    = note: `BeamSearchToken` has a derived impl for the trait `Clone`, but this is intentionally ignored during dead code analysis

warning: function `first_repetition_end` is never used
   --> src/transcribe.rs:370:4
    |
370 | fn first_repetition_end(tokens: &[usize], period: usize) -> usize {
    |    ^^^^^^^^^^^^^^^^^^^^

warning: function `repetition_period` is never used
   --> src/transcribe.rs:380:4
    |
380 | fn repetition_period(
    |    ^^^^^^^^^^^^^^^^^

warning: function `find_repeated_tokens_index` is never used
   --> src/transcribe.rs:404:4
    |
404 | fn find_repeated_tokens_index(
    |    ^^^^^^^^^^^^^^^^^^^^^^^^^^

warning: `whisper` (lib) generated 32 warnings (run `cargo fix --lib -p whisper` to apply 22 suggestions)
   Compiling whisper v0.1.0 (/Users/davidgortega/Documents/projects/kunzite/whisper-burn)
warning: unused import: `std::collections::HashMap`
 --> src/bin/transcribe/main.rs:1:5
  |
1 | use std::collections::HashMap;
  |     ^^^^^^^^^^^^^^^^^^^^^^^^^
  |
  = note: `#[warn(unused_imports)]` on by default

warning: unused import: `std::iter`
 --> src/bin/transcribe/main.rs:2:5
  |
2 | use std::iter;
  |     ^^^^^^^^^

warning: unused import: `whisper::helper::*`
 --> src/bin/transcribe/main.rs:4:5
  |
4 | use whisper::helper::*;
  |     ^^^^^^^^^^^^^^^^^^

warning: unused import: `token`
 --> src/bin/transcribe/main.rs:6:15
  |
6 | use whisper::{token, token::Language};
  |               ^^^^^

warning: unused imports: `Data`, `Float`, `Int`, `Tensor`, `self`, `self`
  --> src/bin/transcribe/main.rs:23:9
   |
23 |         self,
   |         ^^^^
24 |         backend::{self, Backend},
   |                   ^^^^
25 |         Data, Float, Int, Tensor,
   |         ^^^^  ^^^^^  ^^^  ^^^^^^

warning: unused import: `num_traits::ToPrimitive`
  --> src/bin/transcribe/main.rs:57:5
   |
57 | use num_traits::ToPrimitive;
   |     ^^^^^^^^^^^^^^^^^^^^^^^

warning: unused import: `whisper::audio::prep_audio`
  --> src/bin/transcribe/main.rs:58:5
   |
58 | use whisper::audio::prep_audio;
   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^

warning: unused import: `SpecialToken`
  --> src/bin/transcribe/main.rs:59:37
   |
59 | use whisper::token::{Gpt2Tokenizer, SpecialToken};
   |                                     ^^^^^^^^^^^^

warning: unused variable: `duration`
  --> src/bin/transcribe/main.rs:35:9
   |
35 |     let duration = reader.duration() as usize;
   |         ^^^^^^^^ help: if this is intentional, prefix it with an underscore: `_duration`
   |
   = note: `#[warn(unused_variables)]` on by default

warning: unused variable: `bits_per_sample`
  --> src/bin/transcribe/main.rs:38:9
   |
38 |     let bits_per_sample = spec.bits_per_sample;
   |         ^^^^^^^^^^^^^^^ help: if this is intentional, prefix it with an underscore: `_bits_per_sample`

warning: variable does not need to be mutable
  --> src/bin/transcribe/main.rs:32:9
   |
32 |     let mut reader = hound::WavReader::open(filename)?;
   |         ----^^^^^^
   |         |
   |         help: remove this `mut`
   |
   = note: `#[warn(unused_mut)]` on by default

warning: unused variable: `tokens`
   --> src/bin/transcribe/main.rs:145:16
    |
145 |     let (text, tokens) = match waveform_to_text(&whisper, &bpe, lang, waveform, sample_rate) {
    |                ^^^^^^ help: if this is intentional, prefix it with an underscore: `_tokens`

warning: `whisper` (bin "transcribe") generated 12 warnings (run `cargo fix --bin "transcribe"` to apply 12 suggestions)
    Finished release [optimized] target(s) in 3.43s
warning: the following packages contain code that will be rejected by a future version of Rust: nom v1.2.4, nom v3.2.1
note: to see what the problems were, use the option `--future-incompat-report`, or run `cargo report future-incompatibilities --id 1`
     Running `target/release/transcribe tiny_en audio16k.wav en transcription.txt`
Loading waveform...
Loading model...
Depth: 0
Chunk 0: 

Transcription finished.

The file transcription is empty.

If I debug it

let (text, tokens) = match waveform_to_text(&whisper, &bpe, lang, waveform, sample_rate)

text and tokens are empty

this is my file generated by sox

audio16k.wav.zip

DavidGOrtega commented 11 months ago

I tried another file and depth is always 0

Loading waveform...
Loading model...
Depth: 0
Chunk 0: 

Depth: 0
Chunk 1: 

Depth: 0
Chunk 2: 

Depth: 0
Chunk 3: 

Depth: 0
Chunk 4: 

Depth: 0
Depth: 1
Chunk 5: 

Depth: 0
Chunk 6: 

Depth: 0
Chunk 7: 

Depth: 0
Chunk 8: 

Depth: 0
Chunk 9: 

Depth: 0
Chunk 10: 

Depth: 0
Chunk 11: 

Depth: 0
Chunk 12: 

Depth: 0
Chunk 13: 

Depth: 0
Chunk 14: 

Depth: 0
Chunk 15: 

Transcription finished.
DavidGOrtega commented 11 months ago

Disclaimer: I changed the code to use Cpu since I don not have CUDA

 } else if #[cfg(feature = "torch-backend")] {
            type Backend = TchBackend<f32>;
            let device = TchDevice::Cuda(0);
        }

to

 } else if #[cfg(feature = "torch-backend")] {
            type Backend = TchBackend<f32>;
            let device = TchDevice::Cpu;
        }

I also triend de wgpu backend with same result

cargo run --release --features wgpu-backend --bin transcribe tiny_en audio16k.wav en transcription.txt
DavidGOrtega commented 11 months ago

I have downloaded base_en, small_en and medium_en and they works, the issue is only happening with tiny

Gadersd commented 11 months ago

I'll check it out. It might be that the tokenizer I uploaded to my hugging face for tiny is incorrect.

Gadersd commented 11 months ago

I tested it and the issue is that the tiny models hallucinate very badly while the larger models are good enough to work correctly without as much hand-holding. I recommend using at least the small models until I can reduce the hallucinations of the tiny models.

DavidGOrtega commented 11 months ago

I tested it and the issue is that the tiny models hallucinate very badly while the larger models are good enough to work correctly

You mean your model conversion right? HF transformers model work fine with your audio. I have tested it

DavidGOrtega commented 11 months ago

Interestingly enough, tiny works perfectly opposed to tiny_en

     Running `target/release/transcribe tiny audio16k.wav en transcription.txt`
Loading waveform...
Loading model...
Depth: 0
Depth: 1
Depth: 2
Depth: 3
Depth: 4
Depth: 5
Depth: 6
Depth: 7
Depth: 8
Depth: 9
Depth: 10
Depth: 11
Depth: 12
Depth: 13
Depth: 14
Depth: 15
Depth: 16
Depth: 17
Depth: 18
Depth: 19
Depth: 20
Depth: 21
Depth: 22
Depth: 23
Depth: 24
Depth: 25
Depth: 26
Depth: 27
Depth: 28
Depth: 29
Depth: 30
Depth: 31
Depth: 32
Chunk 0:  Hello, I am the Whisper Machine Learning model. If you see this as text then I am working properly.

Transcription finished.
Gadersd commented 11 months ago

You mean your model conversion right? HF transformers model work fine with your audio. I have tested it

The models should be equivalent. I think HF transformers uses a lot of heuristics to get the whisper models working consistently. Tiny en in particular seems to have a difficult time. In this case it is prematurely outputting an EOT (end of text) token before it gets to the words. The multilingual tiny seems to be a bit more robust.

Gadersd commented 11 months ago

I just updated the project with yet another heuristic/hack: masking some special tokens. Tiny_en now works for me. Let me know if you encounter any issues.

DavidGOrtega commented 11 months ago

It works 🥳 !! however...

it does not work with the wgpu backend

cargo run --release --features wgpu-backend --bin transcribe tiny_en  audio16k.wav en transcription.txt

     Running `target/release/transcribe tiny_en audio16k.wav en transcription.txt`
Loading waveform...
Loading model...
Depth: 0
Chunk 0: 

Transcription finished.

why 🤔 ?

Gadersd commented 11 months ago

It works on WGPU for me. Are you sure that you updated to the latest version of this project? It should be impossible for the latest version to stop at a depth of 0. Also make sure that your tokenizer is the correct one for tiny_en. wget actually changes the names of downloaded tokenizers if there is a naming conflict rather than replacing.

DavidGOrtega commented 11 months ago

@Gadersd I updated the repo however for some reason the build was still the same, I cleaned up the folder and regenerated again and both versions worked!