voidful / SpeechMix

Explore different way to mix speech model(wav2vec2, hubert) and nlp model(BART,T5,GPT) together
42 stars 10 forks source link

Inference error #6

Closed egorsmkv closed 2 years ago

egorsmkv commented 2 years ago

I am successfully trained a checkpoint, but having an error during inference.

My code to do inference:

import torch
import re
from datasets import  Audio
from datasets import load_dataset, load_metric
from speechmix import SpeechMixEED

eval_dataset = "mozilla-foundation/common_voice_9_0"
field = 'uk'
split_val = "test[:10]"
device = "cuda"

chars_to_ignore_regex = '— , ? . ! - \; \: \" “ % ‘ ” �'
chars_to_ignore_regex = (
    f'[{"".join(chars_to_ignore_regex)}]' if chars_to_ignore_regex is not None else None
)

seed = SpeechMixEED('wav2vec2', 'google/mt5-small')
seed.to(device)
seed.load_state_dict(torch.load("./wav2vec2_google/mt5-small_SpeechMixEED_base/checkpoint-11900/pytorch_model.bin"))
seed.eval()

ds = load_dataset(eval_dataset, field, split=split_val)
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))

def map_to_array(batch):
    audio = batch["audio"]

    batch["speech"] = audio["array"]
    batch["sampling_rate"] = audio["sampling_rate"]
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().replace("’", "'")

    return batch

ds = ds.map(map_to_array)

def map_to_pred(batch):
    outputs = seed.generate(torch.tensor([batch["speech"]], device=seed.device), max_length=250)
    decoded = seed.tokenizer.decode(outputs[0], skip_special_tokens=True)

    # label = None
    # while True:
    #     result = seed([torch.tensor(batch["speech"], device=seed.device)],decoder_input_ids=label)
    #     label = torch.tensor([[0]+result['logits'].tolist()[0]],device=seed.device)
    #     if label.tolist()[-1][-1] == seed.tokenizer.eos_token_id:
    #         break

    batch["predicted"] = decoded
    batch["target"] = batch["sentence"]

    return batch

result = ds.map(map_to_pred, batched=True, batch_size=3, remove_columns=list(ds.features.keys()))

wer = load_metric("wer.py")
cer = load_metric("cer.py")

predictions = [x.upper() for x in result["predicted"]]
references = [x.upper() for x in result["target"]]

print(f"WER: {wer.compute(predictions=predictions, references=references)}")
print(f"CER: {cer.compute(predictions=predictions, references=references)}")

Error after running the code:

(base) yehor@desktop:~/Work/YehorSmoliakov/wav2vec2-with-gpt/SpeechMix$ python recognize_mt5.py 
################################################################################
### WARNING, path does not exist: KALDI_ROOT=/mnt/matylda5/iveselyk/Tools/kaldi-trunk
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

[s3prl.downstream.experts] Warning: can not import s3prl.downstream.a2a-vc-vctk.expert: No module named 'resemblyzer'. Pass.
Using cache found in /home/yehor/.cache/torch/hub/s3prl_cache/3a990c945fbe378df95598eec534e91ba22a5d9eab0b2f88777a7a696d1344e9
for https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt
Before layer sharing num_speech_encoder_layers 12
After layer sharing  num_speech_encoder_layers 12 num_nlp_encoder_layers 8 share_layer_ratio 0 remove_layers 0
Traceback (most recent call last):
  File "recognize_mt5.py", line 20, in <module>
    seed.load_state_dict(torch.load("./wav2vec2_google/mt5-small_SpeechMixEED_base/checkpoint-11900/pytorch_model.bin"))
  File "/home/yehor/Tools/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1498, in load_state_dict
    self.__class__.__name__, "\n\t".join(error_msgs)))
RuntimeError: Error(s) in loading state_dict for SpeechMixEED:
    Missing key(s) in state_dict: "length_adapters.1.weight", "length_adapters.1.bias", "length_adapters.2.weight", "length_adapters.2.bias". 

So, my question is how to fix the error: Missing key(s) in state_dict: "length_adapters.1.weight", "length_adapters.1.bias", "length_adapters.2.weight", "length_adapters.2.bias" ?

voidful commented 2 years ago

You can take a look to this script: https://github.com/voidful/SpeechMix/blob/main/eval.ipynb

It seems the architecture is different when you loading back your model

egorsmkv commented 2 years ago

You can take a look to this script: https://github.com/voidful/SpeechMix/blob/main/eval.ipynb

It seems the architecture is different when you loading back your model

I was training a checkpoint with down_scale=2 used along with train.py, when I set it as seed = SpeechMixEED('wav2vec2', 'google/mt5-small', down_scale=2) it loads the model correctly but then I get an error with tensor sizes:

File "recognize_mt5.py", line 43, in map_to_pred
    result = seed([torch.tensor(batch["speech"], device=seed.device)], decoder_input_ids=label)
ValueError: expected sequence of length 127872 at dim 1 (got 77952)

My full code:

import torch
import re
from datasets import  Audio
from datasets import load_dataset, load_metric
from speechmix import SpeechMixEED

eval_dataset = "mozilla-foundation/common_voice_9_0"
field = 'uk'
split_val = "test[:10]"
device = "cuda"

chars_to_ignore_regex = '— , ? . ! - \; \: \" “ % ‘ ” �'
chars_to_ignore_regex = (
    f'[{"".join(chars_to_ignore_regex)}]' if chars_to_ignore_regex is not None else None
)

seed = SpeechMixEED('wav2vec2', 'google/mt5-small', down_scale=2)
seed.to(device)
seed.load_state_dict(torch.load("./wav2vec2_google/mt5-small_SpeechMixEED_base/checkpoint-14000/pytorch_model.bin"))
seed.eval()

ds = load_dataset(eval_dataset, field, split=split_val)
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))

def map_to_array(batch):
    audio = batch["audio"]

    batch["speech"] = audio["array"]
    batch["sampling_rate"] = audio["sampling_rate"]
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().replace("’", "'")

    return batch

ds = ds.map(map_to_array)

def map_to_pred(batch):
    label = None
    while True:
        result = seed([torch.tensor(batch["speech"], device=seed.device)], decoder_input_ids=label)
        label = torch.tensor([[0] + result['logits'].tolist()[0]], device=seed.device)
        if label.tolist()[-1][-1] == seed.tokenizer.eos_token_id:
            break

    result = seed.tokenizer.batch_decode(label)

    batch["predicted"] = result[0]
    batch["target"] = batch["sentence"]

    return batch

result = ds.map(map_to_pred, batched=True, batch_size=3, remove_columns=list(ds.features.keys()))

wer = load_metric("wer.py")
cer = load_metric("cer.py")

predictions = [x.upper() for x in result["predicted"]]
references = [x.upper() for x in result["target"]]

print(f"WER: {wer.compute(predictions=predictions, references=references)}")
print(f"CER: {cer.compute(predictions=predictions, references=references)}")
egorsmkv commented 2 years ago

Also, I see in that notebook that you're downloading this model https://huggingface.co/voidful/speechmix_eed_fixed/resolve/main/pytorch_model.bin where in the URL is "speechmix_eed_fixed", can you, please, say why this is SpeechMixEED Fixed?

Did you train a model using the --SpeechMixFixed argument?

egorsmkv commented 2 years ago

I have solved the problem.

Reproducing inference without map_to_pred function works. It seems the problem in using ds.map function,