vasistalodagala / whisper-finetune

Fine-tune and evaluate Whisper models for Automatic Speech Recognition (ASR) on custom datasets or datasets from huggingface.
MIT License
217 stars 52 forks source link

pytorch_model.bin not getting saved #13

Open 25Varun opened 8 months ago

25Varun commented 8 months ago

While fine tuning vasita/whisper-kannada-small model on customized dataset, after training, in the output directory, all the other json and bin files are getting saved except pytorch_model.bin file and after saving it says some keys were missing while saving the model [proj_out.weight]. Why is that I have no clue, I actually run the whole thing on google colab.

import torch import evaluate from dataclasses import dataclass from typing import Any, Dict, List, Union from datasets import DatasetDict, Audio, load_from_disk, concatenate_datasets from transformers.models.whisper.english_normalizer import BasicTextNormalizer from transformers import ( WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, )

model_name = 'vasista22/whisper-kannada-small' language = 'Kannada' sampling_rate = 16000 num_proc = 1 train_strategy = 'steps' learning_rate = 1.75e-5*0.1 warmup = 20 train_batchsize = 16 eval_batchsize = 8

num_epochs = 20

num_steps = 50 resume_from_ckpt = None output_dir = 'model_1' train_datasets = ['/content/drive/MyDrive/Children/prepared_data'] eval_datasets = ['/content/drive/MyDrive/Children/prepared_data']

gradient_checkpointing = True freeze_feature_encoder = False freeze_encoder = False do_normalize_eval = True do_lower_case = False do_remove_punctuation = False normalizer = BasicTextNormalizer()

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name) tokenizer = WhisperTokenizer.from_pretrained(model_name, language=language, task="transcribe") processor = WhisperProcessor.from_pretrained(model_name, language=language, task="transcribe") model = WhisperForConditionalGeneration.from_pretrained(model_name)

if model.config.decoder_start_token_id is None: raise ValueError("Make sure that config.decoder_start_token_id is correctly defined")

if freeze_feature_encoder: model.freeze_feature_encoder()

if freeze_encoder: model.freeze_encoder() model.model.encoder.gradient_checkpointing = False

model.config.forced_decoder_ids = None model.config.suppress_tokens = []

if gradient_checkpointing: model.config.use_cache = False

def load_custom_dataset(split): ds = [] if split == 'train': for dset in train_datasets: ds.append(load_from_disk(dset)) if split == 'eval': for dset in eval_datasets: ds.append(load_from_disk(dset))

ds_to_return = concatenate_datasets(ds)
ds_to_return = ds_to_return.shuffle(seed=22)
return ds_to_return

def prepare_dataset(batch):

load and (possibly) resample audio data to 16kHz

audio = batch["audio"]

# compute log-Mel input features from input audio array
batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
# compute input length of audio sample in seconds
batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]

# optional pre-processing steps
transcription = batch["sentence"]
if do_lower_case:
    transcription = transcription.lower()
if do_remove_punctuation:
    transcription = normalizer(transcription).strip()

# encode target text to label ids
batch["labels"] = processor.tokenizer(transcription).input_ids
return batch

max_label_length = model.config.max_length min_input_length = 0.0 max_input_length = 30.0 def is_in_length_range(length, labels): return min_input_length < length < max_input_length and 0 < len(labels) < max_label_length

print('DATASET PREPARATION IN PROGRESS...') raw_dataset = DatasetDict() raw_dataset["train"] = load_custom_dataset('train') raw_dataset["eval"] = load_custom_dataset('eval')

raw_dataset = raw_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate)) raw_dataset = raw_dataset.map(prepare_dataset, num_proc=num_proc)

raw_dataset = raw_dataset.filter( is_in_length_range, input_columns=["input_length", "labels"], num_proc=num_proc, )

@dataclass class DataCollatorSpeechSeq2SeqWithPadding: processor: Any

def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
    # split inputs and labels since they have to be of different lengths and need different padding methods
    # first treat the audio inputs by simply returning torch tensors
    input_features = [{"input_features": feature["input_features"]} for feature in features]
    batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

    # get the tokenized label sequences
    label_features = [{"input_ids": feature["labels"]} for feature in features]
    # pad the labels to max length
    labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

    # replace padding with -100 to ignore loss correctly
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

    # if bos token is appended in previous tokenization step,
    # cut bos token here as it's append later anyways
    if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
        labels = labels[:, 1:]

    batch["labels"] = labels

    return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor) print('DATASET PREPARATION COMPLETED')

metric = evaluate.load("wer") def compute_metrics(pred): pred_ids = pred.predictions label_ids = pred.label_ids

# replace -100 with the pad_token_id
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

# we do not want to group tokens when computing the metrics
pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

if do_normalize_eval:
    pred_str = [normalizer(pred) for pred in pred_str]
    label_str = [normalizer(label) for label in label_str]

wer = 100 * metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}

############################### TRAINING ARGS AND TRAINING ############################

if train_strategy == 'epoch': training_args = Seq2SeqTrainingArguments( output_dir=output_dir, per_device_train_batch_size=train_batchsize, gradient_accumulation_steps=1, learning_rate=learning_rate, warmup_steps=warmup, gradient_checkpointing=gradient_checkpointing, fp16=True, evaluation_strategy="epoch", save_strategy="epoch", num_train_epochs=num_epochs, save_total_limit=10, per_device_eval_batch_size=eval_batchsize, predict_with_generate=True, generation_max_length=225, logging_steps=500, report_to=["tensorboard"], load_best_model_at_end=True, metric_for_best_model="wer", greater_is_better=False, optim="adamw_bnb_8bit", resume_from_checkpoint= resume_from_ckpt, )

elif train_strategy == 'steps': training_args = Seq2SeqTrainingArguments( output_dir=output_dir, per_device_train_batch_size=train_batchsize, gradient_accumulation_steps=1, learning_rate=learning_rate, warmup_steps=warmup, gradient_checkpointing=gradient_checkpointing, fp16=True, evaluation_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=50, max_steps=num_steps, save_total_limit=10, per_device_eval_batch_size=eval_batchsize, predict_with_generate=True, generation_max_length=225, logging_steps=500, report_to=["tensorboard"], load_best_model_at_end=True, metric_for_best_model="wer", greater_is_better=False, optim="adamw_bnb_8bit", resume_from_checkpoint=resume_from_ckpt, )

trainer = Seq2SeqTrainer( args=training_args, model=model, train_dataset=raw_dataset["train"], eval_dataset=raw_dataset["eval"], data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=processor.feature_extractor, )

processor.save_pretrained(output_dir) model.save_pretrained(output_dir) print('TRAINING IN PROGRESS...') trainer.train() print('DONE TRAINING')

This was the code, please help

charan-455 commented 5 months ago

I am also facing the same issue, pytorch_model.bin is not being saved. Please help

DMUGZT commented 5 months ago

I have also met this problem, and there is a possible solution: make sure warmup steps is lower than your num_epochs number. It is worked in my model, since I changed the warmup steps, there have been no more same problems :)

benchrus commented 4 months ago

Sorry, I got the same issue. I followed your advice, changed the warmup steps lower than num_epochs, but it's not fixing this issue. Please help me.