Open pskrunner14 opened 4 years ago
I'm running into the same error here. My config is fairseq 1.0 GPU 2080ti CUDA 10.2 Python 3.7 PyTorch 1.7 OS Ubuntu 18.04 Installed farseq from source Not using docker image I'm using the same pretrained transformer and wav2vec2.0 model as @pskrunner14
File "fairseq/examples/speech_recognition/infer.py", line 471, in <module>
cli_main()
File "fairseq/examples/speech_recognition/infer.py", line 467, in cli_main
main(args)
File "fairseq/examples/speech_recognition/infer.py", line 327, in main
generator = build_generator(args)
File "fairseq/examples/speech_recognition/infer.py", line 320, in build_generator
return W2lFairseqLMDecoder(args, task.target_dictionary)
File "/root/github/wav2vecasr/fairseq/examples/speech_recognition/w2l_decoder.py", line 354, in __init__
model = task.build_model(lm_args)
File "/root/github/wav2vecasr/fairseq/fairseq/tasks/language_modeling.py", line 178, in build_model
model = super().build_model(args)
File "/root/github/wav2vecasr/fairseq/fairseq/tasks/fairseq_task.py", line 548, in build_model
model = models.build_model(args, self)
File "/root/github/wav2vecasr/fairseq/fairseq/models/__init__.py", line 84, in build_model
return model.build_model(cfg, task)
File "/root/github/wav2vecasr/fairseq/fairseq/models/transformer_lm.py", line 224, in build_model
args.quant_noise_pq_block_size,
File "/root/github/wav2vecasr/fairseq/fairseq/modules/adaptive_input.py", line 33, in __init__
), "cannot specify cutoff larger than vocab size"
Has there been any updates on this issue? I'm running into this both in adaptive_input.py https://github.com/pytorch/fairseq/blob/4e3895be1ccb59e36de85441cd049294cbad2d15/fairseq/modules/adaptive_input.py#L33 and in adaptive_softmax.py https://github.com/pytorch/fairseq/blob/4e3895be1ccb59e36de85441cd049294cbad2d15/fairseq/modules/adaptive_softmax.py#L84 commenting both of those out then causes an error where the checkpoint has different dimensions for the softmax than the model generated by the code
Traceback (most recent call last):
File "external_lib/fairseq/examples/speech_recognition/infer.py", line 431, in <module>
cli_main()
File "external_lib/fairseq/examples/speech_recognition/infer.py", line 427, in cli_main
main(args)
File "external_lib/fairseq/examples/speech_recognition/infer.py", line 287, in main
generator = build_generator(args)
File "external_lib/fairseq/examples/speech_recognition/infer.py", line 278, in build_generator
return W2lFairseqLMDecoder(args, task.target_dictionary)
File "/home/prad/github/attempt/external_lib/fairseq/examples/speech_recognition/w2l_decoder.py", line 364, in __init__
model.load_state_dict(checkpoint["model"], strict=False)
File "/home/prad/github/attempt/external_lib/fairseq/fairseq/models/fairseq_model.py", line 115, in load_state_dict
return super().load_state_dict(new_state_dict, strict)
File "/home/prad/anaconda3/envs/asr/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1052, in load_state_dict
self.__class__.__name__, "\n\t".join(error_msgs)))
RuntimeError: Error(s) in loading state_dict for TransformerLanguageModel:
size mismatch for decoder.adaptive_softmax.head.class_proj.weight: copying a param with shape torch.Size([2, 1280]) from checkpoint, the shape in current model is torch.Size([1, 1280]).
🐛 Bug
Getting an assertion error (cutoff) when trying to run Wav2Vec 2.0 CTC inference with Transformer LM. What do I need to change to get this working?
To Reproduce
Steps to reproduce the behavior:
Run cmd:
See error:
INFO:main:Namespace(all_gather_list_size=16384, beam=5, beam_size_token=100, beam_threshold=25.0, bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_suffix='', constraints=None, cpu=False, criterion='ctc', data='data/manifest/', data_buffer_size=10, dataset_impl=None , ddp_backend='c10d', decoding_format=None, device_id=0, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distr ibuted_num_procs=None, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', diverse_beam_groups=-1, diverse _beam_strength=0.5, diversity_rate=-1.0, dump_emissions=None, dump_features=None, empty_cache_freq=0, enable_padding=False, fast_stat_sync=False, fi nd_unused_parameters=False, fix_batches_to_gpus=False, force_anneal=None, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_t olerance=0.0, fp16_scale_window=None, gen_subset='test-other', iter_decode_eos_penalty=0.0, iter_decode_force_max_iter=False, iter_decode_max_iter=1 0, iter_decode_with_beam=1, iter_decode_with_external_reranker=False, kenlm_model='models/lm_librispeech_word_transformer.pt', kspmodel=None, labels ='ltr', lenpen=1, lexicon='models/librispeech_lexicon.lst', lm_weight=2.0, load_emissions=None, localsgd_frequency=3, log_format=None, log_interval= 100, lr_scheduler='fixed', lr_shrink=0.1, match_source_len=False, max_len_a=0, max_len_b=200, max_sample_size=None, max_sentences=None, max_tokens=4 000000, memory_efficient_bf16=False, memory_efficient_fp16=False, min_len=1, min_loss_scale=0.0001, min_sample_size=None, model_overrides='{}', mode l_parallel_size=1, nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, no_seed_provided=True, normali ze=False, nprocs_per_node=8, num_shards=1, num_workers=1, optimizer=None, path='models/wav2vec_big_960h.pt', pipeline_balance=None, pipeline_checkpo int='never', pipeline_chunks=None, pipeline_devices=None, pipeline_model_parallel=False, prefix_size=0, print_alignment=False, print_step=False, pro file=False, quantization_config_path=None, quiet=False, remove_bpe='letter', replace_unk=None, required_batch_size_multiple=8, required_seq_len_mult iple=1, results_path='results/', retain_dropout=False, retain_dropout_modules=None, retain_iter_history=False, rnnt_decoding_type='greedy', rnnt_len _penalty=-0.5, sacrebleu=False, sample_rate=16000, sampling=False, sampling_topk=-1, sampling_topp=-1.0, score_reference=False, scoring='bleu', seed =1, shard_id=0, sil_weight=0.0, skip_invalid_size_inputs_valid_test=False, slowmo_algorithm='LocalSGD', slowmo_momentum=None, task='audio_pretrainin g', temperature=1.0, tensorboard_logdir='', threshold_loss_scale=None, tokenizer=None, tpu=False, unit_lm=False, unk_weight=-inf, unkpen=0, unnormal ized=False, user_dir=None, w2l_decoder='fairseqlm', warmup_updates=0, wer_args=None, wfstlm=None, word_score=-1.0, zero_infinity=False, zero_shardin g='none')
INFO:fairseq.data.audio.raw_audio_dataset:loaded 2939, skipped 0 samples
INFO:main:| data/manifest/ test-other 2939 examples
INFO:main:| decoding with criterion ctc
INFO:main:| loading model(s) from models/wav2vec_big_960h.pt
Namespace(activation_dropout=0.1, activation_fn='relu', adaptive_input=True, adaptive_input_cutoff='60000,160000', adaptive_input_factor=4.0, adapti ve_softmax_cutoff='60000,160000', adaptive_softmax_dropout=0, adaptive_softmax_factor=4.0, add_bos_token=False, arch='transformer_lm_gbw', attention _dropout=0.1, best_checkpoint_metric='loss', bpe=None, bucket_cap_mb=25, char_embedder_highway_layers=2, character_embedding_dim=4, character_embedd ings=False, character_filters='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', clip_norm=0.1, cpu=False, criterion='adaptive _loss', curriculum=0, data='models', dataset_impl=None, ddp_backend='no_c10d', decoder_attention_heads=16, decoder_embed_dim=1280, decoder_ffn_embed _dim=6144, decoder_input_dim=1280, decoder_layers=20, decoder_learned_pos=False, decoder_normalize_before=True, decoder_output_dim=1280, device_id=0 , disable_validation=False, distributed_backend='nccl', distributed_init_method='tcp://learnfair1288:10075', distributed_no_spawn=False, distributed _port=10075, distributed_rank=0, distributed_world_size=128, dropout=0.1, empty_cache_freq=0, fast_stat_sync=False, find_unused_parameters=False, fi x_batches_to_gpus=False, fixed_validation_seed=None, fp16=True, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, future_target =False, keep_interval_updates=1, keep_last_epochs=-1, lazy_load=False, log_format='json', log_interval=1000, lr=[0.05], lr_scheduler='inverse_sqrt', max_epoch=100, max_sentences=None, max_sentences_valid=None, max_target_positions=256, max_tokens=2048, max_tokens_valid=2048, max_update=0, maximi ze_best_checkpoint_metric=False, memory_efficient_fp16=False, min_loss_scale=0.0001, min_lr=-1, momentum=0.99, no_decoder_final_norm=True, noepoch checkpoints=True, no_last_checkpoints=False, no_progress_bar=False, no_save=False, no_save_optimizer_state=False, no_token_positional_embeddings=Fal se, num_workers=1, optimizer='nag', optimizer_overrides='{}', output_dictionary_size=-1, past_target=False, raw_text=False, required_batch_size_mult iple=8, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', sample_break _mode='eos', save_dir='/checkpoint/antares/2019-11-02/v4_reduced_lr_trlmgb.invsqrt.wrm16000.int1e-07.nag.lr0.05.clp0.1.lyr20.hd16.drp0.1.adp=60000_1 60000.ad_inp.ad_f4.ad_sf4.inp=60000_160000.tie.ffn6144.at_d0.1.rl_d0.1.i1280.m1280.o1280.mxtk2048.tps256.seed1.bm=eos.ngpu128', save_interval=1, sav e_interval_updates=40000, seed=1, self_target=False, sentence_avg=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test =True, task='language_modeling', tbmf_wrapper=False, tensorboard_logdir='/checkpoint/antares/tensorboard_logs/2019-11-02/v4_reduced_lr_trlmgb.invsqr t.wrm16000.int1e-07.nag.lr0.05.clp0.1.lyr20.hd16.drp0.1.adp=60000_160000.ad_inp.ad_f4.ad_sf4.inp=60000_160000.tie.ffn6144.at_d0.1.rl_d0.1.i1280.m128 0.o1280.mxtk2048.tps256.seed1.bm=eos.ngpu128', threshold_loss_scale=None, tie_adaptive_proj=False, tie_adaptive_weights=True, tokenizer=None, tokens _per_sample=256, train_subset='train', update_freq=[1], use_bmuf=False, user_dir=None, valid_subset='valid', validate_interval=1, warmup_init_lr=1e- 07, warmup_updates=16000, weight_decay=0.0)
INFO:fairseq.tasks.language_modeling:dictionary: 32 types
Traceback (most recent call last): File "examples/speech_recognition/infer.py", line 429, in
cli_main()
File "examples/speech_recognition/infer.py", line 425, in cli_main
main(args)
File "examples/speech_recognition/infer.py", line 301, in main
generator = build_generator(args)
File "examples/speech_recognition/infer.py", line 297, in build_generator
return W2lFairseqLMDecoder(args, task.target_dictionary)
File "/root/fairseq/examples/speech_recognition/w2l_decoder.py", line 353, in init
model = task.build_model(lm_args)
File "/root/fairseq/fairseq/tasks/language_modeling.py", line 187, in build_model
model = super().build_model(args)
File "/root/fairseq/fairseq/tasks/fairseq_task.py", line 268, in build_model
model = models.build_model(args, self)
File "/root/fairseq/fairseq/models/init.py", line 50, in build_model
return ARCH_MODEL_REGISTRY[args.arch].build_model(args, task)
File "/root/fairseq/fairseq/models/transformer_lm.py", line 301, in build_model
args.quant_noise_pq_block_size,
File "/root/fairseq/fairseq/modules/adaptive_input.py", line 33, in init
-1], 'cannot specify cutoff larger than vocab size'
AssertionError: cannot specify cutoff larger than vocab size