Open AMOHYZ opened 3 years ago
this means some of your files are too short you can set task.min_sample_size to something reasonable (like 32000 = 2s if sample rate is 16k) to filter out examples that are too short, but then you need to change the code to also skip those examples when loading labels (since you get this error during finetuning). or you change the code to pad those examples instead of skipping them
i'll look into adding some guard rails
Hi, I'm having the same issue when training with hydra but working correctly with command line arguments.
I'm using a training set for a trial with only one audio of 30 seconds repeated many times.
2020-12-03 20:31:50 | INFO | fairseq.trainer | begin training epoch 1
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Traceback (most recent call last):
File "/home/aalvarez/Projects/fairseq/fairseq_cli/hydra_train.py", line 70, in <module>
cli_main()
File "/home/aalvarez/Projects/fairseq/fairseq_cli/hydra_train.py", line 66, in cli_main
hydra_main()
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/hydra/main.py", line 32, in decorated_main
_run_hydra(
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/hydra/_internal/utils.py", line 346, in _run_hydra
run_and_report(
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/hydra/_internal/utils.py", line 201, in run_and_report
raise ex
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/hydra/_internal/utils.py", line 198, in run_and_report
return func()
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/hydra/_internal/utils.py", line 347, in <lambda>
lambda: hydra.run(
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/hydra/_internal/hydra.py", line 107, in run
return run_job(
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/hydra/core/utils.py", line 125, in run_job
ret.return_value = task_function(task_cfg)
File "/home/aalvarez/Projects/fairseq/fairseq_cli/hydra_train.py", line 38, in hydra_main
distributed_utils.call_main(cfg, pre_main)
File "/home/aalvarez/Projects/fairseq/fairseq/distributed_utils.py", line 313, in call_main
torch.multiprocessing.spawn(
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/home/aalvarez/Projects/fairseq/fairseq/distributed_utils.py", line 300, in distributed_main
main(cfg, **kwargs)
File "/home/aalvarez/Projects/fairseq/fairseq_cli/train.py", line 130, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/usr/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/home/aalvarez/Projects/fairseq/fairseq_cli/train.py", line 219, in train
log_output = trainer.train_step(samples)
File "/usr/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/home/aalvarez/Projects/fairseq/fairseq/trainer.py", line 563, in train_step
raise e
File "/home/aalvarez/Projects/fairseq/fairseq/trainer.py", line 531, in train_step
loss, sample_size_i, logging_output = self.task.train_step(
File "/home/aalvarez/Projects/fairseq/fairseq/tasks/fairseq_task.py", line 428, in train_step
loss, sample_size, logging_output = criterion(model, sample)
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/aalvarez/Projects/fairseq/fairseq/criterions/ctc.py", line 106, in forward
net_output = model(**sample["net_input"])
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/aalvarez/Projects/fairseq/fairseq/legacy_distributed_data_parallel.py", line 83, in forward
return self.module(*inputs, **kwargs)
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/aalvarez/Projects/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 160, in forward
x = self.w2v_encoder(**kwargs)
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/aalvarez/Projects/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 339, in forward
x, padding_mask = self.w2v_model.extract_features(**w2v_args)
File "/home/aalvarez/Projects/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 570, in extract_features
res = self.forward(source, padding_mask, mask=mask, features_only=True)
File "/home/aalvarez/Projects/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 454, in forward
features = self.feature_extractor(source)
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/aalvarez/Projects/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 680, in forward
x = conv(x)
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/aalvarez/.virtualenvs/wav2vec_training/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 258, in forward
return F.conv1d(input, self.weight, self.bias, self.stride,
RuntimeError: Calculated padded input size per channel: (1). Kernel size: (10). Kernel size can't be greater than actual input size
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
/usr/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 216 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
hydra vs command line is just how you specify configuration. can you check the config that gets printed at the start of training for differences
You were right, my apologies. I just check and max_sample_size
was misconfigured.
Although I now have the same set of parameters in both cases, I still don't get the exact same configuration. I noticed that I need to set more parameters in hydra than through the command-line options.
For example, I had to set post_process
and sentence_avg
twice (in two different sections) to match the output parameters.
common_eval:
post_process: letter
...
criterion:
_name: ctc
post_process: letter
sentence_avg: True
...
optimization:
sentence_avg: True
But, I still don't have the same configuration. I think the most critical part is that I'm not getting the eval_wer_config
automatically. Am I doing something wrong?
if you look at the default values for e.g. sentence_avg, you'll see that criterion actually "interpolates" (i.e. inherits) it from the optimization config by default (II("path.to.field") in code, or ${path.to.field} in yaml)
eval_wer_config is used only by infer.py (which has not yet been migrated to hydra), but i am not sure what you mean that you are not getting it automatically
Sorry for the late reply.
I mean that for when launching the command line training I have many more parameters also been printed, which is not the case when executing the hydra training procedure. But from what you said, I guess that I'm not configuring the correct group of parameters for them to be cascaded.
if eval_wer_config
is only used by infer.py
, then I guess that setting eval_wer=True
has also no effect. If this is the case, how is the model being evaluated for each validation step? Is it by the wer_args
parameter inside criterion
?
I ask this because I find out that after training the model, I obtained a fixed and steady WER value, which is weird.
during training, we compute raw wer (aka "viterbi") inside ctc criterion. optionally, if you provide "wer_wars" then we can also eval using a ken lm
if you see strange things happening with wer computation you might want to print out a few examples. for instance, i think by default "post_process" is set to "letter" which assumes your targets are lettesr with | as a word boundary.
Code sample
nohup python fairseq_cli/hydra_train.py task.data=/datadrive/ASR/training_data model.w2v_path=/datadrive/ASR/model/checkpoint_best.pt --config-path /home/rashwan/ASR/fairseq/examples/wav2vec/config/finetuning --config-name base_100h > /datadrive/ASR/CTC_Model/train.log
Configuration
@package group
common: fp16: false log_format: json log_interval: 200
checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: wer save_dir: /datadrive/ASR/CTC_Model task: _name: audio_pretraining data: ??? normalize: false labels: ltr
dataset: num_workers: 6 max_tokens: 750000 skip_invalid_size_inputs_valid_test: true valid_subset: valid
distributed_training: ddp_backend: no_c10d distributed_world_size: 4
criterion: _name: ctc zero_infinity: true
optimization: max_update: 80000 lr: [0.00003] sentence_avg: true update_freq: [4]
optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08
lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05
model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_channel_prob: 0.5 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 0
Error
2020-11-26 11:25:00 | INFO | fairseq_cli.train | task: AudioPretrainingTask 2020-11-26 11:25:00 | INFO | fairseq_cli.train | model: Wav2VecCtc 2020-11-26 11:25:00 | INFO | fairseq_cli.train | criterion: CtcCriterion) 2020-11-26 11:25:00 | INFO | fairseq_cli.train | num. model params: 94420159 (num. trained: 94420159) 2020-11-26 11:25:00 | INFO | fairseq.trainer | detected shared parameter: w2v_encoder.w2v_model.feature_extractor.conv_layers.0.0.bias <- w2v_encoder.w2v_model.feature_extractor.conv_layers.1.0.bias 2020-11-26 11:25:00 | INFO | fairseq.trainer | detected shared parameter: w2v_encoder.w2v_model.feature_extractor.conv_layers.0.0.bias <- w2v_encoder.w2v_model.feature_extractor.conv_layers.2.0.bias 2020-11-26 11:25:00 | INFO | fairseq.trainer | detected shared parameter: w2v_encoder.w2v_model.feature_extractor.conv_layers.0.0.bias <- w2v_encoder.w2v_model.feature_extractor.conv_layers.3.0.bias 2020-11-26 11:25:00 | INFO | fairseq.trainer | detected shared parameter: w2v_encoder.w2v_model.feature_extractor.conv_layers.0.0.bias <- w2v_encoder.w2v_model.feature_extractor.conv_layers.4.0.bias 2020-11-26 11:25:00 | INFO | fairseq.trainer | detected shared parameter: w2v_encoder.w2v_model.feature_extractor.conv_layers.0.0.bias <- w2v_encoder.w2v_model.feature_extractor.conv_layers.5.0.bias 2020-11-26 11:25:00 | INFO | fairseq.trainer | detected shared parameter: w2v_encoder.w2v_model.feature_extractor.conv_layers.0.0.bias <- w2v_encoder.w2v_model.feature_extractor.conv_layers.6.0.bias 2020-11-26 11:25:01 | INFO | fairseq.utils | CUDA enviroments for all 4 workers 2020-11-26 11:25:01 | INFO | fairseq.utils | rank 0: capabilities = 3.7 ; total memory = 11.173 GB ; name = Tesla K80
cli_main()
File "fairseq_cli/hydra_train.py", line 66, in cli_main
hydra_main()
File "/anaconda/envs/ASR/lib/python3.7/site-packages/hydra/main.py", line 37, in decorated_main
strict=strict,
File "/anaconda/envs/ASR/lib/python3.7/site-packages/hydra/_internal/utils.py", line 347, in _run_hydra
lambda: hydra.run(
File "/anaconda/envs/ASR/lib/python3.7/site-packages/hydra/_internal/utils.py", line 201, in run_and_report
raise ex
File "/anaconda/envs/ASR/lib/python3.7/site-packages/hydra/_internal/utils.py", line 198, in run_and_report
return func()
File "/anaconda/envs/ASR/lib/python3.7/site-packages/hydra/_internal/utils.py", line 350, in
overrides=args.overrides,
File "/anaconda/envs/ASR/lib/python3.7/site-packages/hydra/_internal/hydra.py", line 112, in run
configure_logging=with_log_configuration,
File "/anaconda/envs/ASR/lib/python3.7/site-packages/hydra/core/utils.py", line 125, in run_job
ret.return_value = task_function(task_cfg)
File "fairseq_cli/hydra_train.py", line 38, in hydra_main
distributed_utils.call_main(cfg, pre_main)
File "/home/rashwan/ASR/fairseq/fairseq/distributed_utils.py", line 318, in call_main
cfg.distributed_training.distributed_world_size,
File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 119, in join
raise Exception(msg)
Exception:
2020-11-26 11:25:01 | INFO | fairseq.utils | rank 1: capabilities = 3.7 ; total memory = 11.173 GB ; name = Tesla K80
2020-11-26 11:25:01 | INFO | fairseq.utils | rank 2: capabilities = 3.7 ; total memory = 11.173 GB ; name = Tesla K80
2020-11-26 11:25:01 | INFO | fairseq.utils | rank 3: capabilities = 3.7 ; total memory = 11.173 GB ; name = Tesla K80
2020-11-26 11:25:01 | INFO | fairseq.utils | CUDA enviroments for all 4 workers 2020-11-26 11:25:01 | INFO | fairseq_cli.train | training on 4 devices (GPUs/TPUs) 2020-11-26 11:25:01 | INFO | fairseq_cli.train | max tokens per GPU = 750000 and batch size per GPU = None 2020-11-26 11:25:01 | INFO | fairseq.trainer | no existing checkpoint found /datadrive/ASR/CTC_Model/checkpoint_last.pt 2020-11-26 11:25:01 | INFO | fairseq.trainer | loading train data for epoch 1 2020-11-26 11:25:01 | INFO | fairseq.data.audio.raw_audio_dataset | loaded 460997, skipped 0 samples 2020-11-26 11:25:02 | INFO | fairseq.optim.adam | using FusedAdam 2020-11-26 11:25:02 | INFO | fairseq.trainer | begin training epoch 1 2020-11-26 11:40:34 | INFO | train_inner | {"epoch": 1, "update": 0.036, "loss": "2264.27", "ntokens": "6405.06", "nsentences": "59.175", "nll_loss": "20.919", "wps": "1401.2", "ups": "0.22", "wpb": "6405.1", "bsz": "59.2", "num_updates": "200", "lr": "1.0425e-06", "gnorm": "8029.41", "train_wall": "917", "wall": "934"} Traceback (most recent call last): File "fairseq_cli/hydra_train.py", line 70, in
-- Process 0 terminated with the following error: Traceback (most recent call last): File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap fn(i, *args) File "/home/rashwan/ASR/fairseq/fairseq/distributed_utils.py", line 300, in distributed_main main(cfg, kwargs) File "/home/rashwan/ASR/fairseq/fairseq_cli/train.py", line 130, in main valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) File "/anaconda/envs/ASR/lib/python3.7/contextlib.py", line 74, in inner return func(*args, *kwds) File "/home/rashwan/ASR/fairseq/fairseq_cli/train.py", line 219, in train log_output = trainer.train_step(samples) File "/anaconda/envs/ASR/lib/python3.7/contextlib.py", line 74, in inner return func(args, kwds) File "/home/rashwan/ASR/fairseq/fairseq/trainer.py", line 572, in train_step raise e File "/home/rashwan/ASR/fairseq/fairseq/trainer.py", line 546, in train_step ignore_grad=is_dummy_batch, File "/home/rashwan/ASR/fairseq/fairseq/tasks/fairseq_task.py", line 428, in train_step loss, sample_size, logging_output = criterion(model, sample) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/home/rashwan/ASR/fairseq/fairseq/criterions/ctc.py", line 106, in forward net_output = model(sample["net_input"]) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/home/rashwan/ASR/fairseq/fairseq/legacy_distributed_data_parallel.py", line 83, in forward return self.module(*inputs, *kwargs) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/home/rashwan/ASR/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 160, in forward x = self.w2v_encoder(kwargs) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/home/rashwan/ASR/fairseq/fairseq/models/wav2vec/wav2vec2_asr.py", line 339, in forward x, padding_mask = self.w2v_model.extract_features(w2v_args) File "/home/rashwan/ASR/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 570, in extract_features res = self.forward(source, padding_mask, mask=mask, features_only=True) File "/home/rashwan/ASR/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 454, in forward features = self.feature_extractor(source) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/home/rashwan/ASR/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 680, in forward x = conv(x) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, *kwargs) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/container.py", line 117, in forward input = module(input) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, **kwargs) File "/anaconda/envs/ASR/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 257, in forward self.padding, self.dilation, self.groups) RuntimeError: Calculated padded input size per channel: (1). Kernel size: (3). Kernel size can't be greater than actual input size
Environment
pip
, source): pip install . (from the source)Additional context