Open farmer21cn opened 2 years ago
Are you using CUDA 11.1?
yes,it's11.1.1
Could you switch to another version of cudatoolkit?
You can find more at
Guys, my feeling is it might be worthwhile to track this error down and find out exactly why it happens. It might be a bug in our code or a library we call, that only impacts certain Torch versions.
We may get a more informative stack trace by doing
export CUDA_LAUNCH_BLOCKING=1
before running.
It looks like cuda just prints out a warning but the status is not checked until the next kernel, which is a k2 kernel.
2022-08-01 08:07:46,843 INFO [train.py:483] Training started 2022-08-01 08:07:46,843 INFO [train.py:484] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': True, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.15.1', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'b173c11ba379e2da0056281fe6b2d56f081419be', 'k2-git-date': 'Mon Apr 18 16:10:45 2022', 'lhotse-version': '1.4.0', 'torch-version': '1.8.1', 'torch-cuda-available': True, 'torch-cuda-version': '11.1', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '2f75236-clean', 'icefall-git-date': 'Fri Jul 29 16:40:06 2022', 'icefall-path': '/home/fyj/icefall', 'k2-path': '/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/k2/init.py', 'lhotse-path': '/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/lhotse/init.py', 'hostname': 'ai', 'IP address': '127.0.1.1'}} 2022-08-01 08:07:46,844 INFO [lexicon.py:176] Loading pre-compiled data/lang_phone/Linv.pt 2022-08-01 08:07:46,844 INFO [train.py:497] device: cuda:0 2022-08-01 08:07:48,636 INFO [asr_datamodule.py:146] About to get train cuts 2022-08-01 08:07:48,636 INFO [asr_datamodule.py:244] About to get train cuts 2022-08-01 08:07:48,636 INFO [asr_datamodule.py:149] About to create train dataset 2022-08-01 08:07:48,636 INFO [asr_datamodule.py:199] Using SingleCutSampler. 2022-08-01 08:07:48,636 INFO [asr_datamodule.py:205] About to create train dataloader 2022-08-01 08:07:48,637 INFO [asr_datamodule.py:218] About to get test cuts 2022-08-01 08:07:48,637 INFO [asr_datamodule.py:252] About to get test cuts 2022-08-01 08:07:49,232 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.069, over 2392.00 frames; over 0.00 utterances.], tot_loss[loss=1.069, over 2392.00 frames; over 0.00 utterances.], batch size: 4 /opt/conda/conda-bld/pytorch_1616554793803/work/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [0,0,0] Assertion::operator const [with T = int; int32_t = int] Check failed: ret == cudaSuccess (710 vs. 0) Error: device-side assert triggered.
index >= -sizes[i] && index < sizes[i] && "index out of bounds"
failed. /opt/conda/conda-bld/pytorch_1616554793803/work/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [1,0,0] Assertionindex >= -sizes[i] && index < sizes[i] && "index out of bounds"
failed. [F] /usr/share/miniconda/envs/k2/conda-bld/k2_1650347559347/work/k2/csrc/array.h:385:T k2::Array1[ Stack-Trace: ] /home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/libk2_log.so(k2::internal::GetStackTrace()+0x47) [0x7fa6d408db17] /home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/libk2context.so(k2::Array1::operator const+0x882) [0x7fa6d43acce2]
/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/libk2context.so(k2::Renumbering::ComputeOld2New()+0x15e) [0x7fa6d43a610e]
/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/libk2context.so(k2::Renumbering::ComputeNew2Old()+0x800) [0x7fa6d43a7d40]
/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/libk2context.so(k2::SubsetRaggedShape(k2::RaggedShape&, k2::Renumbering&, int, k2::Array1*)+0x368) [0x7fa6d459c888]
/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so(+0x101d97) [0x7fa6dd1b0d97]
/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so(+0x1023af) [0x7fa6dd1b13af]
/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so(+0x154030) [0x7fa6dd203030]
/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so(+0x13ffcb) [0x7fa6dd1eefcb]
/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so(+0x2290e) [0x7fa6dd0d190e]
python3(PyCFunction_Call+0x6e) [0x561158922f8e]
python3(_PyObject_MakeTpCall+0x501) [0x56115890b651]
python3(+0x13bd0d) [0x561158922d0d]
python3(_PyEval_EvalFrameDefault+0x48dc) [0x561158906fec]
python3(_PyEval_EvalCodeWithName+0x2e1) [0x561158901451]
python3(_PyFunction_Vectorcall+0x18c) [0x5611589133fc]
python3(_PyEval_EvalFrameDefault+0x10ea) [0x5611589037fa]
python3(_PyEval_EvalCodeWithName+0x2e1) [0x561158901451]
python3(_PyFunction_Vectorcall+0x18c) [0x5611589133fc]
python3(_PyEval_EvalFrameDefault+0x48dc) [0x561158906fec]
python3(_PyFunction_Vectorcall+0xf6) [0x561158913366]
python3(_PyEval_EvalFrameDefault+0x67d) [0x561158902d8d]
python3(_PyEval_EvalCodeWithName+0x9f6) [0x561158901b66]
python3(_PyFunction_Vectorcall+0x18c) [0x5611589133fc]
python3(_PyEval_EvalFrameDefault+0x10ea) [0x5611589037fa]
python3(_PyEval_EvalCodeWithName+0x2e1) [0x561158901451]
python3(_PyFunction_Vectorcall+0x18c) [0x5611589133fc]
python3(_PyEval_EvalFrameDefault+0x10ea) [0x5611589037fa]
python3(_PyEval_EvalCodeWithName+0x2e1) [0x561158901451]
python3(_PyFunction_Vectorcall+0x18c) [0x5611589133fc]
python3(_PyEval_EvalFrameDefault+0x10ea) [0x5611589037fa]
python3(_PyFunction_Vectorcall+0xf6) [0x561158913366]
python3(_PyEval_EvalFrameDefault+0x38b) [0x561158902a9b]
python3(_PyEval_EvalCodeWithName+0x2e1) [0x561158901451]
python3(PyEval_EvalCodeEx+0x39) [0x5611589c21e9]
python3(PyEval_EvalCode+0x1b) [0x5611589c21ab]
python3(+0x1fbd93) [0x5611589e2d93]
python3(+0x1fad73) [0x5611589e1d73]
python3(+0x99f6a) [0x561158880f6a]
python3(PyRun_SimpleFileExFlags+0x364) [0x561158880a5e]
python3(+0x8d021) [0x561158874021]
python3(Py_BytesMain+0x39) [0x5611589b6619]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7fa80b698083]
python3(+0x1cf525) [0x5611589b6525]
Traceback (most recent call last): File "./tdnn/train.py", line 577, in
main()
File "./tdnn/train.py", line 573, in main
run(rank=0, world_size=1, args=args)
File "./tdnn/train.py", line 538, in run
train_one_epoch(
File "./tdnn/train.py", line 406, in train_one_epoch
loss, loss_info = compute_loss(
File "./tdnn/train.py", line 302, in compute_loss
decoding_graph = graph_compiler.compile(texts)
File "/home/fyj/icefall/icefall/graph_compiler.py", line 78, in compile
fsa_with_self_loops = k2.remove_epsilon_and_add_self_loops(
File "/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/k2/fsa_algo.py", line 627, in remove_epsilon_and_add_self_loops
out_fsa = k2.utils.fsa_from_unary_function_ragged(
File "/home/fyj/anaconda3/envs/kaldi003/lib/python3.8/site-packages/k2/utils.py", line 521, in fsa_from_unary_function_ragged
setattr(dest, name, new_value.remove_values_eq(filler))
RuntimeError:
Some bad things happened. Please read the above error messages and stack
trace. If you are using Python, the following command may be helpful: