when i run build/train, the error occurred : CUDNN_STATUS_NOT_SUPPORTED
Error log
I1021 09:29:31.667903 7461 W2lListFilesDataset.cpp:62] Total batches (i.e. iters): 2275
I1021 09:29:31.793767 7460 W2lListFilesDataset.cpp:141] 78752 files found.
I1021 09:29:31.794911 7460 Utils.cpp:102] Filtered 5968/78752 samples
I1021 09:29:31.795254 7460 W2lListFilesDataset.cpp:62] Total batches (i.e. iters): 2275
I1021 09:29:32.036325 7459 Train.cpp:813] Epoch 1 started!
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDNN_STATUS_NOT_SUPPORTED
*** Aborted at 1603272576 (unix time) try "date -d @1603272576" if you are using GNU date ***
PC: @ 0x7f6bab0dcf47 gsignal
*** SIGABRT (@0x1d26) received by PID 7462 (TID 0x7f6beee28380) from PID 7462; stack trace: ***
@ 0x7f6be713e8a0 (unknown)
@ 0x7f6bab0dcf47 gsignal
@ 0x7f6bab0de8b1 abort
@ 0x7f6babad1957 (unknown)
@ 0x7f6babad7ae6 (unknown)
@ 0x7f6babad7b21 std::terminate()
@ 0x7f6babad7d54 __cxa_throw
@ 0x563503afc489 fl::TensorDescriptor::TensorDescriptor()
@ 0x563503afa7cc fl::conv2d()
@ 0x563503aa4bc1 fl::Conv2D::forward()
@ 0x563503ab89ee fl::UnaryModule::forward()
@ 0x563503aa2aba fl::Sequential::forward()
@ 0x5635037d14ac _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x5635037622ab main
@ 0x7f6bab0bfb97 __libc_start_main
@ 0x5635037c9eba _start
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDNN_STATUS_NOT_SUPPORTED
*** Aborted at 1603272576 (unix time) try "date -d @1603272576" if you are using GNU date ***
PC: @ 0x7fd8ca441f47 gsignal
*** SIGABRT (@0x1d25) received by PID 7461 (TID 0x7fd90e18d380) from PID 7461; stack trace: ***
@ 0x7fd9064a38a0 (unknown)
@ 0x7fd8ca441f47 gsignal
@ 0x7fd8ca4438b1 abort
@ 0x7fd8cae36957 (unknown)
@ 0x7fd8cae3cae6 (unknown)
@ 0x7fd8cae3cb21 std::terminate()
@ 0x7fd8cae3cd54 __cxa_throw
@ 0x55ef87bf1489 fl::TensorDescriptor::TensorDescriptor()
@ 0x55ef87bef7cc fl::conv2d()
@ 0x55ef87b99bc1 fl::Conv2D::forward()
@ 0x55ef87bad9ee fl::UnaryModule::forward()
@ 0x55ef87b97aba fl::Sequential::forward()
@ 0x55ef878c64ac _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x55ef878572ab main
@ 0x7fd8ca424b97 __libc_start_main
@ 0x55ef878beeba _start
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDNN_STATUS_NOT_SUPPORTED
*** Aborted at 1603272576 (unix time) try "date -d @1603272576" if you are using GNU date ***
PC: @ 0x7f9c2aa6af47 gsignal
*** SIGABRT (@0x1d23) received by PID 7459 (TID 0x7f9c6e7b6380) from PID 7459; stack trace: ***
@ 0x7f9c66acc8a0 (unknown)
@ 0x7f9c2aa6af47 gsignal
@ 0x7f9c2aa6c8b1 abort
@ 0x7f9c2b45f957 (unknown)
@ 0x7f9c2b465ae6 (unknown)
@ 0x7f9c2b465b21 std::terminate()
@ 0x7f9c2b465d54 __cxa_throw
@ 0x563d337c9489 fl::TensorDescriptor::TensorDescriptor()
@ 0x563d337c77cc fl::conv2d()
@ 0x563d33771bc1 fl::Conv2D::forward()
@ 0x563d337859ee fl::UnaryModule::forward()
@ 0x563d3376faba fl::Sequential::forward()
@ 0x563d3349e4ac _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x563d3342f2ab main
@ 0x7f9c2aa4db97 __libc_start_main
@ 0x563d33496eba _start
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDNN_STATUS_NOT_SUPPORTED
*** Aborted at 1603272576 (unix time) try "date -d @1603272576" if you are using GNU date ***
PC: @ 0x7f687fd23f47 gsignal
*** SIGABRT (@0x1d24) received by PID 7460 (TID 0x7f68c3a6f380) from PID 7460; stack trace: ***
@ 0x7f68bbd858a0 (unknown)
@ 0x7f687fd23f47 gsignal
@ 0x7f687fd258b1 abort
@ 0x7f6880718957 (unknown)
@ 0x7f688071eae6 (unknown)
@ 0x7f688071eb21 std::terminate()
@ 0x7f688071ed54 __cxa_throw
@ 0x559c78bf0489 fl::TensorDescriptor::TensorDescriptor()
@ 0x559c78bee7cc fl::conv2d()
@ 0x559c78b98bc1 fl::Conv2D::forward()
@ 0x559c78bac9ee fl::UnaryModule::forward()
@ 0x559c78b96aba fl::Sequential::forward()
@ 0x559c788c54ac _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x559c788562ab main
@ 0x7f687fd06b97 __libc_start_main
@ 0x559c788bdeba _start
--------------------------------------------------------------------------
mpirun noticed that process rank 3 with PID 0 on node e0ebb0a63bf9 exited on signal 6 (Aborted).
--------------------------------------------------------------------------
Additional Context
This is additional information.
cudnn & cuda
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2
cat: /usr/local/cuda/include/cudnn.h: No such file or directory
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# cat /usr/include/cudnn.h | grep CUDNN_MAJOR -A 2
#define CUDNN_MAJOR 7
#define CUDNN_MINOR 6
#define CUDNN_PATCHLEVEL 5
--
#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
include "driver_types.h"
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# which nvcc
/usr/local/cuda/bin/nvcc
cudnn is avaliable in pytorch
```python
root@e04c9ef4ea64:/home/dcshin/wav2letter# python
Python 3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31)
[GCC 7.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.backends.cudnn.is_available()
True
Question
when i run
build/train
, the error occurred : CUDNN_STATUS_NOT_SUPPORTEDError log
Additional Context
This is additional information.
include "driver_types.h"
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# nvcc --version nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2018 NVIDIA Corporation Built on Sat_Aug_25_21:08:01_CDT_2018 Cuda compilation tools, release 10.0, V10.0.130 root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# which nvcc /usr/local/cuda/bin/nvcc
100% tests passed, 0 tests failed out of 31
Total Test time (real) = 69.11 sec
-- Checking for [mkl_gf_lp64 - mkl_gnu_thread - mkl_core - iomp5 - pthread - m] -- Library mkl_gf_lp64: /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.so -- Library mkl_gnu_thread: /opt/intel/mkl/lib/intel64/libmkl_gnu_thread.so -- Library mkl_core: /opt/intel/mkl/lib/intel64/libmkl_core.so -- Library iomp5: /usr/lib/x86_64-linux-gnu/libiomp5.so -- Library pthread: /usr/lib/x86_64-linux-gnu/libpthread.so -- Library m: /usr/lib/x86_64-linux-gnu/libm.so -- MKL library found -- ArrayFire found (include: /usr/local/include, library: ArrayFire::afcuda) -- Found glog (include: /usr/include, library: /usr/lib/x86_64-linux-gnu/libglog.so) -- GLOG found -- Found gflags (include: /usr/include, library: /usr/lib/x86_64-linux-gnu/libgflags.so) -- GFLAGS found -- OpenMP found -- flashlight found (include: lib: flashlight::flashlight ) -- flashlight built in distributed mode. -- flashlight built with contrib features. -- CUDA found (library: /usr/local/cuda/lib64/libcudart_static.a;-pthread;dl;/usr/lib/x86_64-linux-gnu/librt.so include: /usr/local/cuda/include) -- CUDA architecture flags: -gencodearch=compute_30,code=sm_30-gencodearch=compute_35,code=sm_35-gencodearch=compute_50,code=sm_50-gencodearch=compute_52,code=sm_52-gencodearch=compute_60,code=sm_60-gencodearch=compute_61,code=sm_61-gencodearch=compute_70,code=sm_70-gencodearch=compute_75,code=sm_75-gencodearch=compute_70,code=compute_70-gencodearch=compute_75,code=compute_75 -- CBLAS found (include: /opt/intel/mkl/include, library: /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.so;/opt/intel/mkl/lib/intel64/libmkl_gnu_thread.so;/opt/intel/mkl/lib/intel64/libmkl_core.so;/usr/lib/x86_64-linux-gnu/libiomp5.so;/usr/lib/x86_64-linux-gnu/libpthread.so;/usr/lib/x86_64-linux-gnu/libm.so) -- FFTW found -- Looking for KenLM -- Using kenlm library found in /root/kenlm/build/lib/libkenlm.a -- Using kenlm utils library found in /root/kenlm/build/lib/libkenlm.a -- kenlm lm/model.hh found in /root/kenlm/lm/model.hh -- Found kenlm (include: /root/kenlm, library: /root/kenlm/build/lib/libkenlm.a;/root/kenlm/build/lib/libkenlm_util.a) -- Adding warpctc: -- warpctc: cuda found TRUE -- warpctc: using CUDA 9.0 or above -- warpctc: Building shared library with GPU support -- Required SndFile dependency Ogg found. -- Required SndFile dependency Vorbis found. -- Required SndFile dependency VorbisEnc found. -- Required SndFile dependency FLAC found. -- Found libsndfile: (lib: /usr/local/lib/libsndfile.so include: /usr/local/include -- libsndfile found. -- Found gflags (include: /usr/include, library: /usr/lib/x86_64-linux-gnu/libgflags.so) -- GFLAGS found -- Looking for KenLM -- Using kenlm library found in /root/kenlm/build/lib/libkenlm.a -- Using kenlm utils library found in /root/kenlm/build/lib/libkenlm.a -- kenlm lm/model.hh found in /root/kenlm/lm/model.hh -- Found kenlm (include: /root/kenlm, library: /root/kenlm/build/lib/libkenlm.a;/root/kenlm/build/lib/libkenlm_util.a) -- Examples: add executable interactive_streaming_asr_example -- Examples: add executable simple_streaming_asr_example -- Examples: add executable multithreaded_streaming_asr_example -- Tests: add executable inference_Conv1dTest -- Tests: add executable inference_IdentityTest -- Tests: add executable inference_LayerNormTest -- Tests: add executable inference_LinearTest -- Tests: add executable inference_LogMelFeatureTest -- Tests: add executable inference_MemoryManagerTest -- Tests: add executable inference_ReluTest -- Tests: add executable inference_ResidualTest -- Tests: add executable inference_TDSBlockTest -- Building recipes. -- Configuring done -- Generating done -- Build files have been written to: /home/dcshin/wav2letter/build
data path
--datadir= --train=/home/dcshin/news_data/lists_syllable/train-kor.lst, /home/dcshin/news_data/lists_add_syllable/train-kor.lst --valid=/home/dcshin/news_data/lists_syllable/dev-kor.lst, /home/dcshin/news_data/lists_add_syllable/dev-kor.lst --test=/home/dcshin/news_data/lists_syllable/test-kor.lst, /home/dcshin/news_data/lists_add_syllable/test-kor.lst --lexicon=/home/dcshin/wav2letter/experiments/results/subword_finetuning_transformer/am/librispeech-train+dev-unigram-20000-nbest10.lexicon --tokensdir=/home/dcshin/wav2letter/experiments/results/subword_finetuning_transformer/am --tokens=librispeech-train-all-unigram-20000.tokens --rundir=/home/dcshin/wav2letter --runname=experiments/logs/subword_finetuning_transformer_seq2seq_squash --archdir=/home/dcshin/wav2letter/experiments/arch --arch=am_transformer_s2s_librivox.arch --input=flac
concurrency
--nthread=10 --enable_distributed=true --world_size=4
데이터를 섞기
--noresample=true --seed=2
min audio duration
--minisz=1000 --minitz=3
Additional info
optimizer
--netoptim=adagrad --critoptim=adagrad --lr=0.03 --lrcrit=0.03 --lr_decay=200 --lr_decay_step=40 --adambeta1=0.95 --adambeta2=0.99 --momentum=0.0 --maxgradnorm=0.1
learning strategy
--warmup=64000 --saug_start_update=64000 --pctteacherforcing=95 --sampletarget=0.01
etc
--batchsize=8 --encoderdim=256 --target=ltr --memstepsize=5000000 --onorm=target --sqnorm=true
decoder
--am_decoder_tr_dropout=0.1 --am_decoder_tr_layerdrop=0.1 --am_decoder_tr_layers=6
--criterion=transformer --eostoken=true --attention=keyvalue --maxdecoderoutputlen=120
--attnWindow=softPretrain --trainWithWindow=true --pretrainWindow=3 --softwstd=4
Seq2Seq에서 오버피팅을 막기 위한 방법, refer https://arxiv.org/pdf/1612.02695.pdf
--labelsmooth=0.05
data group
데이터 그룹을 만드는 방법인듯 (오디오 길이 순서대로 정렬하는건가?), binning using audio length and spiral along reference length
--dataorder=output_spiral --inputbinsize=25
Feature
--filterbanks=80
changed
--wordseparator=_ --usewordpiece=true --pcttraineval=10
reportiters
--reportiters=0
please refer https://github.com/facebookresearch/wav2letter/issues/806
--rndv_filepath=