Open JosePeeterson opened 1 year ago
I was training the code in a jupyter notebook (xxx.ipynb). Should train using python scripts (xxx.py) I also updated all my packages to latest versions as below. THis solved the problem.
name: test-env
channels:
- anaconda
- pytorch
- nvidia
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- alembic=1.10.2=pyhd8ed1ab_0
- asttokens=2.2.1=pyhd8ed1ab_0
- backcall=0.2.0=pyh9f0ad1d_0
- backports=1.0=pyhd8ed1ab_3
- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
- blas=1.0=mkl
- bottleneck=1.3.7=py310h0a54255_0
- brotli=1.0.9=h166bdaf_8
- brotli-bin=1.0.9=h166bdaf_8
- brotlipy=0.7.0=py310h5764c6d_1005
- bzip2=1.0.8=h7f98852_4
- ca-certificates=2023.01.10=h06a4308_0
- certifi=2022.12.7=py310h06a4308_0
- cffi=1.15.1=py310h255011f_3
- charset-normalizer=2.1.1=pyhd8ed1ab_0
- cmaes=0.9.1=pyhd8ed1ab_0
- colorama=0.4.6=pyhd8ed1ab_0
- colorlog=6.7.0=py310hff52083_1
- comm=0.1.3=pyhd8ed1ab_0
- contourpy=1.0.7=py310hdf3cbec_0
- cryptography=38.0.4=py310h597c629_0
- cuda-cudart=11.7.99=0
- cuda-cupti=11.7.101=0
- cuda-libraries=11.7.1=0
- cuda-nvrtc=11.7.99=0
- cuda-nvtx=11.7.91=0
- cuda-runtime=11.7.1=0
- cycler=0.11.0=pyhd8ed1ab_0
- dbus=1.13.18=hb2f20db_0
- debugpy=1.6.6=py310heca2aa9_0
- decorator=5.1.1=pyhd8ed1ab_0
- executing=1.2.0=pyhd8ed1ab_0
- expat=2.5.0=h27087fc_0
- ffmpeg=4.3=hf484d3e_0
- fftw=3.3.10=nompi_hf0379b8_106
- filelock=3.10.7=pyhd8ed1ab_0
- fontconfig=2.14.1=h52c9d5c_1
- fonttools=4.39.3=py310h1fa729e_0
- freetype=2.12.1=hca18f0e_1
- fsspec=2023.3.0=pyhd8ed1ab_1
- giflib=5.2.1=h0b41bf4_3
- glib=2.69.1=he621ea3_2
- gmp=6.2.1=h58526e2_0
- gnutls=3.6.13=h85f3911_1
- greenlet=2.0.2=py310heca2aa9_0
- gst-plugins-base=1.14.1=h6a678d5_1
- gstreamer=1.14.1=h5eee18b_1
- icu=58.2=hf484d3e_1000
- idna=3.4=pyhd8ed1ab_0
- importlib-metadata=6.1.0=pyha770c72_0
- importlib_metadata=6.1.0=hd8ed1ab_0
- importlib_resources=5.12.0=pyhd8ed1ab_0
- intel-openmp=2021.4.0=h06a4308_3561
- ipykernel=6.22.0=pyh210e3f2_0
- ipython=8.11.0=pyh41d4057_0
- jedi=0.18.2=pyhd8ed1ab_0
- jinja2=3.1.2=pyhd8ed1ab_1
- joblib=1.2.0=pyhd8ed1ab_0
- jpeg=9e=h0b41bf4_3
- jupyter_client=8.1.0=pyhd8ed1ab_0
- jupyter_core=5.3.0=py310hff52083_0
- keyutils=1.6.1=h166bdaf_0
- kiwisolver=1.4.4=py310hbf28c38_1
- krb5=1.19.3=h3790be6_0
- lame=3.100=h166bdaf_1003
- lcms2=2.15=hfd0df8a_0
- ld_impl_linux-64=2.40=h41732ed_0
- lerc=4.0.0=h27087fc_0
- libbrotlicommon=1.0.9=h166bdaf_8
- libbrotlidec=1.0.9=h166bdaf_8
- libbrotlienc=1.0.9=h166bdaf_8
- libclang=10.0.1=default_hb85057a_2
- libcublas=11.10.3.66=0
- libcufft=10.7.2.124=h4fbf590_0
- libcufile=1.6.0.25=0
- libcurand=10.3.2.56=0
- libcusolver=11.4.0.1=0
- libcusparse=11.7.4.91=0
- libdeflate=1.17=h0b41bf4_0
- libedit=3.1.20191231=he28a2e2_2
- libevent=2.1.12=h8f2d780_0
- libffi=3.4.2=h7f98852_5
- libgcc-ng=12.2.0=h65d4601_19
- libgfortran-ng=12.2.0=h69a702a_19
- libgfortran5=12.2.0=h337968e_19
- libgomp=12.2.0=h65d4601_19
- libiconv=1.17=h166bdaf_0
- libllvm10=10.0.1=he513fc3_3
- libnpp=11.7.4.75=0
- libnvjpeg=11.8.0.2=0
- libpng=1.6.39=h753d276_0
- libpq=12.9=h16c4e8d_3
- libsodium=1.0.18=h36c2ea0_1
- libsqlite=3.40.0=h753d276_0
- libstdcxx-ng=12.2.0=h46fd767_19
- libtiff=4.5.0=h6adf6a1_2
- libuuid=1.41.5=h5eee18b_0
- libwebp=1.2.4=h1daa5a0_1
- libwebp-base=1.2.4=h5eee18b_1
- libxcb=1.15=h7f8727e_0
- libxkbcommon=1.0.1=hfa300c1_0
- libxml2=2.9.14=h74e7548_0
- libxslt=1.1.35=h4e12654_0
- libzlib=1.2.13=h166bdaf_4
- lightning-utilities=0.8.0=pyhd8ed1ab_0
- llvm-openmp=16.0.0=h417c0b6_0
- mako=1.2.4=pyhd8ed1ab_0
- markupsafe=2.1.2=py310h1fa729e_0
- matplotlib=3.5.3=py310h06a4308_0
- matplotlib-base=3.5.3=py310hf590b9c_0
- matplotlib-inline=0.1.6=pyhd8ed1ab_0
- mkl=2021.4.0=h06a4308_640
- mkl-service=2.4.0=py310ha2c4b55_0
- mkl_fft=1.3.1=py310hd6ae3a3_0
- mkl_random=1.2.2=py310h00e6091_0
- mpmath=1.3.0=pyhd8ed1ab_0
- munkres=1.1.4=pyh9f0ad1d_0
- ncurses=6.3=h27087fc_1
- nest-asyncio=1.5.6=pyhd8ed1ab_0
- nettle=3.6=he412f7d_0
- networkx=3.0=pyhd8ed1ab_0
- nspr=4.35=h27087fc_0
- nss=3.89=he45b914_0
- numexpr=2.8.4=py310h8879344_0
- numpy=1.23.5=py310hd5efca6_0
- numpy-base=1.23.5=py310h8e6c178_0
- openh264=2.1.1=h4ff587b_0
- openssl=1.1.1t=h7f8727e_0
- optuna=3.1.0=pyhd8ed1ab_0
- packaging=23.0=pyhd8ed1ab_0
- pandas=1.5.3=py310h1128e8f_0
- parso=0.8.3=pyhd8ed1ab_0
- patsy=0.5.3=pyhd8ed1ab_0
- pcre=8.45=h9c3ff4c_0
- pexpect=4.8.0=pyh1a96a4e_2
- pickleshare=0.7.5=py_1003
- pillow=9.4.0=py310h6a678d5_0
- pip=23.0.1=pyhd8ed1ab_0
- platformdirs=3.2.0=pyhd8ed1ab_0
- plotly=5.9.0=py310h06a4308_0
- ply=3.11=py_1
- pooch=1.7.0=pyha770c72_3
- prompt-toolkit=3.0.38=pyha770c72_0
- prompt_toolkit=3.0.38=hd8ed1ab_0
- psutil=5.9.4=py310h5764c6d_0
- ptyprocess=0.7.0=pyhd3deb0d_0
- pure_eval=0.2.2=pyhd8ed1ab_0
- pycparser=2.21=pyhd8ed1ab_0
- pygments=2.14.0=pyhd8ed1ab_0
- pyopenssl=23.1.1=pyhd8ed1ab_0
- pyparsing=3.0.9=pyhd8ed1ab_0
- pyqt=5.15.7=py310h6a678d5_1
- pysocks=1.7.1=pyha2e5f31_6
- python=3.10.9=h7a1cb2a_0
- python-dateutil=2.8.2=pyhd8ed1ab_0
- python_abi=3.10=2_cp310
- pytorch=2.0.0=py3.10_cuda11.7_cudnn8.5.0_0
- pytorch-cuda=11.7=h778d358_3
- pytorch-lightning=1.9.0=pyhd8ed1ab_1
- pytorch-mutex=1.0=cuda
- pytz=2023.3=pyhd8ed1ab_0
- pyyaml=6.0=py310h5764c6d_5
- pyzmq=25.0.2=py310h059b190_0
- qt-main=5.15.2=h327a75a_7
- qt-webengine=5.15.9=hd2b0992_4
- qtwebkit=5.212=h4eab89a_4
- readline=8.2=h8228510_1
- requests=2.28.2=pyhd8ed1ab_0
- scikit-learn=1.2.0=py310h6a678d5_0
- scipy=1.10.0=py310hd5efca6_0
- setuptools=67.6.1=pyhd8ed1ab_0
- sip=6.6.2=py310hd8f1fbe_0
- six=1.16.0=pyh6c4a22f_0
- sqlalchemy=2.0.7=py310h1fa729e_0
- sqlite=3.40.1=h5082296_0
- stack_data=0.6.2=pyhd8ed1ab_0
- statsmodels=0.13.5=py310ha9d4c09_1
- sympy=1.11.1=pyh04b8f61_3
- tenacity=8.2.2=pyhd8ed1ab_0
- threadpoolctl=3.1.0=pyh8a188c0_0
- tk=8.6.12=h27826a3_0
- toml=0.10.2=pyhd8ed1ab_0
- torchaudio=2.0.0=py310_cu117
- torchmetrics=0.11.4=pyhd8ed1ab_0
- torchtriton=2.0.0=py310
- torchvision=0.15.0=py310_cu117
- tornado=6.2=py310h5764c6d_1
- tqdm=4.65.0=pyhd8ed1ab_1
- traitlets=5.9.0=pyhd8ed1ab_0
- typing-extensions=4.5.0=hd8ed1ab_0
- typing_extensions=4.5.0=pyha770c72_0
- tzdata=2023c=h71feb2d_0
- unicodedata2=15.0.0=py310h5764c6d_0
- urllib3=1.26.15=pyhd8ed1ab_0
- wcwidth=0.2.6=pyhd8ed1ab_0
- wheel=0.40.0=pyhd8ed1ab_0
- xz=5.2.10=h5eee18b_1
- yaml=0.2.5=h7f98852_2
- zeromq=4.3.4=h9c3ff4c_1
- zipp=3.15.0=pyhd8ed1ab_0
- zlib=1.2.13=h166bdaf_4
- zstd=1.5.2=h3eb15da_6
- pip:
- absl-py==1.4.0
- cachetools==5.3.0
- google-auth==2.17.0
- google-auth-oauthlib==0.4.6
- grpcio==1.53.0
- markdown==3.4.3
- oauthlib==3.2.2
- protobuf==4.22.1
- pyasn1==0.4.8
- pyasn1-modules==0.2.8
- pyqt5-sip==12.11.0
- requests-oauthlib==1.3.1
- rsa==4.9
- tensorboard==2.12.0
- tensorboard-data-server==0.7.0
- tensorboard-plugin-wit==1.8.1
- werkzeug==2.2.3
prefix: /home/optimusprime/miniconda3/envs/test-env
Expected behavior
I executed the below code in order to get optimal hyperparameter values by training DeepAR and validating after every epoch and using Optuna. I expected the training to run smoothly with 1 GPU. I only have 1 GPU.
Actual behavior
However, DeepAR Training gets stuck at some random epoch.
I could run the same code in Google colab error-free.
Code to reproduce the problem