fujiki-1emon / t5x

Apache License 2.0
0 stars 0 forks source link

TPUs are not available on TPU VMs esp. multi-host TPU VMs #7

Open fujiki-1emon opened 2 years ago

fujiki-1emon commented 2 years ago

v4-32-1 _1

$ gcloud alpha compute tpus tpu-vm create v4-32-1 --zone us-central2-b --accelerator-type v4-32 --version v2-alpha-tpuv4 --subnetwork=tpusubnet
$ ./scripts/health_check_1.sh

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
$ ./scripts/health_check.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
fujiki-1emon commented 2 years ago

v4-32-1 _2

$ gcloud alpha compute tpus tpu-vm create v4-32-1 --zone us-central2-b --accelerator-type v4-32 --version v2-alpha-tpuv4 --subnetwork=tpusubnet

Create request issued for: [v4-32-1]
Waiting for operation [projects/trc-tpuv4/locations/us-central2-b/operations/operation-1662614933116-5e823b47c9805-65ba0186-62174bc6] to complete...done.
Created tpu [v4-32-1].
$ ./scripts/setup_project.sh

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
$ ./scripts/run_multi-host.sh

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
chmod: cannot access 'runs/run_scalable_t5_1_1_xl_obpc.sh': No such file or directory
##### Command execution on worker 2 failed with exit status 1. Continuing.
chmod: cannot access 'runs/run_scalable_t5_1_1_xl_obpc.sh': No such file or directory
##### Command execution on worker 0 failed with exit status 1. Continuing.
chmod: cannot access 'runs/run_scalable_t5_1_1_xl_obpc.sh': No such file or directory
##### Command execution on worker 1 failed with exit status 1. Continuing.
chmod: cannot access 'runs/run_scalable_t5_1_1_xl_obpc.sh': No such file or directory
##### Command execution on worker 3 failed with exit status 1. Continuing.

15:22

$ ./scripts/health_check.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
fujiki-1emon commented 2 years ago

v4-32_3

$ gcloud alpha compute tpus tpu-vm create v4-32-1 --zone us-central2-b --accelerator-type v4-32 --version v2-alpha-tpuv4 --subnetwork=tpusubnet
Create request issued for: [v4-32-1]
Waiting for operation [projects/trc-tpuv4/locations/us-central2-b/operations/operation-1662622619046-5e8257e9a92fc-122fe08f-36cdbf87] to complete...done.
Created tpu [v4-32-1].
$ ./scripts/setup_project.sh

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
$ ./scripts/scp_to_all_workers.sh

SCP: Attempting to connect to worker 0...
SCP: Attempting to connect to worker 1...
SCP: Attempting to connect to worker 2...
SCP: Attempting to connect to worker 3...
scalable_t5_1_1_xl_obpc_pretrain.gin                                                                                                       100%  473     3.1KB/s   00:00

scalable_t5_1_1_xl_obpc_pretrain.gin                                                                                                       100%  473     3.1KB/s   00:00
scalable_t5_1_1_xl_obpc_pretrain.gin                                                                                                       100%  473     3.2KB/s   00:00
SCP: Attempting to connect to worker 0...
SCP: Attempting to connect to worker 1...
SCP: Attempting to connect to worker 2...
SCP: Attempting to connect to worker 3...
pretrain.gin                                                                                                                               100% 3098    20.1KB/s   00:00
pretrain.gin                                                                                                                               100% 3098    20.1KB/s   00:00
pretrain.gin                                                                                                                               100% 3098    20.1KB/s   00:00
pretrain.gin                                                                                                                               100% 3098    20.5KB/s   00:00
SCP: Attempting to connect to worker 0...
SCP: Attempting to connect to worker 1...
SCP: Attempting to connect to worker 2...
SCP: Attempting to connect to worker 3...
run_scalable_t5_1_1_xl_obpc.sh                                                                                                             100%  752     5.0KB/s   00:00
run_scalable_t5_1_1_xl_obpc.sh                                                                                                             100%  752     5.0KB/s   00:00
run_scalable_t5_1_1_xl_obpc.sh                                                                                                             100%  752     5.0KB/s   00:00
run_scalable_t5_1_1_xl_obpc.sh                                                                                                             100%  752     5.1KB/s   00:00
$ ./scripts/health_check.sh

[TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)]
[TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0)]
$ ./scripts/run_all_workers.sh

logs/20220908-165340.run_scalable_t5_1_1_xl_obpc.sh.log
$ ./scripts/health_check.sh

SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
[CpuDevice(id=0)]
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
$ source tpuv4/bin/activate
$ python -c "import jax; print(jax.devices())"
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
$ python -c "import jax; print(jax.local_devices())"
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
fujiki-1emon commented 2 years ago
fujiki-1emon commented 2 years ago

v4-8-2

t1v-n-70eaccf2-w-0:~$ python -c "import jax; print(jax.devices())"

WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
$ TF_CPP_MIN_LOG_LEVEL=0 python -c "import jax; print(jax.devices())"

2022-09-13 02:04:18.529527: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x2f47200 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices:
2022-09-13 02:04:18.529575: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Interpreter, <undefined>
2022-09-13 02:04:18.591655: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created.
2022-09-13 02:04:18.592543: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
2022-09-13 02:04:18.702191: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
./scripts/setup_project.sh

Successfully built clu ml_collections rouge-score future promise
Installing collected packages: sentencepiece, pytz, gin-config, dataclasses, cached_property, torch, toml, threadpoolctl, tensorstore, tensorflow-hub, tabulate, promise, portalocker, lxml, joblib, googleapis-common-protos, future, editdistance, dill, contextlib2, click, babel, tensorflow-metadata, scikit-learn, sacrebleu, pandas, nltk, ml_collections, mesh-tensorflow, tfds-nightly, tensorflow_datasets, rouge-score, tensorflow-text, seqio-nightly, seqio, orbax, clu, t5, t5x
  Running setup.py develop for t5x
Successfully installed babel-2.10.3 cached_property-1.5.2 click-8.1.3 clu-0.0.7 contextlib2-21.6.0 dataclasses-0.6 dill-0.3.5.1 editdistance-0.6.0 future-0.18.2 gin-config-0.5.0 googleapis-common-protos-1.56.4 joblib-1.1.0 lxml-4.9.1 mesh-tensorflow-0.1.21 ml_collections-0.1.1 nltk-3.7 orbax-0.0.9 pandas-1.4.4 portalocker-2.5.1 promise-2.3 pytz-2022.2.1 rouge-score-0.1.2 sacrebleu-2.2.0 scikit-learn-1.1.2 sentencepiece-0.1.97 seqio-0.0.10 seqio-nightly-0.0.10.dev20220912 t5-0.9.3 t5x-0.0.0 tabulate-0.8.10 tensorflow-hub-0.12.0 tensorflow-metadata-1.10.0 tensorflow-text-2.10.0 tensorflow_datasets-4.6.0 tensorstore-0.1.23 tfds-nightly-4.6.0.dev202209130045 threadpoolctl-3.1.0 toml-0.10.2 torch-1.12.1
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)]
t1v-n-ca1a6b17-w-0:~$ python -c "import jax; print(jax.devices())"

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)]
$ ./scripts/scp_to_all_workers.sh

SCP: Attempting to connect to worker 0...
scalable_t5_1_1_xl_obpc_pretrain.gin                                                                                                       100%  473     3.0KB/s   00:00
SCP: Attempting to connect to worker 0...
t5_1_1_large_obpc_pretrain.gin                                                                                                             100%  481     3.0KB/s   00:00
SCP: Attempting to connect to worker 0...
obpc_pretrain.gin                                                                                                                          100% 3093    19.2KB/s   00:00
SCP: Attempting to connect to worker 0...
pretrain.gin                                                                                                                               100% 3098    19.5KB/s   00:00
SCP: Attempting to connect to worker 0...
run_scalable_t5_1_1_xl_obpc.sh                                                                                                             100%  752     4.7KB/s   00:00
SCP: Attempting to connect to worker 0...
resume_t5_1_1_large_obpc.sh
$ python -c "import jax; print(jax.devices())"
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)]
$ ./scripts/run_all_workers.sh

SSH: Attempting to connect to worker 0...
/home/fujiki/tpuv4/bin/python
2022-09-13 02:25:21.154497: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-13 02:25:21.294113: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-13 02:25:21.294174: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-13 02:25:21.317087: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-13 02:25:21.847464: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-13 02:25:21.847543: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-13 02:25:21.847551: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
True
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)]
Mac-4:t5x fujiki$ vim ./scripts/run_all_workers.sh
Mac-4:t5x fujiki$ ./scripts/run_all_workers.sh
SSH: Attempting to connect to worker 0...
/home/fujiki/tpuv4/bin/python
2022-09-13 02:27:50.180733: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-13 02:27:50.323396: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-13 02:27:50.323451: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-13 02:27:50.346974: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-13 02:27:50.879483: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-13 02:27:50.879556: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-13 02:27:50.879564: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
True
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)]

I0913 02:49:24.169922 140196151319616 trainer.py:490] Training: step 949 I0913 02:49:34.728353 140196151319616 trainer.py:490] Training: step 961 I0913 02:49:45.284352 140196151319616 trainer.py:490] Training: step 973 I0913 02:49:55.840657 140196151319616 trainer.py:490] Training: step 985 I0913 02:50:06.464937 140196151319616 trainer.py:490] Training: step 997 I0913 02:50:09.054086 140196151319616 train.py:611] END Train loop. I0913 02:50:09.054351 140196151319616 train.py:470] Compiling training eval loop. I0913 02:50:23.067978 140104560203520 logging_writer.py:48] [1000] collection=train accuracy=0.239925, cross_ent_loss=86212.368000, cross_ent_loss_per_all_target_tokens=1.315496, learning_rate=0.010000, learning_rate/current=0.009999999776482582, loss=86385.776000, loss_per_all_target_tokens=1.318142, loss_per_nonpadding_target_token=5.961502, nonpadding_fraction=0.221109, timing/seconds=880.827219, timing/seqs=128000, timing/seqs_per_second=145.317943, timing/seqs_per_second_per_core=36.329486, timing/steps_per_second=1.135296, timing/target_tokens_per_second=74402.787026, timing/target_tokens_per_second_per_core=18600.696756, z_loss=173.497328, z_loss_per_all_target_tokens=0.002647 I0913 02:51:01.171211 140103504471808 logging_writer.py:48] [1000] collection=training_eval/obpc_span_corruption timing/compilation_seconds=48.667511 I0913 02:51:01.172631 140196151319616 train.py:475] Computing training evaluation metrics. I0913 02:51:01.761483 140196151319616 trainer.py:532] Evaluating: obpc_span_corruption.

$ ./scripts/kill_all_workers.sh SSH: Attempting to connect to worker 0... 98420

fujiki-1emon commented 2 years ago

v4-32-1

$ ./scripts/run_all_workers.sh

 process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
^C

Command killed by keyboard interrupt
$ ./scripts/kill_all_workers.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
240317
239960
241111
237711
t1v-n-61518497-w-0:~$ python -c "import jax; print(jax.devices())"
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
t1v-n-61518497-w-0:~$ TF_CPP_MIN_LOG_LEVEL=0 python -c "import jax; print(jax.devices())"
2022-09-13 03:14:35.783824: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x337b200 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices:
2022-09-13 03:14:35.783866: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Interpreter, <undefined>
2022-09-13 03:14:35.843691: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created.
2022-09-13 03:14:35.844637: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
2022-09-13 03:14:35.946403: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
fujiki-1emon commented 2 years ago

v4-32-1

Successfully installed babel-2.10.3 cached_property-1.5.2 click-8.1.3 clu-0.0.7 contextlib2-21.6.0 dataclasses-0.6 dill-0.3.5.1 editdistance-0.6.0 future-0.18.2 gin-config-0.5.0 googleapis-common-protos-1.56.4 joblib-1.1.0 lxml-4.9.1 mesh-tensorflow-0.1.21 ml_collections-0.1.1 nltk-3.7 orbax-0.0.9 pandas-1.4.4 portalocker-2.5.1 promise-2.3 pytz-2022.2.1 rouge-score-0.1.2 sacrebleu-2.2.0 scikit-learn-1.1.2 sentencepiece-0.1.97 seqio-0.0.10 seqio-nightly-0.0.10.dev20220912 t5-0.9.3 t5x-0.0.0 tabulate-0.8.10 tensorflow-hub-0.12.0 tensorflow-metadata-1.10.0 tensorflow-text-2.10.0 tensorflow_datasets-4.6.0 tensorstore-0.1.23 tfds-nightly-4.6.0.dev202209130045 threadpoolctl-3.1.0 toml-0.10.2 torch-1.12.1 [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)] [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)] [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)] [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]

$ ./scripts/run_all_workers.sh

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)] [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)] [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)] [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]

Command execution on worker 2 failed with exit status 1. Continuing.
Command execution on worker 3 failed with exit status 1. Continuing.

Rewritten gin arg: --gin_bindings=MODEL_DIR = "gs://large_language_models_ja/exps/t5_1_1_xl_obpc/20220913-034514.v4-32-1_64/models" Rewritten gin arg: --gin_bindings=BATCH_SIZE = 64 Rewritten gin arg: --gin_bindings=TRAIN_STEPS = 524_288 Rewritten gin arg: --gin_bindings=USE_CACHED_TASKS = False Traceback (most recent call last): File "/home/fujiki/t5x/t5x/train.py", line 761, in gin_utils.run(main) File "/home/fujiki/t5x/t5x/gin_utils.py", line 107, in run app.run( File "/home/fujiki/tpuv4/lib/python3.8/site-packages/absl/app.py", line 308, in run _run_main(main, args) File "/home/fujiki/tpuv4/lib/python3.8/site-packages/absl/app.py", line 254, in _run_main sys.exit(main(argv)) File "/home/fujiki/t5x/t5x/train.py", line 719, in main _main(argv) File "/home/fujiki/t5x/t5x/train.py", line 757, in _main train_using_gin() File "/home/fujiki/tpuv4/lib/python3.8/site-packages/gin/config.py", line 1605, in gin_wrapper utils.augment_exception_message_and_reraise(e, err_str) File "/home/fujiki/tpuv4/lib/python3.8/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise raise proxy.with_traceback(exception.traceback) from None File "/home/fujiki/tpuv4/lib/python3.8/site-packages/gin/config.py", line 1582, in gin_wrapper return fn(*new_args, **new_kwargs) File "/home/fujiki/t5x/t5x/train.py", line 527, in train checkpoint_manager.save(trainer.train_state, File "/home/fujiki/t5x/t5x/utils.py", line 355, in save self._checkpointer.save( File "/home/fujiki/t5x/t5x/utils.py", line 246, in save self._save_checkpointer.save( File "/home/fujiki/t5x/t5x/checkpoints.py", line 687, in save multihost_utils.sync_global_devices(f'checkpointer:make_dir:{tmp_dir}') File "/home/fujiki/tpuv4/lib/python3.8/site-packages/jax/experimental/multihost_utils.py", line 79, in sync_global_devices assert_equal(h, f"sync_global_devices name mismatch ('{name}')") File "/home/fujiki/tpuv4/lib/python3.8/site-packages/jax/experimental/multihost_utils.py", line 133, in assert_equal raise AssertionError( AssertionError: sync_global_devices name mismatch ('checkpointer:make_dir:gs://large_language_models_ja/exps/t5_1_1_xl_obpc/20220913-034514.v4-32-1_64/models/checkpoint_0.tmp-1663040737') Expected: -1575622072; got: -882056915. In call to configurable 'train' (<function train at 0x7fd39919c040>)

$ ./scripts/kill_all_workers.sh SSH: Attempting to connect to worker 0... SSH: Attempting to connect to worker 1... SSH: Attempting to connect to worker 2... SSH: Attempting to connect to worker 3...

Command execution on worker 3 failed with exit status 1. Continuing.

65146

Command execution on worker 2 failed with exit status 1. Continuing.

27287

$ ./scripts/run_all_workers.sh

annot open shared object file: No such file or directory 2022-09-13 03:53:08.434616: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. 2022-09-13 03:53:08.458402: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2022-09-13 03:53:08.487096: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory 2022-09-13 03:53:08.487149: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. 2022-09-13 03:53:08.509940: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2022-09-13 03:53:08.867200: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:53:08.867274: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:53:08.867281: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. 2022-09-13 03:53:08.896598: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:53:08.896673: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:53:08.896680: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. 2022-09-13 03:53:09.003958: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:53:09.004038: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:53:09.004045: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. 2022-09-13 03:53:09.040273: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:53:09.040344: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:53:09.040352: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. True True True True WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] [CpuDevice(id=0)] WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)

$ ./scripts/kill_all_workers.sh SSH: Attempting to connect to worker 0... SSH: Attempting to connect to worker 1... SSH: Attempting to connect to worker 2... SSH: Attempting to connect to worker 3... 91839 92714 65146 27287

$ ./scripts/run_all_workers.sh

2022-09-13 03:54:40.872870: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:54:40.872951: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:54:40.872958: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. 2022-09-13 03:54:40.900235: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:54:40.900314: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:54:40.900322: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. 2022-09-13 03:54:41.016654: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:54:41.016738: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:54:41.016783: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. 2022-09-13 03:54:41.095525: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:54:41.095601: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2022-09-13 03:54:41.095609: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. True True True True WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] [CpuDevice(id=0)] WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)]

$ ./scripts/health_check.sh SSH: Attempting to connect to worker 0... SSH: Attempting to connect to worker 1... SSH: Attempting to connect to worker 2... SSH: Attempting to connect to worker 3... /home/fujiki/tpuv4/bin/python /home/fujiki/tpuv4/bin/python /home/fujiki/tpuv4/bin/python /home/fujiki/tpuv4/bin/python 2022-09-13 03:57:13.477531: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x3cbf080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:57:13.477576: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:57:13.506971: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x2ccf080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:57:13.507019: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:57:13.510363: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x3695080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:57:13.510410: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:57:13.543361: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:57:13.544149: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so 2022-09-13 03:57:13.566747: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:57:13.567690: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so 2022-09-13 03:57:13.571385: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:57:13.572216: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] [CpuDevice(id=0)] WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) 2022-09-13 03:57:13.649280: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed. 2022-09-13 03:57:13.674309: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed. 2022-09-13 03:57:13.680135: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed. 2022-09-13 03:57:13.805845: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x408f080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:57:13.805910: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:57:13.876359: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:57:13.877597: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtp 1 #!/bin/bash u/libtpu.so [CpuDevice(id=0)] WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) 2022-09-13 03:57:14.037387: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed. 2022-09-13 03:57:14.441072: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x2ed1080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:57:14.441113: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:57:14.455530: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x2407080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:57:14.455575: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:57:14.505247: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x3137080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:57:14.505303: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:57:14.507079: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:57:14.507860: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so 2022-09-13 03:57:14.518734: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:57:14.519593: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] 2022-09-13 03:57:14.579996: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:57:14.580968: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so 2022-09-13 03:57:14.616060: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed. WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] 2022-09-13 03:57:14.623689: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed. 2022-09-13 03:57:14.693135: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed. 2022-09-13 03:57:15.073911: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x2c63080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:57:15.073957: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:57:15.135628: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:57:15.136524: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] 2022-09-13 03:57:15.257179: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.

@t1v-n-d9f15c5a-w-0:~$ TF_CPP_MIN_LOG_LEVEL=0 python -c "import jax; print(jax.devices())" 2022-09-13 03:59:04.829153: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x34e3200 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices: 2022-09-13 03:59:04.829197: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Interpreter, 2022-09-13 03:59:04.888700: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created. 2022-09-13 03:59:04.889686: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.) [CpuDevice(id=0)] 2022-09-13 03:59:05.047633: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.

fujiki-1emon commented 2 years ago
$ TPU_MIN_LOG_LEVEL=0 TF_CPP_MIN_LOG_LEVEL=0 TPU_STDERR_LOG_LEVEL=0 python3 -c "import jax; print(jax.device_count())"
2022-09-14 08:30:51.505712: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x2fef080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices:
2022-09-14 08:30:51.505758: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Interpreter, <undefined>
2022-09-14 08:30:51.709312: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created.
2022-09-14 08:30:51.714002: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
1
2022-09-14 08:30:54.226525: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
fujiki-1emon commented 2 years ago

v4-32-1 2022-09-15

$ gcloud alpha compute tpus tpu-vm create v4-32-1 --project trc-tpuv4 --zone us-central2-b --accelerator-type v4-32 --version v2-alpha-tpuv4 --subnetwork=tpusubnet
$ ./scripts/setup_project.sh

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
$ ./scripts/scp_to_all_workers.sh
$ ./scripts/health_check.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
2022-09-15 07:11:29.949766: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x3eab080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices:
2022-09-15 07:11:29.949811: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Interpreter, <undefined>
2022-09-15 07:11:30.010118: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created.
2022-09-15 07:11:30.010935: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so
2022-09-15 07:11:30.022906: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x34d9080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices:
2022-09-15 07:11:30.022950: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Interpreter, <undefined>
2022-09-15 07:11:30.024438: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x23a5080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices:
2022-09-15 07:11:30.024484: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Interpreter, <undefined>
2022-09-15 07:11:30.085219: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created.
2022-09-15 07:11:30.085972: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so
2022-09-15 07:11:30.084974: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created.
2022-09-15 07:11:30.085737: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so
2022-09-15 07:11:30.104088: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x403b080 initialized for platform Interpreter (this does not guarantee that XLA will be used). Devices:
2022-09-15 07:11:30.104134: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Interpreter, <undefined>
2022-09-15 07:11:30.165472: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:181] TfrtCpuClient created.
2022-09-15 07:11:30.166237: I external/org_tensorflow/tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: /home/fujiki/tpuv4/lib/python3.8/site-packages/libtpu/libtpu.so
2022-09-15 07:11:33.162322: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x676b080 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
2022-09-15 07:11:33.162365: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): TPU, 2a886c8
2022-09-15 07:11:33.162370: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (1): TPU, 2a886c8
2022-09-15 07:11:33.162373: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (2): TPU, 2a886c8
2022-09-15 07:11:33.162376: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (3): TPU, 2a886c8
2022-09-15 07:11:33.176677: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x5d7f200 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
2022-09-15 07:11:33.176730: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): TPU, 2a886c8
2022-09-15 07:11:33.176735: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (1): TPU, 2a886c8
2022-09-15 07:11:33.176739: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (2): TPU, 2a886c8
2022-09-15 07:11:33.176742: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (3): TPU, 2a886c8
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
2022-09-15 07:11:33.194514: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x67b6c00 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
2022-09-15 07:11:33.194568: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): TPU, 2a886c8
2022-09-15 07:11:33.194572: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (1): TPU, 2a886c8
2022-09-15 07:11:33.194575: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (2): TPU, 2a886c8
2022-09-15 07:11:33.194578: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (3): TPU, 2a886c8
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
2022-09-15 07:11:33.233520: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x4c4b200 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
2022-09-15 07:11:33.233563: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): TPU, 2a886c8
2022-09-15 07:11:33.233568: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (1): TPU, 2a886c8
2022-09-15 07:11:33.233571: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (2): TPU, 2a886c8
2022-09-15 07:11:33.233574: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (3): TPU, 2a886c8
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
2022-09-15 07:11:33.267758: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
2022-09-15 07:11:33.287796: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
2022-09-15 07:11:33.312590: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
2022-09-15 07:11:33.326125: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
$ ./scripts/run_all_workers.sh

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
##### Command execution on worker 3 failed with exit status 1. Continuing.
##### Command execution on worker 1 failed with exit status 1. Continuing.

- logs/20220915-161240.run_scalable_t5_1_1_xl_obpc.sh.log
$ ./scripts/run_all_workers.sh

2022-09-15 07:34:43.789351: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-15 07:34:43.789429: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-15 07:34:43.789437: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
True
True
True
True
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
$ ./scripts/kill_all_workers.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
98447
##### Command execution on worker 0 failed with exit status 1. Continuing.
96329
##### Command execution on worker 2 failed with exit status 1. Continuing.
$ ./scripts/run_all_workers.sh

2022-09-15 07:36:28.245762: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-15 07:36:28.245839: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-15 07:36:28.245847: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2022-09-15 07:36:28.421692: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-15 07:36:28.421791: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-15 07:36:28.421804: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
True
True
True
True
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
fujiki-1emon commented 1 year ago

v4-32-1 2022-09-20

$ gcloud alpha compute tpus tpu-vm create v4-32-1 --project trc-tpuv4 --zone us-central2-b --accelerator-type v4-32 --version v2-alpha-tpuv4 --subnetwork=tpusubnet
$ ./scripts/setup_project.sh

Successfully installed babel-2.10.3 cached_property-1.5.2 click-8.1.3 clu-0.0.7 contextlib2-21.6.0 dataclasses-0.6 dill-0.3.5.1 editdistance-0.6.0 future-0.18.2 gin-config-0.5.0 googleapis-common-protos-1.56.4 joblib-1.2.0 lxml-4.9.1 mesh-tensorflow-0.1.21 ml_collections-0.1.1 nltk-3.7 orbax-0.0.10 pandas-1.5.0 portalocker-2.5.1 promise-2.3 pytz-2022.2.1 rouge-score-0.1.2 sacrebleu-2.2.1 scikit-learn-1.1.2 sentencepiece-0.1.97 seqio-0.0.10 seqio-nightly-0.0.10.dev20220920 t5-0.9.3 t5x-0.0.0 tabulate-0.8.10 tensorflow-hub-0.12.0 tensorflow-metadata-1.10.0 tensorflow-text-2.10.0 tensorflow_datasets-4.6.0 tensorstore-0.1.24 tfds-nightly-4.6.0.dev202209200045 threadpoolctl-3.1.0 toml-0.10.2 torch-1.12.1
  Running setup.py develop for t5x
Successfully installed babel-2.10.3 cached_property-1.5.2 click-8.1.3 clu-0.0.7 contextlib2-21.6.0 dataclasses-0.6 dill-0.3.5.1 editdistance-0.6.0 future-0.18.2 gin-config-0.5.0 googleapis-common-protos-1.56.4 joblib-1.2.0 lxml-4.9.1 mesh-tensorflow-0.1.21 ml_collections-0.1.1 nltk-3.7 orbax-0.0.10 pandas-1.5.0 portalocker-2.5.1 promise-2.3 pytz-2022.2.1 rouge-score-0.1.2 sacrebleu-2.2.1 scikit-learn-1.1.2 sentencepiece-0.1.97 seqio-0.0.10 seqio-nightly-0.0.10.dev20220920 t5-0.9.3 t5x-0.0.0 tabulate-0.8.10 tensorflow-hub-0.12.0 tensorflow-metadata-1.10.0 tensorflow-text-2.10.0 tensorflow_datasets-4.6.0 tensorstore-0.1.24 tfds-nightly-4.6.0.dev202209200045 threadpoolctl-3.1.0 toml-0.10.2 torch-1.12.1
/home/fujiki/tpuv4/lib/python3.8/site-packages/jax/_src/lib/xla_bridge.py:184: UserWarning: TPU backend initialization is taking more than 60.0 seconds. Did you run your code on all TPU hosts? See https://jax.readthedocs.io/en/latest/multi_process.html for more information.
  warnings.warn(
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
/home/fujiki/tpuv4/lib/python3.8/site-packages/jax/_src/lib/xla_bridge.py:184: UserWarning: TPU backend initialization is taking more than 60.0 seconds. Did you run your code on all TPU hosts? See https://jax.readthedocs.io/en/latest/multi_process.html for more information.
  warnings.warn(
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
$ ./scripts/health_check.sh

2022-09-20 07:55:57.821354: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:173] XLA service 0x5e25200 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
2022-09-20 07:55:57.821399: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): TPU, 2a886c8
2022-09-20 07:55:57.821403: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (1): TPU, 2a886c8
2022-09-20 07:55:57.821407: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (2): TPU, 2a886c8
2022-09-20 07:55:57.821410: I external/org_tensorflow/tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (3): TPU, 2a886c8
2022-09-20 07:55:57.825911: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
2022-09-20 07:55:57.848181: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
2022-09-20 07:55:57.917323: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
2022-09-20 07:55:57.963155: I external/org_tensorflow/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc:184] TfrtCpuClient destroyed.
$ ./scripts/scp_to_all_workers.sh
$ ./scripts/health_check.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
$ ./scripts/setup_project.sh

$ ./scripts/run_all_workers.sh
True
True
True
True
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
##### Command execution on worker 1 failed with exit status 1. Continuing.
##### Command execution on worker 3 failed with exit status 1. Continuing.
##### Command execution on worker 2 failed with exit status 1. Continuing.
I0920 08:37:08.732430 140483972213824 checkpoints.py:682] Saving checkpoint for step 0 to gs://large_language_models_ja/exps/t5_1_1_xl_obpc/20220920-083646.v4-32-1_64/models/checkpoint_0.tmp-1663663028
Rewritten gin arg: --gin_bindings=MODEL_DIR = "gs://large_language_models_ja/exps/t5_1_1_xl_obpc/20220920-083646.v4-32-1_64/models"
Rewritten gin arg: --gin_bindings=BATCH_SIZE = 64
Rewritten gin arg: --gin_bindings=TRAIN_STEPS = 524_288
Rewritten gin arg: --gin_bindings=USE_CACHED_TASKS = False
Traceback (most recent call last):
  File "/home/fujiki/t5x/t5x/train.py", line 761, in <module>
    gin_utils.run(main)
  File "/home/fujiki/t5x/t5x/gin_utils.py", line 107, in run
    app.run(
  File "/home/fujiki/tpuv4/lib/python3.8/site-packages/absl/app.py", line 308, in run
    _run_main(main, args)
  File "/home/fujiki/tpuv4/lib/python3.8/site-packages/absl/app.py", line 254, in _run_main
    sys.exit(main(argv))
  File "/home/fujiki/t5x/t5x/train.py", line 719, in main
    _main(argv)
  File "/home/fujiki/t5x/t5x/train.py", line 757, in _main
    train_using_gin()
  File "/home/fujiki/tpuv4/lib/python3.8/site-packages/gin/config.py", line 1605, in gin_wrapper
    utils.augment_exception_message_and_reraise(e, err_str)
  File "/home/fujiki/tpuv4/lib/python3.8/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
    raise proxy.with_traceback(exception.__traceback__) from None
  File "/home/fujiki/tpuv4/lib/python3.8/site-packages/gin/config.py", line 1582, in gin_wrapper
    return fn(*new_args, **new_kwargs)
  File "/home/fujiki/t5x/t5x/train.py", line 527, in train
    checkpoint_manager.save(trainer.train_state,
  File "/home/fujiki/t5x/t5x/utils.py", line 355, in save
    self._checkpointer.save(
  File "/home/fujiki/t5x/t5x/utils.py", line 246, in save
    self._save_checkpointer.save(
  File "/home/fujiki/t5x/t5x/checkpoints.py", line 687, in save
    multihost_utils.sync_global_devices(f'checkpointer:make_dir:{tmp_dir}')
  File "/home/fujiki/tpuv4/lib/python3.8/site-packages/jax/experimental/multihost_utils.py", line 79, in sync_global_devices
    assert_equal(h, f"sync_global_devices name mismatch ('{name}')")
  File "/home/fujiki/tpuv4/lib/python3.8/site-packages/jax/experimental/multihost_utils.py", line 133, in assert_equal
    raise AssertionError(
AssertionError: sync_global_devices name mismatch ('checkpointer:make_dir:gs://large_language_models_ja/exps/t5_1_1_xl_obpc/20220920-083646.v4-32-1_64/models/checkpoint_0.tmp-1663663028') Expected: -283111926; got: -2040986257.
  In call to configurable 'train' (<function train at 0x7fc38007c790>)
$ ./scripts/health_check.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
fujiki-1emon commented 1 year ago

v4-32 2022-09-22

$ ./scripts/health_check.sh

SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
Warning: Permanently added 'tpu.8117630292933245005-3-vo/z0w' (ED25519) to the list of known hosts.
Warning: Permanently added 'tpu.8117630292933245005-2-ekujfw' (ED25519) to the list of known hosts.
Warning: Permanently added 'tpu.8117630292933245005-1-pko60k' (ED25519) to the list of known hosts.
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
worker = 1

$ python -c "import jax; print(jax.local_devices())"
/home/fujiki/tpuv4/lib/python3.8/site-packages/jax/_src/lib/xla_bridge.py:184: UserWarning: TPU backend initialization is taking more than 60.0 seconds. Did you run your code on all TPU hosts? See https://jax.readthedocs.io/en/latest/multi_process.html for more information.
  warnings.warn(
$ ./scripts/run_all_workers.sh

SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
2022-09-22 06:49:19.390089: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-22 06:49:19.479682: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-22 06:49:19.829237: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-22 06:49:20.272982: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-22 06:49:22.551736: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-22 06:49:22.551796: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-22 06:49:22.695487: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-22 06:49:22.695546: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-22 06:49:23.003398: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-22 06:49:23.003439: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-22 06:49:23.003782: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-22 06:49:23.140047: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-22 06:49:23.481083: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-22 06:49:23.657835: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-22 06:49:23.657885: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-22 06:49:23.956752: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-22 06:49:27.725365: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-22 06:49:27.725467: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-22 06:49:27.725475: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2022-09-22 06:49:27.757671: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-22 06:49:27.757746: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-22 06:49:27.757753: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2022-09-22 06:49:28.683058: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-22 06:49:28.683156: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-22 06:49:28.683165: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2022-09-22 06:49:29.094593: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-22 06:49:29.094690: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2022-09-22 06:49:29.094697: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
True
True
True
True
[CpuDevice(id=0)]
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
[CpuDevice(id=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_c  1 #!/bin/bash
hip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]
^C

Command killed by keyboard interrupt

Failed to execute command on multiple workers. This may have happened if you have not added your SSH key to your ssh-agent using "ssh-add ~/.ssh/google_compute_engine".

...

$ ./scripts/kill_all_workers.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...

Usage:
 kill [options] <pid> [...]

Options:
 <pid> [...]            send signal to every <pid> listed
 -<signal>, -s, --signal <signal>
                        specify the <signal> to be sent
 -l, --list=[<signal>]  list all signal names, or convert one to a name
 -L, --table            list all signal names in a nice table

 -h, --help     display this help and exit
 -V, --version  output version information and exit

For more details see kill(1).
##### Command execution on worker 1 failed with exit status 123. Continuing.

Usage:
 kill [options] <pid> [...]

Options:
 <pid> [...]            send signal to every <pid> listed
 -<signal>, -s, --signal <signal>
                        specify the <signal> to be sent
 -l, --list=[<signal>]  list all signal names, or convert one to a name
 -L, --table            list all signal names in a nice table

 -h, --help     display this help and exit
 -V, --version  output version information and exit

For more details see kill(1).
##### Command execution on worker 0 failed with exit status 123. Continuing.
Mac-4:t5x fujiki$ vim ./scripts/kill_all_workers.sh
Mac-4:t5x fujiki$ vim scripts/run_all_workers.sh
Mac-4:t5x fujiki$ ./scripts/kill_all_workers.sh
SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
##### Command execution on worker 0 failed with exit status 1. Continuing.
##### Command execution on worker 3 failed with exit status 1. Continuing.
##### Command execution on worker 2 failed with exit status 1. Continuing.
##### Command execution on worker 1 failed with exit status 1. Continuing.
fujiki-1emon commented 1 year ago
$ ./scripts/health_check.sh

SSH: Attempting to connect to worker 0...
SSH: Attempting to connect to worker 1...
SSH: Attempting to connect to worker 2...
SSH: Attempting to connect to worker 3...
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
/home/fujiki/tpuv4/bin/python
[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0), TpuDevice(id=4, process_index=1, coords=(0,0,1), core_on_chip=0), TpuDevice(id=5, process_index=1, coords=(1,0,1), core_on_chip=0), TpuDevice(id=6, process_index=1, coords=(0,1,1), core_on_chip=0), TpuDevice(id=7, process_index=1, coords=(1,1,1), core_on_chip=0), TpuDevice(id=8, process_index=2, coords=(0,0,2), core_on_chip=0), TpuDevice(id=9, process_index=2, coords=(1,0,2), core_on_chip=0), TpuDevice(id=10, process_index=2, coords=(0,1,2), core_on_chip=0), TpuDevice(id=11, process_index=2, coords=(1,1,2), core_on_chip=0), TpuDevice(id=12, process_index=3, coords=(0,0,3), core_on_chip=0), TpuDevice(id=13, process_index=3, coords=(1,0,3), core_on_chip=0), TpuDevice(id=14, process_index=3, coords=(0,1,3), core_on_chip=0), TpuDevice(id=15, process_index=3, coords=(1,1,3), core_on_chip=0)]

seems that no problem

True
True
True
True
16
16
16
16
TPU_15(process=3,(1,1,3,0))
TPU_15(process=3,(1,1,3,0))
TPU_15(process=3,(1,1,3,0))
TPU_15(process=3,(1,1,3,0))