santhoshvly commented 5 months ago

Hi Team,

I was able to attach GPUs to Dataproc 2.1 cluster and it was working fine after disabling the secure boot. I am using the latest install_gpu_driver.sh from this repository. But I am getting the following error during cluster initialization now:-

++ lsb_release -is ++ tr '[:upper:]' '[:lower:]'

OS_NAME=debian ++ . /etc/os-release +++ PRETTY_NAME='Debian GNU/Linux 11 (bullseye)' +++ NAME='Debian GNU/Linux' +++ VERSION_ID=11 +++ VERSION='11 (bullseye)' +++ VERSION_CODENAME=bullseye +++ ID=debian +++ HOME_URL=https://www.debian.org/ +++ SUPPORT_URL=https://www.debian.org/support +++ BUG_REPORT_URL=https://bugs.debian.org/ ++ echo debian11
distribution=debian11
readonly OS_NAME ++ /usr/share/google/get_metadata_value attributes/dataproc-role
ROLE=Worker
readonly ROLE
DRIVER_FOR_CUDA=(['10.1']='418.88' ['10.2']='440.64.00' ['11.0']='450.51.06' ['11.1']='455.45.01' ['11.2']='460.73.01' ['11.5']='495.29.05' ['11.6']='510.47.03' ['11.7']='515.65.01' ['11.8']='520.56.06')
readonly -A DRIVER_FOR_CUDA
CUDNN_FOR_CUDA=(['10.1']='7.6.4.38' ['10.2']='7.6.5.32' ['11.0']='8.0.4.30' ['11.1']='8.0.5.39' ['11.2']='8.1.1.33' ['11.5']='8.3.3.40' ['11.6']='8.4.1.50' ['11.7']='8.5.0.96' ['11.8']='8.6.0.163')
readonly -A CUDNN_FOR_CUDA
NCCL_FOR_CUDA=(['10.1']='2.4.8' ['10.2']='2.5.6' ['11.0']='2.7.8' ['11.1']='2.8.3' ['11.2']='2.8.3' ['11.5']='2.11.4' ['11.6']='2.11.4' ['11.7']='2.12.12' ['11.8']='2.15.5')
readonly -A NCCL_FOR_CUDA
CUDA_SUBVER=(['10.1']='10.1.243' ['10.2']='10.2.89' ['11.0']='11.0.3' ['11.1']='11.1.0' ['11.2']='11.2.2' ['11.5']='11.5.2' ['11.6']='11.6.2' ['11.7']='11.7.1' ['11.8']='11.8.0')
readonly -A CUDA_SUBVER ++ get_metadata_attribute rapids-runtime SPARK ++ local -r attribute_name=rapids-runtime ++ local -r default_value=SPARK ++ /usr/share/google/get_metadata_value attributes/rapids-runtime ++ echo -n SPARK
RUNTIME=SPARK
DEFAULT_CUDA_VERSION=11.2
[[ 2.1 == 2.* ]]
[[ SPARK == \S\P\A\R\K ]]
DEFAULT_CUDA_VERSION=11.5
readonly DEFAULT_CUDA_VERSION ++ get_metadata_attribute cuda-version 11.5 ++ local -r attribute_name=cuda-version ++ local -r default_value=11.5 ++ /usr/share/google/get_metadata_value attributes/cuda-version ++ echo -n 11.5
readonly CUDA_VERSION=11.5
CUDA_VERSION=11.5
readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION=495.29.05
DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_VERSION=495.29.05 ++ get_metadata_attribute gpu-driver-version 495.29.05 ++ local -r attribute_name=gpu-driver-version ++ local -r default_value=495.29.05 ++ /usr/share/google/get_metadata_value attributes/gpu-driver-version ++ echo -n 495.29.05
readonly NVIDIA_DEBIAN_GPU_DRIVER_VERSION=495.29.05
NVIDIA_DEBIAN_GPU_DRIVER_VERSION=495.29.05
readonly NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX=495
NVIDIA_DEBIAN_GPU_DRIVER_VERSION_PREFIX=495
readonly DRIVER=495
DRIVER=495
[[ debian == \r\o\c\k\y ]]
[[ debian == \r\o\c\k\y ]]
[[ debian == \d\e\b\i\a\n ]] ++ uname -r ++ awk -F- '{print $1}'
KERNEL_VERSION=5.10.0
[[ 495 < 455 ]]
DEFAULT_NCCL_VERSION=2.11.4
[[ debian == \r\o\c\k\y ]]
readonly DEFAULT_NCCL_VERSION ++ get_metadata_attribute nccl-version 2.11.4 ++ local -r attribute_name=nccl-version ++ local -r default_value=2.11.4 ++ /usr/share/google/get_metadata_value attributes/nccl-version ++ echo -n 2.11.4
readonly NCCL_VERSION=2.11.4
NCCL_VERSION=2.11.4
DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL=https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run ++ curl -s -I https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run ++ head -1 ++ awk '{print $2}'
[[ 200 != \2\0\0 ]]
readonly DEFAULT_NVIDIA_DEBIAN_GPU_DRIVER_URL ++ get_metadata_attribute gpu-driver-url https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run ++ local -r attribute_name=gpu-driver-url ++ local -r default_value=https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run ++ /usr/share/google/get_metadata_value attributes/gpu-driver-url ++ echo -n https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run
NVIDIA_DEBIAN_GPU_DRIVER_URL=https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run
readonly NVIDIA_DEBIAN_GPU_DRIVER_URL
readonly NVIDIA_BASE_DL_URL=https://developer.download.nvidia.com/compute
NVIDIA_BASE_DL_URL=https://developer.download.nvidia.com/compute
readonly DEFAULT_NCCL_REPO_URL=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
DEFAULT_NCCL_REPO_URL=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb ++ get_metadata_attribute nccl-repo-url https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb ++ local -r attribute_name=nccl-repo-url ++ local -r default_value=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb ++ /usr/share/google/get_metadata_value attributes/nccl-repo-url ++ echo -n https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
NCCL_REPO_URL=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
readonly NCCL_REPO_URL
readonly NCCL_REPO_KEY=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
NCCL_REPO_KEY=https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
DEFAULT_NVIDIA_DEBIAN_CUDA_URLS=(['10.1']='https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run' ['10.2']='https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run' ['11.0']='https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run' ['11.1']='https://developer.download.nvidia.com/compute/cuda/11.1.0/local_installers/cuda_11.1.0_455.23.05_linux.run' ['11.2']='https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run' ['11.5']='https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run' ['11.6']='https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run' ['11.7']='https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run' ['11.8']='https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run')
readonly -A DEFAULT_NVIDIA_DEBIAN_CUDA_URLS
readonly DEFAULT_NVIDIA_DEBIAN_CUDA_URL=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
DEFAULT_NVIDIA_DEBIAN_CUDA_URL=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run ++ get_metadata_attribute cuda-url https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run ++ local -r attribute_name=cuda-url ++ local -r default_value=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run ++ /usr/share/google/get_metadata_value attributes/cuda-url ++ echo -n https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
NVIDIA_DEBIAN_CUDA_URL=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
readonly NVIDIA_DEBIAN_CUDA_URL
readonly NVIDIA_UBUNTU_REPO_URL=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64
NVIDIA_UBUNTU_REPO_URL=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64
readonly NVIDIA_UBUNTU_REPO_KEY_PACKAGE=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
NVIDIA_UBUNTU_REPO_KEY_PACKAGE=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
readonly NVIDIA_UBUNTU_REPO_CUDA_PIN=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
NVIDIA_UBUNTU_REPO_CUDA_PIN=https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
readonly NVIDIA_ROCKY_REPO_URL=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
NVIDIA_ROCKY_REPO_URL=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
DEFAULT_CUDNN_VERSION=8.3.3.40
[[ debian == \r\o\c\k\y ]]
readonly DEFAULT_CUDNN_VERSION ++ get_metadata_attribute cudnn-version 8.3.3.40 ++ local -r attribute_name=cudnn-version ++ local -r default_value=8.3.3.40 ++ /usr/share/google/get_metadata_value attributes/cudnn-version ++ echo -n 8.3.3.40
readonly CUDNN_VERSION=8.3.3.40
CUDNN_VERSION=8.3.3.40
CUDNN_TARBALL=cudnn-11.5-linux-x64-v8.3.3.40.tgz
CUDNN_TARBALL_URL=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.3/cudnn-11.5-linux-x64-v8.3.3.40.tgz
compare_versions_lte 8.3.1.22 8.3.3.40 ++ echo -e '8.3.1.22\n8.3.3.40' ++ sort -V ++ head -n1
'[' 8.3.1.22 = 8.3.1.22 ']'
CUDNN_TARBALL=cudnn-linux-x86_64-8.3.3.40_cuda11-archive.tar.xz
compare_versions_lte 8.3.3.40 8.4.1.50 ++ echo -e '8.3.3.40\n8.4.1.50' ++ sort -V ++ head -n1
'[' 8.3.3.40 = 8.3.3.40 ']'
CUDNN_TARBALL=cudnn-linux-x86_64-8.3.3.40_cuda11.5-archive.tar.xz
CUDNN_TARBALL_URL=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.3/local_installers/11.5/cudnn-linux-x86_64-8.3.3.40_cuda11.5-archive.tar.xz
readonly CUDNN_TARBALL
readonly CUDNN_TARBALL_URL ++ get_metadata_attribute gpu-driver-provider NVIDIA ++ local -r attribute_name=gpu-driver-provider ++ local -r default_value=NVIDIA ++ /usr/share/google/get_metadata_value attributes/gpu-driver-provider ++ echo -n NVIDIA
GPU_DRIVER_PROVIDER=NVIDIA
readonly GPU_DRIVER_PROVIDER
readonly GPU_AGENT_REPO_URL=https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics
GPU_AGENT_REPO_URL=https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics ++ get_metadata_attribute install-gpu-agent false ++ local -r attribute_name=install-gpu-agent ++ local -r default_value=false ++ /usr/share/google/get_metadata_value attributes/install-gpu-agent ++ echo -n false
INSTALL_GPU_AGENT=false
readonly INSTALL_GPU_AGENT
readonly HADOOP_CONF_DIR=/etc/hadoop/conf
HADOOP_CONF_DIR=/etc/hadoop/conf
readonly HIVE_CONF_DIR=/etc/hive/conf
HIVE_CONF_DIR=/etc/hive/conf
readonly SPARK_CONF_DIR=/etc/spark/conf
SPARK_CONF_DIR=/etc/spark/conf
NVIDIA_SMI_PATH=/usr/bin
MIG_MAJOR_CAPS=0
IS_MIG_ENABLED=0
main
[[ debian != debian ]]
remove_old_backports ++ curl -s https://deb.debian.org/debian/dists/oldstable/Release ++ awk '/^Codename/ {print $2}'
oldstable=bullseye ++ curl -s https://deb.debian.org/debian/dists/stable/Release ++ awk '/^Codename/ {print $2}'
stable=bookworm ++ grep -rsil '-backports' /etc/apt/sources.list /etc/apt/sources.list.d
matched_files=/etc/apt/sources.list
[[ -n /etc/apt/sources.list ]]
for filename in "$matched_files"
grep -e bullseye-backports -e bookworm-backports /etc/apt/sources.list deb https://deb.debian.org/debian bullseye-backports main deb-src https://deb.debian.org/debian bullseye-backports main
[[ debian == debian ]]
export DEBIAN_FRONTEND=noninteractive
DEBIAN_FRONTEND=noninteractive
execute_with_retries 'apt-get update'
local -r 'cmd=apt-get update'
(( i = 0 ))
(( i < 10 ))
eval 'apt-get update' ++ apt-get update Hit:1 https://deb.debian.org/debian bullseye InRelease Get:2 https://deb.debian.org/debian-security bullseye-security InRelease [48.4 kB] Get:3 https://download.docker.com/linux/debian bullseye InRelease [43.3 kB] Get:4 https://deb.debian.org/debian bullseye-updates InRelease [44.1 kB] Get:5 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-east4/2_1_deb11_20240513_020335-RC01 dataproc InRelease [3708 B] Get:6 https://deb.debian.org/debian bullseye-backports InRelease [49.0 kB] Hit:7 https://repo.mysql.com/apt/debian bullseye InRelease Hit:8 https://storage.googleapis.com/dataproc-bigtop-repo/2_1_deb11_20240513_020335-RC01 dataproc InRelease Hit:9 https://packages.cloud.google.com/apt google-cloud-logging-bullseye-all InRelease Get:10 https://download.docker.com/linux/debian bullseye/stable amd64 Packages [38.7 kB] Hit:11 https://packages.cloud.google.com/apt google-cloud-monitoring-bullseye-all InRelease Get:12 https://packages.cloud.google.com/apt google-compute-engine-bullseye-stable InRelease [1321 B] Hit:13 https://packages.adoptium.net/artifactory/deb bullseye InRelease Get:14 https://deb.debian.org/debian-security bullseye-security/main Sources [179 kB] Get:15 https://deb.debian.org/debian-security bullseye-security/main amd64 Packages [275 kB] Get:16 https://packages.cloud.google.com/apt cloud-sdk-bullseye InRelease [1602 B] Get:17 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-east4/2_1_deb11_20240513_020335-RC01 dataproc/contrib Sources [8460 B] Get:18 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-east4/2_1_deb11_20240513_020335-RC01 dataproc/contrib amd64 Packages [19.2 kB] Get:19 https://packages.cloud.google.com/apt cloud-sdk-bullseye/main all Packages [1480 kB] Get:20 https://packages.cloud.google.com/apt cloud-sdk-bullseye/main amd64 Packages [3083 kB] Fetched 5274 kB in 2s (3266 kB/s) Reading package lists...
return 0
execute_with_retries 'apt-get install -y -q pciutils'
local -r 'cmd=apt-get install -y -q pciutils'
(( i = 0 ))
(( i < 10 ))
eval 'apt-get install -y -q pciutils' ++ apt-get install -y -q pciutils Reading package lists... Building dependency tree... Reading state information... pciutils is already the newest version (1:3.7.0-5). 0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.
return 0
configure_yarn
[[ ! -f /etc/hadoop/conf/resource-types.xml ]]
printf '<?xml version="1.0" ?>\n'
set_hadoop_property resource-types.xml yarn.resource-types yarn.io/gpu
local -r config_file=resource-types.xml
local -r property=yarn.resource-types
local -r value=yarn.io/gpu
bdconfig set_property --configuration_file /etc/hadoop/conf/resource-types.xml --name yarn.resource-types --value yarn.io/gpu --clobber
set_hadoop_property capacity-scheduler.xml yarn.scheduler.capacity.resource-calculator org.apache.hadoop.yarn.util.resource.DominantResourceCalculator
local -r config_file=capacity-scheduler.xml
local -r property=yarn.scheduler.capacity.resource-calculator
local -r value=org.apache.hadoop.yarn.util.resource.DominantResourceCalculator
bdconfig set_property --configuration_file /etc/hadoop/conf/capacity-scheduler.xml --name yarn.scheduler.capacity.resource-calculator --value org.apache.hadoop.yarn.util.resource.DominantResourceCalculator --clobber
set_hadoop_property yarn-site.xml yarn.resource-types yarn.io/gpu
local -r config_file=yarn-site.xml
local -r property=yarn.resource-types
local -r value=yarn.io/gpu
bdconfig set_property --configuration_file /etc/hadoop/conf/yarn-site.xml --name yarn.resource-types --value yarn.io/gpu --clobber
lspci
grep -q NVIDIA
/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader
uniq /etc/google-dataproc/startup-scripts/dataproc-initialization-script-0: line 533: /usr/bin/nvidia-smi: No such file or directory
wc -l 0
[[ debian == debian ]] ++ uname -r
execute_with_retries 'apt-get install -y -q '\''linux-headers-5.10.0-30-cloud-amd64'\'''
local -r 'cmd=apt-get install -y -q '\''linux-headers-5.10.0-30-cloud-amd64'\'''
(( i = 0 ))
(( i < 10 ))
eval 'apt-get install -y -q '\''linux-headers-5.10.0-30-cloud-amd64'\''' ++ apt-get install -y -q linux-headers-5.10.0-30-cloud-amd64 Reading package lists... Building dependency tree... Reading state information... linux-headers-5.10.0-30-cloud-amd64 is already the newest version (5.10.218-1). linux-headers-5.10.0-30-cloud-amd64 set to manually installed. 0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.
return 0
[[ 0 -eq 0 ]]
install_nvidia_gpu_driver
[[ debian == debian ]]
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb -o /tmp/cuda-keyring.deb
dpkg -i /tmp/cuda-keyring.deb Selecting previously unselected package cuda-keyring. (Reading database ... 168905 files and directories currently installed.) Preparing to unpack /tmp/cuda-keyring.deb ... Unpacking cuda-keyring (1.0-1) ... Setting up cuda-keyring (1.0-1) ...
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run -o driver.run
bash ./driver.run --silent --install-libglvnd Verifying archive integrity... OK Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 495.29.05.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

ERROR: An error occurred while performing the step: "Building kernel modules". See /var/log/nvidia-installer.log for details.

ERROR: An error occurred while performing the step: "Checking to see whether the nvidia kernel module was successfully built". See /var/log/nvidia-installer.log for details.

ERROR: The nvidia kernel module was not created.

ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download page at www.nvidia.com.

I can also see the following error in the file /var/log/nvidia-installer.log in one of the cluster machine.

ld -r -o /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-modeset/nv-modeset-interface.o /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-modeset/nvidia-modeset-linux.o /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-modeset/nv-kthread-q.o LD [M] /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-modeset.o LD [M] /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/nvidia-peermem.o MODPOST /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers FATAL: modpost: GPL-incompatible module nvidia.ko uses GPL-only symbol 'rcu_read_unlock_strict' make[3]: [/usr/src/linux-headers-5.10.0-30-common/scripts/Makefile.modpost:123: /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers] Error 1 make[3]: Target '__modpost' not remade because of errors. make[2]: [/usr/src/linux-headers-5.10.0-30-common/Makefile:1783: modules] Error 2 make[2]: Leaving directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64' make[1]: [Makefile:192: __sub-make] Error 2 make[1]: Target 'modules' not remade because of errors. make[1]: Leaving directory '/usr/src/linux-headers-5.10.0-30-common' make: [Makefile:80: modules] Error 2 -> Checking to see whether the nvidia kernel module was successfully built executing: 'cd ./kernel; /opt/conda/default/bin/make -k -j8 NV_KERNEL_MODULES="nvidia" NV_EXCLUDE_KERNEL_MODULES="" SYSSRC="/lib/modules/5.10.0-30-cloud-amd64/source" SYSOUT="/lib/modules/5.10.0-30-cloud-amd64/build"'... make[1]: Entering directory '/usr/src/linux-headers-5.10.0-30-common' make[2]: Entering directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64' scripts/Makefile.lib:8: 'always' is deprecated. Please use 'always-y' instead MODPOST /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers FATAL: modpost: GPL-incompatible module nvidia.ko uses GPL-only symbol 'rcu_read_unlock_strict' make[3]: [/usr/src/linux-headers-5.10.0-30-common/scripts/Makefile.modpost:123: /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers] Error 1 make[3]: Target '__modpost' not remade because of errors. make[2]: [/usr/src/linux-headers-5.10.0-30-common/Makefile:1783: modules] Error 2 make[2]: Leaving directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64' make[1]: [Makefile:192: __sub-make] Error 2 make[1]: Target 'modules' not remade because of errors. make[1]: Leaving directory '/usr/src/linux-headers-5.10.0-30-common' make: [Makefile:80: modules] Error 2 -> Error. ERROR: An error occurred while performing the step: "Checking to see whether the nvidia kernel module was successfully built". See /var/log/nvidia-installer.log for details. -> The command cd ./kernel; /opt/conda/default/bin/make -k -j8 NV_KERNEL_MODULES="nvidia" NV_EXCLUDE_KERNEL_MODULES="" SYSSRC="/lib/modules/5.10.0-30-cloud-amd64/source" SYSOUT="/lib/modules/5.10.0-30-cloud-amd64/build" failed with the following output:

make[1]: Entering directory '/usr/src/linux-headers-5.10.0-30-common' make[2]: Entering directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64' scripts/Makefile.lib:8: 'always' is deprecated. Please use 'always-y' instead MODPOST /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers FATAL: modpost: GPL-incompatible module nvidia.ko uses GPL-only symbol 'rcu_read_unlock_strict' make[3]: [/usr/src/linux-headers-5.10.0-30-common/scripts/Makefile.modpost:123: /tmp/selfgz13916/NVIDIA-Linux-x86_64-495.29.05/kernel/Module.symvers] Error 1 make[3]: Target '__modpost' not remade because of errors. make[2]: [/usr/src/linux-headers-5.10.0-30-common/Makefile:1783: modules] Error 2 make[2]: Leaving directory '/usr/src/linux-headers-5.10.0-30-cloud-amd64' make[1]: [Makefile:192: __sub-make] Error 2 make[1]: Target 'modules' not remade because of errors. make[1]: Leaving directory '/usr/src/linux-headers-5.10.0-30-common' make: [Makefile:80: modules] Error 2 ERROR: The nvidia kernel module was not created. ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download page at www.nvidia.com.

Is anyone facing similar issue with driver installation in Dataproc 2.1/2.2 clusters?

cjac commented 5 months ago

I could change the default driver and cuda versions on 2.1 images to be more current.

santhoshvly commented 5 months ago

@cjac Thank you! Is there a specific CUDA and driver version to try as a workaround to get past this error in 2.1 images now?

cjac commented 5 months ago

I don't think I've tested the current code with cuda 12, but I think that's what we should be targeting with a recent 5xx series driver.

I recently reimagined the installer to use , on bookworm and later, the stock dkms from non-,free packages and sign drivers using the MOK. That requires that the MOK x509 cert be inserted into the efi header of the block device. I'll be writing it up with some example code shortly.

I will try to set it up to do cuda 12 on a 5xx series kernel module, but I haven't tested it yet. In 2.2 we should be able to use the one from Debian stable non-free with dkms to install the current open module.

On Wed, Jun 12, 2024, 12:25 santhoshvly @.***> wrote:

@cjac https://github.com/cjac Thank you! Is there a specific CUDA and driver version to try as a workaround to get past this error in 2.1 images now?

— Reply to this email directly, view it on GitHub https://github.com/GoogleCloudDataproc/initialization-actions/issues/1189#issuecomment-2163747827, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAAM6USGZGNQTQ3ZR6XCEE3ZHCOA3AVCNFSM6AAAAABJG5NGYOVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDCNRTG42DOOBSG4 . You are receiving this because you were mentioned.Message ID: @.*** com>

santhoshvly commented 5 months ago

@cjac Okay, Thank you!. I tried with latest CUDA version 12.0 and corresponding driver version using the latest install_gpu_driver.sh script from this repo, but got the same error. So, it looks like we can't really attach any GPUs to Dataproc 2.1/2.2 until we fix it. Please let me know if there are any other workarounds

cjac commented 5 months ago

I'm seeing a gcc error when trying to link gpl-incompatible code into kernel modules for all variants available on Debian 11 ; Debian 12 offers open driver support so I will start there tomorrow.

On Thu, Jun 13, 2024, 06:49 santhoshvly @.***> wrote:

@cjac https://github.com/cjac Okay, Thank you!. I tried with latest CUDA version 12.0 and corresponding driver version using the latest install_gpu_driver.sh script from this repo, but got the same error. So, it looks like we can't really attach any GPUs to Dataproc 2.1/2.2 until we fix it. Please let me know if there are any other workarounds

— Reply to this email directly, view it on GitHub https://github.com/GoogleCloudDataproc/initialization-actions/issues/1189#issuecomment-2165732651, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAAM6UUNC5SUOIVPDJUSYMLZHGPNVAVCNFSM6AAAAABJG5NGYOVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDCNRVG4ZTENRVGE . You are receiving this because you were mentioned.Message ID: @.*** com>

cjac commented 5 months ago

The latest Dataproc image that works with the .run file is 2.1.46-debian11

I am pushing a new change to the installer script. Please see #1190 for something that has been tested to work with images prior to and including 2.1.46-debian11

I am also working on bookworm (2.2-debian12) support for installation using apt-get.

cjac commented 5 months ago

I'm also work on something in the custom-images repo. I've got an in-progress PR open over there:

https://github.com/GoogleCloudDataproc/custom-images/pull/83

santhoshvly commented 5 months ago

@cjac Okay, Thank you for the update. We are unable to use GPU with latest 2.1/2.2 images until we get the fixed install_gpu_driver.sh. We always use the latest 2.1 image to launch the Dataproc cluster. Will this script change help attach the GPU to the latest 2.1 Debian 11 image (currently 2.1.53-debian11), or can we only use versions prior to and including 2.1.46-debian11?

santhoshvly commented 5 months ago

We have been running data pipelines using the latest Dataproc 2.1 images with GPU attached, and they have been breaking for some time. However, the documentation does not mention this issue: https://cloud.google.com/dataproc/docs/concepts/compute/gpus. This makes Dataproc GPU clusters seem very unreliable if they can break at any time.

cjac commented 5 months ago

Yes, I agree. I'm doing some work internally to build and distribute the kernel drivers with the stock image. I hope to have the change reviewed and published this quarter.

You are correct that the initialization-actions script will presently only work with those versions mentioned. I will do some work today to see if I can build drivers from bullseye-backports.

cjac commented 5 months ago

I've had some luck building from the open source github repo on the latest 2.1 images ; I'm integrating these changes into the open PR now.

cjac commented 5 months ago

The update is working on the latest 2.1 image.

Thu Jun 20 18:47:35 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   76C    P0             37W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

santhoshvly commented 5 months ago

Okay, cool. Thank you so much!. So, we should be able to attach the GPU to latest 2.1 once you merge this PR, https://github.com/GoogleCloudDataproc/initialization-actions/pull/1190. Is that correct?

cjac commented 5 months ago

The update is also working on 2.0 images:

Thu Jun 20 19:08:20 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   62C    P0             32W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

cjac commented 5 months ago

Okay, cool. Thank you so much!. So, we should be able to attach the GPU to latest 2.1 once you merge this PR, https://github.com/GoogleCloudDataproc/initialization-actions/pull/1190. Is that correct?

1190 is correct

santhoshvly commented 2 months ago

@cjac I am facing the following error while attaching GPU to Dataproc 2.2 cluster

The following packages have unmet dependencies: systemd : Depends: libsystemd0 (= 252.26-1~deb12u2) E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.

sleep 5
(( i++ ))
(( i < 10 ))
eval 'apt-get install -y -q pciutils' ++ apt-get install -y -q pciutils Reading package lists... Building dependency tree... Reading state information... pciutils is already the newest version (1:3.9.0-4). Some packages could not be installed. This may mean that you have requested an impossible situation or if you are using the unstable distribution that some required packages have not yet been created or been moved out of Incoming. The following information may help to resolve the situation:

The following packages have unmet dependencies: systemd : Depends: libsystemd0 (= 252.26-1~deb12u2) E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.

sleep 5
(( i++ ))
(( i < 10 ))
eval 'apt-get install -y -q pciutils' ++ apt-get install -y -q pciutils Reading package lists... Building dependency tree... Reading state information... pciutils is already the newest version (1:3.9.0-4). Some packages could not be installed. This may mean that you have requested an impossible situation or if you are using the unstable distribution that some required packages have not yet been created or been moved out of Incoming. The following information may help to resolve the situation:

The following packages have unmet dependencies: systemd : Depends: libsystemd0 (= 252.26-1~deb12u2) E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.

sleep 5
(( i++ ))
(( i < 10 ))
eval 'apt-get install -y -q pciutils' ++ apt-get install -y -q pciutils Reading package lists... Building dependency tree... Reading state information... pciutils is already the newest version (1:3.9.0-4). Some packages could not be installed. This may mean that you have requested an impossible situation or if you are using the unstable distribution that some required packages have not yet been created or been moved out of Incoming. The following information may help to resolve the situation:

The following packages have unmet dependencies: systemd : Depends: libsystemd0 (= 252.26-1~deb12u2) E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.

sleep 5
(( i++ ))
(( i < 10 ))
return 1 Any workaround to proceed further? Tried with latest from master branch and the changes from 1190 PR

cjac commented 2 months ago

TL;DR: Debian started enforcing deprecation of apt-key add; must move repo signing key to its own file and reference by path in sources.list file

I am fixing. You can find a workaround at the end of install_gpu_drivers.sh in my rapids work branch

https://github.com/cjac/initialization-actions/blob/e43a1eaa402dc8a81aa8853cafb32e906f72f80f/gpu/install_gpu_driver.sh#L1077

santhoshvly commented 2 months ago

@cjac Okay, Thank you. I will try this workaround.

cjac commented 2 months ago

You can likely use that whole file if extracting the function is too complicated.

santhoshvly commented 2 months ago

@cjac Okay,Thanks!. We have disabled secure boot in dataproc. Is that okay or should we enable it?.

santhoshvly commented 2 months ago

@cjac I tried with that workaround script you mentioned but still breaking with similar error in Dataproc 2.2

-----END PGP PUBLIC KEY BLOCK-----'

sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
rm -rf /etc/apt/trusted.gpg
main
is_debian ++ os_id ++ cut -d= -f2 ++ grep '^ID=' /etc/os-release ++ xargs
[[ debian == \d\e\b\i\a\n ]]
remove_old_backports
is_debian12
is_debian ++ os_id ++ xargs ++ cut -d= -f2 ++ grep '^ID=' /etc/os-release
[[ debian == \d\e\b\i\a\n ]] ++ os_version ++ xargs ++ cut -d= -f2 ++ grep '^VERSION_ID=' /etc/os-release
[[ 12 == \1\2* ]]
return
is_debian ++ os_id ++ xargs ++ cut -d= -f2 ++ grep '^ID=' /etc/os-release
[[ debian == \d\e\b\i\a\n ]]
export DEBIAN_FRONTEND=noninteractive
DEBIAN_FRONTEND=noninteractive
execute_with_retries 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64'
local -r 'cmd=apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64'
(( i = 0 ))
(( i < 3 ))
eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
sleep 5
(( i++ ))
(( i < 3 ))
eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
sleep 5
(( i++ ))
(( i < 3 ))
eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
sleep 5
(( i++ ))
(( i < 3 ))
return 1

cjac commented 2 months ago

I didn't explicitly recommend that you run apt-get update after you fix the trust database. you'll still get the errors until you apt-get update to re-build the package cache. I'll encode that into the workaround.

cjac commented 2 months ago

package cache update command included in #1240 as commit 234515d

santhoshvly commented 2 months ago

@cjac I tried with package cache update, but getting same error:-

cAZUlaj3id3TxquAlud4lWDz =h5nH -----END PGP PUBLIC KEY BLOCK-----'

gpg --dearmor -o /usr/share/keyrings/mysql.gpg
sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
rm -rf /etc/apt/trusted.gpg
apt-get update Get:1 file:/etc/apt/mirrors/debian.list Mirrorlist [30 B] Get:5 file:/etc/apt/mirrors/debian-security.list Mirrorlist [39 B] Get:7 https://packages.cloud.google.com/apt google-compute-engine-bookworm-stable InRelease [1321 B] Get:8 https://download.docker.com/linux/debian bookworm InRelease [43.3 kB] Hit:9 https://repo.mysql.com/apt/debian bookworm InRelease Hit:2 https://deb.debian.org/debian bookworm InRelease Get:3 https://deb.debian.org/debian bookworm-updates InRelease [55.4 kB] Get:4 https://deb.debian.org/debian bookworm-backports InRelease [59.0 kB] Get:6 https://deb.debian.org/debian-security bookworm-security InRelease [48.0 kB] Get:10 https://packages.adoptium.net/artifactory/deb bookworm InRelease [7517 B] Get:11 https://packages.cloud.google.com/apt cloud-sdk-bookworm InRelease [1654 B] Get:12 https://packages.cloud.google.com/apt google-compute-engine-bookworm-stable/main amd64 Packages [3128 B] Get:13 https://download.docker.com/linux/debian bookworm/stable amd64 Packages [31.3 kB] Get:14 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-west4/2_2_deb12_20240606_230238-RC01 dataproc InRelease [3708 B] Get:15 https://deb.debian.org/debian bookworm-updates/main Sources.diff/Index [11.7 kB] Get:16 https://deb.debian.org/debian bookworm-updates/main amd64 Packages.diff/Index [11.7 kB] Get:17 https://deb.debian.org/debian bookworm-updates/main Translation-en.diff/Index [11.7 kB] Get:18 https://deb.debian.org/debian bookworm-updates/main Sources T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [562 B] Get:19 https://deb.debian.org/debian bookworm-updates/main amd64 Packages T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [1116 B] Get:18 https://deb.debian.org/debian bookworm-updates/main Sources T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [562 B] Get:19 https://deb.debian.org/debian bookworm-updates/main amd64 Packages T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [1116 B] Get:20 https://deb.debian.org/debian bookworm-updates/main Translation-en T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [538 B] Get:20 https://deb.debian.org/debian bookworm-updates/main Translation-en T-2024-09-10-2011.55-F-2024-09-10-2011.55.pdiff [538 B] Get:21 https://deb.debian.org/debian bookworm-backports/main Sources.diff/Index [63.3 kB] Ign:21 https://deb.debian.org/debian bookworm-backports/main Sources.diff/Index Get:22 https://deb.debian.org/debian bookworm-backports/main amd64 Packages.diff/Index [63.3 kB] Get:23 https://deb.debian.org/debian bookworm-backports/main Translation-en.diff/Index [63.3 kB] Get:25 https://deb.debian.org/debian bookworm-backports/main amd64 Packages T-2024-09-25-2006.54-F-2024-09-03-2007.15.pdiff [57.0 kB] Get:25 https://deb.debian.org/debian bookworm-backports/main amd64 Packages T-2024-09-25-2006.54-F-2024-09-03-2007.15.pdiff [57.0 kB] Get:26 https://deb.debian.org/debian bookworm-backports/main Translation-en T-2024-09-25-0804.34-F-2024-09-06-2122.28.pdiff [11.6 kB] Get:26 https://deb.debian.org/debian bookworm-backports/main Translation-en T-2024-09-25-0804.34-F-2024-09-06-2122.28.pdiff [11.6 kB] Get:24 https://deb.debian.org/debian bookworm-backports/main Sources [276 kB] Get:27 https://deb.debian.org/debian-security bookworm-security/main Sources [110 kB] Get:28 https://deb.debian.org/debian-security bookworm-security/main amd64 Packages [182 kB] Get:29 https://deb.debian.org/debian-security bookworm-security/main Translation-en [111 kB] Get:30 https://packages.adoptium.net/artifactory/deb bookworm/main amd64 Packages [7417 B] Get:31 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-west4/2_2_deb12_20240606_230238-RC01 dataproc/contrib Sources [8442 B] Get:32 https://packages.cloud.google.com/apt cloud-sdk-bookworm/main all Packages [1555 kB] Get:33 https://packages.cloud.google.com/apt cloud-sdk-bookworm/main amd64 Packages [3337 kB] Get:34 https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-west4/2_2_deb12_20240606_230238-RC01 dataproc/contrib amd64 Packages [19.3 kB] Fetched 6156 kB in 2s (3423 kB/s) Reading package lists...
main
is_debian ++ os_id ++ grep '^ID=' /etc/os-release ++ xargs ++ cut -d= -f2
[[ debian == \d\e\b\i\a\n ]]
remove_old_backports
is_debian12
is_debian ++ os_id ++ grep '^ID=' /etc/os-release ++ xargs ++ cut -d= -f2
[[ debian == \d\e\b\i\a\n ]] ++ os_version ++ grep '^VERSION_ID=' /etc/os-release ++ xargs ++ cut -d= -f2
[[ 12 == \1\2* ]]
return
is_debian ++ os_id ++ grep '^ID=' /etc/os-release ++ xargs ++ cut -d= -f2
[[ debian == \d\e\b\i\a\n ]]
export DEBIAN_FRONTEND=noninteractive
DEBIAN_FRONTEND=noninteractive
execute_with_retries 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64'
local -r 'cmd=apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64'
(( i = 0 ))
(( i < 3 ))
eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
sleep 5
(( i++ ))
(( i < 3 ))
eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
sleep 5
(( i++ ))
(( i < 3 ))
eval 'apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64' ++ apt-get install -y -qq pciutils linux-headers-6.1.0-25-cloud-amd64 E: Error, pkgProblemResolver::Resolve generated breaks, this may be caused by held packages.
sleep 5
(( i++ ))
(( i < 3 ))
return 1

GoogleCloudDataproc / initialization-actions

[gpu] Driver installation breaking in Dataproc 2.1 image during initialization #1189

1190 is correct