Closed smuzaffar closed 4 years ago
Here's a possibly more robust version, that avoids calling into python for the comparison:
#!/bin/bash -e
# find a shared library try to emulate what ld.so would do
function find_library() {
{
eval find $(echo $LD_LIBRARY_PATH | sed -e's/^/"/' -e's/:/" "/g' -e's/$/"/') -maxdepth 1 2> /dev/null
ldconfig -p | cut -s -d'>' -f2
} | grep "\<$1\>" | xargs -r -n1 readlink -f | uniq
}
# extract the major, minor and patch library version as a single number
function library_version() {
if [ "$1" ]; then
basename "$1" | sed -n -e's/.*\.so//p' | while IFS=. read EMPTY MAJOR MINOR PATCH; do echo $(( MAJOR * 1000000 + MINOR * 1000 + PATCH)); done
else
echo 0
fi
}
CUDA_BASE=$(${SCRAM} tool tag cuda CUDA_BASE 2>&1 | grep -v 'SCRAM error' || true)
if [ ! "${CUDA_BASE}" ] ; then exit 0; fi
if [ ! -e "${CUDA_BASE}" ] ; then exit 0; fi
NVIDIA_VERSION=
# first, check if the module is loaded and exported on /proc
if [ -f /proc/driver/nvidia/version ]; then
NVIDIA_VERSION=`cat /proc/driver/nvidia/version | sed -ne's/.*Kernel Module *\([0-9.]\+\).*/\1/p'`
else
# check if a kernel module is available, even if not currently loaded (e.g. for an OPTIMUS system)
# if there are multiple modules, pick the newest one
NVIDIA_MODULE=`modprobe -q -R nvidia 2>/dev/null || true`
if [ "$NVIDIA_MODULE" ]; then
NVIDIA_VERSION=`modinfo "$NVIDIA_MODULE" | grep '^version:' | sed 's|.*:\s*||;s|\s*$||'`
fi
fi
if [ "$NVIDIA_VERSION" ]; then
# check the version of libcuda.so library available in the system and bundled with CMSSW
SYS_CUDA_VERSION=$(library_version $(find_library libcuda.so))
CMS_CUDA_VERSION=$(library_version $(readlink -f ${CUDA_BASE}/drivers/libcuda.so))
# if the CMSSW version of the library is older, use the system library
(( CMS_CUDA_VERSION < SYS_CUDA_VERSION )) && exit 0
fi
# otherwise, use the library packaged with CMSSW
if [ -e "${CUDA_BASE}/drivers" ] ; then
echo "RUNTIME:path:append:LD_LIBRARY_PATH=${CUDA_BASE}/drivers"
fi
The reason for checking explicitly the system version of libcuda.so
is that the machine could already be set up with a "compatibility library" newer than the driver version.
Or it may miss a system library at all, and have only the driver (thought that is unlikely).
@fwyzard , I tried your changes but for system cuda library I see this
+++ eval find '"/.singularity.d/libs"' -maxdepth 1
+++ ldconfig -p
+++ cut -s '-d>' -f2
++ library_version /.singularity.d/libs/libcuda.so /.singularity.d/libs/libcuda.so.1
++ '[' /.singularity.d/libs/libcuda.so ']'
++ basename /.singularity.d/libs/libcuda.so
++ sed -n '-es/.*\.so//p'
++ IFS=.
++ read EMPTY MAJOR MINOR PATCH
++ echo 0
++ IFS=.
++ read EMPTY MAJOR MINOR PATCH
+ SYS_CUDA_VERSION=0
so it is not getting the correct system cuda library version
and I think it should be
(( $CMS_CUDA_VERSION < $SYS_CUDA_VERSION )) && exit 0
instead of
(( CMS_CUDA_VERSION < SYS_CUDA_VERSION )) && exit 0
I think (numeric) variables are automatically expanded within (( ... ))
++ library_version /.singularity.d/libs/libcuda.so /.singularity.d/libs/libcuda.so.1
what do you get from readlink -f /.singularity.d/libs/libcuda.so
?
cuda libs are not symlink [a]
[a]
Singularity> ls /.singularity.d/libs/libcuda* -l
-rwxr-xr-x 1 root root 17071216 Nov 13 2019 /.singularity.d/libs/libcuda.so
-rwxr-xr-x 1 root root 17071216 Nov 13 2019 /.singularity.d/libs/libcuda.so.1
I claim that to be a bug in Singularity, or at least in the was it was set up, since usually libraries are versioned only through their name.
Anyway, OK, then we should drop the check on system library and compare the CMS version with the driver version.
Where can I test this inside singularity ?
on one of ibmminsky-X
machines, just run cmssw-cc7 --nv
and then create cmssw dev area to test it
OK, this seems to be working both inside and outside of Singularity:
#! /bin/bash -e
# find a shared library trying to emulate what ld.so would do
function find_library() {
{
eval find $(echo $LD_LIBRARY_PATH | sed -e's/^/"/' -e's/:/" "/g' -e's/$/"/') -maxdepth 1 2> /dev/null
ldconfig -p | cut -s -d'>' -f2
} | grep "\<$1\>" | xargs -r -n1 readlink -f | uniq
}
# extract the version of the library
function library_version() {
if [ "$1" ]; then
basename "$1" | sed -n -e's/.*\.so\.//p'
fi
}
# extract the major, minor and patch version as a single number
function parse_version() {
if [ "$1" ]; then
echo "$1" | while IFS=. read MAJOR MINOR PATCH; do echo $(( MAJOR * 1000000 + MINOR * 1000 + PATCH)); done
else
echo 0
fi
}
CUDA_BASE=$(${SCRAM} tool tag cuda CUDA_BASE 2>&1 | grep -v 'SCRAM error' || true)
if [ ! "${CUDA_BASE}" ] || [ ! -d "${CUDA_BASE}/" ] || [ ! -d "${CUDA_BASE}/drivers/" ]; then
exit 0
fi
NVIDIA_VERSION=
# first, check if the module is loaded and exported on /proc
if [ -f /proc/driver/nvidia/version ]; then
NVIDIA_VERSION=`cat /proc/driver/nvidia/version | sed -ne's/.*Kernel Module *\([0-9.]\+\).*/\1/p'`
else
# check if a kernel module is available, even if not currently loaded (e.g. for an OPTIMUS system)
# if there are multiple modules, pick the newest one
NVIDIA_MODULE=`modprobe -q -R nvidia 2>/dev/null || true`
if [ "$NVIDIA_MODULE" ]; then
NVIDIA_VERSION=`modinfo "$NVIDIA_MODULE" | grep '^version:' | sed 's|.*:\s*||;s|\s*$||'`
fi
fi
# check the version of libcuda.so bundled with CMSSW
CMS_NVIDIA_VERSION=$(library_version $(readlink -f ${CUDA_BASE}/drivers/libcuda.so))
if [ "$NVIDIA_VERSION" ] && (( $(parse_version $CMS_NVIDIA_VERSION) < $(parse_version $NVIDIA_VERSION) )); then
# if the CMSSW version of the library is older, use the system library
exit 0
fi
# otherwise, use the library packaged with CMSSW
echo "RUNTIME:path:append:LD_LIBRARY_PATH=${CUDA_BASE}/drivers"
(edit: simplified the directory check)
Thanks @fwyzard , I will update the PR to use the updated script
Pull request #83 was updated.
Pull request #83 was updated.
A new Pull Request was created by @smuzaffar (Malik Shahzad Muzaffar) for branch scramv3.
@cmsbuild, @smuzaffar, @mrodozov can you please review it and eventually sign? Thanks. cms-bot commands are listed here