nanoporetech / dorado

Oxford Nanopore's Basecaller
https://nanoporetech.com/
Other
495 stars 59 forks source link

ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links #723

Closed CRL-CHAR closed 2 months ago

CRL-CHAR commented 6 months ago

Issue Report

When compiling the most recent version of Dorado (v.0.6.0) on my university's cluster, I ran into the following issue:

ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links

Please describe the issue:

1) Cloned the GitHub repo --> "git clone https://github.com/nanoporetech/dorado.git dorado" then I changed directories into the dorado dir. 2) "ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links" error occurs at the very end of this particular cmake step --> "cmake -S . -B cmake-build"

Run environment:

Logs

-- Performing Test COMPILER_HAS_HIDDEN_VISIBILITY -- Performing Test COMPILER_HAS_HIDDEN_VISIBILITY - Success -- Performing Test COMPILER_HAS_HIDDEN_INLINE_VISIBILITY -- Performing Test COMPILER_HAS_HIDDEN_INLINE_VISIBILITY - Success -- Performing Test COMPILER_HAS_DEPRECATED_ATTR -- Performing Test COMPILER_HAS_DEPRECATED_ATTR - Success ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links -- Configuring done -- Generating done -- Build files have been written to: /cmake-build

blawrence-ont commented 6 months ago

Hi @CRL-CHAR,

Can you run find . -name "libnvToolsExt*" -exec ls -l {} \; after the command fails? It should only print 3 lines, similar to this:

18     cmake-build/..../lib/libnvToolsExt.so -> libnvToolsExt.so.1
22     cmake-build/..../lib/libnvToolsExt.so.1 -> libnvToolsExt.so.1.0.0
40136  cmake-build/..../lib/libnvToolsExt.so.1.0.0
CRL-CHAR commented 6 months ago

Hi @blawrence-ont,

Here's the output after running the above command find . -name "libnvToolsExt*" -exec ls -l {} \;

lrwxrwxrwx 1 <uni_ID> 22 Nov 15 13:10 ./cmake-build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux/libtorch/lib/libnvToolsExt.so.1 -> libnvToolsExt.so.1.0.0
-rwxr-xr-x 1 <uni_ID> 40136 Nov 15 13:10 ./cmake-build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux/libtorch/lib/libnvToolsExt.so.1.0.0
lrwxrwxrwx 1 <uni_ID> 18 Nov 15 13:10 ./cmake-build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux/libtorch/lib/libnvToolsExt.so -> libnvToolsExt.so.1
lrwxrwxrwx 1 <uni_ID> 18 Apr  4 16:16 ./cmake-build/libnvToolsExt.so.1 -> libnvToolsExt.so.1
lrwxrwxrwx 1 <uni_ID> 22 Apr  4 16:16 ./cmake-build/libnvToolsExt.so.1.0.0 -> libnvToolsExt.so.1.0.0
lrwxrwxrwx 1 <uni_ID> 18 Apr  4 16:16 ./cmake-build/libnvToolsExt.so -> libnvToolsExt.so.1
blawrence-ont commented 6 months ago

It looks like you have a cyclic link here:

lrwxrwxrwx 1 <uni_ID> 18 Apr  4 16:16 ./cmake-build/libnvToolsExt.so.1 -> libnvToolsExt.so.1

All of the libnvToolsExt.so* should be in cmake-build/download too, so I'm not sure how that cyclic link would have got there.

Did you have any other issues before this one when trying to build? Can you try deleting cmake-build and re-running the cmake command? It'd be useful to know what's creating that file. FWIW we do do builds on centos7 and haven't seen this before.

julianlg92 commented 2 months ago

Hi, I'm having the same error on CentOS 7.9 in branch release-v0.7

cmake Output:

ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links
ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links
ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links

Symbolic links:

lrwxrwxrwx 1 root root 22 Jul  8 16:08 ./build/libnvToolsExt.so.1.0.0 -> libnvToolsExt.so.1.0.0
-rwxr-xr-x 1 root root 40136 Nov 15  2023 ./build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux/libtorch/lib/libnvToolsExt.so.1.0.0
lrwxrwxrwx 1 root root 18 Nov 15  2023 ./build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux/libtorch/lib/libnvToolsExt.so -> libnvToolsExt.so.1
lrwxrwxrwx 1 root root 22 Nov 15  2023 ./build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux/libtorch/lib/libnvToolsExt.so.1 -> libnvToolsExt.so.1.0.0
lrwxrwxrwx 1 root root 16 Jul  8 16:08 ./build/libnvToolsExt.so -> libnvToolsExt.so
lrwxrwxrwx 1 root root 18 Jul  8 16:08 ./build/libnvToolsExt.so.1 -> libnvToolsExt.so.1
root@u05:/home/software/dorado/v0.7.2# ll ./build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux/libtorch/lib/libnvToolsExt.so.1
blawrence-ont commented 2 months ago

Hi @julianlg92,

I'm still not able to reproduce the issue. Which version of cmake are you using? I do see the recursive symlinks locally, which isn't great, but cmake seems to ignore it and carry on. I've found the cause of those symlinks and as a hacky test to see if that change would fix that issue for you can you try deleting/commenting out these 8 lines: https://github.com/nanoporetech/dorado/blob/9ac85c65fc873a956bda00b2f5608b2bf72d9e7c/CMakeLists.txt#L422-L429

julianlg92 commented 2 months ago

Hi @blawrence-ont, thanks for your prompt response

Run Environment

I added some verbosity to the makefile and the bug took flight. I'm thinking if a condition on those regex might fix it.

Output

Target: /home/software/dorado/v0.7.2/build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux//libtorch/lib/libnvToolsExt.so
Target Name:
COMMAND: ln -rfs libnvToolsExt.so libnvToolsExt.so || WORKING_DIRECTORY: /home/software/dorado/v0.7.2/build
ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links
Link name: libnvToolsExt.so
Link base: libnvToolsExt.so
COMMAND: ln -rfs libnvToolsExt.so libnvToolsExt.so || WORKING_DIRECTORY: /home/software/dorado/v0.7.2/build
ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links
--------- endforeach ----------

Target: /home/software/dorado/v0.7.2/build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux//libtorch/lib/libnvToolsExt.so.1
Target Name:
COMMAND: ln -rfs libnvToolsExt.so.1 libnvToolsExt.so.1 || WORKING_DIRECTORY: /home/software/dorado/v0.7.2/build
ln: failed to access ‘libnvToolsExt.so.1’: Too many levels of symbolic links
Link name: libnvToolsExt.so.1
Link base: libnvToolsExt.so
COMMAND: ln -rfs libnvToolsExt.so.1 libnvToolsExt.so || WORKING_DIRECTORY: /home/software/dorado/v0.7.2/build
ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links
--------- endforeach ----------

Target: /home/software/dorado/v0.7.2/build/download/torch-2.0.0-ont.2-cxx11-abi-static-Linux//libtorch/lib/libnvToolsExt.so.1.0.0
Target Name:
COMMAND: ln -rfs libnvToolsExt.so.1.0.0 libnvToolsExt.so.1.0.0 || WORKING_DIRECTORY: /home/software/dorado/v0.7.2/build
ln: failed to access ‘libnvToolsExt.so.1.0.0’: Too many levels of symbolic links
Link name: libnvToolsExt.so.1.0.0
Link base: libnvToolsExt.so
COMMAND: ln -rfs libnvToolsExt.so.1.0.0 libnvToolsExt.so || WORKING_DIRECTORY: /home/software/dorado/v0.7.2/build
ln: failed to access ‘libnvToolsExt.so’: Too many levels of symbolic links
--------- endforeach ----------
blawrence-ont commented 2 months ago

Yep, the creation of the links should be conditional on them not being the same. Can you confirm that deleting those lines/applying this patch solves the problem:

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a22ec0b..284e63ea 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -423,6 +423,8 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
         # torch may bundle it's own specific copy of the cuda libs. if it does, we want everything to point at them
         file(GLOB TORCH_CUDA_LIBS "${TORCH_LIB}/lib/${LIB}")
         if(TORCH_CUDA_LIBS)
+            # Sort the list so that we process in order: libX.so -> libX.so.1 -> libX.so.1.1.1
+            list(SORT TORCH_CUDA_LIBS)
             foreach(TORCH_CUDA_LIB IN LISTS TORCH_CUDA_LIBS)
                 # create links to the torch bundled libs with hashes in the name
                 # e.g. libcublas.so.11 => libcublas-3b81d170.so.11
@@ -430,14 +432,18 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
                 string(REGEX REPLACE "-[0-9a-f]+[.]" "." link ${target})
                 get_filename_component(target_name ${target} NAME)
                 get_filename_component(link_name ${link} NAME)
+                if (NOT target_name STREQUAL link_name)
                     execute_process(COMMAND ln -rfs ${target_name} ${link_name} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
                     install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${link_name} DESTINATION lib COMPONENT redist_libs)

                     # create links to the versioned links above
                     # e.g. libcublas.so => libcublas.so.11
                     string(REGEX REPLACE "[.]so[.0-9]*$" ".so" base_link ${link_name})
+                    if (NOT base_link STREQUAL link_name)
                         execute_process(COMMAND ln -rfs ${link_name} ${base_link} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
                         install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${base_link} DESTINATION lib COMPONENT redist_libs)
+                    endif()
+                endif()
             endforeach()
         else()
             # bundle the libraries from the cuda toolkit
julianlg92 commented 2 months ago

Hi @blawrence-ont,

The patch solved the issue!! Thanks for your help and prompt response.

blawrence-ont commented 2 months ago

Glad to hear. The patch will land in master soon so the next release should have this fixed.