IIC-SIG-MLsys / HDDT

Distrubuted DNN Training on Heterogeneous GPUs
0 stars 6 forks source link

Error while build torch_app on hygon GPU. #15

Closed derekwin closed 2 weeks ago

derekwin commented 3 weeks ago

Somethings wrong while build torch_app with torch c++ api on hygon gpus (dtk).

[  8%] Building HIP object CMakeFiles/hddt_rocm_shared.dir/src/net/rocm/net_rocm_reduce.hip.o
[ 16%] Building HIP object CMakeFiles/hddt_rocm_shared.dir/src/nn/rocm/nn_rocm_cnn.hip.o
[ 25%] Linking HIP shared library libhddt_rocm_shared.so
[ 25%] Built target hddt_rocm_shared
[ 33%] Building CXX object CMakeFiles/hddt_shared_lib.dir/src/net/net.cpp.o
[ 41%] Building CXX object CMakeFiles/hddt_shared_lib.dir/src/nn/nn.cpp.o
[ 50%] Linking CXX shared library libhddt_shared_lib.so
[ 50%] Built target hddt_shared_lib
[ 58%] Building CXX object apps/train_cnn/CMakeFiles/train_cnn.dir/main.cpp.o
[ 66%] Linking CXX executable train_cnn
[ 66%] Built target train_cnn
[ 75%] Building CXX object apps/simple_inference/CMakeFiles/simple_inference.dir/main.cpp.o
[ 83%] Linking CXX executable simple_inference
[ 83%] Built target simple_inference
[ 91%] Building CXX object apps/torch_app/CMakeFiles/torch_app.dir/main.cpp.o
cc1plus: warning: command-line option ‘-Wno-duplicate-decl-specifier’ is valid for C/ObjC but not for C++
In file included from /usr/include/c++/11/ext/hash_set:60,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/glog/stl_logging.h:54,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/c10/util/logging_is_google_glog.h:20,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/c10/util/Logging.h:26,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/core/ivalue_inl.h:25,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/core/ivalue.h:1499,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/core/List_inl.h:4,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/core/List.h:490,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/core/IListRef_inl.h:3,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/core/IListRef.h:632,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/WrapDimUtils.h:3,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/TensorNames.h:3,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/ATen/NamedTensorUtils.h:3,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/torch/csrc/autograd/variable.h:11,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/torch/csrc/autograd/autograd.h:3,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/torch/csrc/api/include/torch/autograd.h:3,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/torch/csrc/api/include/torch/all.h:7,
                 from /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/include/torch/csrc/api/include/torch/torch.h:3,
                 from /home/liujinyao/HDDT/apps/torch_app/main.cpp:2:
/usr/include/c++/11/backward/backward_warning.h:32:2: warning: #warning This file includes at least one deprecated or antiquated header which may be removed without further notice at a future date. Please use a non-deprecated interface with equivalent functionality instead. For a listing of replacement headers and interfaces, consult the file backward_warning.h. To disable this warning use -Wno-deprecated. [-Wcpp]
   32 | #warning \
      |  ^~~~~~~
[100%] Linking CXX executable torch_app
/usr/bin/ld: /home/liujinyao/miniconda3/envs/py310/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so: undefined reference to `google::base::CheckOpMessageBuilder::NewString()'
collect2: error: ld returned 1 exit status
make[2]: *** [apps/torch_app/CMakeFiles/torch_app.dir/build.make:115: apps/torch_app/torch_app] Error 1
make[1]: *** [CMakeFiles/Makefile2:242: apps/torch_app/CMakeFiles/torch_app.dir/all] Error 2
make: *** [Makefile:136: all] Error 2
Firefly-Dance commented 3 weeks ago

works for me on hygon GPU

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)

project(simple-tensor-operations)

set(CMAKE_CXX_STANDARD 17)  # 设置 C++17
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

set(CMAKE_CXX_COMPILER "g++" CACHE STRING "C++ compiler" FORCE)

# 设置 amd_comgr 的路径
set(amd_comgr_DIR "/opt/dtk/lib64/cmake/amd_comgr" CACHE STRING "Path to amd_comgr")

# 设置 CMAKE_PREFIX_PATH
set(CMAKE_PREFIX_PATH "/opt/dtk/lib64/cmake/amd_comgr" ${CMAKE_PREFIX_PATH})
# 设置 torch.cmake 的路径
set(CMAKE_PREFIX_PATH "/home/sdu/miniforge3/envs/pytorch21/lib/python3.10/site-packages/torch/share/cmake/Torch" ${CMAKE_PREFIX_PATH})
# 设置 glog 的路径
set(TORCH_DIR "/home/sdu/miniforge3/envs/pytorch21/lib/python3.10/site-packages/torch/lib")
set(CMAKE_INSTALL_RPATH "${TORCH_DIR}")
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

find_package(Torch REQUIRED)

link_directories(${TORCH_DIR})

add_executable(simple-tensor-operations main.cpp)

target_link_libraries(simple-tensor-operations
    ${TORCH_LIBRARIES}
    ${GLOG_LIBRARY}
)
derekwin commented 2 weeks ago

pass https://github.com/IIC-SIG-MLsys/HDDT/pull/26