ROCm / HIP

HIP: C++ Heterogeneous-Compute Interface for Portability
https://rocmdocs.amd.com/projects/HIP/
MIT License
3.68k stars 527 forks source link

How do I add HIP support to an existing cmake project? The link costs too much time. #1029

Closed xiaocenxiaocen closed 3 months ago

xiaocenxiaocen commented 5 years ago

Hello guys.. How do I add HIP support to an existing cmake project. I followed this issue #231 and had tried 4 approaches, but only one approach work correctly. So what is the correct way to add HIP support to an existing cmake project?

I am giving the detail information as follow: System information

define HIP_PLATFORM_HCC

// hip header file

include "hip/hip_runtime.h"

include "transpose.cuh"

define WIDTH 1024

define NUM (WIDTH * WIDTH)

// CPU implementation of matrix transpose void matrixTransposeCPUReference(float output, float input, const unsigned int width) { for (unsigned int j = 0; j < width; j++) { for (unsigned int i = 0; i < width; i++) { output[i width + j] = input[j width + i]; } } }

int main() { float Matrix; float TransposeMatrix; float* cpuTransposeMatrix;

float* gpuMatrix;
float* gpuTransposeMatrix;

hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);

std::cout << "Device name " << devProp.name << std::endl;

int i;
int errors;

Matrix = (float*)malloc(NUM * sizeof(float));
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));

// initialize the input data
for (i = 0; i < NUM; i++) {
    Matrix[i] = (float)i * 10.0f;
}

// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

// Lauching kernel from host
exec(gpuTransposeMatrix, gpuMatrix, WIDTH);

// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float),    hipMemcpyDeviceToHost);
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);

// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
        errors++;
    }
}
if (errors != 0) {
    printf("FAILED: %d errors\n", errors);
} else {
    printf("PASSED!\n");
}

// free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);

// free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);

return errors;

}

MatrixTranspose.cpp
```CUDA
#include <iostream>

#include "hip/hip_runtime_api.h"
#include "hip/hip_runtime.h"
#define WIDTH 1024

#define NUM (WIDTH * WIDTH)

#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1

// Device (Kernel) function, it must be void
__global__ void
__attribute__((visibility("default")))
matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

    out[y * width + x] = in[x * width + y];

}

void exec(float* gpuTransposeMatrix, float* gpuMatrix, const int width) {
    // Lauching kernel from host
    hipLaunchKernelGGL(
        matrixTranspose,
        dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
        gpuTransposeMatrix, gpuMatrix, WIDTH);
}

transpose.cuh

#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1

void exec(float* out, float* in, const int width);

CMakeList.txt

cmake_minimum_required(VERSION 2.8.3)

if(NOT DEFINED HIP_PATH)
    if(NOT DEFINED ENV{HIP_PATH})
        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
    else()
        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
    endif()
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})

project(12_cmake)

find_package(HIP QUIET)
if(HIP_FOUND)
    message(STATUS "Found HIP: " ${HIP_VERSION})
else()
    message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location.")
endif()

set(MY_SOURCE_FILES MatrixTranspose.cpp)
set(MY_TARGET_NAME MatrixTranspose)
set(MY_HIPCC_OPTIONS )
set(MY_HCC_OPTIONS )
set(MY_NVCC_OPTIONS )

set (CMAKE_LINKER "/opt/rocm/bin/hipcc")
set (CMAKE_CXX_LINK_EXECUTABLE "<CMAKE_LINKER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")

set_source_files_properties(${MY_SOURCE_FILES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
hip_add_library(${MY_TARGET_NAME} ${MY_SOURCE_FILES} HIPCC_OPTIONS ${MY_HIPCC_OPTIONS} HCC_OPTIONS ${MY_HCC_OPTIONS} NVCC_OPTIONS ${MY_NVCC_OPTIONS})
set (SOURCES main.cpp)
#add_custom_target(combined COMMAND ar -x $<TARGET_FILE:MatrixTranspose>
#                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
#                DEPENDS MatrixTranspose
#                )
#add_library (hip_cmake_lib STATIC main.cpp)
#add_dependencies(hip_cmake_lib MatrixTranspose)
add_executable (hip_cmake_test main.cpp)
target_link_libraries (hip_cmake_test MatrixTranspose hip_hcc)

== hipconfig HIP_PATH : /opt/rocm HIP_PLATFORM : hcc CPP_CONFIG : -DHIP_PLATFORM_HCC= -I/opt/rocm/include -I/opt/rocm/hcc/include

== hcc HSA_PATH : /opt/rocm/hsa HCC_HOME : /opt/rocm/hcc HCC clang version 9.0.0 (/data/jenkins_workspace/compute-rocm-rel-2.3/external/hcc-tot/clang 785f31db116e742ac53d052e207979869a857d1a) (/data/jenkins_workspace/compute-rocm-rel-2.3/external/hcc-tot/compiler 87f982f8ce2b85ce824f91bf8c2c90f6843a50a3) (based on HCC 1.3.19115-9b3a740-785f31d-87f982f ) Target: x86_64-unknown-linux-gnu Thread model: posix InstalledDir: /opt/rocm/hcc/bin LLVM (http://llvm.org/): LLVM version 9.0.0svn Optimized build. Default target: x86_64-unknown-linux-gnu Host CPU: haswell

Registered Targets: amdgcn - AMD GCN GPUs r600 - AMD GPUs HD2XXX-HD6XXX x86 - 32-bit X86: Pentium-Pro and above x86-64 - 64-bit X86: EM64T and AMD64 HCC-cxxflags : -hc -std=c++amp -I/opt/rocm/hcc/include -I/opt/rocm/includeHCC-ldflags : -hc -std=c++amp -L/opt/rocm/hcc/lib -Wl,--rpath=/opt/rocm/hcc/lib -ldl -lm -lpthread -lhc_am -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive ... == Linux Kernel Hostname : ##### Linux qiang 4.13.16-041316-generic #201711240901 SMP Fri Nov 24 09:02:42 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 16.04.5 LTS Release: 16.04 Codename: xenial


**Describe the current behavior**

- If I compile the device code via macro hip_add_library to a .a file, and compile the host code with gcc. And then link the device object code and host object code with hipcc, the generated executable runs correctly. But the link costs to much time in our machine learning framework.

- If I compile the device code via macro hip_add_library to a .so file (i.e. pass SHARED in hip_add_library), and compile the host code with gcc. And then link the device object code and host object code with hipcc. When I run the executable, it throws an error:

Device name Device 6860 terminate called after throwing an instance of 'std::exception' what(): std::exception [1] 38576 abort (core dumped) ./hip_cmake_test

I have investigated this error, the exception is thrown at
`hip/hcc_detail/functional_grid_launch.hpp:108`
if (it == function_names().cend()) {
    hip_throw(std::runtime_error{"Undefined __global__ function."});
}

- If I compile the device code via macro hip_add_library to a .so file, and compile the host code with gcc. And then link the device object code and host object code with gcc. When I run the executable, it throws the same error as the above situation.

-If I compile the device code via macro hip_add_library to a .a file, and compile the host code with gcc. And then link the device object code and host object code with gcc. A link error occurs.

/usr/bin/x86_64-linux-gnu-ld: libMatrixTranspose.a(MatrixTranspose_generated_MatrixTranspose.cpp.o): undefined reference to symbol 'hsa_system_major_extension_supported@@ROCR_1' /opt/rocm/hsa/lib/libhsa-runtime64.so.1: error adding symbols: DSO missing from command line collect2: error: ld returned 1 exit status CMakeFiles/hip_cmake_test.dir/build.make:95: recipe for target 'hip_cmake_test' failed make[2]: [hip_cmake_test] Error 1 CMakeFiles/Makefile2:104: recipe for target 'CMakeFiles/hip_cmake_test.dir/all' failed make[1]: [CMakeFiles/hip_cmake_test.dir/all] Error 2 Makefile:83: recipe for target 'all' failed make: *** [all] Error 2


**Describe the excepted behavior**

- One can compile the device code via hip_add_library, and compile host code with gcc. And then link the device object and host object with gcc/hipcc. The link time is within a reasonable range.

Please assist how to resolve this problem

Thanks and Regards.
yxsamliu commented 5 years ago

You may try use hip-clang for compilation and get better chance. hip-clang embed kernel as strings with internal linkage therefore does not need kernel symbols in the elf itself. You may check HIP installation guide about how to use hip-clang with HIP.

tcojean commented 4 years ago

After stumbling on this same exact problem, I ended up finding a solution to this. Thanks a lot @xiaocenxiaocen for the small example, that helped a lot.

The fix here also fixes some other problems that arise during the linking step of the shared library creation. In particular, when using the target exported from CMake, i.e. in /opt/rocm/hcc/lib/cmake/hcc/hcc-targets.cmake the flag -hc is passed for INTERFACE_LINK_LIBRARIES of the hcc::hccshared target. This flag is only available for hipcc, but hipcc is never used when linking at shared library creation step, except when doing the change I did here.

The changes are relatively small: passing in the SHARED into hip_add_library and changing the various CMAKE_LINKER related settings.

Afterwards, calling the following in a build directory works properly.

cmake -DBUILD_SHARED_LIBS=on ..
export LD_LIBRARY_PATH=$PWD:$LD_LIBRARY_PATH #rpath is not set through hipcc
# either pass the rpath by hand in the CMakeLists.txt or do this
./hip_cmake_test

Final CMakeLists.txt

cmake_minimum_required(VERSION 3.8)

if(NOT DEFINED HIP_PATH)
    if(NOT DEFINED ENV{HIP_PATH})
        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
    else()
        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
    endif()
endif()

if(NOT DEFINED ROCM_PATH)
    if(DEFINED ENV{ROCM_PATH})
        set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed")
    elseif(DEFINED ENV{HIP_PATH})
        set(ROCM_PATH "$ENV{HIP_PATH}/.." CACHE PATH "Path to which ROCM has been installed")
    else()
        set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCM has been installed")
    endif()
endif()

if(NOT DEFINED HCC_PATH)
    if(DEFINED ENV{HCC_PATH})
        set(HCC_PATH $ENV{HCC_PATH} CACHE PATH "Path to which HCC has been installed")
    else()
        set(HCC_PATH "${ROCM_PATH}/hcc" CACHE PATH "Path to which HCC has been installed")
    endif()
    set(HCC_HOME "${HCC_PATH}")
endif()

if(NOT DEFINED HIP_CLANG_PATH)
    if(NOT DEFINED ENV{HIP_CLANG_PATH})
        set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to which HIP compatible clang binaries have been installed")
    else()
        set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to which HIP compatible clang binaries have been installed")
    endif()
endif()

set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
list(APPEND CMAKE_PREFIX_PATH
    "${HIP_PATH}/lib/cmake"
    "${HIP_PATH}/../lib/cmake" # hopefully catches all extra HIP dependencies
)

project(12_cmake)

find_package(HIP QUIET)
if(HIP_FOUND)
    message(STATUS "Found HIP: " ${HIP_VERSION})
else()
    message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location.")
endif()
find_package(hip REQUIRED)

# For ROCm >=3.5, wipe hip-clang specific interface options which are propagated
set_target_properties(hip::device PROPERTIES INTERFACE_COMPILE_OPTIONS "-fPIC")
set_target_properties(hip::device PROPERTIES INTERFACE_LINK_LIBRARIES "hip::host")

set(MY_SOURCE_FILES MatrixTranspose.cpp)
set(MY_TARGET_NAME MatrixTranspose)
set(MY_HIPCC_OPTIONS "-fPIC")
set(MY_HCC_OPTIONS )
set(MY_NVCC_OPTIONS )
set(MY_CLANG_OPTIONS )

if (BUILD_SHARED_LIBS)
    set(STATIC_OR_SHARED SHARED)
else()
    set(STATIC_OR_SHARED STATIC)
endif()

set_source_files_properties(${MY_SOURCE_FILES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
hip_add_library(${MY_TARGET_NAME} ${MY_SOURCE_FILES} HIPCC_OPTIONS "${MY_HIPCC_OPTIONS}" HCC_OPTIONS "${MY_HCC_OPTIONS}" NVCC_OPTIONS "${MY_NVCC_OPTIONS}" CLANG_OPTIONS "${MY_CLANG_OPTIONS}" ${STATIC_OR_SHARED})
set_target_properties(${MY_TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries (${MY_TARGET_NAME} PRIVATE hip::device)

set (SOURCES main.cpp)
add_executable (hip_cmake_test main.cpp)
target_include_directories(hip_cmake_test
    PRIVATE
        $<BUILD_INTERFACE:${HIP_PATH}/include>
        $<BUILD_INTERFACE:${HIP_PATH}/../include>)
target_link_libraries (hip_cmake_test MatrixTranspose hip::device)

ping @hartwiganzt

tcojean commented 4 years ago

Another way to fix this issue is to call set_target_properties(${MY_TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) after calling hip_add_library. Setting HCC_PATH before find_package(HIP QUIET) is also required. In this case, the custom CMAKE_CXX_CREATE_SHARED_LIBRARY is not required.

ppanchad-amd commented 4 months ago

@xiaocenxiaocen Do you still need assistance with this ticket? If not, please close ticket. Thanks!