Closed RavenOchlich closed 1 year ago
I managed to fix the previous error by deleting the bazel cache with rm -rf .cache/bazel
but now i am getting the following error. The old rocm-device-libs-4.5.2-1 https://archlinux.pkgs.org/rolling/jlk-x86_64/rocm-device-libs-4.5.2-1-x86_64.pkg.tar.zst.html provided that file, but the new 5.0.2 version does not https://archlinux.pkgs.org/rolling/archlinuxcn-x86_64/rocm-device-libs-5.0.2-1-x86_64.pkg.tar.zst.html.
2022-03-09 22:07:06.755044: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:237] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-03-09 22:07:06.901240: E tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:292] bitcode module is required by this HLO module but was not found at /opt/rocm/amdgcn/bitcode/hc.bc
2022-03-09 22:07:06.901291: E tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:292] bitcode module is required by this HLO module but was not found at /opt/rocm/amdgcn/bitcode/hc.bc
2022-03-09 22:07:06.901319: E tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:292] bitcode module is required by this HLO module but was not found at /opt/rocm/amdgcn/bitcode/hc.bc
2022-03-09 22:07:06.901344: E tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:292] bitcode module is required by this HLO module but was not found at /opt/rocm/amdgcn/bitcode/hc.bc
2022-03-09 22:07:06.901904: E tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:292] bitcode module is required by this HLO module but was not found at /opt/rocm/amdgcn/bitcode/hc.bc
2022-03-09 22:07:06.902239: E tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:292] bitcode module is required by this HLO module but was not found at /opt/rocm/amdgcn/bitcode/hc.bc
2022-03-09 22:07:06.903199: E tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:292] bitcode module is required by this HLO module but was not found at /opt/rocm/amdgcn/bitcode/hc.bc
2022-03-09 22:07:06.903940: E tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:292] bitcode module is required by this HLO module but was not found at /opt/rocm/amdgcn/bitcode/hc.bc
error: Failure when generating HSACO
error: Failure when generating HSACO
error: Failure when generating HSACO
error: Failure when generating HSACO
error: Failure when generating HSACO
error: Failure when generating HSACO
error: Failure when generating HSACO
error: Failure when generating HSACO
2022-03-09 22:07:06.906725: E tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc:195] INTERNAL: Generating device code failed.
I was able to build the most recent version of Tensorflow from the AMD tensorflow Github with the following PKGBUILD and patch. It seems tensorflow version 2.8.0 doesn't support rocm 5.0.2. The latest Github Version works fine with ROCm 5.0.2. It is possible some dependencys are missing in my PKGBUILD.
PKGBUILD
# Maintainer: acxz <akashpatel2008 at yahoo dot com>
# Contributor: Sven-Hendrik Haase <svenstaro@gmail.com>
# Contributor: Konstantin Gizdov (kgizdov) <arch@kge.pw>
# Contributor: Adria Arrufat (archdria) <adria.arrufat+AUR@protonmail.ch>
# Contributor: Thibault Lorrain (fredszaq) <fredszaq@gmail.com>
pkgbase=tensorflow-rocm
# Flags for building without/with cpu optimizations
_build_no_opt=1
_build_opt=1
pkgname=()
[ "$_build_no_opt" -eq 1 ] && pkgname+=(tensorflow-rocm python-tensorflow-rocm)
pkgver=2.8.0
_pkgver=2.8.0
pkgrel=1
pkgdesc="Library for computation using data flow graphs for scalable machine learning"
url="https://www.tensorflow.org/"
license=('APACHE')
arch=('x86_64')
depends=('c-ares' 'intel-mkl' 'onednn' 'pybind11' 'openssl' 'lmdb' 'libpng' 'curl' 'giflib' 'icu' 'libjpeg-turbo')
makedepends=('bazel' 'python-numpy' 'rocm-hip-sdk' 'miopen' 'rccl' 'git'
'python-pip' 'python-wheel' 'python-setuptools' 'python-h5py'
'python-keras-applications' 'python-keras-preprocessing'
'cython' 'rocblas' 'rocrand' 'rocfft' 'hipfft' 'roctracer'
'hipsparse' 'hipsolver' 'rocsolver' 'rocm-hip-sdk')
optdepends=('tensorboard: Tensorflow visualization toolkit')
source=("$pkgname::git+https://github.com/ROCmSoftwarePlatform/tensorflow-upstream.git"
fix-c++17-compat.patch
hiprand.patch)
sha512sums=('SKIP'
'f682368bb47b2b022a51aa77345dfa30f3b0d7911c56515d428b8326ee3751242f375f4e715a37bb723ef20a86916dad9871c3c81b1b58da85e1ca202bc4901e'
'b4650e9e28da71c6376096cc1ebc284f03c36bfad20bd8917a0f44d69457e11b2dfd95d677c77a30bab420cd8a48c7b6e8bba97fd1c0cc0dea772460e13236da')
# consolidate common dependencies to prevent mishaps
_common_py_depends=(python-termcolor python-astor python-gast03 python-numpy python-protobuf absl-py python-h5py python-keras python-keras-applications python-keras-preprocessing python-tensorflow-estimator python-opt_einsum python-astunparse python-pasta python-flatbuffers)
get_pyver () {
python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))'
}
check_dir() {
# first make sure we do not break parsepkgbuild
if ! command -v cp &> /dev/null; then
>&2 echo "'cp' command not found. PKGBUILD is probably being checked by parsepkgbuild."
if ! command -v install &> /dev/null; then
>&2 echo "'install' command also not found. PKGBUILD must be getting checked by parsepkgbuild."
>&2 echo "Cannot check if directory '${1}' exists. Ignoring."
>&2 echo "If you are not running nacmap or parsepkgbuild, please make sure the PATH is correct and try again."
>&2 echo "PATH should not be '/dummy': PATH=$PATH"
return 0
fi
fi
# if we are running normally, check the given path
if [ -d "${1}" ]; then
return 0
else
>&2 echo Directory "${1}" does not exist or is a file! Exiting...
exit 1
fi
}
prepare() {
# Allow any bazel version
echo "*" > tensorflow-rocm/.bazelversion
# Get rid of hardcoded versions. Not like we ever cared about what upstream
# thinks about which versions should be used anyway. ;) (FS#68772)
sed -i -E "s/'([0-9a-z_-]+) .= [0-9].+[0-9]'/'\1'/" tensorflow-rocm/tensorflow/tools/pip_package/setup.py
# These environment variables influence the behavior of the configure call below.
export PYTHON_BIN_PATH=/usr/bin/python
export USE_DEFAULT_PYTHON_LIB_PATH=1
export TF_NEED_JEMALLOC=1
export TF_NEED_KAFKA=1
export TF_NEED_OPENCL_SYCL=0
export TF_NEED_AWS=1
export TF_NEED_GCP=1
export TF_NEED_HDFS=1
export TF_NEED_S3=1
export TF_ENABLE_XLA=1
export TF_NEED_GDR=0
export TF_NEED_VERBS=0
export TF_NEED_OPENCL=0
export TF_NEED_MPI=0
export TF_NEED_TENSORRT=0
export TF_NEED_NGRAPH=0
export TF_NEED_IGNITE=0
export TF_NEED_ROCM=1
export TF_ROCM_AMDGPU_TARGETS=gfx900
# See https://github.com/tensorflow/tensorflow/blob/master/third_party/systemlibs/syslibs_configure.bzl
export TF_SYSTEM_LIBS="boringssl,curl,cython,gif,icu,libjpeg_turbo,lmdb,nasm,png,pybind11,zlib"
export TF_SET_ANDROID_WORKSPACE=0
export TF_DOWNLOAD_CLANG=0
export TF_NCCL_VERSION=$(pkg-config nccl --modversion | grep -Po '\d+\.\d+')
export TF_IGNORE_MAX_BAZEL_VERSION=1
export TF_MKL_ROOT=/opt/intel/mkl
export NCCL_INSTALL_PATH=/usr
export GCC_HOST_COMPILER_PATH=/usr/bin/gcc
export HOST_C_COMPILER=/usr/bin/gcc
export HOST_CXX_COMPILER=/usr/bin/g++
export TF_CUDA_CLANG=0 # Clang currently disabled because it's not compatible at the moment.
export CLANG_CUDA_COMPILER_PATH=/usr/bin/clang
export TF_CUDA_PATHS=/opt/cuda,/usr/lib,/usr
export TF_CUDA_VERSION=$(/opt/cuda/bin/nvcc --version | sed -n 's/^.*release \(.*\),.*/\1/p')
export TF_CUDNN_VERSION=$(sed -n 's/^#define CUDNN_MAJOR\s*\(.*\).*/\1/p' /usr/include/cudnn_version.h)
# https://github.com/tensorflow/tensorflow/blob/1ba2eb7b313c0c5001ee1683a3ec4fbae01105fd/third_party/gpus/cuda_configure.bzl#L411-L446
# according to the above, we should be specifying CUDA compute capabilities as 'sm_XX' or 'compute_XX' from now on
# add latest PTX for future compatibility
export TF_CUDA_COMPUTE_CAPABILITIES=sm_52,sm_53,sm_60,sm_61,sm_62,sm_70,sm_72,sm_75,sm_80,sm_86,compute_86
export CC=gcc
export CXX=g++
export BAZEL_ARGS="--config=mkl -c opt"
cd "${srcdir}"/tensorflow-rocm
patch --strip=1 < ../hiprand.patch
}
build() {
if [ "$_build_no_opt" -eq 1 ]; then
echo "Building with rocm and without non-x86-64 optimizations"
cd "${srcdir}"/tensorflow-rocm
export CC_OPT_FLAGS="-march=x86-64"
export TF_NEED_CUDA=0
export TF_NEED_ROCM=1
./configure
bazel \
build \
${BAZEL_ARGS[@]} \
//tensorflow:libtensorflow.so \
//tensorflow:libtensorflow_cc.so \
//tensorflow:install_headers \
//tensorflow/tools/pip_package:build_pip_package
bazel-bin/tensorflow/tools/pip_package/build_pip_package --gpu "${srcdir}"/tmprocm
fi
}
_package() {
# install headers first
install -d "${pkgdir}"/usr/include/tensorflow
cp -r bazel-bin/tensorflow/include/* "${pkgdir}"/usr/include/tensorflow/
# install python-version to get all extra headers
WHEEL_PACKAGE=$(find "${srcdir}"/$1 -name "tensor*.whl")
pip install --ignore-installed --upgrade --root "${pkgdir}"/ $WHEEL_PACKAGE --no-dependencies
# move extra headers to correct location
local _srch_path="${pkgdir}/usr/lib/python$(get_pyver)"/site-packages/tensorflow/include
check_dir "${_srch_path}" # we need to quit on broken search paths
find "${_srch_path}" -maxdepth 1 -mindepth 1 -type d -print0 | while read -rd $'\0' _folder; do
cp -nr "${_folder}" "${pkgdir}"/usr/include/tensorflow/
done
# clean up unneeded files
rm -rf "${pkgdir}"/usr/bin
rm -rf "${pkgdir}"/usr/lib
rm -rf "${pkgdir}"/usr/share
# make sure no lib objects are outside valid paths
local _so_srch_path="${pkgdir}/usr/include"
check_dir "${_so_srch_path}" # we need to quit on broken search paths
find "${_so_srch_path}" -type f,l \( -iname "*.so" -or -iname "*.so.*" \) -print0 | while read -rd $'\0' _so_file; do
# check if file is a dynamic executable
ldd "${_so_file}" &>/dev/null && rm -rf "${_so_file}"
done
# install the rest of tensorflow
tensorflow/c/generate-pc.sh --prefix=/usr --version=${pkgver}
sed -e 's@/include$@/include/tensorflow@' -i tensorflow.pc -i tensorflow_cc.pc
install -Dm644 tensorflow.pc "${pkgdir}"/usr/lib/pkgconfig/tensorflow.pc
install -Dm644 tensorflow_cc.pc "${pkgdir}"/usr/lib/pkgconfig/tensorflow_cc.pc
install -Dm755 bazel-bin/tensorflow/libtensorflow.so "${pkgdir}"/usr/lib/libtensorflow.so.${pkgver}
ln -s libtensorflow.so.${pkgver} "${pkgdir}"/usr/lib/libtensorflow.so.${pkgver:0:1}
ln -s libtensorflow.so.${pkgver:0:1} "${pkgdir}"/usr/lib/libtensorflow.so
install -Dm755 bazel-bin/tensorflow/libtensorflow_cc.so "${pkgdir}"/usr/lib/libtensorflow_cc.so.${pkgver}
ln -s libtensorflow_cc.so.${pkgver} "${pkgdir}"/usr/lib/libtensorflow_cc.so.${pkgver:0:1}
ln -s libtensorflow_cc.so.${pkgver:0:1} "${pkgdir}"/usr/lib/libtensorflow_cc.so
install -Dm755 bazel-bin/tensorflow/libtensorflow_framework.so "${pkgdir}"/usr/lib/libtensorflow_framework.so.${pkgver}
ln -s libtensorflow_framework.so.${pkgver} "${pkgdir}"/usr/lib/libtensorflow_framework.so.${pkgver:0:1}
ln -s libtensorflow_framework.so.${pkgver:0:1} "${pkgdir}"/usr/lib/libtensorflow_framework.so
install -Dm644 tensorflow/c/c_api.h "${pkgdir}"/usr/include/tensorflow/tensorflow/c/c_api.h
install -Dm644 LICENSE "${pkgdir}"/usr/share/licenses/${pkgname}/LICENSE
# Fix interoperability of C++14 and C++17. See https://bugs.archlinux.org/task/65953
patch -Np0 -i "${srcdir}"/fix-c++17-compat.patch -d "${pkgdir}"/usr/include/tensorflow/absl/base
}
_python_package() {
WHEEL_PACKAGE=$(find "${srcdir}"/$1 -name "tensor*.whl")
pip install --ignore-installed --upgrade --root "${pkgdir}"/ $WHEEL_PACKAGE --no-dependencies
# create symlinks to headers
local _srch_path="${pkgdir}/usr/lib/python$(get_pyver)"/site-packages/tensorflow/include/
check_dir "${_srch_path}" # we need to quit on broken search paths
find "${_srch_path}" -maxdepth 1 -mindepth 1 -type d -print0 | while read -rd $'\0' _folder; do
rm -rf "${_folder}"
_smlink="$(basename "${_folder}")"
ln -s /usr/include/tensorflow/"${_smlink}" "${_srch_path}"
done
# tensorboard has been separated from upstream but they still install it with
# tensorflow. I don't know what kind of sense that makes but we have to clean
# it out from this pacakge.
rm -rf "${pkgdir}"/usr/bin/tensorboard
install -Dm644 LICENSE "${pkgdir}"/usr/share/licenses/${pkgname}/LICENSE
}
package_tensorflow-rocm() {
pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCM)"
depends+=(rocm-hip-sdk miopen rccl)
conflicts=(tensorflow)
provides=(tensorflow)
cd "${srcdir}"/tensorflow-rocm
_package tmprocm
}
package_python-tensorflow-rocm() {
pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCM)"
depends+=(tensorflow-rocm rocm-hip-sdk miopen rccl "${_common_py_depends[@]}")
conflicts=(python-tensorflow)
provides=(python-tensorflow)
cd "${srcdir}"/tensorflow-rocm
_python_package tmprocm
}
# vim:set ts=2 sw=2 et:
and the following patch (hiprand.patch) is needed:
--- tensorflow-rocm/third_party/gpus/rocm_configure.bzl 2022-03-11 20:46:02.867224360 +0100
+++ tensorflow-rocm/third_party/gpus/rocm_configure.bzl 2022-03-11 20:47:46.923766740 +0100
@@ -331,7 +331,7 @@
("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"),
(hipfft_or_rocfft, rocm_config.rocm_toolkit_path + "/" + hipfft_or_rocfft),
- ("hiprand", rocm_config.rocm_toolkit_path),
+ ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"),
("MIOpen", rocm_config.rocm_toolkit_path + "/miopen"),
("rccl", rocm_config.rocm_toolkit_path + "/rccl"),
("hipsparse", rocm_config.rocm_toolkit_path + "/hipsparse"),
Sweet thanks for the detective work @RavenOchlich !
Feel free to make it a PR and I'll test it on my own system and merge it in!
Let's keep this open until the PR is merged.
Closing this issue as a stale build issue. If you have further issues please open up another issue. Sorry @RavenOchlich @tarcey @pangwalla
The build stops with the following error. I use the newest 5.0.2-1 rocm packages. Any ideas how to resolve this problem?
I was also missing the file /opt/rocm/.info/version-dev so i linked it manually. Also some files were missing so i needed to install the following packages from https://github.com/rocm-arch/rocm-arch. Only with these changes the build process would start.