rocm-arch / tensorflow-rocm

tensorflow-rocm AUR package
17 stars 12 forks source link

dependency of rocm-dev and rocm-libs no longer exists #35

Closed riaqn closed 2 years ago

riaqn commented 2 years ago

These dependencies no longer exists in the latest rocm AUR packages.

riaqn commented 2 years ago

OK here is for TF 2.7.0 and ROCm 4.5.0, based on the latest PKGBUILD in arch official repo for cuda. If you see SIGSEGV in late stage of build, trying reduce TF_ROCM_AMDGPU_TARGETS to the ones you actually need.

# Maintainer: Sven-Hendrik Haase <>
# Maintainer: Konstantin Gizdov (kgizdov) <>
# Contributor: Adria Arrufat (archdria) <>
# Contributor: Thibault Lorrain (fredszaq) <>

pkgname=(tensorflow-rocm tensorflow-opt-rocm python-tensorflow-rocm python-tensorflow-opt-rocm)
pkgdesc="Library for computation using data flow graphs for scalable machine learning"
depends=('c-ares' 'intel-mkl' 'onednn' 'pybind11' 'openssl' 'lmdb' 'libpng' 'curl' 'giflib' 'icu' 'libjpeg-turbo')
makedepends=('bazel' 'python-numpy' 'rocm-hip-sdk' 'rocm-opencl-sdk' 'roctracer' 'rccl' 'git'
             'python-pip' 'python-wheel' 'python-setuptools' 'python-h5py'
             'python-keras-applications' 'python-keras-preprocessing'
optdepends=('tensorboard: Tensorflow visualization toolkit')

# consolidate common dependencies to prevent mishaps
_common_py_depends=(python-termcolor python-astor python-gast03 python-numpy python-protobuf absl-py python-h5py python-keras python-keras-applications python-keras-preprocessing python-tensorflow-estimator python-opt_einsum python-astunparse python-pasta python-flatbuffers)

get_pyver () {
  python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))'

check_dir() {
  # first make sure we do not break parsepkgbuild
  if ! command -v cp &> /dev/null; then
    >&2 echo "'cp' command not found. PKGBUILD is probably being checked by parsepkgbuild."
    if ! command -v install &> /dev/null; then
      >&2 echo "'install' command also not found. PKGBUILD must be getting checked by parsepkgbuild."
      >&2 echo "Cannot check if directory '${1}' exists. Ignoring."
      >&2 echo "If you are not running nacmap or parsepkgbuild, please make sure the PATH is correct and try again."
      >&2 echo "PATH should not be '/dummy': PATH=$PATH"
      return 0
  # if we are running normally, check the given path
  if [ -d "${1}" ]; then
    return 0
    >&2 echo Directory "${1}" does not exist or is a file! Exiting...
    exit 1

prepare() {
  # Allow any bazel version
  echo "*" > tensorflow-${_pkgver}/.bazelversion

  # Get rid of hardcoded versions. Not like we ever cared about what upstream
  # thinks about which versions should be used anyway. ;) (FS#68772)
  sed -i -E "s/'([0-9a-z_-]+) .= [0-9].+[0-9]'/'\1'/" tensorflow-${_pkgver}/tensorflow/tools/pip_package/

  cp -r tensorflow-${_pkgver} tensorflow-${_pkgver}-rocm
  cp -r tensorflow-${_pkgver} tensorflow-${_pkgver}-opt-rocm

  # These environment variables influence the behavior of the configure call below.
  export PYTHON_BIN_PATH=/usr/bin/python
  export TF_NEED_KAFKA=1
  export TF_NEED_AWS=1
  export TF_NEED_GCP=1
  export TF_NEED_HDFS=1
  export TF_NEED_S3=1
  export TF_ENABLE_XLA=1
  export TF_NEED_GDR=0
  export TF_NEED_VERBS=0
  export TF_NEED_OPENCL=0
  export TF_NEED_MPI=0
  export TF_NEED_NGRAPH=0
  export TF_NEED_IGNITE=0
  export TF_NEED_CUDA=0
  export TF_NEED_ROCM=1
  export TF_ROCM_AMDGPU_TARGETS=gfx701,gfx702,gfx803,gfx900,gfx904,gfx906,gfx908  
  # See
  export TF_SYSTEM_LIBS="boringssl,curl,cython,gif,icu,libjpeg_turbo,lmdb,nasm,png,pybind11,zlib"
  export TF_NCCL_VERSION=$(pkg-config nccl --modversion | grep -Po '\d+\.\d+')
  export TF_MKL_ROOT=/opt/intel/mkl
  export NCCL_INSTALL_PATH=/usr
  export GCC_HOST_COMPILER_PATH=/usr/bin/gcc
  export HOST_C_COMPILER=/usr/bin/gcc
  export HOST_CXX_COMPILER=/usr/bin/g++
  export TF_CUDA_CLANG=0  # Clang currently disabled because it's not compatible at the moment.
  export CLANG_CUDA_COMPILER_PATH=/usr/bin/clang
  export TF_CUDA_PATHS=/opt/cuda,/usr/lib,/usr
  export TF_CUDA_VERSION=$(/opt/cuda/bin/nvcc --version | sed -n 's/^.*release \(.*\),.*/\1/p')
  export TF_CUDNN_VERSION=$(sed -n 's/^#define CUDNN_MAJOR\s*\(.*\).*/\1/p' /usr/include/cudnn_version.h)
  # according to the above, we should be specifying CUDA compute capabilities as 'sm_XX' or 'compute_XX' from now on
  # add latest PTX for future compatibility
  export TF_CUDA_COMPUTE_CAPABILITIES=sm_52,sm_53,sm_60,sm_61,sm_62,sm_70,sm_72,sm_75,sm_80,sm_86,compute_86

  export CC=gcc
  export CXX=g++

  export BAZEL_ARGS="--config=mkl -c opt"

build() {
  echo "Building with rocm and without non-x86-64 optimizations"
  cd "${srcdir}"/tensorflow-${_pkgver}-rocm
  export CC_OPT_FLAGS="-march=x86-64"
  bazel \
    build \
      ${BAZEL_ARGS[@]} \
      // \
      // \
      //tensorflow:install_headers \
  bazel-bin/tensorflow/tools/pip_package/build_pip_package --gpu "${srcdir}"/tmprocm

  echo "Building with rocm and with non-x86-64 optimizations"
  cd "${srcdir}"/tensorflow-${_pkgver}-opt-rocm
  export CC_OPT_FLAGS="-march=haswell -O3"
  bazel \
    build --config=avx2_linux \
      ${BAZEL_ARGS[@]} \
      // \
      // \
      //tensorflow:install_headers \
  bazel-bin/tensorflow/tools/pip_package/build_pip_package --gpu "${srcdir}"/tmpoptrocm

_package() {
  # install headers first
  install -d "${pkgdir}"/usr/include/tensorflow
  cp -r bazel-bin/tensorflow/include/* "${pkgdir}"/usr/include/tensorflow/
  # install python-version to get all extra headers
  WHEEL_PACKAGE=$(find "${srcdir}"/$1 -name "tensor*.whl")
  pip install --ignore-installed --upgrade --root "${pkgdir}"/ $WHEEL_PACKAGE --no-dependencies
  # move extra headers to correct location
  local _srch_path="${pkgdir}/usr/lib/python$(get_pyver)"/site-packages/tensorflow/include
  check_dir "${_srch_path}"  # we need to quit on broken search paths
  find "${_srch_path}" -maxdepth 1 -mindepth 1 -type d -print0 | while read -rd $'\0' _folder; do
    cp -nr "${_folder}" "${pkgdir}"/usr/include/tensorflow/
  # clean up unneeded files
  rm -rf "${pkgdir}"/usr/bin
  rm -rf "${pkgdir}"/usr/lib
  rm -rf "${pkgdir}"/usr/share
  # make sure no lib objects are outside valid paths
  local _so_srch_path="${pkgdir}/usr/include"
  check_dir "${_so_srch_path}"  # we need to quit on broken search paths
  find "${_so_srch_path}" -type f,l \( -iname "*.so" -or -iname "*.so.*" \) -print0 | while read -rd $'\0' _so_file; do
    # check if file is a dynamic executable
    ldd "${_so_file}" &>/dev/null && rm -rf "${_so_file}"

  # install the rest of tensorflow
  tensorflow/c/ --prefix=/usr --version=${pkgver}
  sed -e 's@/include$@/include/tensorflow@' -i tensorflow.pc -i tensorflow_cc.pc
  install -Dm644 tensorflow.pc "${pkgdir}"/usr/lib/pkgconfig/tensorflow.pc
  install -Dm644 tensorflow_cc.pc "${pkgdir}"/usr/lib/pkgconfig/tensorflow_cc.pc
  install -Dm755 bazel-bin/tensorflow/ "${pkgdir}"/usr/lib/${pkgver}
  ln -s${pkgver} "${pkgdir}"/usr/lib/${pkgver:0:1}
  ln -s${pkgver:0:1} "${pkgdir}"/usr/lib/
  install -Dm755 bazel-bin/tensorflow/ "${pkgdir}"/usr/lib/${pkgver}
  ln -s${pkgver} "${pkgdir}"/usr/lib/${pkgver:0:1}
  ln -s${pkgver:0:1} "${pkgdir}"/usr/lib/
  install -Dm755 bazel-bin/tensorflow/ "${pkgdir}"/usr/lib/${pkgver}
  ln -s${pkgver} "${pkgdir}"/usr/lib/${pkgver:0:1}
  ln -s${pkgver:0:1} "${pkgdir}"/usr/lib/
  install -Dm644 tensorflow/c/c_api.h "${pkgdir}"/usr/include/tensorflow/tensorflow/c/c_api.h
  install -Dm644 LICENSE "${pkgdir}"/usr/share/licenses/${pkgname}/LICENSE

  # Fix interoperability of C++14 and C++17. See
  patch -Np0 -i "${srcdir}"/fix-c++17-compat.patch -d "${pkgdir}"/usr/include/tensorflow/absl/base

_python_package() {
  WHEEL_PACKAGE=$(find "${srcdir}"/$1 -name "tensor*.whl")
  pip install --ignore-installed --upgrade --root "${pkgdir}"/ $WHEEL_PACKAGE --no-dependencies

  # create symlinks to headers
  local _srch_path="${pkgdir}/usr/lib/python$(get_pyver)"/site-packages/tensorflow/include/
  check_dir "${_srch_path}"  # we need to quit on broken search paths
  find "${_srch_path}" -maxdepth 1 -mindepth 1 -type d -print0 | while read -rd $'\0' _folder; do
    rm -rf "${_folder}"
    _smlink="$(basename "${_folder}")"
    ln -s /usr/include/tensorflow/"${_smlink}" "${_srch_path}"

  # tensorboard has been separated from upstream but they still install it with
  # tensorflow. I don't know what kind of sense that makes but we have to clean
  # it out from this pacakge.
  rm -rf "${pkgdir}"/usr/bin/tensorboard

  install -Dm644 LICENSE "${pkgdir}"/usr/share/licenses/${pkgname}/LICENSE

package_tensorflow-rocm() {
  pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCm)"
  depends+=(rocm-hip-sdk rocm-opencl-sdk roctracer miopen rccl)

  cd "${srcdir}"/tensorflow-${_pkgver}-rocm
  _package tmprocm

package_tensorflow-opt-rocm() {
  pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCm and AVX2 CPU optimizations)"
  depends+=(rocm-hip-sdk rocm-opencl-sdk roctracer miopen rccl)
  provides=(tensorflow tensorflow-rocm)

  cd "${srcdir}"/tensorflow-${_pkgver}-opt-rocm
  _package tmpoptrocm

package_python-tensorflow-rocm() {
  pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCm)"
  depends+=(tensorflow-rocm rocm-hip-sdk rocm-opencl-sdk roctracer miopen rccl "${_common_py_depends[@]}")

  cd "${srcdir}"/tensorflow-${_pkgver}-rocm
  _python_package tmprocm

package_python-tensorflow-opt-rocm() {
  pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCm and AVX2 CPU optimizations)"
  depends+=(tensorflow-opt-rocm rocm-hip-sdk rocm-opencl-sdk roctracer miopen rccl "${_common_py_depends[@]}")
  provides=(python-tensorflow python-tensorflow-rocm)

  cd "${srcdir}"/tensorflow-${_pkgver}-opt-rocm
  _python_package tmpoptrocm

# vim:set ts=2 sw=2 et:
t1nux commented 2 years ago

Thanks @riaqn for providing the updated PKGBUILD. Unfortunately, it does not seem to work with the currently available ROCM AUR packages. Right now, this is a mixture of 4.5 and 4.5.2 versions. I'm getting the following errors:

ERROR: An error occurred during the fetch of repository 'local_config_rocm':
   Traceback (most recent call last):
    File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 834, column 38, in _rocm_autoconf_impl
    File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 540, column 35, in _create_local_rocm_repository
        rocm_config = _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script)
    File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 391, column 30, in _get_rocm_config
        config = find_rocm_config(repository_ctx, find_rocm_config_script)
    File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 369, column 41, in find_rocm_config
        exec_result = _exec_find_rocm_config(repository_ctx, script_path)
    File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 365, column 19, in _exec_find_rocm_config
        return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
    File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/remote_config/common.bzl", line 230, column 13, in execute
Error in fail: Repository command failed
ERROR: ROCm version file "/opt/rocm/.info/version-dev" not found
ERROR: Skipping '//tensorflow/tools/pip_package:build_pip_package': no such package '@local_config_rocm//rocm': Repository command failed
ERROR: ROCm version file "/opt/rocm/.info/version-dev" not found
ERROR: no such package '@local_config_rocm//rocm': Repository command failed
ERROR: ROCm version file "/opt/rocm/.info/version-dev" not found
INFO: Elapsed time: 14.407s
INFO: 0 processes.
FAILED: Build did NOT complete successfully (0 packages loaded)
    currently loading: tensorflow ... (2 packages)
==> ERROR: A failure occurred in build().

Any ideas?

Also, it seems to be necessary to rebuild python-cppheaderparser after the upgrade of Python from 3.9 to 3.10 (See also this).

riaqn commented 2 years ago

@t1nux yes, go to /opt/rocm/.info/ and symlink version to version-dev and other files. I believe this is due to new ROCM releases no longer have this version file while tensorflow still uses it to identify rocm.

Also, I don't know how deep are you in this ROCm mess, but my suggestion is run! run away from this immature ecosystem. Sell your Radeon and get a real GPU from NVIDIA which is cuda-ready.

t1nux commented 2 years ago

@riaqn Sorry for the late reply. It's working now, after what seemed like an eternity of compiling...

I partly agree with Nvidia vs AMD regarding the ecosystem, but, for me, there's more to that topic, like the ratio (FP64-performance)/(USD). Anyway, this is a topic that should be discussed elsewhere ;-).

Thanks for you help and the PKGBUILD, too, of course.

acxz commented 2 years ago

@riaqn If I am missing any deps please create PRs to add them!

riaqn commented 2 years ago

@acxz thanks for updating the package to 2.8.0. I guess you have removed the non-existant dependencies while updating it. Therefore I'm closing this issue. Also I switched to pytorch recently.