Closed riaqn closed 2 years ago
OK here is for TF 2.7.0 and ROCm 4.5.0, based on the latest PKGBUILD in arch official repo for cuda. If you see SIGSEGV in late stage of build, trying reduce TF_ROCM_AMDGPU_TARGETS
to the ones you actually need.
# Maintainer: Sven-Hendrik Haase <svenstaro@gmail.com>
# Maintainer: Konstantin Gizdov (kgizdov) <arch@kge.pw>
# Contributor: Adria Arrufat (archdria) <adria.arrufat+AUR@protonmail.ch>
# Contributor: Thibault Lorrain (fredszaq) <fredszaq@gmail.com>
pkgbase=tensorflow-rocm
pkgname=(tensorflow-rocm tensorflow-opt-rocm python-tensorflow-rocm python-tensorflow-opt-rocm)
pkgver=2.7.0
_pkgver=2.7.0
pkgrel=3
pkgdesc="Library for computation using data flow graphs for scalable machine learning"
url="https://www.tensorflow.org/"
license=('APACHE')
arch=('x86_64')
depends=('c-ares' 'intel-mkl' 'onednn' 'pybind11' 'openssl' 'lmdb' 'libpng' 'curl' 'giflib' 'icu' 'libjpeg-turbo')
makedepends=('bazel' 'python-numpy' 'rocm-hip-sdk' 'rocm-opencl-sdk' 'roctracer' 'rccl' 'git'
'python-pip' 'python-wheel' 'python-setuptools' 'python-h5py'
'python-keras-applications' 'python-keras-preprocessing'
'cython')
optdepends=('tensorboard: Tensorflow visualization toolkit')
source=("$pkgname-$pkgver.tar.gz::https://github.com/tensorflow/tensorflow/archive/v${_pkgver}.tar.gz"
fix-c++17-compat.patch)
sha512sums=('f1e892583c7b3a73d4d39ec65dc135a5b02c789b357d57414ad2b6d05ad9fbfc8ef81918ba6410e314abd6928b76f764e6ef64c0b0c84b58b50796634be03f39'
'f682368bb47b2b022a51aa77345dfa30f3b0d7911c56515d428b8326ee3751242f375f4e715a37bb723ef20a86916dad9871c3c81b1b58da85e1ca202bc4901e')
# consolidate common dependencies to prevent mishaps
_common_py_depends=(python-termcolor python-astor python-gast03 python-numpy python-protobuf absl-py python-h5py python-keras python-keras-applications python-keras-preprocessing python-tensorflow-estimator python-opt_einsum python-astunparse python-pasta python-flatbuffers)
get_pyver () {
python -c 'import sys; print(str(sys.version_info[0]) + "." + str(sys.version_info[1]))'
}
check_dir() {
# first make sure we do not break parsepkgbuild
if ! command -v cp &> /dev/null; then
>&2 echo "'cp' command not found. PKGBUILD is probably being checked by parsepkgbuild."
if ! command -v install &> /dev/null; then
>&2 echo "'install' command also not found. PKGBUILD must be getting checked by parsepkgbuild."
>&2 echo "Cannot check if directory '${1}' exists. Ignoring."
>&2 echo "If you are not running nacmap or parsepkgbuild, please make sure the PATH is correct and try again."
>&2 echo "PATH should not be '/dummy': PATH=$PATH"
return 0
fi
fi
# if we are running normally, check the given path
if [ -d "${1}" ]; then
return 0
else
>&2 echo Directory "${1}" does not exist or is a file! Exiting...
exit 1
fi
}
prepare() {
# Allow any bazel version
echo "*" > tensorflow-${_pkgver}/.bazelversion
# Get rid of hardcoded versions. Not like we ever cared about what upstream
# thinks about which versions should be used anyway. ;) (FS#68772)
sed -i -E "s/'([0-9a-z_-]+) .= [0-9].+[0-9]'/'\1'/" tensorflow-${_pkgver}/tensorflow/tools/pip_package/setup.py
cp -r tensorflow-${_pkgver} tensorflow-${_pkgver}-rocm
cp -r tensorflow-${_pkgver} tensorflow-${_pkgver}-opt-rocm
# These environment variables influence the behavior of the configure call below.
export PYTHON_BIN_PATH=/usr/bin/python
export USE_DEFAULT_PYTHON_LIB_PATH=1
export TF_NEED_JEMALLOC=1
export TF_NEED_KAFKA=1
export TF_NEED_OPENCL_SYCL=0
export TF_NEED_AWS=1
export TF_NEED_GCP=1
export TF_NEED_HDFS=1
export TF_NEED_S3=1
export TF_ENABLE_XLA=1
export TF_NEED_GDR=0
export TF_NEED_VERBS=0
export TF_NEED_OPENCL=0
export TF_NEED_MPI=0
export TF_NEED_TENSORRT=0
export TF_NEED_NGRAPH=0
export TF_NEED_IGNITE=0
export TF_NEED_CUDA=0
export TF_NEED_ROCM=1
export TF_ROCM_AMDGPU_TARGETS=gfx701,gfx702,gfx803,gfx900,gfx904,gfx906,gfx908
# See https://github.com/tensorflow/tensorflow/blob/master/third_party/systemlibs/syslibs_configure.bzl
export TF_SYSTEM_LIBS="boringssl,curl,cython,gif,icu,libjpeg_turbo,lmdb,nasm,png,pybind11,zlib"
export TF_SET_ANDROID_WORKSPACE=0
export TF_DOWNLOAD_CLANG=0
export TF_NCCL_VERSION=$(pkg-config nccl --modversion | grep -Po '\d+\.\d+')
export TF_IGNORE_MAX_BAZEL_VERSION=1
export TF_MKL_ROOT=/opt/intel/mkl
export NCCL_INSTALL_PATH=/usr
export GCC_HOST_COMPILER_PATH=/usr/bin/gcc
export HOST_C_COMPILER=/usr/bin/gcc
export HOST_CXX_COMPILER=/usr/bin/g++
export TF_CUDA_CLANG=0 # Clang currently disabled because it's not compatible at the moment.
export CLANG_CUDA_COMPILER_PATH=/usr/bin/clang
export TF_CUDA_PATHS=/opt/cuda,/usr/lib,/usr
export TF_CUDA_VERSION=$(/opt/cuda/bin/nvcc --version | sed -n 's/^.*release \(.*\),.*/\1/p')
export TF_CUDNN_VERSION=$(sed -n 's/^#define CUDNN_MAJOR\s*\(.*\).*/\1/p' /usr/include/cudnn_version.h)
# https://github.com/tensorflow/tensorflow/blob/1ba2eb7b313c0c5001ee1683a3ec4fbae01105fd/third_party/gpus/cuda_configure.bzl#L411-L446
# according to the above, we should be specifying CUDA compute capabilities as 'sm_XX' or 'compute_XX' from now on
# add latest PTX for future compatibility
export TF_CUDA_COMPUTE_CAPABILITIES=sm_52,sm_53,sm_60,sm_61,sm_62,sm_70,sm_72,sm_75,sm_80,sm_86,compute_86
export CC=gcc
export CXX=g++
export BAZEL_ARGS="--config=mkl -c opt"
}
build() {
echo "Building with rocm and without non-x86-64 optimizations"
cd "${srcdir}"/tensorflow-${_pkgver}-rocm
export CC_OPT_FLAGS="-march=x86-64"
./configure
bazel \
build \
${BAZEL_ARGS[@]} \
//tensorflow:libtensorflow.so \
//tensorflow:libtensorflow_cc.so \
//tensorflow:install_headers \
//tensorflow/tools/pip_package:build_pip_package
bazel-bin/tensorflow/tools/pip_package/build_pip_package --gpu "${srcdir}"/tmprocm
echo "Building with rocm and with non-x86-64 optimizations"
cd "${srcdir}"/tensorflow-${_pkgver}-opt-rocm
export CC_OPT_FLAGS="-march=haswell -O3"
./configure
bazel \
build --config=avx2_linux \
${BAZEL_ARGS[@]} \
//tensorflow:libtensorflow.so \
//tensorflow:libtensorflow_cc.so \
//tensorflow:install_headers \
//tensorflow/tools/pip_package:build_pip_package
bazel-bin/tensorflow/tools/pip_package/build_pip_package --gpu "${srcdir}"/tmpoptrocm
}
_package() {
# install headers first
install -d "${pkgdir}"/usr/include/tensorflow
cp -r bazel-bin/tensorflow/include/* "${pkgdir}"/usr/include/tensorflow/
# install python-version to get all extra headers
WHEEL_PACKAGE=$(find "${srcdir}"/$1 -name "tensor*.whl")
pip install --ignore-installed --upgrade --root "${pkgdir}"/ $WHEEL_PACKAGE --no-dependencies
# move extra headers to correct location
local _srch_path="${pkgdir}/usr/lib/python$(get_pyver)"/site-packages/tensorflow/include
check_dir "${_srch_path}" # we need to quit on broken search paths
find "${_srch_path}" -maxdepth 1 -mindepth 1 -type d -print0 | while read -rd $'\0' _folder; do
cp -nr "${_folder}" "${pkgdir}"/usr/include/tensorflow/
done
# clean up unneeded files
rm -rf "${pkgdir}"/usr/bin
rm -rf "${pkgdir}"/usr/lib
rm -rf "${pkgdir}"/usr/share
# make sure no lib objects are outside valid paths
local _so_srch_path="${pkgdir}/usr/include"
check_dir "${_so_srch_path}" # we need to quit on broken search paths
find "${_so_srch_path}" -type f,l \( -iname "*.so" -or -iname "*.so.*" \) -print0 | while read -rd $'\0' _so_file; do
# check if file is a dynamic executable
ldd "${_so_file}" &>/dev/null && rm -rf "${_so_file}"
done
# install the rest of tensorflow
tensorflow/c/generate-pc.sh --prefix=/usr --version=${pkgver}
sed -e 's@/include$@/include/tensorflow@' -i tensorflow.pc -i tensorflow_cc.pc
install -Dm644 tensorflow.pc "${pkgdir}"/usr/lib/pkgconfig/tensorflow.pc
install -Dm644 tensorflow_cc.pc "${pkgdir}"/usr/lib/pkgconfig/tensorflow_cc.pc
install -Dm755 bazel-bin/tensorflow/libtensorflow.so "${pkgdir}"/usr/lib/libtensorflow.so.${pkgver}
ln -s libtensorflow.so.${pkgver} "${pkgdir}"/usr/lib/libtensorflow.so.${pkgver:0:1}
ln -s libtensorflow.so.${pkgver:0:1} "${pkgdir}"/usr/lib/libtensorflow.so
install -Dm755 bazel-bin/tensorflow/libtensorflow_cc.so "${pkgdir}"/usr/lib/libtensorflow_cc.so.${pkgver}
ln -s libtensorflow_cc.so.${pkgver} "${pkgdir}"/usr/lib/libtensorflow_cc.so.${pkgver:0:1}
ln -s libtensorflow_cc.so.${pkgver:0:1} "${pkgdir}"/usr/lib/libtensorflow_cc.so
install -Dm755 bazel-bin/tensorflow/libtensorflow_framework.so "${pkgdir}"/usr/lib/libtensorflow_framework.so.${pkgver}
ln -s libtensorflow_framework.so.${pkgver} "${pkgdir}"/usr/lib/libtensorflow_framework.so.${pkgver:0:1}
ln -s libtensorflow_framework.so.${pkgver:0:1} "${pkgdir}"/usr/lib/libtensorflow_framework.so
install -Dm644 tensorflow/c/c_api.h "${pkgdir}"/usr/include/tensorflow/tensorflow/c/c_api.h
install -Dm644 LICENSE "${pkgdir}"/usr/share/licenses/${pkgname}/LICENSE
# Fix interoperability of C++14 and C++17. See https://bugs.archlinux.org/task/65953
patch -Np0 -i "${srcdir}"/fix-c++17-compat.patch -d "${pkgdir}"/usr/include/tensorflow/absl/base
}
_python_package() {
WHEEL_PACKAGE=$(find "${srcdir}"/$1 -name "tensor*.whl")
pip install --ignore-installed --upgrade --root "${pkgdir}"/ $WHEEL_PACKAGE --no-dependencies
# create symlinks to headers
local _srch_path="${pkgdir}/usr/lib/python$(get_pyver)"/site-packages/tensorflow/include/
check_dir "${_srch_path}" # we need to quit on broken search paths
find "${_srch_path}" -maxdepth 1 -mindepth 1 -type d -print0 | while read -rd $'\0' _folder; do
rm -rf "${_folder}"
_smlink="$(basename "${_folder}")"
ln -s /usr/include/tensorflow/"${_smlink}" "${_srch_path}"
done
# tensorboard has been separated from upstream but they still install it with
# tensorflow. I don't know what kind of sense that makes but we have to clean
# it out from this pacakge.
rm -rf "${pkgdir}"/usr/bin/tensorboard
install -Dm644 LICENSE "${pkgdir}"/usr/share/licenses/${pkgname}/LICENSE
}
package_tensorflow-rocm() {
pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCm)"
depends+=(rocm-hip-sdk rocm-opencl-sdk roctracer miopen rccl)
conflicts=(tensorflow)
provides=(tensorflow)
cd "${srcdir}"/tensorflow-${_pkgver}-rocm
_package tmprocm
}
package_tensorflow-opt-rocm() {
pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCm and AVX2 CPU optimizations)"
depends+=(rocm-hip-sdk rocm-opencl-sdk roctracer miopen rccl)
conflicts=(tensorflow)
provides=(tensorflow tensorflow-rocm)
cd "${srcdir}"/tensorflow-${_pkgver}-opt-rocm
_package tmpoptrocm
}
package_python-tensorflow-rocm() {
pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCm)"
depends+=(tensorflow-rocm rocm-hip-sdk rocm-opencl-sdk roctracer miopen rccl "${_common_py_depends[@]}")
conflicts=(python-tensorflow)
provides=(python-tensorflow)
cd "${srcdir}"/tensorflow-${_pkgver}-rocm
_python_package tmprocm
}
package_python-tensorflow-opt-rocm() {
pkgdesc="Library for computation using data flow graphs for scalable machine learning (with ROCm and AVX2 CPU optimizations)"
depends+=(tensorflow-opt-rocm rocm-hip-sdk rocm-opencl-sdk roctracer miopen rccl "${_common_py_depends[@]}")
conflicts=(python-tensorflow)
provides=(python-tensorflow python-tensorflow-rocm)
cd "${srcdir}"/tensorflow-${_pkgver}-opt-rocm
_python_package tmpoptrocm
}
# vim:set ts=2 sw=2 et:
Thanks @riaqn for providing the updated PKGBUILD. Unfortunately, it does not seem to work with the currently available ROCM AUR packages. Right now, this is a mixture of 4.5 and 4.5.2 versions. I'm getting the following errors:
...
ERROR: An error occurred during the fetch of repository 'local_config_rocm':
Traceback (most recent call last):
File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 834, column 38, in _rocm_autoconf_impl
_create_local_rocm_repository(repository_ctx)
File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 540, column 35, in _create_local_rocm_repository
rocm_config = _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script)
File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 391, column 30, in _get_rocm_config
config = find_rocm_config(repository_ctx, find_rocm_config_script)
File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 369, column 41, in find_rocm_config
exec_result = _exec_find_rocm_config(repository_ctx, script_path)
File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/gpus/rocm_configure.bzl", line 365, column 19, in _exec_find_rocm_config
return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
File "/home/tinux/pkg/tensorflow/trunk/src/tensorflow-2.7.0-rocm/third_party/remote_config/common.bzl", line 230, column 13, in execute
fail(
Error in fail: Repository command failed
ERROR: ROCm version file "/opt/rocm/.info/version-dev" not found
ERROR: Skipping '//tensorflow/tools/pip_package:build_pip_package': no such package '@local_config_rocm//rocm': Repository command failed
ERROR: ROCm version file "/opt/rocm/.info/version-dev" not found
ERROR: no such package '@local_config_rocm//rocm': Repository command failed
ERROR: ROCm version file "/opt/rocm/.info/version-dev" not found
INFO: Elapsed time: 14.407s
INFO: 0 processes.
FAILED: Build did NOT complete successfully (0 packages loaded)
currently loading: tensorflow ... (2 packages)
==> ERROR: A failure occurred in build().
Aborting...
Any ideas?
Also, it seems to be necessary to rebuild python-cppheaderparser after the upgrade of Python from 3.9 to 3.10 (See also this).
@t1nux yes, go to /opt/rocm/.info/
and symlink version
to version-dev
and other files. I believe this is due to new ROCM releases no longer have this version file while tensorflow still uses it to identify rocm.
Also, I don't know how deep are you in this ROCm mess, but my suggestion is run! run away from this immature ecosystem. Sell your Radeon and get a real GPU from NVIDIA which is cuda-ready.
@riaqn Sorry for the late reply. It's working now, after what seemed like an eternity of compiling...
I partly agree with Nvidia vs AMD regarding the ecosystem, but, for me, there's more to that topic, like the ratio (FP64-performance)/(USD). Anyway, this is a topic that should be discussed elsewhere ;-).
Thanks for you help and the PKGBUILD, too, of course.
@riaqn If I am missing any deps please create PRs to add them!
@acxz thanks for updating the package to 2.8.0. I guess you have removed the non-existant dependencies while updating it. Therefore I'm closing this issue. Also I switched to pytorch recently.
These dependencies no longer exists in the latest rocm AUR packages.