NVIDIA / cccl

CUDA Core Compute Libraries
https://nvidia.github.io/cccl/
Other
1.25k stars 161 forks source link

error: reference to __host__ function 'parallel_for<thrust::cuda_cub::for_each_f...' in __host__ __device__ function #753

Open HangJie720 opened 4 years ago

HangJie720 commented 4 years ago

I try to use clang10 with cuda10.2 to compile the following code:

#include <thrust/binary_search.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>

#define CUDA_1D_KERNEL_LOOP(i, n)                                 \
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)

__global__ void PieceWiseLinearTransformGeneralKernel(
    const int N,
    const int M,
    const int num_grp,
    const int num_fnc_per_grp,
    const float* bounds,
    const float* slopes,
    const float* intercepts,
    const float* X,
    float* Y) {
  CUDA_1D_KERNEL_LOOP(i, N * M) {
    int col = i % M;
    const float* bounds_group = bounds + (col * (num_fnc_per_grp + 1));
    const float* slopes_group = slopes + (col * num_fnc_per_grp);
    const float* intercepts_group = intercepts + (col * num_fnc_per_grp);

    if (X[i] <= bounds_group[0]) {
      Y[i] = slopes_group[0] * bounds_group[0] + intercepts_group[0];
    } else if (X[i] >= bounds_group[num_fnc_per_grp]) {
      Y[i] = slopes_group[num_fnc_per_grp - 1] * bounds_group[num_fnc_per_grp] +
          intercepts_group[num_fnc_per_grp - 1];
    } else {
      auto low_bound = thrust::lower_bound(
          thrust::device,
          bounds_group,
          bounds_group + num_fnc_per_grp + 1,
          X[i]);
      int bounds_idx = low_bound - bounds_group - 1;
      Y[i] = slopes_group[bounds_idx] * X[i] + intercepts_group[bounds_idx];
    }
  }
}

int main(int argc, char* argv[]){
    return 0;
}

reference to hostfunction in host__device__function error occurs as follow.

root@1615c67ed2eb:/home/jhang# clang++ -std=c++11 -lnvToolsExt -I/usr/local/cuda/include 
-L/usr/local/cuda/lib64 -lcudart -lcublas -lcudnn -lgomp -lcurand 
--cuda-path=/usr/local/cuda --cuda-gpu-arch=sm_61 -x cuda -o test test.cu
clang: warning: Unknown CUDA version 10.2. Assuming the latest supported version 10.1 [-Wunknown-cuda-version]
In file included from test.cu:1:
In file included from /usr/local/cuda/include/thrust/binary_search.h:1901:
In file included from /usr/local/cuda/include/thrust/detail/binary_search.inl:26:
In file included from /usr/local/cuda/include/thrust/system/detail/generic/binary_search.h:173:
In file included from /usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:31:
In file included from /usr/local/cuda/include/thrust/for_each.h:279:
In file included from /usr/local/cuda/include/thrust/detail/for_each.inl:27:
In file included from /usr/local/cuda/include/thrust/system/detail/adl/for_each.h:42:
In file included from /usr/local/cuda/include/thrust/system/cuda/detail/for_each.h:35:
/usr/local/cuda/include/thrust/system/cuda/detail/parallel_for.h:164:43: error: reference to __host__ function 'parallel_for<thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, thrust::detail::wrapped_function<thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf>, void> >, long>' in __host__ __device__ function
    cudaError_t  status = __parallel_for::parallel_for(count, f, stream);
                                          ^
/usr/local/cuda/include/thrust/system/cuda/detail/for_each.h:79:15: note: in instantiation of function template specialization 'thrust::cuda_cub::parallel_for<thrust::cuda_cub::par_t, thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, thrust::detail::wrapped_function<thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf>, void> >, long>' requested here
    cuda_cub::parallel_for(policy,
              ^
/usr/local/cuda/include/thrust/system/cuda/detail/for_each.h:103:22: note: in instantiation of function template specialization 'thrust::cuda_cub::for_each_n<thrust::cuda_cub::par_t, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, long, thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf> >' requested here
    return cuda_cub::for_each_n(policy, first,  count, op);
                     ^
/usr/local/cuda/include/thrust/detail/for_each.inl:44:10: note: in instantiation of function template specialization 'thrust::cuda_cub::for_each<thrust::cuda_cub::par_t, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf> >' requested here
  return for_each(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, f);
         ^
/usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:132:11: note: in instantiation of function template specialization 'thrust::for_each<thrust::cuda_cub::par_t, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf> >' requested here
  thrust::for_each(exec,
          ^
/usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:162:44: note: in instantiation of function template specialization 'thrust::system::detail::generic::detail::binary_search<thrust::cuda_cub::par_t, const float *, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf>' requested here
  thrust::system::detail::generic::detail::binary_search(exec, begin, end, d_value.begin(), d_value.end(), d_output.begin(), comp, func);
                                           ^
/usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:213:26: note: in instantiation of function template specialization 'thrust::system::detail::generic::detail::binary_search<long, thrust::cuda_cub::par_t, const float *, float, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf>' requested here
  return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::lbf());
                         ^
/usr/local/cuda/include/thrust/detail/binary_search.inl:56:12: note: in instantiation of function template specialization 'thrust::system::detail::generic::lower_bound<thrust::cuda_cub::par_t, const float *, float, thrust::system::detail::generic::detail::binary_search_less>' requested here
    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
           ^
/usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:200:18: note: in instantiation of function template specialization 'thrust::lower_bound<thrust::cuda_cub::par_t, const float *, float, thrust::system::detail::generic::detail::binary_search_less>' requested here
  return thrust::lower_bound(exec, begin, end, value, detail::binary_search_less());
                 ^
/usr/local/cuda/include/thrust/detail/binary_search.inl:42:12: note: in instantiation of function template specialization 'thrust::system::detail::generic::lower_bound<thrust::cuda_cub::par_t, const float *, float>' requested here
    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
           ^
test.cu:33:32: note: in instantiation of function template specialization 'thrust::lower_bound<thrust::cuda_cub::par_t, const float *, float>' requested here
      auto low_bound = thrust::lower_bound(
                               ^
/usr/local/cuda/include/thrust/system/cuda/detail/for_each.h:79:15: note: called by 'for_each_n<thrust::cuda_cub::par_t, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, long, thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf> >'
    cuda_cub::parallel_for(policy,
              ^
/usr/local/cuda/include/thrust/system/cuda/detail/for_each.h:103:22: note: called by 'for_each<thrust::cuda_cub::par_t, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf> >'
    return cuda_cub::for_each_n(policy, first,  count, op);
                     ^
/usr/local/cuda/include/thrust/detail/for_each.inl:44:10: note: called by 'for_each<thrust::cuda_cub::par_t, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf> >'
  return for_each(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, f);
         ^
/usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:132:11: note: called by 'binary_search<thrust::cuda_cub::par_t, const float *, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf>'
  thrust::for_each(exec,
          ^
/usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:162:44: note: called by 'binary_search<long, thrust::cuda_cub::par_t, const float *, float, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf>'
  thrust::system::detail::generic::detail::binary_search(exec, begin, end, d_value.begin(), d_value.end(), d_output.begin(), comp, func);
                                           ^
/usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:213:26: note: called by 'lower_bound<thrust::cuda_cub::par_t, const float *, float, thrust::system::detail::generic::detail::binary_search_less>'
  return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::lbf());
                         ^
/usr/local/cuda/include/thrust/detail/binary_search.inl:56:12: note: called by 'lower_bound<thrust::cuda_cub::par_t, const float *, float, thrust::system::detail::generic::detail::binary_search_less>'
    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
           ^
/usr/local/cuda/include/thrust/system/detail/generic/binary_search.inl:200:18: note: called by 'lower_bound<thrust::cuda_cub::par_t, const float *, float>'
  return thrust::lower_bound(exec, begin, end, value, detail::binary_search_less());
                 ^
/usr/local/cuda/include/thrust/detail/binary_search.inl:42:12: note: called by 'lower_bound<thrust::cuda_cub::par_t, const float *, float>'
    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
           ^
test.cu:33:32: note: called by 'PieceWiseLinearTransformGeneralKernel'
      auto low_bound = thrust::lower_bound(
                               ^
/usr/local/cuda/include/thrust/system/cuda/detail/parallel_for.h:127:3: note: 'parallel_for<thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::detail::normal_iterator<thrust::pointer<long, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default> >, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type> >, thrust::detail::wrapped_function<thrust::system::detail::generic::detail::binary_search_functor<const float *, thrust::system::detail::generic::detail::binary_search_less, thrust::system::detail::generic::detail::lbf>, void> >, long>' declared here
  parallel_for(Size         num_items,
  ^
1 error generated when compiling for sm_61.

while nvcc is ok, for calling hostfunction in host__device__function is not allowed for clang?

alliepiper commented 4 years ago

Thrust does not officially support Clang CUDA; the existing support for that compiler is a community-maintained effort. If someone submits a PR to address Clang CUDA bugs we'll typically accept them, but this isn't likely to be fixed otherwise.

I'd suggest trying out the current main branch of Thrust to see if this has already been fixed since10.2 -- we've had a number of contributed Clang CUDA cleanups merged since then.

miscco commented 1 year ago

@HangJie720 Could you please have a look whether this issue is still reproducible