In both of those cases we are hitting instances of the overflowing strided loop issue in cuco's static_multimap::pair_count and static_map::insert (see compute-sanitizer output below). It looks like cuDF could work around this by using int64_t as the type in their counting_transform_iterator (like I did in this proof-of-concept), but it is not clear if that is the right solution. This issue is for our tracking, but the fix will be in cuDF or cuCollections.
The only current workaround is to increase our shuffle partitions (for example 400 partitions worked without issues).
Inner join:
========= Invalid __global__ read of size 4 bytes
========= at 0x500 in void cuco::detail::pair_count<(unsigned int)128, (unsigned int)2, (bool)0, thrust::transform_iterator<cudf::detail::make_pair_function<cudf::row_hasher<cudf::detail::default_hash, cudf::nullate::DYNAMIC>, int>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>, cuda::__4::atomic<unsigned long, (cuda::std::__4::__detail::thread_scope)1>, cuco::static_multimap<unsigned int, int, (cuda::std::__4::__detail::thread_scope)1, rmm::mr::stream_allocator_adaptor<default_allocator<char>>, cuco::double_hashing<(unsigned int)2, cudf::detail::MurmurHash3_32<unsigned int>, cudf::detail::MurmurHash3_32<unsigned int>>>::device_view, cudf::detail::pair_equality<cudf::row_equality_comparator<cudf::nullate::DYNAMIC>>>(T4, T4, T5 *, T6, T7)
========= by thread (64,0,0) in block (14773391,0,0)
========= Address 0xbcd89fe80 is out of bounds
========= and is 1603745152 bytes before the nearest allocation at 0xc2d213400 of size 256 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x22da7a]
========= in /usr/lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame: [0x3deb04b]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame: [0x3e28798]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:unsigned long cuco::static_multimap<unsigned int, int, (cuda::std::__4::__detail::thread_scope)1, rmm::mr::stream_allocator_adaptor<default_allocator<char> >, cuco::double_hashing<2u, cudf::detail::MurmurHash3_32<unsigned int>, cudf::detail::MurmurHash3_32<unsigned int> > >::pair_count<thrust::transform_iterator<cudf::detail::make_pair_function<cudf::row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, int>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>, cudf::detail::pair_equality<cudf::row_equality_comparator<cudf::nullate::DYNAMIC> > >(thrust::transform_iterator<cudf::detail::make_pair_function<cudf::row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, int>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>, thrust::transform_iterator<cudf::detail::make_pair_function<cudf::row_hasher<cudf::detail::MurmurHash3_32, cudf::nullate::DYNAMIC>, int>, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::use_default, thrust::use_default>, cudf::detail::pair_equality<cudf::row_equality_comparator<cudf::nullate::DYNAMIC> >, CUstream_st*) const [0x1e69a52]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:unsigned long cudf::detail::_GLOBAL__N__dbc92c90_12_hash_join_cu_cd66f71b::compute_join_output_size<cudf::detail::join_kind>(cudf::table_device_view, cudf::detail::_GLOBAL__N__dbc92c90_12_hash_join_cu_cd66f71b::compute_join_output_size<cudf::detail::join_kind>, cuco::static_multimap<unsigned int, int, cuda::std::__4::__detail::thread_scope, rmm::mr::stream_allocator_adaptor<default_allocator<char>>, cudf::table_device_view::double_hashing<unsigned int=2, cudf::detail::MurmurHash3_32<unsigned int>, cudf::detail::MurmurHash3_32>> const &, bool, cudf::null_equality, cuda::std::__4::__detail::thread_scope::cuda_stream_view) [0x1e69fcc]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:std::pair<std::unique_ptr<rmm::device_uvector<int>, std::default_delete<rmm::device_uvector>>, std::default_delete<rmm::device_uvector>> cudf::detail::_GLOBAL__N__dbc92c90_12_hash_join_cu_cd66f71b::probe_join_hash_table<cudf::detail::join_kind>(cudf::table_device_view, std::pair<std::unique_ptr<rmm::device_uvector<int>, std::default_delete<rmm::device_uvector>>, std::default_delete<rmm::device_uvector>>, cuco::static_multimap<unsigned int, int, cuda::std::__4::__detail::thread_scope, std::unique_ptr::mr::stream_allocator_adaptor<default_allocator<char>>, cudf::table_device_view::double_hashing<unsigned int=2, cudf::detail::MurmurHash3_32<unsigned int>, cudf::detail::MurmurHash3_32>> const &, bool, cudf::null_equality, std::optional<unsigned long>, std::unique_ptr::cuda_stream_view, cuda::std::__4::__detail::thread_scope::device_memory_resource*) [0x1e6f64f]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:std::pair<std::unique_ptr<rmm::device_uvector<int>, std::default_delete<rmm::device_uvector<int> > >, std::unique_ptr<rmm::device_uvector<int>, std::default_delete<rmm::device_uvector<int> > > > cudf::detail::hash_join<cudf::detail::MurmurHash3_32<unsigned int> >::probe_join_indices<(cudf::detail::join_kind)0>(cudf::table_view const&, std::optional<unsigned long>, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const [0x1e6f7f2]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:std::pair<std::unique_ptr<rmm::device_uvector<int>, std::default_delete<rmm::device_uvector<int> > >, std::unique_ptr<rmm::device_uvector<int>, std::default_delete<rmm::device_uvector<int> > > > cudf::detail::hash_join<cudf::detail::MurmurHash3_32<unsigned int> >::compute_hash_join<(cudf::detail::join_kind)0>(cudf::table_view const&, std::optional<unsigned long>, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const [0x1e6face]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:cudf::hash_join::inner_join(cudf::table_view const&, std::optional<unsigned long>, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const [0x1e679e3]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:cudf::detail::inner_join(cudf::table_view const &, cudf::table_view const &, cudf::null_equality, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) [0x1e70633]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:cudf::inner_join(cudf::table_view const &, cudf::table_view const &, cudf::null_equality, rmm::mr::device_memory_resource*) [0x1e70c5c]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame:Java_ai_rapids_cudf_Table_innerJoinGatherMaps [0x14ec5e3]
========= in /tmp/cudf1750694697214535636.so
========= Host Frame: [0x254ac96a7]
========= in
=========
As a follow on to https://github.com/NVIDIA/spark-rapids/issues/6983, we ran the q95 query at 30TB with the fix in this PR (https://github.com/rapidsai/cudf/pull/12079) and we ended up failing during a couple of the joins later, an inner join and a left semi.
In both of those cases we are hitting instances of the overflowing strided loop issue in cuco's
static_multimap::pair_count
andstatic_map::insert
(see compute-sanitizer output below). It looks like cuDF could work around this by usingint64_t
as the type in theircounting_transform_iterator
(like I did in this proof-of-concept), but it is not clear if that is the right solution. This issue is for our tracking, but the fix will be in cuDF or cuCollections.The only current workaround is to increase our shuffle partitions (for example 400 partitions worked without issues).
Inner join:
Leftsemi: