From what I can see in the Kokkos tutorials, a way to use a Kokkos::View with the simd-math library is to use a view based on a simd type. It does not seem easy to express stencil computation with this kind of view as we need direct neighbours. Hence I would like to propose an alternative using a new execution policy, ThreadSimdRange, to handle this type of computation that is close to the existing Kokkos::ThreadVectorRange.
using view_type = Kokkos::View< double**, Kokkos::LayoutLeft >;
#ifdef KOKKOS_ENABLE_CUDA
using simd_t = simd::simd< double, simd::simd_abi::cuda_warp< 32 > >;
#else
using simd_t = simd::simd< double, simd::simd_abi::native >;
#endif
class SimdRange
{
public:
using simd_type = simd_t;
KOKKOS_INLINE_FUNCTION
SimdRange( int begin ) noexcept
: m_begin( begin )
{
}
KOKKOS_INLINE_FUNCTION
static constexpr auto size() noexcept
{
return simd_type::size();
}
KOKKOS_INLINE_FUNCTION
int begin() const noexcept
{
return m_begin;
}
KOKKOS_INLINE_FUNCTION
int end() const noexcept
{
return m_begin + this->size();
}
private:
int m_begin;
};
class ScalarRange
{
public:
KOKKOS_INLINE_FUNCTION
ScalarRange( int begin ) noexcept
: m_begin( begin )
{
}
KOKKOS_INLINE_FUNCTION
static constexpr auto size() noexcept
{
return 1;
}
KOKKOS_INLINE_FUNCTION
int begin() const noexcept
{
return m_begin;
}
KOKKOS_INLINE_FUNCTION
int end() const noexcept
{
return m_begin + this->size();
}
private:
int m_begin;
};
KOKKOS_INLINE_FUNCTION
SimdRange shift( const SimdRange& x, int s )
{
return SimdRange( x.begin() + s );
}
KOKKOS_INLINE_FUNCTION
ScalarRange shift( const ScalarRange& x, int s )
{
return ScalarRange( x.begin() + s );
}
KOKKOS_INLINE_FUNCTION
SimdRange::simd_type
load( const double* ptr, const SimdRange& range )
{
return SimdRange::simd_type( ptr + range.begin(), simd::element_aligned_tag() );
}
KOKKOS_INLINE_FUNCTION
double
load( const double* ptr, const ScalarRange& range )
{
return *( ptr + range.begin() );
}
KOKKOS_INLINE_FUNCTION
void
store( double* ptr, const SimdRange& range, const SimdRange::simd_type& simd )
{
simd.copy_to( ptr + range.begin(), simd::element_aligned_tag() );
}
KOKKOS_INLINE_FUNCTION
void
store( double* ptr, const ScalarRange& range, const double& scalar )
{
*( ptr + range.begin() ) = scalar;
}
class ThreadSimdRange
{
public:
using scalar_range = ScalarRange;
using simd_range = SimdRange;
KOKKOS_INLINE_FUNCTION
ThreadSimdRange( int begin, int end ) noexcept
: m_begin( begin )
, m_end( end )
{
}
KOKKOS_INLINE_FUNCTION
ThreadSimdRange( int count ) noexcept
: ThreadSimdRange( 0, count )
{
}
KOKKOS_INLINE_FUNCTION
int begin() const noexcept
{
return m_begin;
}
KOKKOS_INLINE_FUNCTION
int end() const noexcept
{
return m_end;
}
private:
int m_begin;
int m_end;
};
template < class Functor >
KOKKOS_INLINE_FUNCTION
void
parallel_for( const ThreadSimdRange& range, Functor&& f )
{
using simd_range = ThreadSimdRange::simd_range;
const auto count = range.end() - range.begin();
const auto end_vector = range.end() - count % simd_range::size();
// simd loop
for ( auto i = range.begin(); i < end_vector; i += simd_range::size() )
{
f( SimdRange( i ) );
}
// remainder loop
#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)
for ( auto i = end_vector + threadIdx.x; i < range.end(); i += blockDim.x )
{
f( ScalarRange( i ) );
}
#else
for ( auto i = end_vector; i < range.end(); ++i )
{
f( ScalarRange( i ) );
}
#endif
}
// ...
Kokkos::parallel_for( "Stencil computation", Kokkos::TeamPolicy<>( view.extent( 1 ), 1, simd_t::size() ),
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team)
{
const int j = team.league_rank() + 1;
parallel_for( ThreadSimdRange( 1, view.extent( 0 ) - 1 ),
[&]( const auto& irange ) // represents [i, i + 1, ...]
{
const auto T_ij = load( &view( 0, j ), irange ); // loads [view( i , j ), view( i + 1, j ), ...]
const auto T_imj = load( &view( 0, j ), shift( irange, -1 ) ); // loads [view( i - 1, j ), view( i , j ), ...]
const auto T_ipj = load( &view( 0, j ), shift( irange, +1 ) ); // loads [view( i + 1, j ), view( i + 2, j ), ...]
const auto T_ijm = load( &view( 0, j - 1 ), irange ); // loads [view( i , j - 1 ), view( i + 1, j - 1 ), ...]
const auto T_ijp = load( &view( 0, j + 1 ), irange ); // loads [view( i , j + 1 ), view( i + 1, j + 1 ), ...]
const auto T2_ij = T_ipj + T_ijp + T_ij - T_imj - T_ijm;
store( &view2( 0, j ), irange, T2_ij );
} );
} );
The SimdRange represents a contiguous set of indices that will be used to load/store (unaligned) data from a scalar Kokkos::View. The associated parallel_for should mimic the work of the compiler by splitting the loop into a vectorized loop and a scalar loop for the remainder.
A drawback is the need for C++ 14 because of the lambda but compiles and run with recent compilers.
Do you see any problem with this execution policy ?
From what I can see in the Kokkos tutorials, a way to use a
Kokkos::View
with the simd-math library is to use a view based on a simd type. It does not seem easy to express stencil computation with this kind of view as we need direct neighbours. Hence I would like to propose an alternative using a new execution policy,ThreadSimdRange
, to handle this type of computation that is close to the existingKokkos::ThreadVectorRange
.The
SimdRange
represents a contiguous set of indices that will be used to load/store (unaligned) data from a scalar Kokkos::View. The associatedparallel_for
should mimic the work of the compiler by splitting the loop into a vectorized loop and a scalar loop for the remainder.A drawback is the need for C++ 14 because of the lambda but compiles and run with recent compilers.
Do you see any problem with this execution policy ?