Proposition & discussion for a ThreadSimdRange execution policy

From what I can see in the Kokkos tutorials, a way to use a Kokkos::View with the simd-math library is to use a view based on a simd type. It does not seem easy to express stencil computation with this kind of view as we need direct neighbours. Hence I would like to propose an alternative using a new execution policy, ThreadSimdRange, to handle this type of computation that is close to the existing Kokkos::ThreadVectorRange.

using view_type = Kokkos::View< double**, Kokkos::LayoutLeft >;

#ifdef KOKKOS_ENABLE_CUDA
    using simd_t = simd::simd< double, simd::simd_abi::cuda_warp< 32 > >;
#else
    using simd_t = simd::simd< double, simd::simd_abi::native >;
#endif

class SimdRange
{
public:
    using simd_type = simd_t;

    KOKKOS_INLINE_FUNCTION
    SimdRange( int begin ) noexcept
        : m_begin( begin )
    {
    }

    KOKKOS_INLINE_FUNCTION
    static constexpr auto size() noexcept
    {
        return simd_type::size();
    }

    KOKKOS_INLINE_FUNCTION
    int begin() const noexcept
    {
        return m_begin;
    }

    KOKKOS_INLINE_FUNCTION
    int end() const noexcept
    {
        return m_begin + this->size();
    }

private:
    int m_begin;
};

class ScalarRange
{
public:
    KOKKOS_INLINE_FUNCTION
    ScalarRange( int begin ) noexcept
        : m_begin( begin )
    {
    }

    KOKKOS_INLINE_FUNCTION
    static constexpr auto size() noexcept
    {
        return 1;
    }

    KOKKOS_INLINE_FUNCTION
    int begin() const noexcept
    {
        return m_begin;
    }

    KOKKOS_INLINE_FUNCTION
    int end() const noexcept
    {
        return m_begin + this->size();
    }

private:
    int m_begin;
};

KOKKOS_INLINE_FUNCTION
SimdRange shift( const SimdRange& x, int s )
{
    return SimdRange( x.begin() + s );
}

KOKKOS_INLINE_FUNCTION
ScalarRange shift( const ScalarRange& x, int s )
{
    return ScalarRange( x.begin() + s );
}

KOKKOS_INLINE_FUNCTION
SimdRange::simd_type
load( const double* ptr, const SimdRange& range )
{
    return SimdRange::simd_type( ptr + range.begin(), simd::element_aligned_tag() );
}

KOKKOS_INLINE_FUNCTION
double
load( const double* ptr, const ScalarRange& range )
{
    return *( ptr + range.begin() );
}

KOKKOS_INLINE_FUNCTION
void
store( double* ptr, const SimdRange& range, const SimdRange::simd_type& simd )
{
    simd.copy_to( ptr + range.begin(), simd::element_aligned_tag() );
}

KOKKOS_INLINE_FUNCTION
void
store( double* ptr, const ScalarRange& range, const double& scalar )
{
    *( ptr + range.begin() ) = scalar;
}

class ThreadSimdRange
{
public:
    using scalar_range = ScalarRange;
    using simd_range = SimdRange;

    KOKKOS_INLINE_FUNCTION
    ThreadSimdRange( int begin, int end ) noexcept
        : m_begin( begin )
        , m_end( end )
    {
    }

    KOKKOS_INLINE_FUNCTION
    ThreadSimdRange( int count ) noexcept
        : ThreadSimdRange( 0, count )
    {
    }

    KOKKOS_INLINE_FUNCTION
    int begin() const noexcept
    {
        return m_begin;
    }

    KOKKOS_INLINE_FUNCTION
    int end() const noexcept
    {
        return m_end;
    }

private:
    int m_begin;
    int m_end;
};

template < class Functor >
KOKKOS_INLINE_FUNCTION
void
parallel_for( const ThreadSimdRange& range, Functor&& f )
{
    using simd_range = ThreadSimdRange::simd_range;

    const auto count = range.end() - range.begin();
    const auto end_vector = range.end() - count % simd_range::size();

    // simd loop
    for ( auto i = range.begin(); i < end_vector; i += simd_range::size() )
    {
        f( SimdRange( i ) );
    }

    // remainder loop
#if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)
    for ( auto i = end_vector + threadIdx.x; i < range.end(); i += blockDim.x )
    {
        f( ScalarRange( i ) );
    }
#else
    for ( auto i = end_vector; i < range.end(); ++i )
    {
        f( ScalarRange( i ) );
    }
#endif
}

// ...

Kokkos::parallel_for( "Stencil computation", Kokkos::TeamPolicy<>( view.extent( 1 ), 1, simd_t::size() ),
    KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team)
    {
        const int j = team.league_rank() + 1;
        parallel_for( ThreadSimdRange( 1, view.extent( 0 ) - 1 ),
            [&]( const auto& irange ) // represents [i, i + 1, ...]
            {
                const auto T_ij  = load( &view( 0, j     ), irange );              // loads [view( i    , j     ), view( i + 1, j     ), ...]
                const auto T_imj = load( &view( 0, j     ), shift( irange, -1 ) ); // loads [view( i - 1, j     ), view( i    , j     ), ...]
                const auto T_ipj = load( &view( 0, j     ), shift( irange, +1 ) ); // loads [view( i + 1, j     ), view( i + 2, j     ), ...]
                const auto T_ijm = load( &view( 0, j - 1 ), irange );              // loads [view( i    , j - 1 ), view( i + 1, j - 1 ), ...]
                const auto T_ijp = load( &view( 0, j + 1 ), irange );              // loads [view( i    , j + 1 ), view( i + 1, j + 1 ), ...]
                const auto T2_ij = T_ipj + T_ijp + T_ij - T_imj - T_ijm;
                store( &view2( 0, j ), irange, T2_ij );
            } );
    } );

The SimdRange represents a contiguous set of indices that will be used to load/store (unaligned) data from a scalar Kokkos::View. The associated parallel_for should mimic the work of the compiler by splitting the loop into a vectorized loop and a scalar loop for the remainder.

A drawback is the need for C++ 14 because of the lambda but compiles and run with recent compilers.

Do you see any problem with this execution policy ?

kokkos / simd-math

Proposition & discussion for a ThreadSimdRange execution policy #24