[cuFFTDx 3D] Porting the box example to R2C

Hi! I've been trying to get into cuFFTDx lately which has been challenging, especially with how little material there seems to be of it on the internet. Something I've been trying as of late is seeing if I can get one of the 3D examples (both being C2C) to work in R2C mode too. To this end, I've made a set of changes to the fft_3d_box_single_block.cu file locally:
Adjust the float2 (complex_type) input to float input.
Make the cuFFT plan execute from real to complex.
Make the cuFFTDx x-axis FFT execute real to complex and the y and z-axis complex to complex.
Adjust the loading and storing in the cuFFTDx kernel to account for an FFT of size x y z only having an output size of x y (z/2+1) when going from real to complex numbers. I feel like with those four points I should have covered everything to successfully port from c2c to r2c but the results when running the code I have (found at the bottom of the post) tell me otherwise.
I was wondering if I maybe overlooked something in my checklist or just poorly executed it. Any help would be massively appreciated!
The adjusted fft_3d_box_single_block.cu looks like this:
#include <iostream>
#include <iomanip>
#include <vector>
#include <limits>

#include <cuda_runtime_api.h>
#include <cufftdx.hpp>
#include <cufft.h>

#include "block_io.hpp"
#include "common.hpp"
#include "random.hpp"

#define CUFFTDX_EXAMPLE_DETAIL_DEBUG_FFT_3D
// #define CUFFTDX_EXAMPLE_DETAIL_DEBUG_FFT_3D_SIMPLE_IO
inline constexpr unsigned int cufftdx_example_warm_up_runs = 5;
inline constexpr unsigned int cufftdx_example_performance_runs = 50;

template<unsigned int MaxThreadsPerBlock,
         class FFTX,
         class FFTY,
         class FFTZ,
         class ComplexType                = typename FFTX::value_type,
         unsigned int RequiredStorageSize = std::max({FFTX::storage_size, FFTY::storage_size, FFTZ::storage_size})>
__launch_bounds__(MaxThreadsPerBlock) __global__
    void cufftdx_3d_fft_single_block_kernel(const float* input, ComplexType* output) {
    using complex_type                       = ComplexType;
    static constexpr unsigned int fft_size_x = cufftdx::size_of<FFTX>::value;
    static constexpr unsigned int fft_size_y = cufftdx::size_of<FFTY>::value;
    static constexpr unsigned int fft_size_z = cufftdx::size_of<FFTZ>::value;

    // Local array for thread
    constexpr auto required_storage_size = RequiredStorageSize;
    complex_type   thread_data[required_storage_size];

    // Shared memory use for exchanging data between threads
    extern __shared__ complex_type shared_memory[];

    // Load data from global memory to registers.
    static constexpr unsigned int stride_x = fft_size_y * fft_size_z;
    unsigned int                  index    = (threadIdx.x + threadIdx.y * fft_size_y);
    if (threadIdx.x < fft_size_y && threadIdx.y < fft_size_z) {
        for (size_t i = 0; i < FFTX::elements_per_thread; i++) {
            if ( i * FFTX::stride + threadIdx.x < fft_size_x ) {
                reinterpret_cast<float*>( thread_data )[i] = input[index];
                index += stride_x;
            }
        }

        // Execute FFT in X dimension
        FFTX().execute(thread_data);

        // Exchange/transpose via shared memory
        index = (threadIdx.x + threadIdx.y * fft_size_y);
        for ( size_t i = 0; i < ( FFTX::elements_per_thread / 2 ) + 1; i++ ) {
            if ( i * FFTX::stride + threadIdx.x < ( fft_size_x / 2 ) + 1 ) {
                shared_memory[index] = thread_data[i];
                index += stride_x;
            }
        }
    }

    __syncthreads();
    static constexpr unsigned int stride_y = fft_size_z;
    index                                  = threadIdx.x + threadIdx.y * fft_size_y * fft_size_z;
    if (threadIdx.x < fft_size_z && threadIdx.y < fft_size_x) {
        for ( size_t i = 0; i < ( FFTY::elements_per_thread / 2 ) + 1; i++ ) {
            if ( i * FFTY::stride + threadIdx.x < ( fft_size_y / 2 ) + 1 ) {
                thread_data[i] = shared_memory[index];
                index += stride_y;
            }
        }

        // Execute FFT in Y dimension
        FFTY().execute(thread_data);

        // Exchange/transpose via shared memory
        index = threadIdx.x + threadIdx.y * fft_size_y * fft_size_z;
        for ( size_t i = 0; i < ( FFTY::elements_per_thread / 2 ) + 1; i++ ) {
            if ( i * FFTY::stride + threadIdx.x < ( fft_size_y / 2 ) + 1 ) {
                shared_memory[index] = thread_data[i];
                index += stride_y;
            }
        }
    }

    __syncthreads();
    static constexpr unsigned int stride_z = 1;
    index                                  = ( threadIdx.x + threadIdx.y * fft_size_x ) * fft_size_z;
    if (threadIdx.x < fft_size_x && threadIdx.y < fft_size_y) {
        for ( size_t i = 0; i < ( FFTZ::elements_per_thread / 2 ) + 1; i++ ) {
            if ( i * FFTZ::stride + threadIdx.x < ( fft_size_z / 2 ) + 1 ) {
                thread_data[i] = shared_memory[index];
                index += stride_z;
            }
        }
        // Execute FFT in Z dimension
        FFTZ().execute(thread_data);

        index = (threadIdx.x + threadIdx.y * fft_size_x) * fft_size_z;
        for ( size_t i = 0; i < ( FFTZ::elements_per_thread / 2 ) + 1; i++ ) {
            if ( i * FFTZ::stride + threadIdx.x < ( fft_size_z / 2 ) + 1 ) {
                shared_memory[index] = thread_data[i];
                index += stride_z;
            }
        }
    }
    __syncthreads();
    if (threadIdx.x < fft_size_y && threadIdx.y < fft_size_z) {
        index = (threadIdx.x + threadIdx.y * fft_size_y);
        for ( size_t i = 0; i < ( FFTX::elements_per_thread / 2 ) + 1; i++ ) {
            if ( i * FFTX::stride + threadIdx.x < ( fft_size_x / 2 ) + 1 ) {
                thread_data[i] = shared_memory[index];
                index += stride_x;
            }
        }

        index = (threadIdx.x + threadIdx.y * fft_size_y);
        for (size_t i = 0; i < ( FFTX::elements_per_thread / 2 ) + 1; i++) {
            if ( i * FFTX::stride + threadIdx.x < ( fft_size_x / 2 ) + 1 ) {
                output[index] = thread_data[i];
                index += stride_x;
            }
        }
    }
}

example::fft_results<float2> cufft_3d_fft(unsigned int  fft_size_x,
                                          unsigned int  fft_size_y,
                                          unsigned int  fft_size_z,
                                          float* input,
                                          cufftComplex* output,
                                          cudaStream_t  stream) {
    // Create cuFFT plan
    cufftHandle plan;
    CUFFT_CHECK_AND_EXIT(cufftPlan3d(&plan, fft_size_x, fft_size_y, fft_size_z, CUFFT_R2C));
    CUFFT_CHECK_AND_EXIT(cufftSetStream(plan, stream));

    // Execute cuFFT
    auto cufft_execution = [&](cudaStream_t /* stream */) {
        CUFFT_CHECK_AND_EXIT(cufftExecR2C(plan, input, output))
    };

    // Correctness run
    cufft_execution(stream);
    CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
    // Copy results to host
    const size_t        flat_fft_size = fft_size_x * fft_size_y * ( ( fft_size_z / 2 ) + 1 );
    const size_t        flat_fft_size_bytes = flat_fft_size * sizeof(float2);
    std::vector<float2> output_host(flat_fft_size, {std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN()});
    CUDA_CHECK_AND_EXIT(cudaMemcpy(output_host.data(), output, flat_fft_size_bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());

    // Performance measurements
    auto time = example::measure_execution_ms(
        cufft_execution,
        cufftdx_example_warm_up_runs,
        cufftdx_example_performance_runs,
        stream);

    // Clean-up
    CUFFT_CHECK_AND_EXIT(cufftDestroy(plan));

    // Return results
    return example::fft_results<float2> {output_host, (time / cufftdx_example_performance_runs)};
}

template<unsigned int FFTSizeX, unsigned int FFTSizeY, unsigned int FFTSizeZ>
example::fft_results<float2> cufftdx_3d_fft_single_block(float* input, float2* output, cudaStream_t stream) {
    using namespace cufftdx;

    static constexpr unsigned int fft_size_x = FFTSizeX;
    static constexpr unsigned int fft_size_y = FFTSizeY;
    static constexpr unsigned int fft_size_z = FFTSizeZ;

    using fft_base = decltype(Thread() + Direction<fft_direction::forward>() + Precision<float>());
    using fft_x        = decltype(fft_base() + Type<fft_type::r2c>() + Size<fft_size_x>());
    using fft_y        = decltype(fft_base() + Type<fft_type::c2c>() + Size<fft_size_y>());
    using fft_z        = decltype(fft_base() + Type<fft_type::c2c>() + Size<fft_size_z>());
    using complex_type = typename fft_x::value_type;

    constexpr unsigned int max_dim               = std::max({fft_size_x, fft_size_y, fft_size_z});
    constexpr dim3         block_dim             = {max_dim, max_dim, 1};
    constexpr unsigned int max_threads_per_block = block_dim.x * block_dim.y * block_dim.z;
    const size_t           shared_memory_size    = (fft_size_x * fft_size_y * fft_size_z) * sizeof(complex_type);

    const auto kernel = cufftdx_3d_fft_single_block_kernel<max_threads_per_block, fft_x, fft_y, fft_z, complex_type>;

    // Increase max shared memory if needed
    CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));

    complex_type* cufftdx_output = reinterpret_cast<complex_type*>(output);
    auto fft_3d_execution = [&](cudaStream_t stream) {
        kernel<<<1, block_dim, shared_memory_size, stream>>>(input, cufftdx_output);
        CUDA_CHECK_AND_EXIT(cudaGetLastError());
    };

    // Correctness run
    fft_3d_execution(stream);
    CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
    // Copy results to host
    static constexpr size_t flat_fft_size       = fft_size_x * fft_size_y * ( ( fft_size_z / 2 ) + 1 );
    static constexpr size_t flat_fft_size_bytes = flat_fft_size * sizeof(float2);
    std::vector<float2> output_host(flat_fft_size, {std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN()});
    CUDA_CHECK_AND_EXIT(cudaMemcpy(output_host.data(), cufftdx_output, flat_fft_size_bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());

    // Performance measurements
    auto time = example::measure_execution_ms(
        [&](cudaStream_t stream) {
            fft_3d_execution(stream);
        },
        cufftdx_example_warm_up_runs,
        cufftdx_example_performance_runs,
        stream);

    // Return results
    return example::fft_results<float2>{ output_host, (time / cufftdx_example_performance_runs) };
}

int main(int, char**) {
    // 3D FFT
    static constexpr unsigned int fft_size_x = 16;
    static constexpr unsigned int fft_size_y = 15;
    static constexpr unsigned int fft_size_z = 14;

    // Generate random input data on host
    const unsigned int flat_fft_size = fft_size_x * fft_size_y * fft_size_z;
#ifdef CUFFTDX_EXAMPLE_DETAIL_DEBUG_FFT_3D
    std::vector<float> host_input(flat_fft_size);
    for (size_t i = 0; i < flat_fft_size; i++) {
        float sign      = (i % 3 == 0) ? -1.0f : 1.0f;
        host_input[i] = sign * static_cast<float>(i) / 100;
    }
#else
    auto host_input = example::get_random_complex_data<float>(flat_fft_size, -1, 1);
#endif

    // Allocate managed memory for device input/output
    // float2 has the same size and alignment as cuFFTDx fp32 complex type cufftdx::complex<float> and cufftComplex
    float*    input;
    float2*    output;
    const auto flat_fft_size_bytes = flat_fft_size * sizeof(float);
    CUDA_CHECK_AND_EXIT( cudaMalloc( &input, flat_fft_size_bytes ));
    CUDA_CHECK_AND_EXIT( cudaMalloc( &output, fft_size_x * fft_size_y * ( ( fft_size_z / 2 ) + 1 ) * sizeof( float2 ) ) );

    // Copy input to the device
    CUDA_CHECK_AND_EXIT(cudaMemcpy(input, host_input.data(), flat_fft_size_bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());

    cudaStream_t stream;
    CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream));

    // Run cuFFT
    auto cufft_results = cufft_3d_fft(fft_size_x, fft_size_y, fft_size_z, input, output, stream);

    // Run cuFFTDx
    auto cufftdx_results = cufftdx_3d_fft_single_block<fft_size_x, fft_size_y, fft_size_z>(input, output, stream);

    // Clean-up
    CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream));
    CUDA_CHECK_AND_EXIT(cudaFree(input));
    CUDA_CHECK_AND_EXIT(cudaFree(output));

    // Check if cuFFTDx results are correct
    auto fft_error = example::fft_signal_error::calculate_for_complex_values(cufftdx_results.output, cufft_results.output);

    std::cout << "FFT: (" << fft_size_x << ", " << fft_size_y << ", " << fft_size_z <<")\n";

#ifdef CUFFTDX_EXAMPLE_DETAIL_DEBUG_FFT_3D
    std::cout << "cuFFT, cuFFTDx\n";
    for (size_t i = 0; i < 8; i++) {
        std::cout << i << ": ";
        std::cout << "(" << cufft_results.output[i].x << ", " << cufft_results.output[i].y << ")";
        std::cout << ", ";
        std::cout << "(" << cufftdx_results.output[i].x << ", " << cufftdx_results.output[i].y << ")";
        std::cout << "\n";
    }
#endif

    bool success = fft_error.l2_relative_error < 0.001;
    std::cout << "Correctness results:\n";
    std::cout << "L2 error: " << fft_error.l2_relative_error << "\n";
    std::cout << "Peak error (index: " << fft_error.peak_error_index << "): " << fft_error.peak_error << "\n";

    // Print performance results
    if(success) {
        std::cout << "\nPerformance results:\n";
        std::cout << std::setw(10) << "cuFFTDx: " << cufftdx_results.avg_time_in_ms << " [ms]\n";
        std::cout << std::setw(10) << "cuFFT: " << cufft_results.avg_time_in_ms << " [ms]\n";
    }

    if (success) {
        std::cout << "Success\n";
        return 0;
    } else {
        std::cout << "Failure\n";
        return 1;
    }
}
NVIDIA / CUDALibrarySamples

[cuFFTDx 3D] Porting the box example to R2C #176