Slowdown observed with Raja View

rahulb1218 commented 3 weeks ago

Describe the bug

Slowdown observed with Raja View when compared to accessing memory directly

To Reproduce

#include <RAJA/RAJA.hpp>
#include <iostream>
#include <cuda_runtime.h>
#include <caliper/cali.h>

int main() {
    CALI_CXX_MARK_FUNCTION;
    const int N = 10000;  
    const int K = 17; 

    int* array = new int[N * N];
    int* array_copy = new int[N * N];

    //big array, or image
    for (int i = 0; i < N * N; ++i) {
        array[i] = 1;
        array_copy[i] = 1;
    }

    //small array that acts as the blur
    int* kernel = new int[K * K];
    for (int i = 0; i < K * K; ++i) {
        kernel[i] = 2;
    }

    // copying to gpu
    int* d_array;
    int* d_array_copy;
    int* d_kernel;
    cudaMalloc((void**)&d_array, N * N * sizeof(int));
    cudaMalloc((void**)&d_array_copy, N * N * sizeof(int));
    cudaMalloc((void**)&d_kernel, K * K * sizeof(int));
    cudaMemcpy(d_array, array, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_array_copy, array_copy, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_kernel, kernel, K * K * sizeof(int), cudaMemcpyHostToDevice);

    constexpr int DIM = 2;
    RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view(d_array, N, N);
    RAJA::View<int, RAJA::Layout<DIM, int, 1>> array_view_copy(d_array_copy, N, N);
    RAJA::View<int, RAJA::Layout<DIM, int, 1>> kernel_view(d_kernel, K, K);

    using EXEC_POL5 = RAJA::KernelPolicy<
        RAJA::statement::CudaKernelFixed<256,
            RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>,
                RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>,
                    RAJA::statement::Lambda<0>
                >
            >
        >
    >;

    RAJA::RangeSegment range_i(0, N);
    RAJA::RangeSegment range_j(0, N);

    CALI_MARK_BEGIN("RAJA_View_Kernel");
    RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(range_i, range_j),
        [=] RAJA_DEVICE (int i, int j) {
            int sum = 0;

            //looping through the "blur"
            for (int m = 0; m < K; ++m) {
                for (int n = 0; n < K; ++n) {
                    int x = i + m;
                    int y = j + n;

                    // adding the "blur" to the "image" wherever the blur is located on the image
                    if (x < N && y < N) {
                        sum += kernel_view(m, n) * array_view(x, y);
                    }
                }
            }

            array_view(i, j) += sum;
        }
    );
    CALI_MARK_END("RAJA_View_Kernel");

    CALI_MARK_BEGIN("No_View_Kernel");
    RAJA::kernel<EXEC_POL5>(RAJA::make_tuple(range_i, range_j),
        [=] RAJA_DEVICE (int i, int j) {
            int sum = 0;

            // looping through the "blur"
            for (int m = 0; m < K; ++m) {
                for (int n = 0; n < K; ++n) {
                    int x = i + m;
                    int y = j + n;

                    // adding the "blur" to the "image" wherever the blur is located on the image
                    if (x < N && y < N) {
                        sum += d_kernel[m * K + n] * d_array_copy[x * N + y];
                    }
                }
            }

            d_array_copy[i * N + j] += sum;
        }
    );
    CALI_MARK_END("No_View_Kernel");

    // copy from gpu to cpu
    cudaMemcpy(array, d_array, N * N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(array_copy, d_array_copy, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    cudaFree(d_array);
    cudaFree(d_array_copy);
    cudaFree(d_kernel);

    delete[] array;
    delete[] array_copy;
    delete[] kernel;

    return 0;
}

Expected behavior

Both kernels should take about the same time to run.

Compilers & Libraries (please complete the following information):

Compiler & version: [e.g. GCC 4.9.3]: 8.5.0
CUDA version (if applicable): 11.8

MrBurmark commented 3 weeks ago

Should the first loop be adding to array_view_copy instead of array_view? Is this the code that was used to benchmark the slowdown or is this a reproducer based on some other code where you saw the slowdown? Have you tried running a kernel before the first kernel to capture first kernel overheads? Have you tried running the kernels multiple times to look at how having a warm cache affects performance?

artv3 commented 3 weeks ago

@rahulb1218 can you post the performance numbers, also I just realized it might be worth looking at the kernels through the NVIDIA profiler. As @MrBurmark RAJA has a high initial kernel launch overhead due to the way streams are setup.

rahulb1218 commented 3 weeks ago

Sure, I can report what Caliper recorded. RAJA_View_Kernel: 17.02 seconds. No_View_Kernel: 2.45 seconds.

artv3 commented 1 week ago

@rahulb1218 , I don't have access to pascal but on lassen things check out fine, could you try the following example and share what you see? https://github.com/LLNL/RAJA/pull/1728 I added an empty kernel to avoid measuring stream initialization.

LLNL / RAJA

Slowdown observed with Raja View #1718