parallel101 / simdtutor

x86-64 SIMD矢量优化系列教程
101 stars 8 forks source link

请问小彭老师,这段GPU代码为什么加速比这么低? #8

Open balleb6545anickk opened 10 months ago

balleb6545anickk commented 10 months ago

测试环境: 笔记本R7-5800H,3060,Win11,MSVC最新版Release模式。 测试结果: GPU time: 0.0018809 CPU time: 0.0048002 ratio: 2.55208 我用其它的CUDA程序加速比都能达到10倍左右,这个加速比为什么这么慢? (另外,改成float加速就很快,为什么?如果一定要用double,该怎么改?)

#include <omp.h>
#include <chrono>
#include <iostream>
#include <vector>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define TYPE double
#define imgW 2448
#define imgH 2048
#define N imgW *imgH

__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
                                      TYPE *para2) {
    int pos = blockIdx.x * blockDim.x + threadIdx.x;
    if (pos >= width * height)
        return;

    TYPE data = input[pos];
    TYPE row = pos / width;
    TYPE col = pos % width;
    TYPE x = (col - para2[0]) * para2[2];
    TYPE y = (row - para2[1]) * para2[3];

    const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
    const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;

    output[pos] = a / b; 
}

void CPU_Cal(const TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1, TYPE *para2) {
#pragma omp parallel for
    for (int row = 0; row < height; ++row) {
        TYPE *_output = output + row * width;
        const TYPE *_input = input + row * width;
        for (int col = 0; col < width; ++col) {
            const TYPE data = *_input;
            const TYPE x = (col - para2[0]) * para2[2];
            const TYPE y = (row - para2[1]) * para2[3];

            const TYPE a =
                para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
            const TYPE b =
                para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;

            *_output = a / b; 
            ++_output;
            ++_input;
        }
    }
}

int main() {
    // 准备数据
    std::vector<TYPE> input(N, 2);
    std::vector<TYPE> output(N, 0);
    std::vector<TYPE> para0(30, 1.5);
    std::vector<TYPE> para1(30, 1.5);
    std::vector<TYPE> para3{1246, 1037, 2448, 2048};
    // 随机准备一段数据
    for (int i = 0; i < N; ++i) {
        input[i] = (double)i / N;
        output[i] = (double)i / N + 2;
    }
    for (int i = 0; i < 30; ++i) {
        para0[i] = (double)i / 30;
        para1[i] = (double)i / 30 + 4.0;
    }

    TYPE *d_input;
    TYPE *d_output;
    TYPE *d_para0;
    TYPE *d_para1;
    TYPE *d_para2;
    cudaMalloc((void **)&d_input, N * sizeof(TYPE));
    cudaMalloc((void **)&d_output, N * sizeof(TYPE));
    cudaMalloc((void **)&d_para0, 30 * sizeof(TYPE));
    cudaMalloc((void **)&d_para1, 30 * sizeof(TYPE));
    cudaMalloc((void **)&d_para2, 4 * sizeof(TYPE));
    cudaMemcpy(d_input, input.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output, output.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
    cudaMemcpy(d_para0, para0.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
    cudaMemcpy(d_para1, para1.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
    cudaMemcpy(d_para2, para3.data(), 4 * sizeof(TYPE), cudaMemcpyHostToDevice);

    // GPU计算时间(取最短时间)
    int thread_num = 256;
    int block_num = (imgW * imgH + thread_num - 1) / thread_num;
    double gpu_time = 10000000;
    cudaDeviceSynchronize();
    for (size_t i = 0; i < 50; i++) {
        auto t0 = std::chrono::steady_clock::now();
        GPU_Cal<<<block_num, thread_num>>>(d_input, d_output, imgW, imgH, d_para0, d_para1, d_para2);
        cudaDeviceSynchronize();
        double time =
            std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
        gpu_time = std::min(gpu_time, time);
    }
    std::cout << "GPU time: " << gpu_time << std::endl;

    // CPU计算时间(取最短时间)
    TYPE *h_output;
    h_output = (TYPE *)malloc(N * sizeof(TYPE));
    cudaMemcpy(h_output, d_output, N * sizeof(TYPE), cudaMemcpyDeviceToHost);
    double cpu_time = 10000000;
    for (size_t i = 0; i < 50; i++) {
        auto t0 = std::chrono::steady_clock::now();
        CPU_Cal(input.data(), output.data(), imgW, imgH, para0.data(), para1.data(), para3.data());
        double time =
            std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
        cpu_time = std::min(cpu_time, time);
    }
    std::cout << "CPU time: " << cpu_time << std::endl;
    std::cout << "ratio: " << cpu_time / gpu_time << std::endl;

    // 检测计算结果是否一致
    for (int i = 0; i < N; i++) {
        if (h_output[i] != h_output[i] && output[i] != output[i]) {
            continue;
        }
        if (fabs(h_output[i] - output[i]) > 1e-2) {
            printf("Error! i: %d, cpu: %f, gpu:%f.\n", i, output[i], h_output[i]);
            abort();
        }
    }

    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_para0);
    cudaFree(d_para1);
    cudaFree(d_para2);
    return 0;
}
balleb6545anickk commented 9 months ago

顺便问一下小彭老师,什么时候把CUDA nsight安排上!

balleb6545anickk commented 9 months ago

或者先看一下这个简化一点的问题也行:https://stackoverflow.com/questions/77562389/why-is-this-cuda-code-for-summing-arrays-so-slow

HJzhang-sjtu commented 9 months ago

README里面写了,小彭老师不回答CUDA优化相关的问题。我来回答下吧,你这个开的block数目太多了,总共就那几十个SM,你开了1万多个block,光调度这些block运行在SM上开销就很大了。可以让一个block计算更多的数据,例如每一个block计算256*256个input数据,每一个block内的thread计算256个数据。

archibate commented 9 months ago
  1. GPU(特别是消费级显卡)就是对double支持很差的,正常的。正常图形学应用都是float数据,aipig甚至巴不得用half。(这就是为什么他们搞科学计算的都不爱用GPU集群,因为科研仿真需要double精度)
  2. 你这里的主要瓶颈是这个双精度浮点除法,把 a / b 改成 a + b 后加速比直接从 2.23 提升到 2.99 了。
  3. i / width 这个整数除法的开销也很大,你试图用一维的blockdim和griddim在运行东西,然后用除法和模运算来模拟出row和col,这是不正确的。应该利用blockDim.x和blockDim.y,避免低效的除法。

1和2的问题是可能是你算法需要,改了你的结果就不对了。3这个问题我给你改下。

__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
                                      TYPE *para2) {
    for (int row = threadIdx.y + blockIdx.y * blockDim.y; row < height; row += gridDim.y * blockDim.y) {
        for (int col = threadIdx.x + blockIdx.x * blockDim.x; col < width; col += gridDim.x * blockDim.x) {
            int i = row * width + col;
            TYPE data = input[i];
            TYPE x = (row - para2[0]) * para2[2];
            TYPE y = (col - para2[1]) * para2[3];

            const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
            const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;

            output[i] = a / b;
        }
    }
}