parallel101 / simdtutor

x86-64 SIMD矢量优化系列教程
109 stars 9 forks source link

麻烦小彭老师优化一下下 #4

Open balleb6545anickk opened 1 year ago

balleb6545anickk commented 1 year ago
// 以下是项目中的一段热点代码
// 会调用这个函数很多次,用来计算亚像素的像素值,方法是插值(具体算法可以不用管),里头的具体魔数我改了一下,因为是公司的代码,而且跟优化没有关系

double GetSubPixelValue(const double* preCaculatedParameter, int width, int height, double X, double Y)
{
    int xIndex[6], yIndex[6];
    xIndex[0] = int(X + 0.5) - 2;
    yIndex[0] = int(Y + 0.5) - 2;
    for (int i = 1; i < 5; i++)
    {
        xIndex[i] = xIndex[i - 1] + 1;
        yIndex[i] = yIndex[i - 1] + 1;
    }

    double xWeight[5];
    {
        double w = X - (double)xIndex[2];
        double w2 = w * w;
        double w3 = w * w2;
        double w4 = w2 * w2;
        xWeight[0] = 0.111 * w4 - w3 / 22.0 + 0.233 * w2 - 0.344 * w + 0.345;
        xWeight[1] = -0.333 * w4 + 0.333 * w3 + 0.444 * w2 - 0.555 * w + 0.0666;
        xWeight[2] = 0.15 *w4 - 0.222 * w2 + 0.0777;
        xWeight[3] = -0.333 * w4 - 0.333 * w3 + 0.444 * w2 + 0.555 * w + 0.0666;
        xWeight[4] = 0.111 * w4 + w3 / 22.0 + 0.233 * w2 + 0.344 * w + 0.345;
    }

    // 下面这个计算方法和上面是一样的
    double yWeight[5];
    {
        double w = Y - (double)yIndex[2];
        double w2 = w * w;
        double w3 = w * w2;
        double w4 = w2 * w2;
        yWeight[0] = 0.111 * w4 - w3 / 22.0 + 0.233 * w2 - 0.344 * w + 0.345;
        yWeight[1] = -0.333 * w4 + 0.333 * w3 + 0.444 * w2 - 0.555 * w + 0.0666;
        yWeight[2] = 0.15 *w4 - 0.222 * w2 + 0.0777;
        yWeight[3] = -0.333 * w4 - 0.333 * w3 + 0.444 * w2 + 0.555 * w + 0.0666;
        yWeight[4] = 0.111 * w4 + w3 / 22.0 + 0.233 * w2 + 0.344 * w + 0.345;
    }

    int width2 = 2 * width - 2;
    int height2 = 2 * height - 2;
    for (int i = 0; i < 5; i++)
    {
        xIndex[i] = (xIndex[i] < 0) ? (-xIndex[i] - width2 * ((-xIndex[i]) / width2)) : (xIndex[i] - width2 * (xIndex[i] / width2));
        if (width <= xIndex[i])
            xIndex[i] = width2 - xIndex[i];

        yIndex[i] = (yIndex[i] < 0) ? (-yIndex[i] - height2 * ((-yIndex[i]) / height2)) : (yIndex[i] - height2 * (yIndex[i] / height2));
        if (height <= yIndex[i])
            yIndex[i] = height2 - yIndex[i];
    }

    double result = 0;
    for (int i = 0; i < 5; i++)
    {
        for (int j = 0; j < 5; j++)
        {
            result += ((*(preCaculatedParameter + width * yIndex[i] + xIndex[j])) * xWeight[j] * yWeight[i]);
        }
    }
    return result;
}

#include <vector>
#include <chrono>
#include <iostream>

#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"秒\n";

#define TEST_NUM 10000000
int main() {
    int width = 2048;
    int height = 2048;
    std::vector<double> preCaculatedParameter(width * height, 0.123); // 里面是预计算的数字
    std::vector<double> xs(TEST_NUM);
    std::vector<double> ys(TEST_NUM);
    for (size_t i = 0; i < xs.size(); i++)
    {
        xs[i] = rand() % 2048;
        ys[i] = rand() % 2048;
    }
    std::vector<double> result(1);

    TICK(t);
    for (int i = 0; i < TEST_NUM; i++)
    {
        result[0] = GetSubPixelValue(preCaculatedParameter.data(), width, height, xs[i], ys[i]);
    }
    TOCK(t);

    return 0;
}
archibate commented 1 year ago

我可以用float吗?

archibate commented 1 year ago

这是一个membound内核,瓶颈在于无序访存,只需对xys排序,就可以从0.89秒提升到0.42秒。

archibate commented 1 year ago
#include <cmath>
#include <immintrin.h>

#define WIDTH 4

// 以下是项目中的一段热点代码
// 会调用这个函数很多次,用来计算亚像素的像素值,方法是插值(具体算法可以不用管),里头的具体魔数我改了一下,因为是公司的代码,而且跟优化没有关系

void GetSubPixelValue(double *__restrict Rs, const double*__restrict preCaculatedParameter, int width, int height, double *__restrict Xs, double *__restrict Ys)
{
    auto c0_5 = _mm256_set1_pd(0.5);
    auto X = _mm256_loadu_pd(Xs);
    auto Y = _mm256_loadu_pd(Ys);
    auto xIndex2 = _mm256_cvttpd_epi32(_mm256_add_pd(X, c0_5));
    auto yIndex2 = _mm256_cvttpd_epi32(_mm256_add_pd(Y, c0_5));
    /* int xIndex[6], yIndex[6]; */
    /* xIndex[0] = int(X + 0.5) - 2; */
    /* yIndex[0] = int(Y + 0.5) - 2; */
    /* for (int i = 1; i < 5; i++) */
    /* { */
    /*  xIndex[i] = xIndex[i - 1] + 1; */
    /*  yIndex[i] = yIndex[i - 1] + 1; */
    /* } */

    auto c0_111 = _mm256_set1_pd(0.111);
    auto c0_222 = _mm256_set1_pd(0.222);
    auto c0_233 = _mm256_set1_pd(0.233);
    auto c0_344 = _mm256_set1_pd(0.344);
    auto c0_15 = _mm256_set1_pd(0.15);
    auto c0_333 = _mm256_set1_pd(0.333);
    auto c0_444 = _mm256_set1_pd(0.444);
    auto c0_555 = _mm256_set1_pd(0.555);
    auto c0_0666 = _mm256_set1_pd(0.0666);
    auto c0_0777 = _mm256_set1_pd(0.0777);
    auto c0_345 = _mm256_set1_pd(0.345);
    auto c1o22 = _mm256_set1_pd(1.0 / 22.0);

    __m256d xWeight[5];
    __m256d yWeight[5];
    auto xw = _mm256_sub_pd(X, _mm256_cvtepi32_pd(xIndex2));
    auto xw2 = _mm256_mul_pd(xw, xw);
    auto xw3 = _mm256_mul_pd(xw, xw2);
    auto xw4 = _mm256_mul_pd(xw2, xw2);
    auto yw = _mm256_sub_pd(X, _mm256_cvtepi32_pd(yIndex2));
    auto yw2 = _mm256_mul_pd(yw, yw);
    auto yw3 = _mm256_mul_pd(yw, yw2);
    auto yw4 = _mm256_mul_pd(yw2, yw2);
    xWeight[0] = _mm256_fmsub_pd(c0_111, xw4, _mm256_fmadd_pd(xw3, c1o22, _mm256_fmsub_pd(c0_233, xw2, _mm256_fmadd_pd(c0_344, xw, c0_345))));
    yWeight[0] = _mm256_fmsub_pd(c0_111, yw4, _mm256_fmadd_pd(yw3, c1o22, _mm256_fmsub_pd(c0_233, yw2, _mm256_fmadd_pd(c0_344, yw, c0_345))));
    xWeight[1] = _mm256_fmadd_pd(c0_333, xw3, _mm256_fmsub_pd(c0_444, xw2, _mm256_fmsub_pd(c0_555, xw, _mm256_fmadd_pd(c0_333, xw4, c0_0666))));
    yWeight[1] = _mm256_fmadd_pd(c0_333, yw3, _mm256_fmsub_pd(c0_444, yw2, _mm256_fmsub_pd(c0_555, yw, _mm256_fmadd_pd(c0_333, yw4, c0_0666))));
    xWeight[2] = _mm256_fmsub_pd(c0_15, xw4, _mm256_fmadd_pd(c0_222, xw2, c0_0777));
    yWeight[2] = _mm256_fmsub_pd(c0_15, yw4, _mm256_fmadd_pd(c0_222, yw2, c0_0777));
    xWeight[3] = _mm256_fmsub_pd(c0_444, xw2, _mm256_fmsub_pd(c0_333, xw4, _mm256_fmadd_pd(c0_333, xw3, _mm256_fmadd_pd(c0_555, xw, c0_0666))));
    yWeight[3] = _mm256_fmsub_pd(c0_444, yw2, _mm256_fmsub_pd(c0_333, yw4, _mm256_fmadd_pd(c0_333, yw3, _mm256_fmadd_pd(c0_555, yw, c0_0666))));
    xWeight[4] = _mm256_fmadd_pd(c0_111, xw4, _mm256_fmadd_pd(xw3, c1o22, _mm256_fmadd_pd(c0_233, xw2, _mm256_fmadd_pd(c0_344, xw, c0_345))));
    yWeight[4] = _mm256_fmadd_pd(c0_111, yw4, _mm256_fmadd_pd(yw3, c1o22, _mm256_fmadd_pd(c0_233, yw2, _mm256_fmadd_pd(c0_344, yw, c0_345))));

#ifndef USE_BIT
    int width2 = 2 * width - 2;
    int height2 = 2 * height - 2;
    auto v_width = _mm_set1_epi32(width);
    auto v_height = _mm_set1_epi32(height);
    auto v_width2 = _mm256_set1_pd((double)width2);
    auto v_height2 = _mm256_set1_pd((double)height2);
    auto v_width2i = _mm_set1_epi32(width2);
    auto v_height2i = _mm_set1_epi32(height2);
#endif
    __m128i xIndex[5];
    __m128i yIndex[5];
    auto vi = _mm_set1_epi64x(-2);
    auto c1 = _mm_set1_epi64x(1);
#ifdef USE_BIT
    auto wmask = _mm_set1_epi32(2047);
    auto hmask = _mm_set1_epi32(2047);
#endif
    for (int i = 0; i < 5; i++)
    {
        auto xIndexI = _mm_add_epi32(xIndex2, vi);
        vi = _mm_add_epi32(vi, c1);
      #ifdef USE_BIT
        xIndexI = _mm_and_si128(xIndexI, wmask);
      #else
        xIndexI = _mm_abs_epi32(xIndexI);
        xIndexI = _mm_sub_epi32(xIndexI, _mm256_cvttpd_epi32(_mm256_mul_pd(v_width2, _mm256_floor_pd(_mm256_div_pd(_mm256_cvtepi32_pd(xIndexI), v_width2)))));
                xIndexI = _mm_blendv_epi8(xIndexI, _mm_sub_epi32(v_width2i, xIndexI), _mm_or_si128(_mm_cmpgt_epi32(xIndexI, v_width), _mm_cmpeq_epi32(xIndexI, v_width)));
      #endif
        xIndex[i] = xIndexI;
        auto yIndexI = yIndex2;
      #ifdef USE_BIT
        yIndexI = _mm_and_si128(xIndexI, hmask);
        yIndexI = _mm_slli_epi32(yIndexI, 11);
      #else
        yIndexI = _mm_abs_epi32(yIndexI);
        yIndexI = _mm_sub_epi32(yIndexI, _mm256_cvttpd_epi32(_mm256_mul_pd(v_height2, _mm256_floor_pd(_mm256_div_pd(_mm256_cvtepi32_pd(yIndexI), v_height2)))));
                yIndexI = _mm_blendv_epi8(yIndexI, _mm_sub_epi32(v_height2i, yIndexI), _mm_or_si128(_mm_cmpgt_epi32(yIndexI, v_height), _mm_cmpeq_epi32(yIndexI, v_height)));
        yIndexI = _mm_mullo_epi32(yIndexI, v_width);
      #endif
        yIndex[i] = yIndexI;
    }

    auto result = _mm256_setzero_pd();
    for (int i = 0; i < 5; i++)
    {
        auto rowResult = _mm256_setzero_pd();
        auto yIndexI = yIndex[i];
      #pragma GCC unroll 5
        for (int j = 0; j < 5; j++)
        {
            auto factor = _mm256_i32gather_pd(preCaculatedParameter, _mm_add_epi32(yIndexI, xIndex[j]), 1);
            rowResult = _mm256_fmadd_pd(rowResult, factor, xWeight[j]);
        }
        result = _mm256_fmadd_pd(result, rowResult, yWeight[i]);
    }
    _mm256_storeu_pd(Rs, result);
}

#include <vector>
#include <chrono>
#include <iostream>

#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"秒\n";

#define TEST_NUM 10000000
int main() {
    int width = 2048;
    int height = 2048;
    std::vector<double> preCaculatedParameter(width * height, 0.123); // 里面是预计算的数字
    std::vector<double> xs(TEST_NUM);
    std::vector<double> ys(TEST_NUM);
    for (size_t i = 0; i < xs.size(); i++)
    {
        xs[i] = rand() % 2048;
        ys[i] = rand() % 2048;
    }
    std::vector<double> result(TEST_NUM);

    TICK(t);
    /* #pragma omp parallel for */
    for (size_t i = 0; i < TEST_NUM / WIDTH * WIDTH; i += WIDTH)
    {
        GetSubPixelValue(result.data() + i, preCaculatedParameter.data(), width, height, xs.data() + i, ys.data() + i);
    }
#ifdef __GNUC__
        asm volatile ("" ::: "cc", "memory");
#endif
    TOCK(t);

    return 0;
}

优化前:1.01623秒 优化后:0.104965秒 优化后+开启并行:0.0238891秒

archibate commented 1 year ago

使用i32gather优化了一下随机访存,如果能够改用float还可以进一步提升。

balleb6545anickk commented 1 year ago

小彭老师,你上面的代码有点问题,我改了一下,加速比会变小。 我在台式机i9-13900k上测试,差不多是0.62s->0.4s。 代码我都看完了,思路是对的,我之前的思路应该是错的。 我觉得可能还可以再优化一下。

#include <cmath>
#include <immintrin.h>

#define WIDTH 4

// 以下是项目中的一段热点代码
// 会调用这个函数很多次,用来计算亚像素的像素值,方法是插值(具体算法可以不用管),里头的具体魔数我改了一下,因为是公司的代码,而且跟优化没有关系

void GetSubPixelValue(double* __restrict Rs, const double* __restrict preCaculatedParameter, int width, int height, double* __restrict Xs, double* __restrict Ys)
{
    auto c0_5 = _mm256_set1_pd(0.5);
    auto X = _mm256_loadu_pd(Xs);
    auto Y = _mm256_loadu_pd(Ys);
    auto xIndex2 = _mm256_cvttpd_epi32(_mm256_add_pd(X, c0_5));
    auto yIndex2 = _mm256_cvttpd_epi32(_mm256_add_pd(Y, c0_5));
    /* int xIndex[6], yIndex[6]; */
    /* xIndex[0] = int(X + 0.5) - 2; */
    /* yIndex[0] = int(Y + 0.5) - 2; */
    /* for (int i = 1; i < 5; i++) */
    /* { */
    /*  xIndex[i] = xIndex[i - 1] + 1; */
    /*  yIndex[i] = yIndex[i - 1] + 1; */
    /* } */

    auto c0_111 = _mm256_set1_pd(0.111);
    auto c0_222 = _mm256_set1_pd(-0.222);
    auto c0_233 = _mm256_set1_pd(0.233);
    auto c0_344 = _mm256_set1_pd(0.344);
    auto c0_15 = _mm256_set1_pd(0.15);
    auto c0_333 = _mm256_set1_pd(0.333);
    auto c0_444 = _mm256_set1_pd(0.444);
    auto c0_555 = _mm256_set1_pd(0.555);
    auto c0_0666 = _mm256_set1_pd(0.0666);
    auto c0_0777 = _mm256_set1_pd(0.0777);
    auto c0_345 = _mm256_set1_pd(0.345);
    auto c1o22 = _mm256_set1_pd(1.0 / 22.0);

    __m256d xWeight[5];
    __m256d yWeight[5];
    auto xw = _mm256_sub_pd(X, _mm256_cvtepi32_pd(xIndex2));
    auto xw2 = _mm256_mul_pd(xw, xw);
    auto xw3 = _mm256_mul_pd(xw, xw2);
    auto xw4 = _mm256_mul_pd(xw2, xw2);
    auto yw = _mm256_sub_pd(Y, _mm256_cvtepi32_pd(yIndex2));
    auto yw2 = _mm256_mul_pd(yw, yw);
    auto yw3 = _mm256_mul_pd(yw, yw2);
    auto yw4 = _mm256_mul_pd(yw2, yw2);
    xWeight[0] = _mm256_fmsub_pd(c0_111, xw4, _mm256_fmadd_pd(xw3, c1o22, _mm256_fmsub_pd(c0_233, xw2, _mm256_fmadd_pd(c0_344, xw, c0_345))));
    yWeight[0] = _mm256_fmsub_pd(c0_111, yw4, _mm256_fmadd_pd(yw3, c1o22, _mm256_fmsub_pd(c0_233, yw2, _mm256_fmadd_pd(c0_344, yw, c0_345))));
    xWeight[1] = _mm256_fmadd_pd(c0_333, xw3, _mm256_fmsub_pd(c0_444, xw2, _mm256_fmsub_pd(c0_555, xw, _mm256_fmadd_pd(c0_333, xw4, c0_0666))));
    yWeight[1] = _mm256_fmadd_pd(c0_333, yw3, _mm256_fmsub_pd(c0_444, yw2, _mm256_fmsub_pd(c0_555, yw, _mm256_fmadd_pd(c0_333, yw4, c0_0666))));
    xWeight[2] = _mm256_fmadd_pd(c0_15, xw4, _mm256_fmadd_pd(c0_222, xw2, c0_0777));
    yWeight[2] = _mm256_fmadd_pd(c0_15, yw4, _mm256_fmadd_pd(c0_222, yw2, c0_0777));
    xWeight[3] = _mm256_fmsub_pd(c0_444, xw2, _mm256_fmsub_pd(c0_333, xw4, _mm256_fmadd_pd(c0_333, xw3, _mm256_fmadd_pd(c0_555, xw, c0_0666))));
    yWeight[3] = _mm256_fmsub_pd(c0_444, yw2, _mm256_fmsub_pd(c0_333, yw4, _mm256_fmadd_pd(c0_333, yw3, _mm256_fmadd_pd(c0_555, yw, c0_0666))));
    xWeight[4] = _mm256_fmadd_pd(c0_111, xw4, _mm256_fmadd_pd(xw3, c1o22, _mm256_fmadd_pd(c0_233, xw2, _mm256_fmadd_pd(c0_344, xw, c0_345))));
    yWeight[4] = _mm256_fmadd_pd(c0_111, yw4, _mm256_fmadd_pd(yw3, c1o22, _mm256_fmadd_pd(c0_233, yw2, _mm256_fmadd_pd(c0_344, yw, c0_345))));

    int width2 = 2 * width - 2;
    int height2 = 2 * height - 2;
    auto v_width = _mm_set1_epi32(width);
    auto v_height = _mm_set1_epi32(height);
    auto v_width2 = _mm256_set1_pd((double)width2);
    auto v_height2 = _mm256_set1_pd((double)height2);
    auto v_width2i = _mm_set1_epi32(width2);
    auto v_height2i = _mm_set1_epi32(height2);
    __m128i xIndex[5];
    __m128i yIndex[5];
// 改了下面这两句会影响速度
    auto vi = _mm_set1_epi32(-2); 
    auto c1 = _mm_set1_epi32(1);

    for (int i = 0; i < 5; i++)
    {
        auto xIndexI = _mm_add_epi32(xIndex2, vi);
        xIndexI = _mm_abs_epi32(xIndexI);
        xIndexI = _mm_sub_epi32(xIndexI, _mm256_cvttpd_epi32(_mm256_mul_pd(v_width2, _mm256_floor_pd(_mm256_div_pd(_mm256_cvtepi32_pd(xIndexI), v_width2)))));
        xIndexI = _mm_blendv_epi8(xIndexI, _mm_sub_epi32(v_width2i, xIndexI), _mm_or_si128(_mm_cmpgt_epi32(xIndexI, v_width), _mm_cmpeq_epi32(xIndexI, v_width)));
        xIndex[i] = xIndexI;
        auto yIndexI = _mm_add_epi32(yIndex2, vi);
        yIndexI = _mm_abs_epi32(yIndexI);
        yIndexI = _mm_sub_epi32(yIndexI, _mm256_cvttpd_epi32(_mm256_mul_pd(v_height2, _mm256_floor_pd(_mm256_div_pd(_mm256_cvtepi32_pd(yIndexI), v_height2)))));
        yIndexI = _mm_blendv_epi8(yIndexI, _mm_sub_epi32(v_height2i, yIndexI), _mm_or_si128(_mm_cmpgt_epi32(yIndexI, v_height), _mm_cmpeq_epi32(yIndexI, v_height)));
        yIndexI = _mm_mullo_epi32(yIndexI, v_width);
        yIndex[i] = yIndexI;
        vi = _mm_add_epi32(vi, c1);
    }

    auto result = _mm256_setzero_pd();
    for (int i = 0; i < 5; i++)
    {
        auto rowResult = _mm256_setzero_pd();
        auto yIndexI = yIndex[i];
#pragma GCC unroll 5
        for (int j = 0; j < 5; j++)
        {
            auto asdfafsd = _mm_add_epi32(yIndexI, xIndex[j]);
            auto factor = _mm256_i32gather_pd(preCaculatedParameter, _mm_add_epi32(yIndexI, xIndex[j]), 8); // 这个是8不是1
            rowResult = _mm256_fmadd_pd(factor, xWeight[j], rowResult);
        }
        result = _mm256_fmadd_pd(rowResult, yWeight[i], result);
    }
    _mm256_storeu_pd(Rs, result);
}

double GetSubPixelValueOrigin(const double* preCaculatedParameter, int width, int height, double X, double Y)
{
    int xIndex[6], yIndex[6];
    xIndex[0] = int(X + 0.5) - 2;
    yIndex[0] = int(Y + 0.5) - 2;
    for (int i = 1; i < 5; i++)
    {
        xIndex[i] = xIndex[i - 1] + 1;
        yIndex[i] = yIndex[i - 1] + 1;
    }

    double xWeight[5];
    {
        double w = X - (double)xIndex[2];
        double w2 = w * w;
        double w3 = w * w2;
        double w4 = w2 * w2;
        xWeight[0] = 0.111 * w4 - w3 / 22.0 + 0.233 * w2 - 0.344 * w + 0.345;
        xWeight[1] = -0.333 * w4 + 0.333 * w3 + 0.444 * w2 - 0.555 * w + 0.0666;
        xWeight[2] = 0.15 * w4 - 0.222 * w2 + 0.0777;
        xWeight[3] = -0.333 * w4 - 0.333 * w3 + 0.444 * w2 + 0.555 * w + 0.0666;
        xWeight[4] = 0.111 * w4 + w3 / 22.0 + 0.233 * w2 + 0.344 * w + 0.345;
    }

    // 下面这个计算方法和上面是一样的
    double yWeight[5];
    {
        double w = Y - (double)yIndex[2];
        double w2 = w * w;
        double w3 = w * w2;
        double w4 = w2 * w2;
        yWeight[0] = 0.111 * w4 - w3 / 22.0 + 0.233 * w2 - 0.344 * w + 0.345;
        yWeight[1] = -0.333 * w4 + 0.333 * w3 + 0.444 * w2 - 0.555 * w + 0.0666;
        yWeight[2] = 0.15 * w4 - 0.222 * w2 + 0.0777;
        yWeight[3] = -0.333 * w4 - 0.333 * w3 + 0.444 * w2 + 0.555 * w + 0.0666;
        yWeight[4] = 0.111 * w4 + w3 / 22.0 + 0.233 * w2 + 0.344 * w + 0.345;
    }

    int width2 = 2 * width - 2;
    int height2 = 2 * height - 2;
    for (int i = 0; i < 5; i++)
    {
        xIndex[i] = (xIndex[i] < 0) ? (-xIndex[i] - width2 * ((-xIndex[i]) / width2)) : (xIndex[i] - width2 * (xIndex[i] / width2));
        if (width <= xIndex[i])
            xIndex[i] = width2 - xIndex[i];

        yIndex[i] = (yIndex[i] < 0) ? (-yIndex[i] - height2 * ((-yIndex[i]) / height2)) : (yIndex[i] - height2 * (yIndex[i] / height2));
        if (height <= yIndex[i])
            yIndex[i] = height2 - yIndex[i];
    }

    double result = 0;
    for (int i = 0; i < 5; i++)
    {
        for (int j = 0; j < 5; j++)
        {
            result += ((*(preCaculatedParameter + width * yIndex[i] + xIndex[j])) * xWeight[j] * yWeight[i]);
        }
    }
    return result;
}

#include <vector>
#include <chrono>
#include <iostream>

#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"秒\n";

double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

#define TEST_NUM 10000000
int main() {
    int width = 2048;
    int height = 2048;
    std::vector<double> preCaculatedParameter(width * height, 0.123); // 里面是预计算的数字
    for (size_t i = 0; i < preCaculatedParameter.size(); i++)
    {
        preCaculatedParameter[i] = fRand(0, 1);
    }

    std::vector<double> xs(TEST_NUM);
    std::vector<double> ys(TEST_NUM);
    for (size_t i = 0; i < xs.size(); i++)
    {
        xs[i] = rand() % 2048;
        ys[i] = rand() % 2048;
    }
    std::vector<double> result(TEST_NUM);

    TICK(t);
    /* #pragma omp parallel for */
    for (size_t i = 0; i < TEST_NUM / WIDTH * WIDTH; i += WIDTH)
    {
        GetSubPixelValue(result.data() + i, preCaculatedParameter.data(), width, height, xs.data() + i, ys.data() + i);
    }
#ifdef __GNUC__
    asm volatile ("" ::: "cc", "memory");
#endif
    TOCK(t);

   // 以下的测试是单独测试的,不会一起测试,也就是一次只测试一个程序
   // std::vector<double> resultOrigin(TEST_NUM);
    //TICK(t_origin);
    //for (size_t i = 0; i < TEST_NUM; i++)
   // {
   //     resultOrigin[i] = GetSubPixelValueOrigin(preCaculatedParameter.data(), width, height, xs[i], ys[i]);
  //  }
  //  TOCK(t_origin);
    return 0;
}
balleb6545anickk commented 1 year ago

主要是下面这段会很耗时,访存瓶颈会很严重,是不是有办法优化一下 其实这段话就是在求preCaculatedParameter里面5*5的方块的加权和,可能可以优化。

    auto result = _mm256_setzero_pd();
    for (int i = 0; i < 5; i++)
    {
        auto rowResult = _mm256_setzero_pd();
        auto yIndexI = yIndex[i];
#pragma GCC unroll 5
        for (int j = 0; j < 5; j++)
        {
            auto asdfafsd = _mm_add_epi32(yIndexI, xIndex[j]);
            auto factor = _mm256_i32gather_pd(preCaculatedParameter, _mm_add_epi32(yIndexI, xIndex[j]), 8); // 这个是8不是1
            rowResult = _mm256_fmadd_pd(factor, xWeight[j], rowResult);
        }
        result = _mm256_fmadd_pd(rowResult, yWeight[i], result);
    }
archibate commented 1 year ago

可不可以把precalculatedparameter的范围提前扩大,扩大的里面填充上mirror,这样就可以全部用连续访问了。

balleb6545anickk commented 1 year ago

可不可以把precalculatedparameter的范围提前扩大,扩大的里面填充上mirror,这样就可以全部用连续访问了。

可以做mirror padding,甚至实际情况我们可以简化这个问题,就假设xIndex[5]和yIndex[5]是连续的而且不越界,也就是最后是卷积核是5*5的xWeight和yWeight,和precalculatedparameter卷积