parallel101 / simdtutor

x86-64 SIMD矢量优化系列教程
109 stars 9 forks source link

请教关于1D向量批量卷积如何加速的问题 #11

Open YangZ2020 opened 1 month ago

YangZ2020 commented 1 month ago

我手写了一个计算1D卷积的程序,但是发现执行起来很慢。问题描述和示例代码如下: WechatIMG367

#include <random>
// #include <utils/chronoMarco.h>
#include <vector>

int main() {

  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<float> dis(1.0, 10.0);

  // 生成vec数据;
  size_t size1 = 10000;
  size_t size2 = 126;
  std::vector<float> vec(size1 * size2, 0);
  for (auto &x : vec) {
    x = dis(gen);
  }

  // 生成conv数据;
  size_t sizeb = 64;
  std::vector<float> b(sizeb);
  for (auto &x : b) {
    x = dis(gen);
  };

  // 临时向量,把vec中的每个向量存储到padZeroData中(前后各有一部分0);
  std::vector<float> padZeroData(size2 + sizeb);

  // 分配结果向量;
  std::vector<float> result(size1 * size2, 0);

  // TICK(conv); // 用于计时的Marco。
  // 开始计算。对于每个向量:
  for (int idx = 0; idx < size1; idx++) {

    // 将数据插入到padZeroData中;
    for (int idy = 0; idy < size2; idy++) {
      padZeroData[idy + sizeb / 2] = vec[idx * size2 + idy];
    }

    // padZeroData和b两个向量卷积,结果放入vec中;
    for (int idy = 0; idy < size2; idy++) {
      for (int idz = 0; idz < sizeb; idz++) {
        result[idx * size2 + idy] += padZeroData[idy + idz] * b[idz];
      }
    }
  }
  // TOCK(conv, end, ""); // 用于计时的Marco。
                       // 两段计时之间耗时约35ms。
}

谢谢小彭老师。

archibate commented 1 month ago
            for (int idz = 0; idz < sizeb; idz++) {
                result[idx * size2 + idy] += padZeroData[idy + idz] * b[idz];
            }

5079us

            float tmp = 0;
            for (int idz = 0; idz < sizeb; idz++) {
                tmp += padZeroData[idy + idz] * b[idz];
            }
            result[idx * size2 + idy] = tmp;

5058us

                        // padZeroData和b两个向量卷积,结果放入vec中;
            for (int idy = 0; idy < size2; idy++) {
                __m256 tmp = _mm256_setzero_ps();
                for (int idz = 0; idz < sizeb; idz += 8) {
                    tmp = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz]), _mm256_loadu_ps(&b[idz]), tmp);
                }
                for (int i = 0; i < 8; ++i) {
                    result[idx * size2 + idy] += tmp[i];
                }
            }

3351us

            for (int idy = 0; idy < size2; idy++) {
                __m256 tmp1 = _mm256_setzero_ps();
                __m256 tmp2 = _mm256_setzero_ps();
                for (int idz = 0; idz < sizeb; idz += 16) {
                    tmp1 = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz]), _mm256_loadu_ps(&b[idz]), tmp1);
                    tmp2 = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz + 8]), _mm256_loadu_ps(&b[idz + 8]), tmp2);
                }
                tmp1 = _mm256_add_ps(tmp1, tmp2);
                for (int i = 0; i < 8; ++i) {
                    result[idx * size2 + idy] += tmp1[i];
                }
            }

2624us

            for (int idy = 0; idy < size2; idy += 8) {
                __m256 tmp[8]{};
                for (int idz = 0; idz < sizeb; idz += 8) {
                    __m256 btmp = _mm256_loadu_ps(&b[idz]);
                    for (int offy = 0; offy < 8; ++offy) {
                        tmp[offy] = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + offy + idz]), btmp, tmp[offy]);
                    }
                }
                for (int offy = 0; offy < 8; ++offy) {
                    __m256 res = tmp[offy];
                    res = _mm256_hadd_ps(res, res);
                    res = _mm256_hadd_ps(res, res);
                    result[idx * size2 + idy + offy] = _mm_cvtss_f32(
                        _mm_add_ss(_mm256_extractf128_ps(res, 1), _mm256_castps256_ps128(res)));
                }
            }

2293us

YangZ2020 commented 5 days ago

Amazing!谢谢小彭老师。(回去补习SIMD课程。)

同时还有一个问题:如果我想在Android平板上调用Opengl ES3.2来实现同样的卷积计算,是不是手写着色器就可以实现呢?是不是您opengl课程的前4讲内容就足够解决这个问题?