Open YangZ2020 opened 1 month ago
for (int idz = 0; idz < sizeb; idz++) {
result[idx * size2 + idy] += padZeroData[idy + idz] * b[idz];
}
5079us
float tmp = 0;
for (int idz = 0; idz < sizeb; idz++) {
tmp += padZeroData[idy + idz] * b[idz];
}
result[idx * size2 + idy] = tmp;
5058us
// padZeroData和b两个向量卷积,结果放入vec中;
for (int idy = 0; idy < size2; idy++) {
__m256 tmp = _mm256_setzero_ps();
for (int idz = 0; idz < sizeb; idz += 8) {
tmp = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz]), _mm256_loadu_ps(&b[idz]), tmp);
}
for (int i = 0; i < 8; ++i) {
result[idx * size2 + idy] += tmp[i];
}
}
3351us
for (int idy = 0; idy < size2; idy++) {
__m256 tmp1 = _mm256_setzero_ps();
__m256 tmp2 = _mm256_setzero_ps();
for (int idz = 0; idz < sizeb; idz += 16) {
tmp1 = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz]), _mm256_loadu_ps(&b[idz]), tmp1);
tmp2 = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz + 8]), _mm256_loadu_ps(&b[idz + 8]), tmp2);
}
tmp1 = _mm256_add_ps(tmp1, tmp2);
for (int i = 0; i < 8; ++i) {
result[idx * size2 + idy] += tmp1[i];
}
}
2624us
for (int idy = 0; idy < size2; idy += 8) {
__m256 tmp[8]{};
for (int idz = 0; idz < sizeb; idz += 8) {
__m256 btmp = _mm256_loadu_ps(&b[idz]);
for (int offy = 0; offy < 8; ++offy) {
tmp[offy] = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + offy + idz]), btmp, tmp[offy]);
}
}
for (int offy = 0; offy < 8; ++offy) {
__m256 res = tmp[offy];
res = _mm256_hadd_ps(res, res);
res = _mm256_hadd_ps(res, res);
result[idx * size2 + idy + offy] = _mm_cvtss_f32(
_mm_add_ss(_mm256_extractf128_ps(res, 1), _mm256_castps256_ps128(res)));
}
}
2293us
Amazing!谢谢小彭老师。(回去补习SIMD课程。)
同时还有一个问题:如果我想在Android平板上调用Opengl ES3.2来实现同样的卷积计算,是不是手写着色器就可以实现呢?是不是您opengl课程的前4讲内容就足够解决这个问题?
我手写了一个计算1D卷积的程序,但是发现执行起来很慢。问题描述和示例代码如下:
谢谢小彭老师。