parallel101 / simdtutor

x86-64 SIMD矢量优化系列教程
109 stars 9 forks source link

有明显依赖顺序该怎么优化啊,救救孩子 #12

Open muyuuuu opened 3 weeks ago

muyuuuu commented 3 weeks ago
#include <iostream>
#include <array>
#include <vector>
#include <random>

const int width = 1024;
const int height = 768;

template <typename T>
void RandomData(T &data) {
    std::mt19937 mt(20241101);
    std::uniform_int_distribution<int> int_dist(0, 255);
    for (int i = 0; i < data[0].size(); i++) {
        for (int j = 0; j < data.size(); j++) {
            data[i][j] = int_dist(mt);
        }
    }
}

int main() {
    std::array<std::array<uint8_t, height>, width> mask;
    std::array<std::array<uint8_t, height>, width> mask_errode;
    std::array<std::array<uint8_t, height>, width> disp;
    std::array<std::array<uint8_t, height>, width> dst;

    RandomData(mask);
    RandomData(mask_errode);
    RandomData(disp);
    RandomData(dst);

    std::vector<int> distance(width, 0);

    for (int i = 0; i < height; i++) {
        std::fill(distance.begin(), distance.end(), 0);
        int val = 0;
        int idx = 0;
        for (int j = 0; j < width; j++) {
            if ((!mask[i][j]) && (disp[i][j])) {
                val = disp[i][j];
                idx = j;
            }
            if ((250 < mask[i][j]) && (!mask_errode[i][j])) {
                dst[i][j] = val;
                if (0 != val) {
                    distance[j] = j - idx;
                }
            }
        }
        idx = 0;
        val = 0;
        for (int j = width - 1; j >= 0; j--) {
            if ((!mask[i][j]) && (disp[i][j])) {
                val = disp[i][j];
                idx = j;
            }
            if ((250 < mask[i][j]) && (!mask_errode[i][j])) {
                if (0 != val) {
                    if (0 != disp[i][j]) {
                        if (distance[j] > idx - j) {
                            dst[i][j] = val;
                        }
                    } else {
                        dst[i][j] = val;
                    }
                }
                if (dst[i][j] < disp[i][j]) {
                    dst[i][j] = disp[i][j];
                }
            }
        }
    }
    return 0;
}

补充:

  1. 我是安卓端,SIMD 是 Neon,估计和 SSE 差不多,感觉不太好写向量化
  2. cuda 也可以,或者有什么优化思路都可以
archibate commented 3 weeks ago

https://www.bilibili.com/video/BV1gu411m7kN?t=1419.5