parallel101 / simdtutor

x86-64 SIMD矢量优化系列教程
109 stars 9 forks source link

小彭老师大大,有关mpm的流体模拟函数优化 #10

Open dd123-a opened 1 month ago

dd123-a commented 1 month ago

void Simulate() {

// CLEAR GRID
std::size_t grid_size = grid.size();

// 确保 grid_size 不超出 int 的范围

pragma omp parallel for

for (int i = 0; i < static_cast<int>(grid_size); ++i) {
    grid[i].vel = glm::vec3(0.0f);
    grid[i].mass = 0.0f;
}

// P2G_1
//质量与动量转移
 #pragma omp parallel for
for (int i = 0; i < particles.size(); i++) {
    auto &p = particles[i];
    glm::uvec3 cell_idx = glm::uvec3(p.pos);
    glm::vec3 cell_diff = (p.pos - glm::vec3(cell_idx)) - 0.5f;

    glm::vec3 weights_[3];
    weights_[0] = 0.5f * glm::pow(0.5f - cell_diff, glm::vec3(2.0f));
    weights_[1] = 0.75f - glm::pow(cell_diff, glm::vec3(2.0f));
    weights_[2] = 0.5f * glm::pow(0.5f + cell_diff, glm::vec3(2.0f));

    for (uint32_t gx = 0; gx < 3; ++gx) {
        for (uint32_t gy = 0; gy < 3; ++gy) {
            for (uint32_t gz = 0; gz < 3; ++gz) {
                float weight = weights_[gx].x * weights_[gy].y * weights_[gz].z;

                glm::uvec3 cell_pos = glm::uvec3(cell_idx.x + gx - 1,
                                                 cell_idx.y + gy - 1,
                                                 cell_idx.z + gz - 1);
                glm::vec3 cell_dist = (glm::vec3(cell_pos) - p.pos) + 0.5f;
                glm::vec3 Q = p.C * cell_dist;

                int cell_index = (int) cell_pos.x +
                                 (int) cell_pos.y * grid_res +
                                 (int) cell_pos.z * grid_res * grid_res;

                float mass_contrib = weight * particle_mass;
                grid[cell_index].mass += mass_contrib;
                grid[cell_index].vel += mass_contrib * (p.vel + Q);
            }

        }
    }
}

// P2G_2
//密度计算和压力计算
 #pragma omp parallel for
for (int i = 0; i < particles.size(); i++) {
    auto &p = particles[i];

    glm::uvec3 cell_idx = glm::uvec3(p.pos);
    glm::vec3 cell_diff = (p.pos - glm::vec3(cell_idx)) - 0.5f;

    glm::vec3 weights_[3];
    weights_[0] = 0.5f * glm::pow(0.5f - cell_diff, glm::vec3(2.0f));
    weights_[1] = 0.75f - glm::pow(cell_diff, glm::vec3(2.0f));
    weights_[2] = 0.5f * glm::pow(0.5f + cell_diff, glm::vec3(2.0f));

    float density = 0.0f;
    for (uint32_t gx = 0; gx < 3; ++gx) {
        for (uint32_t gy = 0; gy < 3; ++gy) {
            for (uint32_t gz = 0; gz < 3; ++gz) {
                float weight = weights_[gx].x * weights_[gy].y * weights_[gz].z;

                glm::uvec3 cell_pos = glm::uvec3(cell_idx.x + gx - 1,
                                                 cell_idx.y + gy - 1,
                                                 cell_idx.z + gz - 1);

                int cell_index = (int) cell_pos.x +
                                 (int) cell_pos.y * grid_res +
                                 (int) cell_pos.z * grid_res * grid_res;

                density += grid[cell_index].mass * weight;
            }
        }
    }

    float volume = particle_mass / density;
    float pressure = std::max(-0.1f, eos_stiffness *
                                     (std::pow(density / rest_density, eos_power) - 1.0f));

    glm::mat3 stress = glm::mat3(
            -pressure, 0, 0,
            0, -pressure, 0,
            0, 0, -pressure
    );

     glm::mat3 strain = p.C;

     //float trace = strain[0][0] + strain[1][0] + strain[2][0]; // DEBUG
     float trace = glm::determinant(strain);
     strain[0][0] = strain[1][0] = strain[2][0] = trace;

     glm::mat3 viscosity_term = dynamic_viscosity * strain;
     stress += viscosity_term;

    auto eq_16_term_0 = -volume * 4 * stress * dt;

    for (uint32_t gx = 0; gx < 3; ++gx) {
        for (uint32_t gy = 0; gy < 3; ++gy) {
            for (uint32_t gz = 0; gz < 3; ++gz) {
                float weight = weights_[gx].x * weights_[gy].y * weights_[gz].z;

                glm::uvec3 cell_pos = glm::uvec3(cell_idx.x + gx - 1,
                                                 cell_idx.y + gy - 1,
                                                 cell_idx.z + gz - 1);

                glm::vec3 cell_dist = (glm::vec3(cell_pos) - p.pos) + 0.5f;

                int cell_index = (int) cell_pos.x +
                                 (int) cell_pos.y * grid_res +
                                 (int) cell_pos.z * grid_res * grid_res;

                glm::vec3 momentum = (eq_16_term_0 * weight) * cell_dist;
                grid[cell_index].vel += momentum;
            }
        }
    }
}

// GRID UPDATE
// 获取 grid 的大小
grid_size = grid.size();

// GRID UPDATE

pragma omp parallel for

for (int i = 0; i < static_cast<int>(grid_size); ++i) {
    auto& cell = grid[static_cast<std::size_t>(i)]; // 使用 static_cast 将索引转换为 std::size_t
    if (cell.mass > 0) {
        cell.vel /= cell.mass;
        cell.vel += dt * glm::vec3(0.0f, gravity, 0.0f);

        int index = cell.index;
        int x = index / (grid_res * grid_res);
        index /= grid_res;
        int y = (index / grid_res) % grid_res;
        index /= grid_res;
        int z = index;

        if (x < 1 || x > grid_res - 2) { cell.vel.x = 0.0f; }
        if (y < 1 || y > grid_res - 2) { cell.vel.y = 0.0f; }
        if (z < 1 || z > grid_res - 2) { cell.vel.z = 0.0f; }
    }
}

// G2P
 #pragma omp parallel for
for (int i = 0; i < static_cast<int>(particles.size()); ++i) {
    auto& p = particles[static_cast<std::size_t>(i)]; // 使用索引访问粒子

    p.vel = glm::vec3(0.0f);

    glm::uvec3 cell_idx = glm::uvec3(p.pos);
    glm::vec3 cell_diff = (p.pos - glm::vec3(cell_idx)) - 0.5f;

    glm::vec3 weights_[3];

    weights_[0] = 0.5f * glm::pow(0.5f - cell_diff, glm::vec3(2.0f));
    weights_[1] = 0.75f - glm::pow(cell_diff, glm::vec3(2.0f));
    weights_[2] = 0.5f * glm::pow(0.5f + cell_diff, glm::vec3(2.0f));

    glm::mat3 B = glm::mat3(0.0f);
    for (uint32_t gx = 0; gx < 3; ++gx) {
        for (uint32_t gy = 0; gy < 3; ++gy) {
            for (uint32_t gz = 0; gz < 3; ++gz) {
                float weight = weights_[gx].x * weights_[gy].y * weights_[gz].z;
                // std::cout << weight << std::endl;
                glm::uvec3 cell_pos = glm::uvec3(cell_idx.x + gx - 1,
                    cell_idx.y + gy - 1,
                    cell_idx.z + gz - 1);

                glm::vec3 cell_dist = (glm::vec3(cell_pos) - p.pos) + 0.5f;

                int cell_index = (int)cell_pos.x +
                    (int)cell_pos.y * grid_res +
                    (int)cell_pos.z * grid_res * grid_res;

                glm::vec3 weighted_velocity = grid[cell_index].vel * weight;

                B += glm::mat3(weighted_velocity * cell_dist.x,
                    weighted_velocity * cell_dist.y,
                    weighted_velocity * cell_dist.z);

                p.vel += weighted_velocity;
            }
        }
    }

    p.C = B * 4.0f;
    p.vel *= damping;
    p.pos += p.vel * dt;
    p.pos = glm::clamp(p.pos, 1.0f, grid_res - 2.0f);

    glm::vec3 x_n = p.pos + p.vel;
    const float wall_min = 3.0f;
    const float wall_max = grid_res - 4.0f;
    if (x_n.x < wall_min) p.vel.x += (wall_min - x_n.x);
    if (x_n.x > wall_max) p.vel.x += (wall_max - x_n.x);
    if (x_n.y < wall_min) p.vel.y += (wall_min - x_n.y);
    if (x_n.y > wall_max) p.vel.y += (wall_max - x_n.y);
    if (x_n.z < wall_min) p.vel.z += (wall_min - x_n.z);
    if (x_n.z > wall_max) p.vel.z += (wall_max - x_n.z);
}

}这是使用openmp的进行了加速,但是我有在youtube上看到可以对其用simd指令优化到单线程的也能流畅运行的程度,但本人对此一窍不通(悲),所以来探讨一下是否有可行性。(本人尝试过使用simd进行改写,但是debug的太痛苦了,模板元什么的最讨厌了)https://github.com/Hab5/mls-mpm这是对应的完整代码仓库,https://www.youtube.com/watch?v=4Y58Pg9tpSo这是对应的YouTube视频介绍(梦开始的地方

archibate commented 1 month ago

这是你的仓库吗?看到这个仓库使用了OpenGL计算着色器,为什么说是单CPU的,可以参考一下我的:https://github.com/archibate/opengl_mpm

archibate commented 1 month ago

P2G这一步里的+=需要是atomic的吧?不然累计结果会不对,导致仿真结果爆炸。

archibate commented 1 month ago

你可以用chrono测一测每一步的时间,你应该会发现P2G这一步花的时间是最长的,因为P2G涉及了scatter操作,会出现多个线程同时访问同一个grid的情况,需要低效的atomic。 为了避免多线程竞争导致的性能损失,有两种方案:

  1. 先把粒子按照莫顿码排个序,排序后因为openmp会把值接近的i放在一个线程里执行,从而只有边缘存在多线程同时atomic访问同一个grid的情况。
  2. 如果有4个cpu,那就把粒子按位置分成4块,分别在各自的区域里,那就不需要atomic了。
archibate commented 1 month ago

看到你把omp都注释了,是没加atomic炸了然后debug了半天是吧

                    float mass_contrib = weight * particle_mass;
                    grid[cell_index].mass += mass_contrib;
                    grid[cell_index].vel += mass_contrib * (p.vel + Q);

需要改成:

                    float mass_contrib = weight * particle_mass;
#pragma omp atomic
                    grid[cell_index].mass += mass_contrib;
#pragma omp atomic
                    grid[cell_index].vel.x += mass_contrib * (p.vel.x + Q);
#pragma omp atomic
                    grid[cell_index].vel.y += mass_contrib * (p.vel.y + Q);
#pragma omp atomic
                    grid[cell_index].vel.z += mass_contrib * (p.vel.z + Q);

所以,并行并不是儿童玩具,并不是说加上#pragma omp parallel for就能免费提升性能。并行是有成本的,99%的情况下,无脑加#pragma omp parallel for要么崩溃,要么没有任何加速效果。你必须改变编程思路,适应并行编程的特点,而不是继续按照串行的思路来写代码,这是一套截然不同的思路,并不是免费的。CPU的并行编程和串行时有很大不同,再加上SIMD优化也会完全不同,GPU更是完全不同,在GPU上并行编程若想获得理想的加速效果,有时甚至需要加大计算量来提升并行度,用空间换时间等。所以,实际上很多算法,因为本身不适合GPU的并行编程方式,而没有办法用GPU并行或者收效甚微。好消息是,MPM还算是一个GPU可以加速的算法,这里的P2G俗称scatter操作,thrust里是有稳定的方案的,当然CPU也有相应方案,无论如何,这些并行的方案所写的代码都会和你现在截然不同。

archibate commented 1 month ago

串行写法

例如,在串行编程中过滤出一个vector所有的偶数:

std::vector<int> in = {1, 2, 3, 4};
std::vector<int> out;
for (size_t i = 0; i < in.size(); ++i) {
  if (in[i] % 2 == 0)
    out.push_back(in[i]);
}

很明显,如果你无脑加上#pragma omp parallel for,这个std::vectorpush_back会直接奔溃,因为他并不是线程安全的。

串行思维的并行

坚持使用“串行思维”写并行的方法是使用并发安全的vector容器tbb::concurrent_vector,其push_back是线程安全的:

std::vector<int> in = {1, 2, 3, 4};
tbb::concurrent_vector<int> out;
#pragma omp parallel for
for (size_t i = 0; i < in.size(); ++i) {
  if (in[i] % 2 == 0)
    out.push_back(in[i]);
}

但是你会发现性能提升不大,甚至可能反而倒退,这就是错误用“串行思维”写并行代码的后果(你的代码中需要atomic地+=一系列随机位置的grid成员,就属于这种错误思想)。

真正的大并行

真正的并行方案可能令你大开眼界,需要把上面的单个for拆成三步:

std::vector<int> in = {1, 2, 3, 4};
std::vector<int> cnt(in.size());
#pragma omp parallel for
for (size_t i = 0; i < in.size(); ++i) {
  cnt[i] = in[i] % 2 == 0;
}
tbb::parallel_exclusive_scan(cnt.begin(), cnt.end());
std::vector<int> out(cnt.back() + in.back());
#pragma omp parallel for
for (size_t i = 0; i < in.size(); ++i) {
  if (in[i] % 2 == 0)
    out[cnt[i]] = in[i];
}

连续判断了两遍,很低效?不好意思。并行编程就是需要经常牺牲“计算量”来提升并行度,最终在高并行环境中反而更高效。

Building-Blocks

其中parallel_exclusive_scan是“前缀和”算法,是可以高度并行的,在GPU和CPU上均有高效的实现。此类常用的并行算法被称为“基本算法”,也是Thread-Building-Blocks的Building-Blocks得名由来。 通常写并行算法时,实际上会先拆分成多个“基本算法”,然后就可以直接调用TBB里的基本块来实现。TBB就像乐高积木一样,你需要自己根据你想要拼出模型的形状,选择适合的积木块。 parallel_forparallel_exclusive_scanparallel_reduceparallel_merge_sortparallel_bitonic_sortparallel_radix_sortparallel_scatterparallel_gather等都是这样的基本算法。而在你原始的头脑中,只有parallel_for这一种最无脑的基本算法,因为其他的算法你在串行编程中可能根本没见过。不去把自己的算法拆解成这些可以高度并行的“基本算法”,是你并行编程这么困难的根本原因。 顺便一提,上面这种过滤出指定条件(例如“偶数”)元素的算法被称为 filter,虽然可以进一步拆成 for+scan+for,但因为非常常用,也被当作是一种基本算法了,thrust中就有 thrust::copy_if

CPU优化可以走邪道

当然,上面这种基于Building-Blocks的“三步走”战略是并行得最彻底的,好处是:不论是CPU并行,CPU SIMD,GPU并行,多GPU并行,多机联网并行,都可以轻松移植过去。但是因为牺牲了一点“计算量”,如果CPU核心数量不多(例如只有2核),可能还不如串行来得快。 因此一般CPU并行和CPU SIMD都会有单独的优化方案,例如上面这种过滤出偶数的,用x86 SIMD的AVX2指令集有一种基于_mm_maskstore_epi32黑科技可以单核加速20倍,如果用AVX512又会有所不同,CPU并行也有黑科技可以加速,不必采用最泛用的“三步走”。 但是如果你已经写出并行的最彻底的GPU方案(例如上面这种“三步走”),那么也是可以无缝降级到CPU并行和CPU SIMD的,只不过在CPU上会不如专为SIMD优化的高效。 当然,GPU也有自己的邪道,例如利用shared memory在单个block内局部地三步走,而不必整个在全局范围内跑最通用的“三步走”浪费计算量和缓存局域性,但是如果你要方便移植的话基于Building-Blocks是最方便的。 所以,你考虑好你的目标用户到底是GPU还是CPU。据我估算,你这种算法上GPU可以获得免费的20倍提升,专门优化后可以获得100倍。CPU如果无脑OpenMP(搞定atomic问题后)可以获得免费的2倍提升,但是深度优化(SIMD+并行)可以继续获得20倍提升。

dd123-a commented 1 month ago

好的,感谢小彭老师的指导,我使用#pragma omp parallel for能得到一点效率上的提升,debug的话主要是想用_mm_maskstore_epi32类似这种simd、avx指令来重写一下(虽然我也感觉需要一定的同步机制代码,可能编译器自带的锁机制满足了已经,没遇到崩溃的情况),但是现在我觉得走gpu加速会更好一点,所以是将这部分代码修改为在.cu代码文件,在cpu和gpu之间传递数据吗

archibate commented 1 month ago

不要在cpu和gpu之间来回拷,全部统一放gpu上,只有当你需要导出结果时才拷回cpu。

dd123-a commented 1 month ago

其实我更好奇的是,cpu上面的simd指令之类的具体实现是怎么做到的,我看对应的博文是使用寄存器,进行一系列的内存对齐,然后八个数据一批一批的处理,这也是我想向小彭老师大大请教的地方,或许小彭老师可以写一个简短cpu的simd指令的demo吗,就我给出的这个模拟函数,嘤嘤嘤

dd123-a commented 1 month ago

小彭老师我又来啦; for (int i = 0; i < particles.size(); i += 4) { // 初始化一个 AVX-512 寄存器 位置 __m512 pos = _mm512_set_ps( 0.0f, particles[i + 3].pos.z, particles[i + 3].pos.y, particles[i + 3].pos.x, 0.0f, particles[i + 2].pos.z, particles[i + 2].pos.y, particles[i + 2].pos.x, 0.0f, particles[i + 1].pos.z, particles[i + 1].pos.y, particles[i + 1].pos.x, 0.0f, particles[i].pos.z, particles[i].pos.y, particles[i].pos.x );

 __m512 vel = _mm512_set_ps(
     0.0, particles[i + 3].vel.z, particles[i + 3].vel.y, particles[i + 3].vel.x,
     0.0, particles[i + 2].vel.z, particles[i + 2].vel.y, particles[i + 2].vel.x,
     0.0, particles[i + 1].vel.z, particles[i + 1].vel.y, particles[i + 1].vel.x,
     0.0, particles[i].vel.z, particles[i].vel.y, particles[i].vel.x
 );

 // 计算整数部分
 __m512 intPart = _mm512_floor_ps(pos);

 // 计算小数部分
 __m512 fracPart = _mm512_sub_ps(pos, intPart);

 // 准备一个包含全部元素为0.5的向量
 __m512 halfVector = _mm512_set1_ps(0.5f);
 __m512 halfVector1 = _mm512_set1_ps(0.75f);
 __m128 messVector = _mm_set1_ps(particle_mass);

 // 从小数部分中减去0.5
 __m512 cell_diff = _mm512_sub_ps(fracPart, halfVector);

 __m512 weights[3];

 weights[0] = _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(halfVector, cell_diff), _mm512_sub_ps(halfVector, cell_diff)), halfVector);
 weights[1] = _mm512_sub_ps(halfVector1,_mm512_mul_ps(cell_diff, cell_diff));
 weights[2] = _mm512_mul_ps(_mm512_mul_ps(_mm512_add_ps(halfVector, cell_diff), _mm512_add_ps(halfVector, cell_diff)), halfVector);
 float tempArray[3][16];
 _mm512_store_ps(tempArray[0], weights[0]);
 _mm512_store_ps(tempArray[1], weights[1]);
 _mm512_store_ps(tempArray[2], weights[2]);
 for (int gx = 0; gx < 3; ++gx) {
     for (int gy = 0; gy < 3; ++gy) {
         for (int gz = 0; gz < 3; ++gz) {
             __m128 weight = _mm_set_ps(tempArray[gx][12] * tempArray[gy][13] * tempArray[gx][14],
                 tempArray[gx][8] * tempArray[gy][9] * tempArray[gx][10],
                 tempArray[gx][4] * tempArray[gy][5] * tempArray[gx][6],
                 tempArray[gx][0] * tempArray[gy][1] * tempArray[gz][2]);

             __m512 oneVector = _mm512_set_ps(0.0, gz - 1, gy - 1, gx - 1,
                 0.0, gz - 1, gy - 1, gx - 1,
                 0.0, gz - 1, gy - 1, gx - 1,
                 0.0, gz - 1, gy - 1, gx - 1);

             __m512 cell_pos = _mm512_add_ps(intPart, oneVector);

             __m512 cell_dist = _mm512_add_ps(_mm512_sub_ps(cell_pos, pos), halfVector);

             __m512 C[3];

             C[0] = _mm512_set_ps(
                 0.0, particles[i + 3].C[0][2], particles[i + 3].C[0][1], particles[i + 3].C[0][0],
                 0.0, particles[i + 2].C[0][2], particles[i + 2].C[0][1], particles[i + 2].C[0][0],
                 0.0, particles[i + 1].C[0][2], particles[i + 1].C[0][1], particles[i + 1].C[0][0],
                 0.0, particles[i].C[0][2], particles[i].C[0][1], particles[i].C[0][0]
             );

             C[1] = _mm512_set_ps(
                 0.0, particles[i + 3].C[1][2], particles[i + 3].C[1][1], particles[i + 3].C[1][0],
                 0.0, particles[i + 2].C[1][2], particles[i + 2].C[1][1], particles[i + 2].C[1][0],
                 0.0, particles[i + 1].C[1][2], particles[i + 1].C[1][1], particles[i + 1].C[1][0],
                 0.0, particles[i].C[1][2], particles[i].C[1][1], particles[i].C[1][0]
             );

             C[2] = _mm512_set_ps(
                 0.0, particles[i + 3].C[2][2], particles[i + 3].C[2][1], particles[i + 3].C[2][0],
                 0.0, particles[i + 2].C[2][2], particles[i + 2].C[2][1], particles[i + 2].C[2][0],
                 0.0, particles[i + 1].C[2][2], particles[i + 1].C[2][1], particles[i + 1].C[2][0],
                 0.0, particles[i].C[2][2], particles[i].C[2][1], particles[i].C[2][0]
             );

             __m512 Q = _mm512_add_ps(_mm512_add_ps(_mm512_mul_ps(cell_dist, C[0]), _mm512_mul_ps(cell_dist, C[1])), _mm512_mul_ps(cell_dist, C[2]));

             __m512 temp = _mm512_set_ps(
                 0.0, grid_res * grid_res, grid_res, 1,
                 0.0, grid_res * grid_res, grid_res, 1,
                 0.0, grid_res * grid_res, grid_res, 1,
                 0.0, grid_res * grid_res, grid_res, 1
             );

             float tempArray[16];
             __m512 cell_index_temp = _mm512_mul_ps(cell_pos, temp);
             _mm512_store_ps(tempArray, cell_index_temp);

             __m128 cell_index = _mm_set_ps(
                 tempArray[14] + tempArray[13] + tempArray[12],
                 tempArray[10] + tempArray[9] + tempArray[8],
                 tempArray[6] + tempArray[5] + tempArray[4],
                 tempArray[2] + tempArray[1] + tempArray[0]
             );

             __m128 mass_contrib = _mm_mul_ps(weight, messVector);

             float masstemp[4] = { 0 };
             float cellindextemp[4] = { 0 };
             _mm_store_ps(masstemp, mass_contrib);
             _mm_store_ps(cellindextemp, cell_index);
             if (cellindextemp[0] >= 91125 || cellindextemp[1] >= 91125 || cellindextemp[2] >= 91125 || cellindextemp[3] >= 91125||
                 cellindextemp[0] < 0 || cellindextemp[1] < 0 || cellindextemp[2] < 0 || cellindextemp[3] <0) {
                 printf("fucking");
             }

            grid[cellindextemp[0]].mass += masstemp[0];
            grid[cellindextemp[1]].mass += masstemp[1];
            grid[cellindextemp[2]].mass += masstemp[2];
            grid[cellindextemp[3]].mass += masstemp[3];

           __m512 massvector = _mm512_set_ps(
               0.0, masstemp[3], masstemp[3], masstemp[3],
               0.0, masstemp[2], masstemp[2], masstemp[2],
               0.0, masstemp[1], masstemp[1], masstemp[1],
               0.0,masstemp[0], masstemp[0], masstemp[0]
           );
           __m512 massvector1 = _mm512_set1_ps(masstemp[1]);
           __m512 massvector2 = _mm512_set1_ps(masstemp[2]);
           __m512 massvector3 = _mm512_set1_ps(masstemp[3]);

           float veltmp[16] = { 0 };
           _mm512_store_ps(veltmp, _mm512_mul_ps(_mm512_add_ps(vel, Q), massvector));

           grid[cellindextemp[0]].vel.x += veltmp[0];
           grid[cellindextemp[0]].vel.y += veltmp[1];
           grid[cellindextemp[0]].vel.z += veltmp[2];

           grid[cellindextemp[1]].vel.x += veltmp[4];
           grid[cellindextemp[1]].vel.y += veltmp[5];
           grid[cellindextemp[1]].vel.z += veltmp[6];

           grid[cellindextemp[2]].vel.x += veltmp[8];
           grid[cellindextemp[2]].vel.y += veltmp[9];
           grid[cellindextemp[2]].vel.z += veltmp[10];

           grid[cellindextemp[3]].vel.x += veltmp[12];
           grid[cellindextemp[3]].vel.y += veltmp[13];
           grid[cellindextemp[3]].vel.z += veltmp[14];
         }
     }
 }

}这是我对p2g_1这部分的avx512的尝试,但是在我测试下,好像和原来代码的效率相比没有差异,求指正,求拷打(呜呜呜

archibate commented 1 month ago

_mm_set_ps(tempArray[gx][12] tempArray[gy][13] tempArray[gx][14], tempArray[gx][8] tempArray[gy][9] tempArray[gx][10], tempArray[gx][4] tempArray[gy][5] tempArray[gx][6], tempArray[gx][0] tempArray[gy][1] tempArray[gz][2_mm512_set_ps( 0.0, particles[i + 3].C[2][2], particles[i + 3].C[2][1], particles[i + 3].C[2][0], 0.0, particles[i + 2].C[2][2], particles[i + 2].C[2][1], particles[i + 2].C[2][0], 0.0, particles[i + 1].C[2][2], particles[i + 1].C[2][1], particles[i + 1].C[2][0], 0.0, particles[i].C[2][2], particles[i].C[2][1], particles[i].C[2][0] );这算什么?你实际上依然在标量地加载数据,标量地计算数据,你只是把结果存入了m512走个过场,实际的加减乘除计算你都是标量的,当然没法利用到m512的任何加速 真正利用simd的程序就不应该出现_mm_set_ps这种东西,setps是极其低效的,对于m128的setps会生成4条指令,而m512的setps会生成16条指令,达不到simd所希望的“并行”效果。 例如_mm_setr_ps(x[0], x[1], x[2], x[3])会生成四个指令,和标量根本没区别。 应当被改成_mm_load_ps(&x[0]);这样就只会生成一条指令,真正起到了加速效果。 如果你说你的数据布局并不是连续的,需要穿插0等,那也不是你用setps的理由,因为你可以先顺序加载数据,随后通过shuffleps,permuteps,调整数据顺序,适应你的算法要求,如果调整不过来,那就需要你调整数据布局来适应simd。 例如_mm_setr_ps(x[0], x[2], x[1], x[3])存在乱序,无法用load一次性顺序加载,那就先load出来,然后一次shuffle搞定,一共两条指令,依然比setps生成4条指令高效,特别是m512的情况。

无法顺畅的大口呼吸,是活着的最好证明

---原始邮件--- 发件人: @.> 发送时间: 2024年10月25日(周五) 下午5:55 收件人: @.>; 抄送: @.**@.>; 主题: Re: [parallel101/simdtutor] 小彭老师大大,有关mpm的流体模拟函数优化 (Issue #10)

小彭老师我又来啦; for (int i = 0; i < particles.size(); i += 4) { // 初始化一个 AVX-512 寄存器 位置 m512 pos = _mm512_set_ps( 0.0f, particles[i + 3].pos.z, particles[i + 3].pos.y, particles[i + 3].pos.x, 0.0f, particles[i + 2].pos.z, particles[i + 2].pos.y, particles[i + 2].pos.x, 0.0f, particles[i + 1].pos.z, particles[i + 1].pos.y, particles[i + 1].pos.x, 0.0f, particles[i].pos.z, particles[i].pos.y, particles[i].pos.x ); __m512 vel = _mm512_set_ps( 0.0, particles[i + 3].vel.z, particles[i + 3].vel.y, particles[i + 3].vel.x, 0.0, particles[i + 2].vel.z, particles[i + 2].vel.y, particles[i + 2].vel.x, 0.0, particles[i + 1].vel.z, particles[i + 1].vel.y, particles[i + 1].vel.x, 0.0, particles[i].vel.z, particles[i].vel.y, particles[i].vel.x ); // 计算整数部分 m512 intPart = _mm512_floor_ps(pos); // 计算小数部分 m512 fracPart = _mm512_sub_ps(pos, intPart); // 准备一个包含全部元素为0.5的向量 __m512 halfVector = _mm512_set1_ps(0.5f); m512 halfVector1 = _mm512_set1_ps(0.75f); m128 messVector = _mm_set1_ps(particle_mass); // 从小数部分中减去0.5 m512 cell_diff = _mm512_sub_ps(fracPart, halfVector); m512 weights[3]; weights[0] = _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(halfVector, cell_diff), _mm512_sub_ps(halfVector, cell_diff)), halfVector); weights[1] = _mm512_sub_ps(halfVector1,_mm512_mul_ps(cell_diff, cell_diff)); weights[2] = _mm512_mul_ps(_mm512_mul_ps(_mm512_add_ps(halfVector, cell_diff), _mm512_add_ps(halfVector, cell_diff)), halfVector); float tempArray[3][16]; _mm512_store_ps(tempArray[0], weights[0]); _mm512_store_ps(tempArray[1], weights[1]); _mm512_store_ps(tempArray[2], weights[2]); for (int gx = 0; gx < 3; ++gx) { for (int gy = 0; gy < 3; ++gy) { for (int gz = 0; gz < 3; ++gz) { __m128 weight = _mm_set_ps(tempArray[gx][12] tempArray[gy][13] tempArray[gx][14], tempArray[gx][8] tempArray[gy][9] tempArray[gx][10], tempArray[gx][4] tempArray[gy][5] tempArray[gx][6], tempArray[gx][0] tempArray[gy][1] tempArray[gz][2]); m512 oneVector = _mm512_set_ps(0.0, gz - 1, gy - 1, gx - 1, 0.0, gz - 1, gy - 1, gx - 1, 0.0, gz - 1, gy - 1, gx - 1, 0.0, gz - 1, gy - 1, gx - 1); m512 cell_pos = _mm512_add_ps(intPart, oneVector); m512 cell_dist = _mm512_add_ps(_mm512_sub_ps(cell_pos, pos), halfVector); m512 C[3]; C[0] = _mm512_set_ps( 0.0, particles[i + 3].C[0][2], particles[i + 3].C[0][1], particles[i + 3].C[0][0], 0.0, particles[i + 2].C[0][2], particles[i + 2].C[0][1], particles[i + 2].C[0][0], 0.0, particles[i + 1].C[0][2], particles[i + 1].C[0][1], particles[i + 1].C[0][0], 0.0, particles[i].C[0][2], particles[i].C[0][1], particles[i].C[0][0] ); C[1] = _mm512_set_ps( 0.0, particles[i + 3].C[1][2], particles[i + 3].C[1][1], particles[i + 3].C[1][0], 0.0, particles[i + 2].C[1][2], particles[i + 2].C[1][1], particles[i + 2].C[1][0], 0.0, particles[i + 1].C[1][2], particles[i + 1].C[1][1], particles[i + 1].C[1][0], 0.0, particles[i].C[1][2], particles[i].C[1][1], particles[i].C[1][0] ); C[2] = _mm512_set_ps( 0.0, particles[i + 3].C[2][2], particles[i + 3].C[2][1], particles[i + 3].C[2][0], 0.0, particles[i + 2].C[2][2], particles[i + 2].C[2][1], particles[i + 2].C[2][0], 0.0, particles[i + 1].C[2][2], particles[i + 1].C[2][1], particles[i + 1].C[2][0], 0.0, particles[i].C[2][2], particles[i].C[2][1], particles[i].C[2][0] ); m512 Q = _mm512_add_ps(_mm512_add_ps(_mm512_mul_ps(cell_dist, C[0]), _mm512_mul_ps(cell_dist, C[1])), _mm512_mul_ps(cell_dist, C[2])); m512 temp = _mm512_set_ps( 0.0, grid_res grid_res, grid_res, 1, 0.0, grid_res grid_res, grid_res, 1, 0.0, grid_res grid_res, grid_res, 1, 0.0, grid_res grid_res, grid_res, 1 ); float tempArray[16]; m512 cell_index_temp = _mm512_mul_ps(cell_pos, temp); _mm512_store_ps(tempArray, cell_index_temp); m128 cell_index = _mm_set_ps( tempArray[14] + tempArray[13] + tempArray[12], tempArray[10] + tempArray[9] + tempArray[8], tempArray[6] + tempArray[5] + tempArray[4], tempArray[2] + tempArray[1] + tempArray[0] ); m128 mass_contrib = _mm_mul_ps(weight, messVector); float masstemp[4] = { 0 }; float cellindextemp[4] = { 0 }; _mm_store_ps(masstemp, mass_contrib); _mm_store_ps(cellindextemp, cell_index); if (cellindextemp[0] >= 91125 || cellindextemp[1] >= 91125 || cellindextemp[2] >= 91125 || cellindextemp[3] >= 91125|| cellindextemp[0] < 0 || cellindextemp[1] < 0 || cellindextemp[2] < 0 || cellindextemp[3] <0) { printf("fucking"); } grid[cellindextemp[0]].mass += masstemp[0]; grid[cellindextemp[1]].mass += masstemp[1]; grid[cellindextemp[2]].mass += masstemp[2]; grid[cellindextemp[3]].mass += masstemp[3]; m512 massvector = _mm512_set_ps( 0.0, masstemp[3], masstemp[3], masstemp[3], 0.0, masstemp[2], masstemp[2], masstemp[2], 0.0, masstemp[1], masstemp[1], masstemp[1], 0.0,masstemp[0], masstemp[0], masstemp[0] ); __m512 massvector1 = _mm512_set1_ps(masstemp[1]); m512 massvector2 = _mm512_set1_ps(masstemp[2]); __m512 massvector3 = _mm512_set1_ps(masstemp[3]); float veltmp[16] = { 0 }; _mm512_store_ps(veltmp, _mm512_mul_ps(_mm512_add_ps(vel, Q), massvector)); grid[cellindextemp[0]].vel.x += veltmp[0]; grid[cellindextemp[0]].vel.y += veltmp[1]; grid[cellindextemp[0]].vel.z += veltmp[2]; grid[cellindextemp[1]].vel.x += veltmp[4]; grid[cellindextemp[1]].vel.y += veltmp[5]; grid[cellindextemp[1]].vel.z += veltmp[6]; grid[cellindextemp[2]].vel.x += veltmp[8]; grid[cellindextemp[2]].vel.y += veltmp[9]; grid[cellindextemp[2]].vel.z += veltmp[10]; grid[cellindextemp[3]].vel.x += veltmp[12]; grid[cellindextemp[3]].vel.y += veltmp[13]; grid[cellindextemp[3]].vel.z += veltmp[14]; } } }
}这是我对p2g_1这部分的avx512的尝试,但是在我测试下,好像和原来代码的效率相比没有差异,求指正,求拷打(呜呜呜

— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you commented.Message ID: @.***>

---原始邮件--- 发件人: @.> 发送时间: 2024年10月25日(周五) 下午5:55 收件人: @.>; 抄送: @.**@.>; 主题: Re: [parallel101/simdtutor] 小彭老师大大,有关mpm的流体模拟函数优化 (Issue #10)

小彭老师我又来啦; for (int i = 0; i < particles.size(); i += 4) { // 初始化一个 AVX-512 寄存器 位置 m512 pos = _mm512_set_ps( 0.0f, particles[i + 3].pos.z, particles[i + 3].pos.y, particles[i + 3].pos.x, 0.0f, particles[i + 2].pos.z, particles[i + 2].pos.y, particles[i + 2].pos.x, 0.0f, particles[i + 1].pos.z, particles[i + 1].pos.y, particles[i + 1].pos.x, 0.0f, particles[i].pos.z, particles[i].pos.y, particles[i].pos.x ); __m512 vel = _mm512_set_ps( 0.0, particles[i + 3].vel.z, particles[i + 3].vel.y, particles[i + 3].vel.x, 0.0, particles[i + 2].vel.z, particles[i + 2].vel.y, particles[i + 2].vel.x, 0.0, particles[i + 1].vel.z, particles[i + 1].vel.y, particles[i + 1].vel.x, 0.0, particles[i].vel.z, particles[i].vel.y, particles[i].vel.x ); // 计算整数部分 m512 intPart = _mm512_floor_ps(pos); // 计算小数部分 m512 fracPart = _mm512_sub_ps(pos, intPart); // 准备一个包含全部元素为0.5的向量 __m512 halfVector = _mm512_set1_ps(0.5f); m512 halfVector1 = _mm512_set1_ps(0.75f); m128 messVector = _mm_set1_ps(particle_mass); // 从小数部分中减去0.5 m512 cell_diff = _mm512_sub_ps(fracPart, halfVector); m512 weights[3]; weights[0] = _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(halfVector, cell_diff), _mm512_sub_ps(halfVector, cell_diff)), halfVector); weights[1] = _mm512_sub_ps(halfVector1,_mm512_mul_ps(cell_diff, cell_diff)); weights[2] = _mm512_mul_ps(_mm512_mul_ps(_mm512_add_ps(halfVector, cell_diff), _mm512_add_ps(halfVector, cell_diff)), halfVector); float tempArray[3][16]; _mm512_store_ps(tempArray[0], weights[0]); _mm512_store_ps(tempArray[1], weights[1]); _mm512_store_ps(tempArray[2], weights[2]); for (int gx = 0; gx < 3; ++gx) { for (int gy = 0; gy < 3; ++gy) { for (int gz = 0; gz < 3; ++gz) { __m128 weight = _mm_set_ps(tempArray[gx][12] tempArray[gy][13] tempArray[gx][14], tempArray[gx][8] tempArray[gy][9] tempArray[gx][10], tempArray[gx][4] tempArray[gy][5] tempArray[gx][6], tempArray[gx][0] tempArray[gy][1] tempArray[gz][2]); m512 oneVector = _mm512_set_ps(0.0, gz - 1, gy - 1, gx - 1, 0.0, gz - 1, gy - 1, gx - 1, 0.0, gz - 1, gy - 1, gx - 1, 0.0, gz - 1, gy - 1, gx - 1); m512 cell_pos = _mm512_add_ps(intPart, oneVector); m512 cell_dist = _mm512_add_ps(_mm512_sub_ps(cell_pos, pos), halfVector); m512 C[3]; C[0] = _mm512_set_ps( 0.0, particles[i + 3].C[0][2], particles[i + 3].C[0][1], particles[i + 3].C[0][0], 0.0, particles[i + 2].C[0][2], particles[i + 2].C[0][1], particles[i + 2].C[0][0], 0.0, particles[i + 1].C[0][2], particles[i + 1].C[0][1], particles[i + 1].C[0][0], 0.0, particles[i].C[0][2], particles[i].C[0][1], particles[i].C[0][0] ); C[1] = _mm512_set_ps( 0.0, particles[i + 3].C[1][2], particles[i + 3].C[1][1], particles[i + 3].C[1][0], 0.0, particles[i + 2].C[1][2], particles[i + 2].C[1][1], particles[i + 2].C[1][0], 0.0, particles[i + 1].C[1][2], particles[i + 1].C[1][1], particles[i + 1].C[1][0], 0.0, particles[i].C[1][2], particles[i].C[1][1], particles[i].C[1][0] ); C[2] = _mm512_set_ps( 0.0, particles[i + 3].C[2][2], particles[i + 3].C[2][1], particles[i + 3].C[2][0], 0.0, particles[i + 2].C[2][2], particles[i + 2].C[2][1], particles[i + 2].C[2][0], 0.0, particles[i + 1].C[2][2], particles[i + 1].C[2][1], particles[i + 1].C[2][0], 0.0, particles[i].C[2][2], particles[i].C[2][1], particles[i].C[2][0] ); m512 Q = _mm512_add_ps(_mm512_add_ps(_mm512_mul_ps(cell_dist, C[0]), _mm512_mul_ps(cell_dist, C[1])), _mm512_mul_ps(cell_dist, C[2])); m512 temp = _mm512_set_ps( 0.0, grid_res grid_res, grid_res, 1, 0.0, grid_res grid_res, grid_res, 1, 0.0, grid_res grid_res, grid_res, 1, 0.0, grid_res grid_res, grid_res, 1 ); float tempArray[16]; m512 cell_index_temp = _mm512_mul_ps(cell_pos, temp); _mm512_store_ps(tempArray, cell_index_temp); m128 cell_index = _mm_set_ps( tempArray[14] + tempArray[13] + tempArray[12], tempArray[10] + tempArray[9] + tempArray[8], tempArray[6] + tempArray[5] + tempArray[4], tempArray[2] + tempArray[1] + tempArray[0] ); m128 mass_contrib = _mm_mul_ps(weight, messVector); float masstemp[4] = { 0 }; float cellindextemp[4] = { 0 }; _mm_store_ps(masstemp, mass_contrib); _mm_store_ps(cellindextemp, cell_index); if (cellindextemp[0] >= 91125 || cellindextemp[1] >= 91125 || cellindextemp[2] >= 91125 || cellindextemp[3] >= 91125|| cellindextemp[0] < 0 || cellindextemp[1] < 0 || cellindextemp[2] < 0 || cellindextemp[3] <0) { printf("fucking"); } grid[cellindextemp[0]].mass += masstemp[0]; grid[cellindextemp[1]].mass += masstemp[1]; grid[cellindextemp[2]].mass += masstemp[2]; grid[cellindextemp[3]].mass += masstemp[3]; m512 massvector = _mm512_set_ps( 0.0, masstemp[3], masstemp[3], masstemp[3], 0.0, masstemp[2], masstemp[2], masstemp[2], 0.0, masstemp[1], masstemp[1], masstemp[1], 0.0,masstemp[0], masstemp[0], masstemp[0] ); __m512 massvector1 = _mm512_set1_ps(masstemp[1]); m512 massvector2 = _mm512_set1_ps(masstemp[2]); __m512 massvector3 = _mm512_set1_ps(masstemp[3]); float veltmp[16] = { 0 }; _mm512_store_ps(veltmp, _mm512_mul_ps(_mm512_add_ps(vel, Q), massvector)); grid[cellindextemp[0]].vel.x += veltmp[0]; grid[cellindextemp[0]].vel.y += veltmp[1]; grid[cellindextemp[0]].vel.z += veltmp[2]; grid[cellindextemp[1]].vel.x += veltmp[4]; grid[cellindextemp[1]].vel.y += veltmp[5]; grid[cellindextemp[1]].vel.z += veltmp[6]; grid[cellindextemp[2]].vel.x += veltmp[8]; grid[cellindextemp[2]].vel.y += veltmp[9]; grid[cellindextemp[2]].vel.z += veltmp[10]; grid[cellindextemp[3]].vel.x += veltmp[12]; grid[cellindextemp[3]].vel.y += veltmp[13]; grid[cellindextemp[3]].vel.z += veltmp[14]; } } }
}这是我对p2g_1这部分的avx512的尝试,但是在我测试下,好像和原来代码的效率相比没有差异,求指正,求拷打(呜呜呜

— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you commented.Message ID: @.***>

dd123-a commented 4 weeks ago

太难了,感觉这个算法走avx512也太难了,死了一大半脑细胞也不知道应该怎么优化这里了: __m128 weight = _mm_set_ps(tempArray[gx][12] tempArray[gy][13] tempArray[gx][14], tempArray[gx][8] tempArray[gy][9] tempArray[gx][10], tempArray[gx][4] tempArray[gy][5] tempArray[gx][6], tempArray[gx][0] tempArray[gy][1] tempArray[gz][2]);