Forceflow / cuda_voxelizer

CUDA Voxelizer to convert polygon meshes into annotated voxel grids
MIT License
597 stars 98 forks source link

Performance improvements #57

Open GridEyes-2010 opened 3 years ago

GridEyes-2010 commented 3 years ago

I've made a few code improvements to your CPU version that increase efficiency by about 10 times The test results in the same environment are as follows: 646464--->5ms 128128128--->7ms 256256256--->19ms 512512512--->121ms 102410241024--->728ms 204820482048---->5576ms

Forceflow commented 3 years ago

Cool, would be interesting to see these improvements in a PR?

GridEyes-2010 commented 3 years ago

void Voxelizer::cpu_voxelize_mesh_solid( const voxinfo& info, Triangle_mesh& mesh, unsigned int* voxel_table, bool morton_order) {

    const Vec3f& move_min = info.pMin;
    const int uzero = 0;
    int nvert = (int)mesh.n_vertices();
    auto points = mesh.points();
    std::vector<Vec3f> npoints(nvert);
    for (int i = 0; i < nvert; ++i)
    {
        npoints[i] = points[i] - move_min;
    }

    int nface = (int)mesh.n_faces();

    const Vec3f delta_p(info.unit[0], info.unit[1], info.unit[2]);
    const Vec3i vzero(0, 0, 0);
    const Vec3i grid_max(info.gridsize[0] - 1, info.gridsize[1] - 1, info.gridsize[2] - 1);
    size_t yz = static_cast<size_t>(info.gridsize[1]) * static_cast<size_t>(info.gridsize[2]);
    Vec3f invuint(1 / info.unit[0], 1 / info.unit[1], 1 / info.unit[2]);
    const size_t s31 = 31;
    if (morton_order)
    {

pragma omp parallel for

        for (int i = 0; i < nface; ++i) {
            auto fv = mesh.cfv_begin(FaceHandle(i));
            auto& v0 = npoints[fv->idx()]; ++fv;
            auto& v1 = npoints[fv->idx()]; ++fv;
            auto& v2 = npoints[fv->idx()];
            auto e0 = v1 - v0;
            auto e1 = v2 - v1;
            auto e2 = v0 - v2;
            auto n = (e0.cross(e1)).normalize_cond();
            if (std::fabs(n[0]) < float_error) {
                continue;
            }
            n[0] = 1 / n[0];
            Vec2f v0_yz = Vec2f(v0[1], v0[2]);
            Vec2f v1_yz = Vec2f(v1[1], v1[2]);
            Vec2f v2_yz = Vec2f(v2[1], v2[2]);
            if (!checkCCW(v0_yz, v1_yz, v2_yz))
            {
                std::swap(v1_yz, v2_yz);
            }
            Vec2f bbox_max = OpenMesh::max(v0_yz, OpenMesh::max(v1_yz, v2_yz));
            Vec2f bbox_min = OpenMesh::min(v0_yz, OpenMesh::min(v1_yz, v2_yz));
            Vec2i bbox_max_grid = Vec2i(
                (int)std::floor(bbox_max[0] * invuint[1] - 0.5f),
                (int)std::floor(bbox_max[1] * invuint[2] - 0.5f));
            Vec2i bbox_min_grid = Vec2i(
                (int)std::ceil(bbox_min[0] * invuint[1] - 0.5f),
                (int)std::ceil(bbox_min[1] * invuint[2] - 0.5f));

            for (int y = bbox_min_grid[0]; y <= bbox_max_grid[0]; ++y)
            {
                for (int z = bbox_min_grid[1]; z <= bbox_max_grid[1]; ++z)
                {
                    Vec2f point = Vec2f((y + 0.5f) * info.unit[1], (z + 0.5f) * info.unit[2]);
                    int checknum = check_point_triangle(v0_yz, v1_yz, v2_yz, point);
                    if ((checknum == 1 && TopLeftEdge(v0_yz, v1_yz)) ||
                        (checknum == 2 && TopLeftEdge(v1_yz, v2_yz)) ||
                        (checknum == 3 && TopLeftEdge(v2_yz, v0_yz)) || (checknum == 0))
                    {
                        int xmax = int(get_x_coordinate(n, v0, point) * invuint[0] - 0.5f);
                        for (int x = 0; x <= xmax; x++)
                        {
                            size_t location = mortonEncode_LUT((unsigned)x, (unsigned)y, (unsigned)z);
                            size_t int_location = location >> 5;
                            unsigned int bit_pos = size_t(31) - (location & s31);
                            unsigned int mask = 1 << bit_pos;

pragma omp atomic

                            voxel_table[int_location] ^= mask;
                        }
                    }
                }
            }
        }

    }
    else
    {

pragma omp parallel for

        for (int i = 0; i < nface; ++i)
        {
            auto fv = mesh.cfv_begin(FaceHandle(i));
            auto& v0 = npoints[fv->idx()]; ++fv;
            auto& v1 = npoints[fv->idx()]; ++fv;
            auto& v2 = npoints[fv->idx()];
            auto e0 = v1 - v0;
            auto e1 = v2 - v1;
            auto e2 = v0 - v2;
            auto n = (e0.cross(e1)).normalize_cond();
            if (std::fabs(n[0]) < float_error) {
                continue;
            }
            n[0] = 1 / n[0];
            Vec2f v0_yz = Vec2f(v0[1], v0[2]);
            Vec2f v1_yz = Vec2f(v1[1], v1[2]);
            Vec2f v2_yz = Vec2f(v2[1], v2[2]);
            if (!checkCCW(v0_yz, v1_yz, v2_yz))
            {
                std::swap(v1_yz, v2_yz);
            }
            Vec2f bbox_max = OpenMesh::max(v0_yz, OpenMesh::max(v1_yz, v2_yz));
            Vec2f bbox_min = OpenMesh::min(v0_yz, OpenMesh::min(v1_yz, v2_yz));
            Vec2i bbox_max_grid = Vec2i(std::floor(bbox_max[0] * invuint[1] - 0.5f),
                                        std::floor(bbox_max[1] * invuint[2] - 0.5f));
            Vec2i bbox_min_grid = Vec2i(std::ceil(bbox_min[0] * invuint[1] - 0.5f),
                                        std::ceil(bbox_min[1] * invuint[2] - 0.5f));
            size_t ypos = static_cast<size_t>(bbox_min_grid[0] * info.gridsize[1]);
            for (int y = bbox_min_grid[0]; y <= bbox_max_grid[0]; ++y)
            {
                size_t zyz = bbox_min_grid[1] * yz;
                for (int z = bbox_min_grid[1]; z <= bbox_max_grid[1]; ++z)
                {
                    Vec2f point((y + 0.5f) * info.unit[1], (z + 0.5f) * info.unit[2]);
                    int checknum = check_point_triangle(v0_yz, v1_yz, v2_yz, point);
                    if ((checknum == 1 && TopLeftEdge(v0_yz, v1_yz)) ||
                        (checknum == 2 && TopLeftEdge(v1_yz, v2_yz)) ||
                        (checknum == 3 && TopLeftEdge(v2_yz, v0_yz)) || 
                        (checknum == 0))
                    {
                        size_t xmax = size_t(get_x_coordinate(n, v0, point) * invuint[0] - 0.5f);
                        for (int x = 0; x <= xmax; x++)
                        {
                            size_t location = x + ypos + zyz;
                            size_t int_location = location >> 5;
                            unsigned int bit_pos = size_t(31) - (location & s31);
                            unsigned int mask = 1 << bit_pos;

pragma omp atomic

                            voxel_table[int_location] ^= mask;
                        }
                    }
                    zyz += yz;
                }
                ypos += info.gridsize[1];
            }
        }
    }
}

Just code improvement, not algorithm improvement

Forceflow commented 3 years ago

Could you make a PR for this? It's hard comparing code with all the formatting gone.