GPUOpen-LibrariesAndSDKs / RadeonRays_SDK

Radeon Rays is ray intersection acceleration library for hardware and software multiplatforms using CPU and GPU
MIT License
1.07k stars 192 forks source link

vk::Device::waitForFences: ErrorDeviceLost after few iterations of intersection commands #225

Open mlo77 opened 3 years ago

mlo77 commented 3 years ago

Hello,

Thanks AMD folks for this repo.

I am facing a vk ErrorDeviceLost pb. The code is borrowed from the test_vk/internal_resources, but here I am casting rays in a loop where the number of rays is decreased by 2 at each iteration.

void test() {
    RRContext context = nullptr;
    CHECK_RR_CALL(rrCreateContext(RR_API_VERSION, RR_API_VK, &context));
    MeshData mesh_data("../resources/sponza.obj");

    // memory management to pass buffers to builder
    // get radeonrays ptrs to triangle description
    RRDevicePtr vertex_ptr = nullptr;
    RRDevicePtr index_ptr  = nullptr;

    CHECK_RR_CALL(rrAllocateDeviceBuffer(context, mesh_data.positions.size() * sizeof(float), &vertex_ptr));
    CHECK_RR_CALL(rrAllocateDeviceBuffer(context, mesh_data.indices.size() * sizeof(uint32_t), &index_ptr));
    void* ptr = nullptr;
    CHECK_RR_CALL(rrMapDevicePtr(context, vertex_ptr, &ptr));
    float* pos_ptr = (float*)ptr;
    for (const auto& pos : mesh_data.positions)
    {
        *pos_ptr = pos;
        pos_ptr++;
    }
    CHECK_RR_CALL(rrUnmapDevicePtr(context, vertex_ptr, &ptr));
    CHECK_RR_CALL(rrMapDevicePtr(context, index_ptr, &ptr));
    uint32_t* ind_ptr = (uint32_t*)ptr;
    for (const auto& ind : mesh_data.indices)
    {
        *ind_ptr = ind;
        ind_ptr++;
    }
    CHECK_RR_CALL(rrUnmapDevicePtr(context, index_ptr, &ptr));

    auto triangle_count = (uint32_t)mesh_data.indices.size() / 3;

    RRGeometryBuildInput    geometry_build_input                = {};
    RRTriangleMeshPrimitive mesh                                = {};
    geometry_build_input.triangle_mesh_primitives               = &mesh;
    geometry_build_input.primitive_type                         = RR_PRIMITIVE_TYPE_TRIANGLE_MESH;
    geometry_build_input.triangle_mesh_primitives->vertices     = vertex_ptr;
    geometry_build_input.triangle_mesh_primitives->vertex_count = uint32_t(mesh_data.positions.size() / 3);

    geometry_build_input.triangle_mesh_primitives->vertex_stride    = 3 * sizeof(float);
    geometry_build_input.triangle_mesh_primitives->triangle_indices = index_ptr;
    geometry_build_input.triangle_mesh_primitives->triangle_count   = triangle_count;
    geometry_build_input.triangle_mesh_primitives->index_type       = RR_INDEX_TYPE_UINT32;
    geometry_build_input.primitive_count                            = 1u;

    std::cout << "Triangle count " << triangle_count << "\n";

    RRBuildOptions options;
    options.build_flags = 0u;

    RRMemoryRequirements geometry_reqs;
    CHECK_RR_CALL(rrGetGeometryBuildMemoryRequirements(context, &geometry_build_input, &options, &geometry_reqs));

    // allocate buffers for builder and resulting geometry
    RRDevicePtr scratch_ptr  = nullptr;
    RRDevicePtr geometry_ptr = nullptr;
    CHECK_RR_CALL(rrAllocateDeviceBuffer(context, geometry_reqs.temporary_build_buffer_size, &scratch_ptr));
    CHECK_RR_CALL(rrAllocateDeviceBuffer(context, geometry_reqs.result_buffer_size, &geometry_ptr));

    std::cout << "Scratch buffer size: " << (float)geometry_reqs.temporary_build_buffer_size / 1000000 << "Mb\n";
    std::cout << "Result buffer size: " << (float)geometry_reqs.result_buffer_size / 1000000 << "Mb\n";

    RRCommandStream command_stream = nullptr;
    CHECK_RR_CALL(rrAllocateCommandStream(context, &command_stream));

    CHECK_RR_CALL(rrCmdBuildGeometry(
        context, RR_BUILD_OPERATION_BUILD, &geometry_build_input, &options, scratch_ptr, geometry_ptr, command_stream));

    RREvent wait_event = nullptr;
    CHECK_RR_CALL(rrSumbitCommandStream(context, command_stream, nullptr, &wait_event));
    CHECK_RR_CALL(rrWaitEvent(context, wait_event));
    CHECK_RR_CALL(rrReleaseEvent(context, wait_event));
    CHECK_RR_CALL(rrReleaseCommandStream(context, command_stream));

    //// Cast rays in loop

    uint32_t kResolution = 10000;
    for (int i=0; i<5; i++) {
        auto start = std::chrono::system_clock::now();

        RRDevicePtr rays_dev_ptr, hits_dev_ptr, scratch_trace_ptr;
        void *ptr = nullptr;
        RREvent wait_event = nullptr;

        kResolution >>= 1;
        uint32_t raysNb = kResolution * kResolution;

        CHECK_RR_CALL(rrAllocateDeviceBuffer(context, raysNb * sizeof(RRRay), &rays_dev_ptr));
        CHECK_RR_CALL(rrMapDevicePtr(context, rays_dev_ptr, &ptr));

        RRRay* rays_ptr = static_cast<RRRay*>(ptr);

        for (int x = 0; x < kResolution; ++x)
        {
            for (int y = 0; y < kResolution; ++y)
            {
                auto i = kResolution * y + x;
                RRRay *r = rays_ptr+i;
                r->origin[0] = -1.f;
                r->origin[1] = 5.5f;
                r->origin[2] = 0.f;
                r->direction[1] = -1.f;
                r->direction[0] = -1.f + (2.f / kResolution) * y;
                r->direction[2] = -1.f + (2.f / kResolution) * x;
                r->min_t = 0.001f;
                r->max_t = 100000.f;
            }
        }
        CHECK_RR_CALL(rrUnmapDevicePtr(context, rays_dev_ptr, &ptr));

        auto end = std::chrono::system_clock::now();
        std::chrono::duration<double> elapsed_seconds = end - start;
        auto total_time_taken = elapsed_seconds.count();
        cerr << "ray init " << total_time_taken <<"s for "<< raysNb << " rays" << endl; 

        start = std::chrono::system_clock::now();
        CHECK_RR_CALL(rrAllocateDeviceBuffer(context, raysNb * sizeof(RRHit), &hits_dev_ptr));
        size_t scratch_trace_size;
        CHECK_RR_CALL(rrGetTraceMemoryRequirements(context, raysNb, &scratch_trace_size));
        CHECK_RR_CALL(rrAllocateDeviceBuffer(context, scratch_trace_size, &scratch_trace_ptr));

        RRCommandStream trace_command_stream = nullptr;
        CHECK_RR_CALL(rrAllocateCommandStream(context, &trace_command_stream));

        CHECK_RR_CALL(rrCmdIntersect(context,
                                     geometry_ptr,
                                     RR_INTERSECT_QUERY_CLOSEST,
                                     rays_dev_ptr,
                                     raysNb,
                                     nullptr,
                                     RR_INTERSECT_QUERY_OUTPUT_FULL_HIT,
                                     hits_dev_ptr,
                                     scratch_trace_ptr,
                                     trace_command_stream));

        CHECK_RR_CALL(rrSumbitCommandStream(context, trace_command_stream, nullptr, &wait_event));
        CHECK_RR_CALL(rrWaitEvent(context, wait_event));

        CHECK_RR_CALL(rrReleaseEvent(context, wait_event));
        CHECK_RR_CALL(rrReleaseCommandStream(context, trace_command_stream));

        // Map staging ray buffer.
        CHECK_RR_CALL(rrMapDevicePtr(context, hits_dev_ptr, &ptr));

        end = std::chrono::system_clock::now();
        elapsed_seconds = end - start;
        total_time_taken = elapsed_seconds.count();
        cerr << "intersection " << total_time_taken <<"s for "<< raysNb << " rays" << endl; 

        RRHit* mapped_ptr = static_cast<RRHit*>(ptr);
        std::vector<uint32_t> data(raysNb);
        int hitcount = 0;
        for (int y = 0; y < kResolution; ++y)
        {
            for (int x = 0; x < kResolution; ++x)
            {
                int wi = kResolution * (kResolution - 1 - y) + x;
                int i  = kResolution * y + x;
                if (mapped_ptr[i].inst_id != ~0u)
                {
                     data[wi] = 0xff000000 | (uint32_t(mapped_ptr[i].uv[0] * 255) << 8) |
                                   (uint32_t(mapped_ptr[i].uv[1] * 255) << 16);
                     hitcount++;
                } else
                {
                    data[wi] = 0xff101010;
                }
            }
        }

        // rrSetLogLevel(RR_LOG_LEVEL_OFF);
        cout << "hitcount:" << hitcount << endl;
        CHECK_RR_CALL(rrUnmapDevicePtr(context, hits_dev_ptr, &ptr));

        // stringstream ss;
        // ss << "/dev/shm/test_vk_sponza_geom_isect_internal" << i << ".jpg";
        // stbi_write_jpg(ss.str().c_str(), kResolution, kResolution, 4, data.data(), 120);

        CHECK_RR_CALL(rrReleaseDevicePtr(context, rays_dev_ptr));
        CHECK_RR_CALL(rrReleaseDevicePtr(context, scratch_trace_ptr));
        CHECK_RR_CALL(rrReleaseDevicePtr(context, hits_dev_ptr));
    }
    CHECK_RR_CALL(rrReleaseDevicePtr(context, scratch_ptr));
    CHECK_RR_CALL(rrReleaseDevicePtr(context, geometry_ptr));
    CHECK_RR_CALL(rrReleaseDevicePtr(context, index_ptr));
    CHECK_RR_CALL(rrReleaseDevicePtr(context, vertex_ptr));
    CHECK_RR_CALL(rrDestroyContext(context));
}

int main(int argc, char* argv[]) {
    test();
    return 0;
}

The loop fails at last iteration where an assertion halts, this is the log output:

[2021-07-17 00:24:19.903] [RR logger] [info] rrCreateContext(1001) [2021-07-17 00:24:19.903] [RR logger] [info] Creating Vulkan context [2021-07-17 00:24:21.709] [RR logger] [info] rrSetLogLevel(1) ... ... hitcount:390625 [2021-07-17 00:24:35.067] [RR logger] [info] rrUnmapDevicePtr [2021-07-17 00:24:35.067] [RR logger] [debug] Unmap vulkan buffer [2021-07-17 00:24:35.067] [RR logger] [info] rrReleaseDevicePtr [2021-07-17 00:24:35.069] [RR logger] [debug] Device pointer successfully released [2021-07-17 00:24:35.069] [RR logger] [info] rrReleaseDevicePtr [2021-07-17 00:24:35.078] [RR logger] [debug] Device pointer successfully released [2021-07-17 00:24:35.078] [RR logger] [info] rrReleaseDevicePtr [2021-07-17 00:24:35.079] [RR logger] [debug] Device pointer successfully released // next iteration [2021-07-17 00:24:35.079] [RR logger] [info] rrAllocateDeviceBuffer [2021-07-17 00:24:35.081] [RR logger] [debug] Allocated vulkan buffer with size 3115008 [2021-07-17 00:24:35.081] [RR logger] [info] rrMapDevicePtr [2021-07-17 00:24:35.081] [RR logger] [debug] Map vulkan buffer [2021-07-17 00:24:35.082] [RR logger] [info] rrUnmapDevicePtr [2021-07-17 00:24:35.082] [RR logger] [debug] Unmap vulkan buffer ray init 0.00319416s for 97344 rays [2021-07-17 00:24:35.082] [RR logger] [info] rrAllocateDeviceBuffer [2021-07-17 00:24:35.082] [RR logger] [debug] Allocated vulkan buffer with size 1557504 [2021-07-17 00:24:35.082] [RR logger] [info] rrGetTraceMemoryRequirements [2021-07-17 00:24:35.082] [RR logger] [debug] Successfully provided trace memory requirements [2021-07-17 00:24:35.082] [RR logger] [info] rrAllocateDeviceBuffer [2021-07-17 00:24:35.088] [RR logger] [debug] Allocated vulkan buffer with size 24920064 [2021-07-17 00:24:35.088] [RR logger] [info] rrAllocateCommandStream [2021-07-17 00:24:35.088] [RR logger] [debug] Command stream successfully allocated [2021-07-17 00:24:35.088] [RR logger] [info] rrCmdIntersect [2021-07-17 00:24:35.088] [RR logger] [debug] Intersector::Intersect() [2021-07-17 00:24:35.088] [RR logger] [debug] Batch intersect command successfully recorded [2021-07-17 00:24:35.088] [RR logger] [info] rrSumbitCommandStream [2021-07-17 00:24:35.088] [RR logger] [debug] Device::SubmitCommandStream() [2021-07-17 00:24:35.088] [RR logger] [debug] Command stream successfully submitted [2021-07-17 00:24:35.089] [RR logger] [info] rrReleaseEvent [2021-07-17 00:24:35.089] [RR logger] [debug] Device::WaitEvent() [2021-07-17 00:24:35.469] [RR logger] [error] vk::Device::waitForFences: ErrorDeviceLost rfrt: src/main3.cpp:168: void test(): Assertion `(rrWaitEvent(context, wait_event)) == RR_SUCCESS' failed. Aborted (core dumped)

Also, if I save the Hits data at each iteration, the program completes without raising any problem, it is like allowing more time gap to the GPU hides this problem in some way.

Ubuntu 20.04 Vulkan version 1.2.182 Nvidia driver 460.73.1 on QuadroM1200 RR4.1 build options : ENABLE_VULKAN=ON, EMBEDDED_KERNELS=ON, ENABLE_TESTING=ON, CMAKE_POSITION_INDEPENDENT_CODE=ON

Any suggestion how to debug this ? Am I doing this the right way ?

Cheers