"cuda reported error 500" when compute the albedo map in "shape_from_shading" using Graph

Dear author,

I'm trying to compute the albedo map in the example "shape_from_shading", but I get the following error cuModuleGetFunction: cuda reported error 500:

Num Active Unknowns: 0
tval: 1
Num Active Unknowns: 0
tval: 1
Saving sfsInitDepth.ply 640x480x1
Saving targetIntensity 640x480x1
Saving initialUnknown 640x480x1
Saving maskEdgeMap 640x960x1
Using Opt v0.2.2
No unknownwise residuals for ispaces(s) U-SHIT. Creating zero-valued stand-ins.
nUnknowns =     9
nResiduals =    0 + (@parametersSym).G.N * 1 + 1 * 1

nnz =   0 + (@parametersSym).G.N * 9 + 1 * 0

E:\Github\Opt\API\src/util.t:873: cuModuleGetFunction: cuda reported error 500
stack traceback:
        [C]: in function 'error'
        [string "<string>"]:243: in function 'cudacompile'
        E:\Github\Opt\API\src/util.t:873: in function 'makeGPUFunctions'
        E:\Github\Opt\API\src/solverGPUGaussNewton.t:762: in function 'compilePlan'
        E:\Github\Opt\API\src/o.t:870: in function <E:\Github\Opt\API\src/o.t:862>
        [C]: in function 'xpcall'
        E:\Github\Opt\API\src/o.t:862: in function <E:\Github\Opt\API\src/o.t:861>
Assertion failed: m_plan, file e:\github\opt\examples\shared\OptSolver.h, line 58
Press any key to continue . . .

I tried to ensure the validity of the variables allocated on the GPU, but still get this error... How can I debug this? could someone help me?

my source code snippet is like:

//in main.cpp:

    CombinedSolverComputeLighting solverLighting(solverInputGPU, params);
    printf("Solving Light Coeffs>>>>>>>>>\n");
    solverLighting.solveAll();
    std::vector<float> res = solverLighting.result();
    for (size_t i = 0; i < res.size(); i++){
        printf("res-%d: %f\n", i, res[i]);
    }
    printf("=======================Solved\n");

and class CombinedSolverComputeLighting:

class CombinedSolverComputeLighting : public CombinedSolverBase {
private:
    //std::shared_ptr<SimpleBuffer>   m_initialUnknown;
    //std::shared_ptr<SimpleBuffer>   m_result;
    std::vector<float> m_result; //light coeffs
    float *m_result_gpu;

    std::vector<unsigned int> m_dims;

    int *m_param_idx_gpu,
        *m_data_idx_gpu;

public:
    CombinedSolverComputeLighting(const SFSSolverInput& inputGPU, CombinedSolverParameters params)
    {
        m_combinedSolverParameters = params;

        const uint ww = (uint)inputGPU.targetDepth->width(),
            hh = (uint)inputGPU.targetDepth->height(),
            elemNum = ww *hh,
            LcoeffsNum = 9;// (for 2nd order SH)
        //m_dims = { ww, hh, 1 };
        m_dims = { /*ww, */elemNum, 1 };

        //1, result init
        m_result.resize(LcoeffsNum, 0.f); //alloc CPU memory
        const size_t sizeRes9Byte = sizeof(float) * LcoeffsNum;
        //float *m_result_gpu;
        cudaSafeCall(cudaMalloc(&m_result_gpu, sizeRes9Byte));
        cudaSafeCall(cudaMemcpy(m_result_gpu, m_result.data(), sizeRes9Byte, cudaMemcpyHostToDevice));

        //2, vert idx arr, for data and params
        const size_t sizeIdxMatByte = sizeof(int) * elemNum;
        cudaSafeCall(cudaMalloc(&m_param_idx_gpu, sizeIdxMatByte));
        cudaSafeCall(cudaMemset(m_param_idx_gpu, 0, sizeIdxMatByte)); //all 0s

        cudaSafeCall(cudaMalloc(&m_data_idx_gpu, sizeIdxMatByte));
        vector<int> data_idx_cpu(elemNum);
        for (size_t i = 0; i < elemNum; i++){
            data_idx_cpu[i] = i;
        }
        cudaSafeCall(cudaMemcpy(m_data_idx_gpu, data_idx_cpu.data(), sizeIdxMatByte, cudaMemcpyHostToDevice));

        float3 *nmapGpu;
        cudaSafeCall(cudaMalloc(&nmapGpu, sizeof(float3) * elemNum));

        float fx = inputGPU.parameters.fx,
            fy = inputGPU.parameters.fy,
            cx = inputGPU.parameters.ux,
            cy = inputGPU.parameters.uy;
        computeNmap((float*)inputGPU.targetDepth->data(), nmapGpu, ww, hh, fx, fy, cx, cy);

        inputGPU.setParamsComputeLighting(m_problemParams, (void *)m_result_gpu
            , elemNum, (void *)m_data_idx_gpu, (void *)m_param_idx_gpu
            , nmapGpu
            );

        addOptSolvers(m_dims, "compute_lighting.t", false);
    }

    virtual void combinedSolveInit() override {
        m_solverParams.set("nIterations", &m_combinedSolverParameters.nonLinearIter);
        m_solverParams.set("lIterations", &m_combinedSolverParameters.linearIter);
    }

    virtual void preSingleSolve() override {
        //resetGPUMemory();
    }
    virtual void postSingleSolve() override {}

    virtual void preNonlinearSolve(int) override {}
    virtual void postNonlinearSolve(int) override {}

    virtual void combinedSolveFinalize() override {
        //ceresIterationComparison("Shape From Shading", m_combinedSolverParameters.optDoublePrecision);
    }

    std::vector<float> result() {
        return m_result;
    }

    //void resetGPUMemory() {
    //    cudaSafeCall(cudaMemcpy(m_result->data(), m_initialUnknown->data(), m_dims[0] * m_dims[1] * sizeof(float), cudaMemcpyDeviceToDevice));
    //}

};

and add a setter in struct SFSSolverInput, in SFSSolverInput.h:

    void setParamsComputeLighting(NamedParameters &probParams, void *unknown_lvec
        , int graphEdgeNum, void *ptrDataIdx, void *ptrParamIdx
        , void *nmap) const
    {

        //0123
        probParams.set("L_coeffs", unknown_lvec);
        probParams.set("D_i", targetDepth->data());
        probParams.set("Im", targetIntensity->data());
        probParams.set("N_i", nmap);

        //456 --G
        probParams.set("graphEdgeNum", &graphEdgeNum);
        probParams.set("ptrDataIdx", ptrDataIdx);
        probParams.set("ptrParamIdx", ptrParamIdx);

    }

and the computeNmap is like:

static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }

__global__
void computeNmapKernel(float *dmap, float3 *nmap, int ww, int hh, float fx, float fy, float cx, float cy){
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    bool dbgPrint = false;
    if (x % 100 == 0 && y % 100 == 0)
        //dbgPrint = true;
        ;

    if (x >= ww || y >= hh)
        return;

    int idx = y * ww + x,
        idxL = y * ww + x - 1,
        idxU = (y - 1)*ww + x;
    if (dbgPrint){
        printf("x,y:= %d, %d; idx, idxL, idxU:= %d, %d, %d\n",
            x, y, idx, idxL, idxU);
    }
    if (x == 0 || y == 0
        || dmap[idx] == 0 || dmap[idxL] == 0 || dmap[idxU] == 0)
    {
        nmap[idx].x = 0;
        nmap[idx].y = 0;
        nmap[idx].y = 0;

        if (dbgPrint){
            printf("-----x,y:= %d, %d; idx, idxL, idxU:= %d, %d, %d\n",
                x, y, idx, idxL, idxU);
        }

        return;
    }

    const float m2mm = 1e3; //avoid float underflow when calc nx,ny,nz
    float d0 = dmap[idx] * m2mm,
        dx1 = dmap[idxL] * m2mm,
        dy1 = dmap[idxU] * m2mm;

    float nx = dy1*(d0 - dx1) / fy,
        ny = dx1*(d0 - dy1) / fx,
        nz = nx*(cx - x) / fx + ny*(cy - y) / fy - dx1*dy1 / (fx*fy);

    float sqLength = nx*nx + ny*ny + nz*nz;
    float invLen = 1;
    if (sqLength > 0)
        invLen = 1.f / sqrtf(sqLength);

    //normed nx,ny,nz:
    float nnx = invLen*nx,
        nny = invLen*ny,
        nnz = invLen*nz;

    if (dbgPrint){
        printf("+++++x,y:= %d, %d; idx, idxL, idxU:= %d, %d, %d\n \
               \ \td0, dx1, dy1: %f, %f, %f; nx, ny, nz: %f, %f, %f, ===normed-nxyz: %f, %f, %f\n",
               x, y, idx, idxL, idxU,
               d0, dx1, dy1, nx, ny, nz, nnx, nny, nnz);
    }

    nmap[idx].x = nnx;
    nmap[idx].y = nny;
    nmap[idx].z = nnz;

}

void computeNmap(float *dmap, float3 *nmap, int ww, int hh, float fx, float fy, float cx, float cy){
    dim3 block(32, 8);
    dim3 grid(divUp(ww, block.x), divUp(hh, block.y));

    computeNmapKernel <<<grid, block >>>(dmap, nmap, ww, hh, fx, fy, cx, cy);
    cudaSafeCall(cudaGetLastError());
}

niessner / Opt

"cuda reported error 500" when compute the albedo map in "shape_from_shading" using Graph #153