Is cusparseSpMV wrong? - Githubissues

I am trying to use cusparseSpMV to calculate sparse matrix (coo format) multiply vector. But it can not return the correct result for matrix with column major. It seems that the coo matrix must stored in row major?

#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>         // cusparseSpMV
#include <stdio.h>            // printf
#include <stdlib.h>           // EXIT_FAILURE

#define cuDouOrSin CUDA_R_64F // precision of double
#define indexBaseC CUSPARSE_INDEX_BASE_ZERO
#define cuIdxDouOrSin CUSPARSE_INDEX_32I // common index type, 64I for nnz > 2^32

#define CUDA_CHECK(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
               __LINE__, cudaGetErrorString(status), status);                  \
        return;                                                   \
    }                                                                          \
}

#define CUSPARSE_CHECK(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
               __LINE__, cusparseGetErrorString(status), status);              \
        return;                                                   \
    }                                                                          \
}

#define CUDA_FREE(var) do {if ((var)) {cudaFree((var)); (var) = NULL;}} while (0)
#define CUDA_INIT(var, type, size) cudaMalloc((void**)&(var), (size) * sizeof(type))
#define CUDA_MEMCPY_H2D(dst, src, type, size) cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyHostToDevice)
#define CUDA_MEMCPY_D2H(dst, src, type, size) cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDeviceToHost)
#define CUDA_MEMCPY_D2D(dst, src, type, size) cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDeviceToDevice)
#define CUDA_ZERO(var, size) cudaMemset(var, 0, (size) * sizeof(*var))
#define ERROR_TRACE printf("File [%30s] Line [%d]\n", __FILE__, __LINE__)

#define FREE(var) do {if (var) {free((var)); (var) = NULL;}} while (0)
#define INIT(var, type, size) (var) = (type *) calloc(size, sizeof(type))
#define REALLOC(var, type, size) (var) = (type *) realloc(var, sizeof(type) * (size))
#define MEMCPY(dst, src, type, size) memcpy(dst, src, sizeof(type) * (size))
#define ZERO(var, type, size) memset(var, 0, sizeof(type) * (size))
#define NULLCHECK(var) if(!(var)) {                     \
                                retcode = RETCODE_FAILED; \
                                goto exit_cleanup;              \
                             }
#define MEMCHECK(var) if (!(var)) {                       \
                                retcode = RETCODE_MEMORY; \
                                goto exit_cleanup;              \
                            }
#define ERROR_TRACE printf("File [%30s] Line [%d]\n", __FILE__, __LINE__)
#define CALL(func) retcode = (func);                      \
                         if (retcode != RETCODE_OK) {     \
                             goto exit_cleanup;                 \
                         }
#define STATUS_CHECK(func) retcode = (func);                \
                         if (retcode == RETCODE_EXIT) {     \
                             goto exit_cleanup;                  \
                         }
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define ABS(x)    fabs(x)

#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define ABS(x)    fabs(x)

int main(void){
    int   A_num_rows = 4;
    int   A_num_cols = 4;
    int   A_nnz = 9;
    // int   hA_rows[] =    { 0,   0,   1,   0,   2,   2,   2,   3,   3 };
    // int   hA_columns[] = { 0,   2,   1,   3,   0,   2,   3,   1,   3 };
    // double hA_values[] = { 1.0, 2.0, 4.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
    int   hA_rows[] =    { 0,   0,   0,   1,   2,   2,   2,   3,   3 };
    int   hA_columns[] = { 0,   2,   3,   1,   0,   2,   3,   1,   3 };
    double hA_values[] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
    double hX[] = { 1.0, 2.0, 3.0, 4.0 };
    double hY[] = { 0.0, 0.0, 0.0, 0.0 };
    double hY_result[] = { 19.0, 8.0, 51.0, 52.0 };
    double alpha = 1.0;
    double beta = 0.0;

    int *dA_rows, *dA_columns;
    double *dA_values, *dX, *dY;
    CUDA_CHECK(CUDA_INIT(dA_rows, int, A_nnz));
    CUDA_CHECK(CUDA_INIT(dA_columns, int, A_nnz));
    CUDA_CHECK(CUDA_INIT(dA_values, double, A_nnz));
    CUDA_CHECK(CUDA_INIT(dX, double, A_num_cols));
    CUDA_CHECK(CUDA_INIT(dY, double, A_num_rows));

    CUDA_CHECK(CUDA_MEMCPY_H2D(dA_rows, hA_rows, int, A_nnz));
    CUDA_CHECK(CUDA_MEMCPY_H2D(dA_columns, hA_columns, int, A_nnz));
    CUDA_CHECK(CUDA_MEMCPY_H2D(dA_values, hA_values, double, A_nnz));
    CUDA_CHECK(CUDA_MEMCPY_H2D(dX, hX, double, A_num_cols));
    CUDA_CHECK(CUDA_MEMCPY_H2D(dY, hY, double, A_num_rows));

    cusparseHandle_t     handle = NULL;
    cusparseSpMatDescr_t matA;
    cusparseDnVecDescr_t vecX, vecY;
    void* d_buffer = NULL;
    CUSPARSE_CHECK(cusparseCreate(&handle));
    CUSPARSE_CHECK(cusparseCreateCoo(&matA, A_num_rows, A_num_cols, A_nnz,
                                          dA_rows, dA_columns, dA_values, cuIdxDouOrSin,
                                          indexBaseC, cuDouOrSin));

    // sorted by column
    size_t               bufferSize = 0;
    double *dA_values_sorted;
    int *d_permutation;
    CUDA_CHECK(CUDA_INIT(dA_values_sorted, double, A_nnz));
    CUDA_CHECK(CUDA_INIT(d_permutation, int, A_nnz));
    cusparseSpVecDescr_t vec_permutation;
    CUSPARSE_CHECK(cusparseCreateSpVec(&vec_permutation, A_nnz, A_nnz, d_permutation, dA_values_sorted, cuIdxDouOrSin, indexBaseC, cuDouOrSin));
    cusparseDnVecDescr_t vec_values;
    CUSPARSE_CHECK(cusparseCreateDnVec(&vec_values, A_nnz, dA_values, cuDouOrSin));
    cusparseXcoosort_bufferSizeExt(handle, A_num_rows, A_num_cols, A_nnz, dA_rows, dA_columns, &bufferSize);
    CUDA_CHECK( cudaMalloc(&d_buffer, bufferSize) );
    int d_permutation_ref[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; // at order
    CUDA_MEMCPY_H2D(d_permutation, d_permutation_ref, int, A_nnz);
    cusparseXcoosortByColumn(handle, A_num_rows, A_num_cols, A_nnz, dA_rows, dA_columns, d_permutation, d_buffer);
    cusparseGather(handle, vec_values, vec_permutation);

    double hA_values_sorted[9]; // nnz
    // device result check
    CUDA_CHECK( cudaMemcpy(hA_rows, dA_rows, A_nnz * sizeof(int),
                           cudaMemcpyDeviceToHost) )
    CUDA_CHECK( cudaMemcpy(hA_columns, dA_columns, A_nnz * sizeof(int),
                           cudaMemcpyDeviceToHost) )
    CUDA_CHECK( cudaMemcpy(hA_values_sorted, dA_values_sorted,
                           A_nnz * sizeof(double), cudaMemcpyDeviceToHost) )
    // CUDA_CHECK( cudaMemcpy(h_permutation, d_permutation,
    //                        A_nnz * sizeof(int), cudaMemcpyDeviceToHost) )
    for (int i = 0; i < A_nnz; i++) {
        printf("hA_rows[%d] = %d\n", i, hA_rows[i]);
        printf("hA_columns[%d] = %d\n", i, hA_columns[i]);
        printf("hA_values_sorted[%d] = %f\n", i, hA_values_sorted[i]);
    }

    cusparseSpMatDescr_t matA_sorted;
    CUSPARSE_CHECK(cusparseCreateCoo(&matA_sorted, A_num_rows, A_num_cols, A_nnz,
                                          dA_rows, dA_columns, dA_values_sorted, cuIdxDouOrSin,
                                          indexBaseC, cuDouOrSin));

    // Create dense vector X
    CUSPARSE_CHECK(cusparseCreateDnVec(&vecX, A_num_cols, dX, cuDouOrSin));
    // Create dense vector y
    CUSPARSE_CHECK(cusparseCreateDnVec(&vecY, A_num_rows, dY, cuDouOrSin));

    size_t bufferSizeMV = 0;
    void *d_bufferMV = NULL;
    cusparseSpMV_bufferSize(
            handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
            &alpha, matA_sorted, vecX, &beta, vecY, cuDouOrSin,
            CUSPARSE_SPMV_ALG_DEFAULT, &bufferSizeMV);

    cudaMalloc(&d_bufferMV, bufferSizeMV);

    cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
            &alpha, matA_sorted, vecX, &beta, vecY, cuDouOrSin,
            CUSPARSE_SPMV_ALG_DEFAULT, d_bufferMV);

        //--------------------------------------------------------------------------
        // device result check
        CUDA_CHECK(cudaMemcpy(hY, dY, A_num_rows * sizeof(double),
            cudaMemcpyDeviceToHost));
        int correct = 1;
        for (int i = 0; i < A_num_rows; i++) {
            printf("hY[%d] = %f\n", i, hY[i]);
            if (hY[i] != hY_result[i]) { // direct doubleing point comparison is not
                correct = 0;             // reliable
                // break;
            }
        }
        if (correct)
            printf("spmv_coo_example test PASSED\n");
        else
            printf("spmv_coo_example test FAILED: wrong result\n");
        //--------------------------------------------------------------------------

    CUDA_FREE(d_bufferMV);
    CUDA_FREE(d_buffer);
    CUDA_FREE(d_permutation);
    CUDA_FREE(dA_values_sorted);
    CUDA_FREE(dA_rows);
    CUDA_FREE(dA_columns);
    CUDA_FREE(dA_values);
    CUDA_FREE(dX);
    CUDA_FREE(dY);
    CUSPARSE_CHECK(cusparseDestroySpVec(vec_permutation));
    CUSPARSE_CHECK(cusparseDestroySpMat(matA));
    CUSPARSE_CHECK(cusparseDestroySpMat(matA_sorted));
    CUSPARSE_CHECK(cusparseDestroy(handle));
}```
NVIDIA / CUDALibrarySamples

Is cusparseSpMV wrong? #170