Closed zhenweilin closed 8 months ago
I am trying to use cusparseSpMV to calculate sparse matrix (coo format) multiply vector. But it can not return the correct result for matrix with column major. It seems that the coo matrix must stored in row major?
cusparseSpMV
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc. #include <cusparse.h> // cusparseSpMV #include <stdio.h> // printf #include <stdlib.h> // EXIT_FAILURE #define cuDouOrSin CUDA_R_64F // precision of double #define indexBaseC CUSPARSE_INDEX_BASE_ZERO #define cuIdxDouOrSin CUSPARSE_INDEX_32I // common index type, 64I for nnz > 2^32 #define CUDA_CHECK(func) \ { \ cudaError_t status = (func); \ if (status != cudaSuccess) { \ printf("CUDA API failed at line %d with error: %s (%d)\n", \ __LINE__, cudaGetErrorString(status), status); \ return; \ } \ } #define CUSPARSE_CHECK(func) \ { \ cusparseStatus_t status = (func); \ if (status != CUSPARSE_STATUS_SUCCESS) { \ printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \ __LINE__, cusparseGetErrorString(status), status); \ return; \ } \ } #define CUDA_FREE(var) do {if ((var)) {cudaFree((var)); (var) = NULL;}} while (0) #define CUDA_INIT(var, type, size) cudaMalloc((void**)&(var), (size) * sizeof(type)) #define CUDA_MEMCPY_H2D(dst, src, type, size) cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyHostToDevice) #define CUDA_MEMCPY_D2H(dst, src, type, size) cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDeviceToHost) #define CUDA_MEMCPY_D2D(dst, src, type, size) cudaMemcpy(dst, src, sizeof(type) * (size), cudaMemcpyDeviceToDevice) #define CUDA_ZERO(var, size) cudaMemset(var, 0, (size) * sizeof(*var)) #define ERROR_TRACE printf("File [%30s] Line [%d]\n", __FILE__, __LINE__) #define FREE(var) do {if (var) {free((var)); (var) = NULL;}} while (0) #define INIT(var, type, size) (var) = (type *) calloc(size, sizeof(type)) #define REALLOC(var, type, size) (var) = (type *) realloc(var, sizeof(type) * (size)) #define MEMCPY(dst, src, type, size) memcpy(dst, src, sizeof(type) * (size)) #define ZERO(var, type, size) memset(var, 0, sizeof(type) * (size)) #define NULLCHECK(var) if(!(var)) { \ retcode = RETCODE_FAILED; \ goto exit_cleanup; \ } #define MEMCHECK(var) if (!(var)) { \ retcode = RETCODE_MEMORY; \ goto exit_cleanup; \ } #define ERROR_TRACE printf("File [%30s] Line [%d]\n", __FILE__, __LINE__) #define CALL(func) retcode = (func); \ if (retcode != RETCODE_OK) { \ goto exit_cleanup; \ } #define STATUS_CHECK(func) retcode = (func); \ if (retcode == RETCODE_EXIT) { \ goto exit_cleanup; \ } #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define ABS(x) fabs(x) #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define ABS(x) fabs(x) int main(void){ int A_num_rows = 4; int A_num_cols = 4; int A_nnz = 9; // int hA_rows[] = { 0, 0, 1, 0, 2, 2, 2, 3, 3 }; // int hA_columns[] = { 0, 2, 1, 3, 0, 2, 3, 1, 3 }; // double hA_values[] = { 1.0, 2.0, 4.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; int hA_rows[] = { 0, 0, 0, 1, 2, 2, 2, 3, 3 }; int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 }; double hA_values[] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; double hX[] = { 1.0, 2.0, 3.0, 4.0 }; double hY[] = { 0.0, 0.0, 0.0, 0.0 }; double hY_result[] = { 19.0, 8.0, 51.0, 52.0 }; double alpha = 1.0; double beta = 0.0; int *dA_rows, *dA_columns; double *dA_values, *dX, *dY; CUDA_CHECK(CUDA_INIT(dA_rows, int, A_nnz)); CUDA_CHECK(CUDA_INIT(dA_columns, int, A_nnz)); CUDA_CHECK(CUDA_INIT(dA_values, double, A_nnz)); CUDA_CHECK(CUDA_INIT(dX, double, A_num_cols)); CUDA_CHECK(CUDA_INIT(dY, double, A_num_rows)); CUDA_CHECK(CUDA_MEMCPY_H2D(dA_rows, hA_rows, int, A_nnz)); CUDA_CHECK(CUDA_MEMCPY_H2D(dA_columns, hA_columns, int, A_nnz)); CUDA_CHECK(CUDA_MEMCPY_H2D(dA_values, hA_values, double, A_nnz)); CUDA_CHECK(CUDA_MEMCPY_H2D(dX, hX, double, A_num_cols)); CUDA_CHECK(CUDA_MEMCPY_H2D(dY, hY, double, A_num_rows)); cusparseHandle_t handle = NULL; cusparseSpMatDescr_t matA; cusparseDnVecDescr_t vecX, vecY; void* d_buffer = NULL; CUSPARSE_CHECK(cusparseCreate(&handle)); CUSPARSE_CHECK(cusparseCreateCoo(&matA, A_num_rows, A_num_cols, A_nnz, dA_rows, dA_columns, dA_values, cuIdxDouOrSin, indexBaseC, cuDouOrSin)); // sorted by column size_t bufferSize = 0; double *dA_values_sorted; int *d_permutation; CUDA_CHECK(CUDA_INIT(dA_values_sorted, double, A_nnz)); CUDA_CHECK(CUDA_INIT(d_permutation, int, A_nnz)); cusparseSpVecDescr_t vec_permutation; CUSPARSE_CHECK(cusparseCreateSpVec(&vec_permutation, A_nnz, A_nnz, d_permutation, dA_values_sorted, cuIdxDouOrSin, indexBaseC, cuDouOrSin)); cusparseDnVecDescr_t vec_values; CUSPARSE_CHECK(cusparseCreateDnVec(&vec_values, A_nnz, dA_values, cuDouOrSin)); cusparseXcoosort_bufferSizeExt(handle, A_num_rows, A_num_cols, A_nnz, dA_rows, dA_columns, &bufferSize); CUDA_CHECK( cudaMalloc(&d_buffer, bufferSize) ); int d_permutation_ref[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; // at order CUDA_MEMCPY_H2D(d_permutation, d_permutation_ref, int, A_nnz); cusparseXcoosortByColumn(handle, A_num_rows, A_num_cols, A_nnz, dA_rows, dA_columns, d_permutation, d_buffer); cusparseGather(handle, vec_values, vec_permutation); double hA_values_sorted[9]; // nnz // device result check CUDA_CHECK( cudaMemcpy(hA_rows, dA_rows, A_nnz * sizeof(int), cudaMemcpyDeviceToHost) ) CUDA_CHECK( cudaMemcpy(hA_columns, dA_columns, A_nnz * sizeof(int), cudaMemcpyDeviceToHost) ) CUDA_CHECK( cudaMemcpy(hA_values_sorted, dA_values_sorted, A_nnz * sizeof(double), cudaMemcpyDeviceToHost) ) // CUDA_CHECK( cudaMemcpy(h_permutation, d_permutation, // A_nnz * sizeof(int), cudaMemcpyDeviceToHost) ) for (int i = 0; i < A_nnz; i++) { printf("hA_rows[%d] = %d\n", i, hA_rows[i]); printf("hA_columns[%d] = %d\n", i, hA_columns[i]); printf("hA_values_sorted[%d] = %f\n", i, hA_values_sorted[i]); } cusparseSpMatDescr_t matA_sorted; CUSPARSE_CHECK(cusparseCreateCoo(&matA_sorted, A_num_rows, A_num_cols, A_nnz, dA_rows, dA_columns, dA_values_sorted, cuIdxDouOrSin, indexBaseC, cuDouOrSin)); // Create dense vector X CUSPARSE_CHECK(cusparseCreateDnVec(&vecX, A_num_cols, dX, cuDouOrSin)); // Create dense vector y CUSPARSE_CHECK(cusparseCreateDnVec(&vecY, A_num_rows, dY, cuDouOrSin)); size_t bufferSizeMV = 0; void *d_bufferMV = NULL; cusparseSpMV_bufferSize( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA_sorted, vecX, &beta, vecY, cuDouOrSin, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSizeMV); cudaMalloc(&d_bufferMV, bufferSizeMV); cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA_sorted, vecX, &beta, vecY, cuDouOrSin, CUSPARSE_SPMV_ALG_DEFAULT, d_bufferMV); //-------------------------------------------------------------------------- // device result check CUDA_CHECK(cudaMemcpy(hY, dY, A_num_rows * sizeof(double), cudaMemcpyDeviceToHost)); int correct = 1; for (int i = 0; i < A_num_rows; i++) { printf("hY[%d] = %f\n", i, hY[i]); if (hY[i] != hY_result[i]) { // direct doubleing point comparison is not correct = 0; // reliable // break; } } if (correct) printf("spmv_coo_example test PASSED\n"); else printf("spmv_coo_example test FAILED: wrong result\n"); //-------------------------------------------------------------------------- CUDA_FREE(d_bufferMV); CUDA_FREE(d_buffer); CUDA_FREE(d_permutation); CUDA_FREE(dA_values_sorted); CUDA_FREE(dA_rows); CUDA_FREE(dA_columns); CUDA_FREE(dA_values); CUDA_FREE(dX); CUDA_FREE(dY); CUSPARSE_CHECK(cusparseDestroySpVec(vec_permutation)); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); CUSPARSE_CHECK(cusparseDestroySpMat(matA_sorted)); CUSPARSE_CHECK(cusparseDestroy(handle)); }```
yes, COO is row-major only. My suggestion is to check the format storage details in the cuSPARSE documentation https://docs.nvidia.com/cuda/cusparse/index.html#coordinate-coo
I am trying to use
cusparseSpMV
to calculate sparse matrix (coo format) multiply vector. But it can not return the correct result for matrix with column major. It seems that the coo matrix must stored in row major?