NVIDIA / AMGX

Distributed multigrid linear solver library on GPU
468 stars 136 forks source link

Seeking Advice on Configuring AMGX for Solving a Poorly-Conditioned Matrix Equation #286

Closed fxuecnu closed 7 months ago

fxuecnu commented 7 months ago

Describe the issue

A clear and concise description of what the issue is.

I'm working on solving a matrix equation that seems to be ill-conditioned, and I'm encountering convergence issues with all the configuration files provided in AMGX. I've attached a JPEG image of the matrix to give an idea of its structure.

Despite confirming that the matrix rank is appropriate, I'm struggling with the equation's principal axis, which contains many zero values. This characteristic might be contributing to the convergence problems.

I'm seeking advice on how to properly configure AMGX for this type of equation. To provide more context, I've also attached a snippet of my code that illustrates how I'm constructing and attempting to solve this system.

Any suggestions or guidance on the appropriate AMGX configuration settings for this scenario would be greatly appreciated.

Environment information: (I have successfully build the lib)

AMGX solver configuration I just used the json files in AMGX\core\configs

// below is the code

/ Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.

include

include

include

include

include "cuda_runtime.h"

include

define M_PI 3.14159265358979323846

/ CUDA error macro /

define CUDA_SAFE_CALL(call) do { \

cudaError_t err = call; \ if(cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ FILE, LINE, cudaGetErrorString( err) ); \ exit(EXIT_FAILURE); \ } } while (0)

//#define AMGX_DYNAMIC_LOADING //#undef AMGX_DYNAMIC_LOADING

define MAX_MSG_LEN 4096

/ standard or dynamically load library /

ifdef AMGX_DYNAMIC_LOADING

include "amgx_capi.h"

else

include "amgx_c.h"

endif

/ print error message and exit / void errAndExit(const char *err) { printf("%s\n", err); fflush(stdout); MPI_Abort(MPI_COMM_WORLD, 1); exit(1); }

/ print callback (could be customized) / void print_callback(const char *msg, int length) { int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);

if (rank == 0) { printf("%s", msg); }

}

/ print usage and exit / void printUsageAndExit() { char msg[MAX_MSG_LEN] = "Usage: mpirun [-n nranks] ./example [-mode [dDDI | dDFI | dFFI]] [-p nx ny] [-c config_file] [-amg \"variable1=value1 ... variable3=value3\"] [-gpu] [-it k]\n"; strcat(msg, " -mode: select the solver mode\n"); strcat(msg, " -p nx ny: select x- and y-dimensions of the 2D (5-points) local discretization of the Poisson operator (the global problem size will be nranksnxny)\n"); strcat(msg, " -c: set the amg solver options from the config file\n"); strcat(msg, " -amg: set the amg solver options from the command line\n"); print_callback(msg, MAX_MSG_LEN); MPI_Finalize(); exit(0); }

/ parse parameters / int findParamIndex(char *argv, int argc, const char parm) { int count = 0; int index = -1;

for (int i = 0; i < argc; i++)
{
    if (strncmp(argv[i], parm, 100) == 0)
    {
        index = i;
        count++;
    }
}

if (count == 0 || count == 1)
{
    return index;
}
else
{
    char msg[MAX_MSG_LEN];
    sprintf(msg, "ERROR: parameter %s has been specified more than once, exiting\n", parm);
    print_callback(msg, MAX_MSG_LEN);
    exit(1);
}

return -1;

}

void continuityXdirKernel( int n, int nx, int ny, double dx, int ndd, double u, double v, double h, double detadt, void values, void rhs, int row_ptrs) {

/* -------------------- continuity equation ---------------------------
-----------------------detadt = dhu/dx + dhv/dy------------------------
detadt is constant, so it is not included in the matrix. In other words,
the pivot element is 0 for the continuity equation.
-------------------------------------------------------------------- */

for (int i = 0; i < n; i++) {

    // note that here n, nx, ny is different from those of momentum equation

    int row = i / nx;                               // row index in the 2D work array       
    int col = i % nx;                               // col index in the 1D work array    
    int idx = 2 * col + 1 + (2 * nx + 1) * row;     // pivot index in the row of sparse matrix
    int nnz = row_ptrs[idx];                        // index of the first non-zero element in the row

    // open boundary condition for water level
    if (ndd[i] == 2) {

        ((double*)values)[nnz] = 1.0;
        ((double*)rhs)[idx] = 0.0;

    }

    // in other cases, just discrete the equation 
    else {

        // left u coefficient
        ((double*)values)[nnz] = -0.5 * (h[i] + h[i - 1]) / dx;

        // right u coefficient
        ((double*)values)[nnz + 1] = 0.5 * (h[i] + h[min(i + 1, nx - 1)]) / dx;

        //rhs values
        ((double*)rhs)[idx] = detadt[i];

    }

}

}

void momentumXdirKernel(int n, int nx, int ny, double dx, double gra, double mann, double Du, int ndd, double u, double v, double eta, double h, void values, void rhs, int row_ptrs) {

/* -------------------- momentum equation u ---------------------------
* 0 = g * mann^2 / h^(4/3) * |uv| * u     // friction term
      + udu/dx + vdu/dy                   // advection term
      g * deta/dx                         // gravity term
      - d/dx (Du * du/dx)                 // diffusion term
-------------------------------------------------------------------- */

//int i = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < n; i++) {

    // note that here n, nx is different from those of continuity equation

    int row = i / nx;                               // row index in the 2D work array       
    int col = i % nx;                               // col index in the 1D work array    
    int idx = 2 * col + (2 * nx - 1) * row;         // pivot index in the row of sparse matrix
    int nnz = row_ptrs[idx];                        // index of the first non-zero element in the row

    // inner points
    if (col > 0 && col < nx - 1) {

        int i0 = col + row * (nx - 1);

        double h_u_c = 0.5 * (h[i0] + h[i0 - 1]);   // h_u_c = h_u_center
        double D_h_l = Du;
        double D_h_r = Du;

        // left U
        ((double*)values)[nnz] = -fmax(0., u[i]) / dx      // upwind advection term
            - D_h_l / dx / dx;         // diffusion term

        // left eta
        ((double*)values)[nnz + 1] = -gra / dx;               // gravity term

        // center U
        ((double*)values)[nnz + 2] = gra * pow(mann, 2) / pow(h_u_c, 4.0 / 3.0) * fabs(u[i])       // friction term
            + (fmax(0., u[i]) - fmin(0., u[i])) / dx                        // upwind advection term
            + (D_h_l + D_h_r) / dx / dx;                                    // diffusion term

        // right eta
        ((double*)values)[nnz + 3] = gra / dx;                 // gravity term

        // right U
        ((double*)values)[nnz + 4] = fmin(0., u[i]) / dx       // upwind advection term
            - D_h_r / dx / dx;          // diffusion term

        //rhs values
        ((double*)rhs)[idx] = 0.;

    }
    // left boundary
    else if (col == 0) {

        int i0 = col + row * (nx - 1);

        // open boundary condition
        if (ndd[i0] == 2) {

            // center U
            ((double*)values)[nnz] = 1.0;

            // right U
            ((double*)values)[nnz + 1] = -2.0;

            // right right U
            ((double*)values)[nnz + 2] = 1.0;

            //rhs values
            ((double*)rhs)[idx] = 0.;

        }

        // closed boundary condition
        else {

            ((double*)values)[nnz] = 1.0;
            ((double*)rhs)[idx] = 0.;

        }

    }
    // right boundary
    else if (col == nx - 1) {

        int i0 = col - 1 + row * (nx - 1);

        // open boundary condition
        if (ndd[i0] == 2) {

            // left left U
            ((double*)values)[nnz] = 1.0;

            // left U
            ((double*)values)[nnz + 1] = -2.0;

            // center U
            ((double*)values)[nnz + 2] = 1.0;

            //rhs values
            ((double*)rhs)[idx] = 0.;

        }

        // closed boundary condition
        else {

            ((double*)values)[nnz] = 1.0;
            ((double*)rhs)[idx] = 0.;

        }

    }
}

}

int main(int argc, char *argv) { //parameter parsing int pidx = 0; int pidy = 0; //MPI (with CUDA GPUs) int rank = 0; int lrank = 0; int nranks = 0; int n; int nx, ny; int gpu_count = 0; MPI_Comm amgx_mpi_comm = MPI_COMM_WORLD; //versions int major, minor; char ver, date, time; //input matrix and rhs/solution int partition_sizes = NULL; int partition_vector = NULL; int partition_vector_size = 0; //library handles AMGX_Mode mode; AMGX_config_handle cfg; AMGX_resources_handle rsrc; AMGX_matrix_handle A; AMGX_vector_handle b, x; AMGX_solver_handle solver; //status handling AMGX_SOLVE_STATUS status; / MPI init (with CUDA GPUs) / //MPI MPI_Init(&argc, &argv); MPI_Comm_size(amgx_mpi_comm, &nranks); MPI_Comm_rank(amgx_mpi_comm, &rank); //CUDA GPUs CUDA_SAFE_CALL(cudaGetDeviceCount(&gpu_count)); lrank = rank % gpu_count; CUDA_SAFE_CALL(cudaSetDevice(lrank)); printf("Process %d selecting device %d\n", rank, lrank);

/* check arguments */
if (argc == 1)
{
    printUsageAndExit();
}

/* load the library (if it was dynamically loaded) */

ifdef AMGX_DYNAMIC_LOADING

void *lib_handle = NULL;

ifdef _WIN32

lib_handle = amgx_libopen("amgxsh.dll");

else

lib_handle = amgx_libopen("libamgxsh.so");

endif

if (lib_handle == NULL)
{
    errAndExit("ERROR: can not load the library");
}

//load all the routines
if (amgx_liblink_all(lib_handle) == 0)
{
    amgx_libclose(lib_handle);
    errAndExit("ERROR: corrupted library loaded\n");
}

endif

/* init */
AMGX_SAFE_CALL(AMGX_initialize());
AMGX_SAFE_CALL(AMGX_initialize_plugins());
/* system */
AMGX_SAFE_CALL(AMGX_register_print_callback(&print_callback));
AMGX_SAFE_CALL(AMGX_install_signal_handler());

/* get api and build info */
if ((pidx = findParamIndex(argv, argc, "--version")) != -1)
{
    AMGX_get_api_version(&major, &minor);
    printf("amgx api version: %d.%d\n", major, minor);
    AMGX_get_build_info_strings(&ver, &date, &time);
    printf("amgx build version: %s\nBuild date and time: %s %s\n", ver, date, time);
    AMGX_SAFE_CALL(AMGX_finalize_plugins());
    AMGX_SAFE_CALL(AMGX_finalize());
    /* close the library (if it was dynamically loaded) */

ifdef AMGX_DYNAMIC_LOADING

    amgx_libclose(lib_handle);

endif

    MPI_Finalize();
    exit(0);
}

/* get mode */
if ((pidx = findParamIndex(argv, argc, "-mode")) != -1)
{
    if (strncmp(argv[pidx + 1], "dDDI", 100) == 0)
    {
        mode = AMGX_mode_dDDI;
    }
    else if (strncmp(argv[pidx + 1], "dDFI", 100) == 0)
    {
        mode = AMGX_mode_dDFI;
    }
    else if (strncmp(argv[pidx + 1], "dFFI", 100) == 0)
    {
        mode = AMGX_mode_dFFI;
    }
    else
    {
        errAndExit("ERROR: invalid mode");
    }
}
else
{
    printf("Warning: No mode specified, using dDDI by default.\n");
    mode = AMGX_mode_dDDI;
}

int sizeof_m_val = ((AMGX_GET_MODE_VAL(AMGX_MatPrecision, mode) == AMGX_matDouble)) ? sizeof(double) : sizeof(float);
int sizeof_v_val = ((AMGX_GET_MODE_VAL(AMGX_VecPrecision, mode) == AMGX_vecDouble)) ? sizeof(double) : sizeof(float);
/* create config */
pidx = findParamIndex(argv, argc, "-amg");
pidy = findParamIndex(argv, argc, "-c");

if ((pidx != -1) && (pidy != -1))
{
    printf("%s\n", argv[pidx + 1]);
    AMGX_SAFE_CALL(AMGX_config_create_from_file_and_string(&cfg, argv[pidy + 1], argv[pidx + 1]));
}
else if (pidy != -1)
{
    AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, argv[pidy + 1]));
}
else if (pidx != -1)
{
    printf("%s\n", argv[pidx + 1]);
    AMGX_SAFE_CALL(AMGX_config_create(&cfg, argv[pidx + 1]));
}
else
{
    errAndExit("ERROR: no config was specified");
}

/* example of how to handle errors */
//char msg[MAX_MSG_LEN];
//AMGX_RC err_code = AMGX_resources_create(NULL, cfg, &amgx_mpi_comm, 1, &lrank);
//AMGX_SAFE_CALL(AMGX_get_error_string(err_code, msg, MAX_MSG_LEN));
//printf("ERROR: %s\n",msg);
/* switch on internal error handling (no need to use AMGX_SAFE_CALL after this point) */
AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1"));
/* create resources, matrix, vector and solver */
AMGX_resources_create(&rsrc, cfg, &amgx_mpi_comm, 1, &lrank);
AMGX_matrix_create(&A, rsrc, mode);
AMGX_vector_create(&x, rsrc, mode);
AMGX_vector_create(&b, rsrc, mode);
AMGX_solver_create(&solver, rsrc, mode, cfg);
//generate 3D Poisson matrix, [and rhs & solution]
//WARNING: use 1 ring for aggregation and 2 rings for classical path
int nrings; //=1; //=2;
AMGX_config_get_default_number_of_rings(cfg, &nrings);
//printf("nrings=%d\n",nrings);
int nglobal = 0;

if  ((pidx = findParamIndex(argv, argc, "-p")) != -1)
{
    nx = atoi(argv[++pidx]);
    ny = atoi(argv[++pidx]);
    n  = nx * ny; // each rank has the strip of size nx*ny
    nglobal = n * nranks; // global domain is just those strips stacked one on each other
}
else
{
    printf("Please, use '-p nx ny' parameter for this example\n");
    exit(1);
}

nx = 5000;
ny = 1;
n = nx * ny;
nglobal = n * nranks;

/* generate the matrix
   In more detail, this routine will create 2D (5 point) discretization of the
   Poisson operator. The discretization is performed on a the 2D domain consisting
   of nx and ny points in x- and y-dimension respectively. Each rank processes it's
   own part of discretization points. Finally, the rhs and solution will be set to
   a vector of ones and zeros, respectively. */

int *row_ptrs = (int*)malloc((2 * n + ny + 1) * sizeof(int));
int64_t *col_idxs = (int64_t*)malloc(6 * (2 * n + ny) * sizeof(int64_t));

void *values = malloc(6 * (2 * n + ny) * sizeof(double)); // maximum nnz
int nnz = 0;
int64_t count = 0;
int64_t start_idx = rank * n;

//for (int i = 0; i < n; i ++)
//{
//    row_ptrs[i] = nnz;

//    if (rank > 0 || i > ny-1)
//    {
//        col_idxs[nnz] = (i + start_idx - ny);

//        if (sizeof_m_val == 4)
//        {
//            ((float *)values)[nnz] = -1.f;
//        }
//        else if (sizeof_m_val == 8)
//        {
//            ((double *)values)[nnz] = -1.;
//        }

//        nnz++;
//    }

//    if (i % ny != 0)
//    {
//        col_idxs[nnz] = (i + start_idx - 1);

//        if (sizeof_m_val == 4)
//        {
//            ((float *)values)[nnz] = -1.f;
//        }
//        else if (sizeof_m_val == 8)
//        {
//            ((double *)values)[nnz] = -1.;
//        }

//        nnz++;
//    }

//    {
//        col_idxs[nnz] = (i + start_idx);

//        if (sizeof_m_val == 4)
//        {
//            ((float *)values)[nnz] = 4.f;
//        }
//        else if (sizeof_m_val == 8)
//        {
//            ((double *)values)[nnz] = 4.;
//        }

//        nnz++;
//    }

//    if ((i + 1) % ny != 0)
//    {
//        col_idxs[nnz] = (i + start_idx + 1);

//        if (sizeof_m_val == 4)
//        {
//            ((float *)values)[nnz] = -1.f;
//        }
//        else if (sizeof_m_val == 8)
//        {
//            ((double *)values)[nnz] = -1.;
//        }

//        nnz++;
//    }

//    if ( (rank != nranks - 1) || (i / ny != (nx - 1)) )
//    {
//        col_idxs[nnz] = (i + start_idx + ny);

//        if (sizeof_m_val == 4)
//        {
//            ((float *)values)[nnz] = -1.f;
//        }
//        else if (sizeof_m_val == 8)
//        {
//            ((double *)values)[nnz] = -1.;
//        }

//        nnz++;
//    }
//}

//row_ptrs[n] = nnz;

///////////////////////////////////////////////////////
///////////////////////////////////////////////////////

double dx = .5;
double gra = 9.81;
double mann = 0.012;
double Du = 0.5;

double Trange = 1.0;
double Tperiod = 43200.0;

int* ndd;
double* u;
double* v;
double* eta;
double* h;
double* detadt;
void* rhs;

// Allocate memory on host
ndd = (int*)malloc(n * sizeof(int));
u = (double*)malloc((n + ny) * sizeof(double));
v = (double*)malloc((n + nx) * sizeof(double));
eta = (double*)malloc(n * sizeof(double));
h = (double*)malloc(n * sizeof(double));
detadt = (double*)malloc(n * sizeof(double));
rhs = malloc((2 * n + ny) * sizeof(double));

// Initialize host memory
for (int i = 0; i < n; i++) {

    int row = i / nx;                               // row index in the 2D work array   
    int col = i % nx;                               // col index in the 1D work array

    if (col == 0) {
        ndd[i] = 2;
    }
    else {
        ndd[i] = 0;
    }

    eta[i] = 0.0;
    h[i] = 2.0;
    detadt[i] = M_PI * Trange / Tperiod;

}

for (int i = 0; i < n + ny; i++) {
    u[i] = 0.0;
}

for (int i = 0; i < n + nx; i++) {
    v[i] = 0.0;
}

// initialize sparse matrix (determine row_ptrs and col_idxs)

nnz = 0;

for (int row = 0; row < ny; row++) {

    for (int i = 0; i < 2 * nx + 1; i++) {

        // continuity equation
        if (i % 2 == 1) {

            int col = (i - 1) / 2;
            int idx = i + (2 * nx + 1) * row;
            int i0 = col + row * nx;
            row_ptrs[idx] = nnz;

            if (ndd[i0] == 2) {

                col_idxs[nnz++] = idx;

            }
            else {

                col_idxs[nnz++] = idx - 1;
                col_idxs[nnz++] = idx + 1;

            }

        }

        // momentum equation
        if (i % 2 == 0) {

            int col = i / 2;
            int idx = i + (2 * nx + 1) * row;
            int i0 = col + row * nx;
            row_ptrs[idx] = nnz;

            if (col > 0 && col < nx) {

                col_idxs[nnz++] = idx - 2;
                col_idxs[nnz++] = idx - 1;
                col_idxs[nnz++] = idx;
                col_idxs[nnz++] = idx + 1;
                col_idxs[nnz++] = idx + 2;

            }
            else if (col == 0) {

                if (ndd[i0] == 2) {

                    printf("idx = %d, nnz = %d\n", idx, nnz);

                    col_idxs[nnz++] = idx;
                    col_idxs[nnz++] = idx + 2;
                    col_idxs[nnz++] = idx + 4;

                }
                else {

                    col_idxs[nnz++] = idx;

                }
            }
            else if (col == nx) {

                int i0 = col - 1 + row * nx;

                if (ndd[i0] == 2) {

                    col_idxs[nnz++] = idx - 4;
                    col_idxs[nnz++] = idx - 2;
                    col_idxs[nnz++] = idx;

                }
                else {

                    col_idxs[nnz++] = idx;

                }

            }

        }

    }

}

row_ptrs[2 * n + ny] = nnz;

continuityXdirKernel(n, nx, ny, dx, ndd, u, v, h, detadt, (double*)values, rhs, row_ptrs);

momentumXdirKernel(n + ny, nx + 1, ny, dx, gra, mann, Du, ndd, u, v, eta, h, (double*)values, rhs, row_ptrs);

///////////////////////////////////////////////////////
///////////////////////////////////////////////////////

    FILE* fp;
fp = fopen("values.bin", "wb");
fwrite(values, sizeof(double), 6 * (2 * n + ny), fp);
fclose(fp);

fp = fopen("rhs.bin", "wb");
fwrite(rhs, sizeof(double), (2 * n + ny), fp);
fclose(fp);

fp = fopen("row_ptrs.bin", "wb");
fwrite(row_ptrs, sizeof(int), (2 * n + ny + 1), fp);
fclose(fp);

fp = fopen("col_idxs.bin", "wb");
fwrite(col_idxs, sizeof(int64_t), 6 * (2 * n + ny), fp);
fclose(fp);

AMGX_matrix_upload_all_global(A,
                               2 * n + ny, 2 * n + ny, nnz, 1, 1,
                              row_ptrs, col_idxs, values, NULL,
                              nrings, nrings, NULL);
free(values);
free(row_ptrs);
free(col_idxs);
/* generate the rhs and solution */
void *h_x = malloc( (2 * n + ny) * sizeof_v_val);
//for (int i = 0; i < 2 * n + ny; i++) { 
//    if (i % 2 == 0) {
//        ((double*)h_x)[i] = 0.0;
//    }
//    else {
//        ((double*)h_x)[i] = 0.0;
//    }   
//}

memset(h_x, 50.0, (2 * n + ny) * sizeof_v_val);

/* set the connectivity information (for the vector) */
AMGX_vector_bind(x, A);
AMGX_vector_bind(b, A);
/* upload the vector (and the connectivity information) */
AMGX_vector_upload(x, 2 * n + ny, 1, h_x);
AMGX_vector_upload(b, 2 * n + ny, 1, rhs);
/* solver setup */
//MPI barrier for stability (should be removed in practice to maximize performance)
MPI_Barrier(amgx_mpi_comm);
AMGX_solver_setup(solver, A);
/* solver solve */
//MPI barrier for stability (should be removed in practice to maximize performance)
MPI_Barrier(amgx_mpi_comm);
AMGX_solver_solve(solver, b, x);
/* example of how to change parameters between non-linear iterations */
//AMGX_config_add_parameters(&cfg, "config_version=2, default:tolerance=1e-12");
//AMGX_solver_solve(solver, b, x);
/* example of how to replace coefficients between non-linear iterations */
//AMGX_matrix_replace_coefficients(A, n, nnz, values, diag);
//AMGX_solver_setup(solver, A);
//AMGX_solver_solve(solver, b, x);
AMGX_solver_get_status(solver, &status);
/* example of how to get (the local part of) the solution */
//int sizeof_v_val;
//sizeof_v_val = ((NVAMG_GET_MODE_VAL(NVAMG_VecPrecision, mode) == NVAMG_vecDouble))? sizeof(double): sizeof(float);
// 
void* SLN = malloc((2 * n + ny) * sizeof(double));
AMGX_vector_download(x, SLN);

fp = fopen("SLN.bin", "wb");
fwrite(SLN, sizeof(double), (2 * n + ny), fp);
fclose(fp);

//free(result_host);
/* destroy resources, matrix, vector and solver */
AMGX_solver_destroy(solver);
AMGX_vector_destroy(x);
AMGX_vector_destroy(b);
AMGX_matrix_destroy(A);
AMGX_resources_destroy(rsrc);
/* destroy config (need to use AMGX_SAFE_CALL after this point) */
AMGX_SAFE_CALL(AMGX_config_destroy(cfg))
/* shutdown and exit */
AMGX_SAFE_CALL(AMGX_finalize_plugins())
AMGX_SAFE_CALL(AMGX_finalize())
/* close the library (if it was dynamically loaded) */

ifdef AMGX_DYNAMIC_LOADING

amgx_libclose(lib_handle);

endif

MPI_Finalize();
CUDA_SAFE_CALL(cudaDeviceReset());
return status;

}

//

sparse_matrix