NVIDIA / AMGX

Distributed multigrid linear solver library on GPU
468 stars 136 forks source link

Caught amgx exception: getFixedSizesForView should not be called by a non-distributed matrix #317

Open fxuecnu opened 1 week ago

fxuecnu commented 1 week ago

I generate a block matrix with 3x3 blocks. and I used FGMRES_AGGREGATION_JACOBI.json configureation to solve the system but it returns:

Process 0 selecting device 0
AMGX version 2.2.0.132-opensource
Built on Nov 23 2023, 14:14:00
Compiled with CUDA Runtime 11.8, using CUDA driver 12.2
Cannot read file as JSON object, trying as AMGX config
Converting config string to current config version
Parsing configuration string: tolerance=1e-16 ;
Cannot read file as JSON object, trying as AMGX config
Converting config string to current config version
Parsing configuration string: exception_handling=1 ;
Using Normal MPI (Hostbuffer) communicator...
AMG Grid:
         Number of Levels: 22
            LVL         ROWS               NNZ    SPRSTY       Mem (GB)
         --------------------------------------------------------------
           0(D)         1200             17280     0.012       5.37e-05
           1(D)         1161             16713    0.0124         0.0003
           2(D)         1116             16074    0.0129       0.000288
           3(D)         1071             15417    0.0134       0.000277
           4(D)         1023             14715    0.0141       0.000264
           5(D)          987             14211    0.0146       0.000255
           6(D)          927             13329    0.0155       0.000239
           7(D)          864             12384    0.0166       0.000222
           8(D)          828             11862    0.0173       0.000213
           9(D)          774             11070    0.0185       0.000198
          10(D)          714             10188      0.02       0.000183
          11(D)          654              9306    0.0218       0.000167
          12(D)          594              8424    0.0239       0.000151
          13(D)          534              7542    0.0264       0.000135
          14(D)          474              6660    0.0296       0.000119
          15(D)          414              5778    0.0337       0.000104
          16(D)          354              4896    0.0391       8.77e-05
          17(D)          294              4014    0.0464       7.19e-05
          18(D)          234              3132    0.0572       5.61e-05
          19(D)          174              2250    0.0743       4.05e-05
          20(D)          129              1683     0.101       3.03e-05
          21(D)           96              1242     0.135       2.06e-05
         --------------------------------------------------------------
         Grid Complexity: 12.18
         Operator Complexity: 12.0469
         Total Memory Usage: 0.00347444 GB
         --------------------------------------------------------------
           iter      Mem Usage (GB)       residual           rate
         --------------------------------------------------------------
            Ini             1.77295   1.417614e-03
Caught amgx exception: getFixedSizesForView should not be called by a non-distributed matrix
 at: C:/study/AMGX/AMGX-2.3.0/base/include\matrix.h:987
Stack trace:

AMGX ERROR: file C:\study\AMGX\AMGX-2.3.0\base\src\amgx_c.cu line   2799
AMGX ERROR: Internal error.

job aborted:
[ranks] message

Environment information: OS: Windows 11 CUDA runtime: CUDA 11. AMGX version: v2.3.0 NVIDIA driver: 12.2 NVIDIA GPU: 3060ti

AMGX solver configuration

{ "config_version": 2, "solver": { "preconditioner": { "error_scaling": 0, "print_grid_stats": 1, "algorithm": "AGGREGATION", "solver": "AMG", "smoother": "BLOCK_JACOBI", "presweeps": 0, "selector": "SIZE_2", "coarse_solver": "NOSOLVER", "max_iters": 1, "min_coarse_rows": 32, "relaxation_factor": 0.75, "scope": "amg", "max_levels": 100, "postsweeps": 3, "cycle": "V" }, "use_scalar_norm": 1, "solver": "FGMRES", "print_solve_stats": 1, "obtain_timings": 1, "max_iters": 1000, "monitor_residual": 1, "gmres_n_restart": 32, "convergence": "RELATIVE_INI", "scope": "main", "tolerance": 0.0001, "norm": "L2" } }

Matrix Data The form of matrix is in 3x3 block CSR form col_idxs.txt rhs.txt row_ptrs.txt values.txt

the shape of the sparse matrix is like this:

image the black boxes are blocks and the red points are non-zero values.

the values of the matrix points are like this:

image

The challenge in solving this linear equation lies in the small diagonal values of the matrix. To address this, I attempted to use a block matrix approach.

Reproduction steps

Below is the code snippet. You can directly copy the entire code into a .c file and run it. In the file, you only need to adjust the value of nx and ny

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>
#include "cuda_runtime.h"
#include <cusolverSp.h>
#include <stdint.h>

#define M_PI 3.14159265358979323846

 /* CUDA error macro */
#define CUDA_SAFE_CALL(call) do {                                 \
  cudaError_t err = call;                                         \
  if(cudaSuccess != err) {                                        \
    fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
            __FILE__, __LINE__, cudaGetErrorString( err) );       \
    exit(EXIT_FAILURE);                                           \
  } } while (0)

#define CUSOLVER_SAFE_CALL(call) do { \
    cusolverStatus_t status = call; \
    if (status != CUSOLVER_STATUS_SUCCESS) { \
        fprintf(stderr, "CUSOLVER error in %s:%d. Code:%d\n", __FILE__, __LINE__, status); \
        exit(1); \
    } \
} while(0)

//#define AMGX_DYNAMIC_LOADING
//#undef AMGX_DYNAMIC_LOADING
#define MAX_MSG_LEN 4096

/* standard or dynamically load library */
#ifdef AMGX_DYNAMIC_LOADING
#include "amgx_capi.h"
#else
#include "amgx_c.h"
#endif

/* print error message and exit */
void errAndExit(const char* err)
{
    printf("%s\n", err);
    fflush(stdout);
    MPI_Abort(MPI_COMM_WORLD, 1);
    exit(1);
}

/* print callback (could be customized) */
void print_callback(const char* msg, int length)
{
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (rank == 0) { printf("%s", msg); }
}

/* print usage and exit */
void printUsageAndExit()
{
    char msg[MAX_MSG_LEN] = "Usage: mpirun [-n nranks] ./example [-mode [dDDI | dDFI | dFFI]] [-p nx ny] [-c config_file] [-amg \"variable1=value1 ... variable3=value3\"] [-gpu] [-it k]\n";
    strcat(msg, "     -mode:                select the solver mode\n");
    strcat(msg, "     -p nx ny:             select x- and y-dimensions of the 2D (5-points) local discretization of the Poisson operator (the global problem size will be nranks*nx*ny)\n");
    strcat(msg, "     -c:                   set the amg solver options from the config file\n");
    strcat(msg, "     -amg:                 set the amg solver options from the command line\n");
    print_callback(msg, MAX_MSG_LEN);
    MPI_Finalize();
    exit(0);
}

/* parse parameters */
int findParamIndex(char** argv, int argc, const char* parm)
{
    int count = 0;
    int index = -1;

    for (int i = 0; i < argc; i++)
    {
        if (strncmp(argv[i], parm, 100) == 0)
        {
            index = i;
            count++;
        }
    }

    if (count == 0 || count == 1)
    {
        return index;
    }
    else
    {
        char msg[MAX_MSG_LEN];
        sprintf(msg, "ERROR: parameter %s has been specified more than once, exiting\n", parm);
        print_callback(msg, MAX_MSG_LEN);
        exit(1);
    }

    return -1;
}

double dhudx(double hw, double he, double uw, double ue, double dx)
{
    return (he * ue - hw * uw) / dx;
}

double dhvdy(double hb, double ht, double vb, double vt, double dx)
{
    return (ht * vt - hb * vb) / dx;
}

double gdetadx(double gra, double etaw, double etae, double dx)
{
    return gra * (etae - etaw) / dx;
}

double friction(double gra, double mann, double h, double u, double v)
{
    return gra * mann * mann / pow(h, 4.0 / 3.0) * sqrt(u * u + v * v) * u;
}

void SWEValKernel(
    int n,
    int nx,
    int ny,
    int block_dim,
    double dx,
    double gra,
    double mann,
    double eddy,
    int* ndd,
    int* nddu,
    int* nddv,
    double* eta,
    double* u,
    double* v,
    double* h,
    double* detadt,
    void* values,
    void* rhs,
    int* row_ptrs,
    int* pivot_offset) {

    for (int i = 0; i < n; i++) {

        int row = i / nx;
        int col = i % nx;
        int nnz = row_ptrs[i];
        int offset = pivot_offset[i];
        int nnz0 = 0;
        int nnz1 = 0;
        int nnz2 = 0;
        int nnz3 = 0;
        int nnz4 = 0;

        /* continuity equation start*/

        nnz2 = block_dim * block_dim * (nnz + offset);
        if(col > 0){
            nnz0 = block_dim * block_dim * (nnz + offset - 2);
            nnz1 = block_dim * block_dim * (nnz + offset - 1);
        }
        else {
            nnz0 = block_dim * block_dim * (nnz + offset - 2);
        }

        int idx = block_dim * i;

        if (ndd[i] == 2) { // open boundary
            ((double*)values)[nnz2] = 1.0;
            ((double*)rhs)[idx] = -eta[i];
        }
        else {
            double he = (col < nx - 1) ? 0.5 * (h[i] + h[i + 1]) : 0.0;
            double hw = (col > 0) ? 0.5 * (h[i - 1] + h[i]) : 0.0;
            double ht = (row < ny - 1) ? 0.5 * (h[i] + h[i + nx]) : 0.0;
            double hb = (row > 0) ? 0.5 * (h[i - nx] + h[i]) : 0.0;

            double ue = (col < nx - 1) ? u[i] : 0.0;
            double uw = (col > 0) ? u[i - 1] : 0.0;
            double vn = (row < ny - 1) ? v[i] : 0.0;
            double vs = (row > 0) ? v[i - nx] : 0.0;

            double dhudx0 = dhudx(hw, he, uw, ue, dx);
            double dhvdy0 = dhvdy(hb, ht, vs, vn, dx);

            if (row > 0) {
                ((double*)values)[nnz0 + 2] = -hb / dx;
            }

            if (col > 0) {
                ((double*)values)[nnz1 + 1] = -hw / dx;
            }

            if (col < nx - 1) {
                ((double*)values)[nnz2 + 1] = he / dx;
            }

            if (row < ny - 1) {
                ((double*)values)[nnz2 + 2] = ht / dx;
            }

            ((double*)rhs)[idx] = -dhudx0 - dhvdy0 + detadt[i];

        }// continuity equation end

        /* X momentom equation start */

        double du = 1e-8;

        nnz2 = block_dim * block_dim * (nnz + offset) + block_dim;

        if (col > 0) {
            nnz0 = block_dim * block_dim * (nnz + offset - 2) + block_dim;
            nnz1 = block_dim * block_dim * (nnz + offset - 1) + block_dim;
        }
        else {
            nnz0 = block_dim * block_dim * (nnz + offset - 1) + block_dim;
        }

        if (col < nx - 1) {
            nnz3 = block_dim * block_dim * (nnz + offset + 1) + block_dim;
            nnz4 = block_dim * block_dim * (nnz + offset + 2) + block_dim;
        }
        else {
            nnz4 = block_dim * block_dim * (nnz + offset + 1) + block_dim;
        }

        idx = block_dim * i + 1;

        if (col == nx - 1) {
            ((double*)values)[nnz2 + 1] = 1.0;
            ((double*)rhs)[idx] = 0.0;
        }
        else {
            double h0 = 0.5 * (h[i] + h[i + 1]);
            double v0, v00, v01;
            double gdeta0 = gdetadx(gra, eta[i + 1], eta[i], dx);
            double frc0, frcu, dfrcu;
            double ddu0;
            int iup = i + nx;
            int idown = i - nx;

            if (row == 0) {
                v00 = nddv[i] < 0 ? 0.0 : 2.0 * v[i] - v[i + nx];
                v01 = nddv[i + 1] < 0 ? 0.0 : 2.0 * v[i + 1] - v[i + nx + 1];
                v0 = 0.25 * (v[i] + v[i + 1] + v00 + v01);
                idown = iup;
                ((double*)values)[nnz4 + 1] = -2.0 * eddy / dx / dx;
            }
            else if (row == ny - 1) {
                v00 = nddv[i] < 0 ? 0.0 : 2.0 * v[i - nx] - v[i - 2 * nx];
                v01 = nddv[i + 1] < 0 ? 0.0 : 2.0 * v[i - nx + 1] - v[i - 2 * nx + 1];
                v0 = 0.25 * (v00 + v01 + v[i - nx] + v[i - nx + 1]);
                iup = idown;
                ((double*)values)[nnz0 + 1] = -2.0 * eddy / dx / dx;
            }
            else {
                v0 = 0.25 * (v[i] + v[i + 1] + v[i - nx] + v[i - nx + 1]);
                ((double*)values)[nnz0 + 1] = -eddy / dx / dx;
                ((double*)values)[nnz4 + 1] = -eddy / dx / dx;
            }

            frc0 = friction(gra, mann, h0, u[i], v0);
            frcu = friction(gra, mann, h0, u[i] + du, v0);
            dfrcu = (frcu - frc0) / du;

            if (nddu[i] == 2 || nddu[i + 1] == 2) // open boundary
            {
                ddu0 = eddy * (2.0 * u[i] - u[idown] - u[iup]) / dx / dx;
                ((double*)values)[nnz2 + 1] = 2.0 * eddy / dx / dx + dfrcu;
            }
            else {
                ((double*)values)[nnz2 + 1] = 4.0 * eddy / dx / dx + dfrcu;
                if (col == 0) {
                    ddu0 = eddy * (4.0 * u[i] - u[i + 1] - u[idown] - u[iup]) / dx / dx;
                    ((double*)values)[nnz3 + 1] = -2.0 * eddy / dx / dx;
                }
                else if (col == nx - 2) {
                    ddu0 = eddy * (4.0 * u[i] - u[i - 1] - u[idown] - u[iup]) / dx / dx;
                    ((double*)values)[nnz1 + 1] = -2.0 * eddy / dx / dx;
                }
                else {
                    ddu0 = eddy * (4.0 * u[i] - u[i - 1] - u[i + 1] - u[idown] - u[iup]) / dx / dx;
                    ((double*)values)[nnz1 + 1] = -eddy / dx / dx;
                    ((double*)values)[nnz3 + 1] = -eddy / dx / dx;
                }
            }

            ((double*)values)[nnz2] = -gra / dx;
            ((double*)values)[nnz3] = gra / dx;
            ((double*)rhs)[idx] = -gdeta0 - frc0 - ddu0;

        }// X momentum equation end

        /* Y momentom equation start */

        nnz2 = block_dim * block_dim * (nnz + offset) + block_dim * 2;

        if (col > 0) {
            nnz1 = block_dim * block_dim * (nnz + offset - 2) + block_dim * 2;
            nnz0 = block_dim * block_dim * (nnz + offset - 1) + block_dim * 2;
        }
        else {
            nnz1 = block_dim * block_dim * (nnz + offset - 1) + block_dim * 2;
        }

        if (col < nx - 1) {
            nnz4 = block_dim * block_dim * (nnz + offset + 1) + block_dim * 2;
            nnz3 = block_dim * block_dim * (nnz + offset + 2) + block_dim * 2;
        }
        else {
            nnz3 = block_dim * block_dim * (nnz + offset + 1) + block_dim * 2;
        }

        idx = block_dim * i + 2;

        if (row == ny - 1) {
            ((double*)values)[nnz2 + 2] = 1.0;
            ((double*)rhs)[idx] = 0.0;
        }
        else {
            double h0 = 0.5 * (h[i] + h[i + nx]);
            double v0, v00, v01;
            double gdeta0 = gdetadx(gra, eta[i + nx], eta[i], dx);
            double frc0, frcu, dfrcu;
            double ddu0;
            int iup = i + 1;
            int idown = i - 1;

            if (col == 0) {
                v00 = nddu[i] < 0 ? 0.0 : 2.0 * u[i] - u[i + 1];
                v01 = nddu[i + nx] < 0 ? 0.0 : 2.0 * u[i + nx] - u[i + nx + 1];
                v0 = 0.25 * (u[i] + u[i + nx] + v00 + v01);
                idown = iup;
                ((double*)values)[nnz4 + 2] = -2.0 * eddy / dx / dx;
            }
            else if (col == nx - 1) {
                v00 = nddu[i] < 0 ? 0.0 : 2.0 * u[i - 1] - u[i - 2];
                v01 = nddu[i + nx] < 0 ? 0.0 : 2.0 * u[i - 1 + nx] - u[i - 2 * 1 + nx];
                v0 = 0.25 * (v00 + v01 + u[i - 1] + u[i - 1 + nx]);
                iup = idown;
                ((double*)values)[nnz0 + 2] = -2.0 * eddy / dx / dx;
            }
            else {
                v0 = 0.25 * (u[i] + u[i + nx] + u[i - 1] + u[i - 1 + nx]);
                ((double*)values)[nnz0 + 2] = -eddy / dx / dx;
                ((double*)values)[nnz4 + 2] = -eddy / dx / dx;
            }

            frc0 = friction(gra, mann, h0, v[i], v0);
            frcu = friction(gra, mann, h0, v[i] + du, v0);
            dfrcu = (frcu - frc0) / du;

            if (nddv[i] == 2 || nddv[i + nx] == 2) // open boundary
            {
                ddu0 = eddy * (2.0 * v[i] - v[idown] - v[iup]) / dx / dx;
                ((double*)values)[nnz2 + 2] = 2.0 * eddy / dx / dx + dfrcu;
            }
            else {
                ((double*)values)[nnz2 + 2] = 4.0 * eddy / dx / dx + dfrcu;
                if (row == 0) {
                    ddu0 = eddy * (4.0 * v[i] - v[i + nx] - v[idown] - v[iup]) / dx / dx;
                    ((double*)values)[nnz3 + 2] = -2.0 * eddy / dx / dx;
                }
                else if (row == ny - 2) {
                    ddu0 = eddy * (4.0 * v[i] - v[i - nx] - v[idown] - v[iup]) / dx / dx;
                    ((double*)values)[nnz1 + 2] = -2.0 * eddy / dx / dx;
                }
                else {
                    ddu0 = eddy * (4.0 * v[i] - v[i - nx] - v[i + nx] - v[idown] - v[iup]) / dx / dx;
                    ((double*)values)[nnz1 + 2] = -eddy / dx / dx;
                    ((double*)values)[nnz3 + 2] = -eddy / dx / dx;
                }
            }

            ((double*)values)[nnz2] = -gra / dx;
            ((double*)values)[nnz3] = gra / dx;
            ((double*)rhs)[idx] = -gdeta0 - frc0 - ddu0;

        }// Y momentum equation end        

    }

}

int main(int argc, char** argv)
{

    //parameter parsing
    int pidx = 0;
    int pidy = 0;
    //MPI (with CUDA GPUs)
    int rank = 0;
    int lrank = 0;
    int nranks = 0;
    int n;
    int nx, ny;
    int gpu_count = 0;
    MPI_Comm amgx_mpi_comm = MPI_COMM_WORLD;
    //versions
    int major, minor;
    char* ver, * date, * time;
    //input matrix and rhs/solution
    int* partition_sizes = NULL;
    int* partition_vector = NULL;
    int partition_vector_size = 0;

    //library handles
    AMGX_Mode mode;
    AMGX_config_handle cfg;
    AMGX_resources_handle rsrc;
    AMGX_matrix_handle A;
    AMGX_vector_handle b, x;
    AMGX_solver_handle solver;
    AMGX_SOLVE_STATUS status;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(amgx_mpi_comm, &nranks);
    MPI_Comm_rank(amgx_mpi_comm, &rank);

    CUDA_SAFE_CALL(cudaGetDeviceCount(&gpu_count));
    lrank = rank % gpu_count;
    CUDA_SAFE_CALL(cudaSetDevice(lrank));
    printf("Process %d selecting device %d\n", rank, lrank);

    nx = 20;
    ny = 20;
    n = nx * ny;

    /* init */
    AMGX_SAFE_CALL(AMGX_initialize());
    AMGX_SAFE_CALL(AMGX_initialize_plugins());
    /* system */
    AMGX_SAFE_CALL(AMGX_register_print_callback(&print_callback));
    AMGX_SAFE_CALL(AMGX_install_signal_handler());

    mode = AMGX_mode_dDDI;
    int sizeof_m_val = ((AMGX_GET_MODE_VAL(AMGX_MatPrecision, mode) == AMGX_matDouble)) ? sizeof(double) : sizeof(float);
    int sizeof_v_val = ((AMGX_GET_MODE_VAL(AMGX_VecPrecision, mode) == AMGX_vecDouble)) ? sizeof(double) : sizeof(float);

    AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, "FGMRES_AGGREGATION_JACOBI.json"));
    AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "tolerance=1e-16"));

    AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1"));

    AMGX_resources_create(&rsrc, cfg, &amgx_mpi_comm, 1, &lrank);
    AMGX_matrix_create(&A, rsrc, mode);
    AMGX_vector_create(&x, rsrc, mode);
    AMGX_vector_create(&b, rsrc, mode);
    AMGX_solver_create(&solver, rsrc, mode, cfg);

    int nrings; //=1; //=2;
    AMGX_config_get_default_number_of_rings(cfg, &nrings);
    //printf("nrings=%d\n",nrings);
    int nglobal = 0;

    /* generate the matrix
       In more detail, this routine will create 2D (5 point) discretization of the
       Poisson operator. The discretization is performed on a the 2D domain consisting
       of nx and ny points in x- and y-dimension respectively. Each rank processes it's
       own part of discretization points. Finally, the rhs and solution will be set to
       a vector of ones and zeros, respectively. */

    int* row_ptrs = (int*)malloc((n + 1) * sizeof(int));
    int* pivot_offsets = (int*)malloc((n) * sizeof(int));
    int64_t* col_idxs = (int64_t*)malloc(5 * n * sizeof(int64_t));
    int block_dimx = 3;
    int block_dimy = 3;

    void* values = malloc( block_dimx * block_dimy * 5 * n * sizeof(double)); // maximum nnz
    int nnz = 0;
    int count = 0;

    ///////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////

    double dx = 5.0;
    double gra = 9.81;
    double mann = 0.012;
    double eddy = 0.5;

    double Trange = 1.0;
    double Tperiod = 43200.0;

    int* ndd;
    int* nddu;
    int* nddv;
    double* u;
    double* v;
    double* eta;
    double* h;
    double* detadt;
    void* rhs;
    void* h_x;

    // Allocate memory on host
    ndd = (int*)malloc(n * sizeof(int));
    nddu = (int*)malloc(n * sizeof(int));
    nddv = (int*)malloc(n * sizeof(int));
    u = (double*)malloc(n * sizeof(double));
    v = (double*)malloc(n * sizeof(double));
    eta = (double*)malloc(n * sizeof(double));
    h = (double*)malloc(n * sizeof(double));
    detadt = (double*)malloc(n * sizeof(double));
    rhs = malloc(block_dimx * n * sizeof(double));
    h_x = malloc(block_dimx * n * sizeof(double));

    // mebset h_x to be 0
    memset(h_x, 0, (block_dimx * n) * sizeof(double));

    // memset values to be 0
    memset(values, 0, (block_dimx * block_dimy * 5 * n) * sizeof(double));

    // Allocate memory on device
    double* d_values;
    int* d_row_ptrs;
    int* d_col_idxs;
    double* d_rhs;
    double* d_x;

    // Initialize host memory
    for (int i = 0; i < n; i++) {

        int row = i / nx;                               // row index in the 2D work array   
        int col = i % nx;                               // col index in the 1D work array

        if (col == 0) {
            ndd[i] = 2;
        }
        else {
            ndd[i] = 0;
        }

        if (col == 0) {
            nddu[i] = 2;
        }
        else if (col == nx - 1) {
            nddu[i] = -1;
        }
        else {
            nddu[i] = 0;
        }

        if (row == 0) {
            nddv[i] = -1;
        }
        else if (row == ny - 1) {
            nddv[i] = -1;
        }
        else {
            nddv[i] = 0;
        }

        u[i] = 0.0;
        v[i] = 0.0;
        eta[i] = 0.0;
        h[i] = 2.0;
        detadt[i] = M_PI * Trange / Tperiod;

    }

    // initialize sparse matrix (determine row_ptrs and col_idxs)

    nnz = 0;
    for (int i = 0; i < n; i++) {

        int row = i / nx;
        int col = i % nx;
        row_ptrs[i] = nnz;

        if (col == 0 && row == 0){
            pivot_offsets[i] = 0;
            col_idxs[nnz++] = i;
            col_idxs[nnz++] = i + 1;
            col_idxs[nnz++] = i + nx;
        }
        else if (col == 0 && row == ny - 1) {
            pivot_offsets[i] = 1;
            col_idxs[nnz++] = i - nx;
            col_idxs[nnz++] = i;
            col_idxs[nnz++] = i + 1;
        }
        else if (col == nx - 1 && row == 0) {
            pivot_offsets[i] = 1;
            col_idxs[nnz++] = i - 1;
            col_idxs[nnz++] = i;
            col_idxs[nnz++] = i + nx;
        }
        else if (col == nx - 1 && row == ny - 1) {
            pivot_offsets[i] = 2;
            col_idxs[nnz++] = i - nx;
            col_idxs[nnz++] = i - 1;
            col_idxs[nnz++] = i;
        }
        else if (col == 0) {
            pivot_offsets[i] = 1;
            col_idxs[nnz++] = i - nx;
            col_idxs[nnz++] = i;
            col_idxs[nnz++] = i + 1;
            col_idxs[nnz++] = i + nx;
        }
        else if (col == nx - 1) {
            pivot_offsets[i] = 2;
            col_idxs[nnz++] = i - nx;
            col_idxs[nnz++] = i - 1;
            col_idxs[nnz++] = i;
            col_idxs[nnz++] = i + nx;
        }
        else if (row == 0) {
            pivot_offsets[i] = 1;
            col_idxs[nnz++] = i - 1;
            col_idxs[nnz++] = i;
            col_idxs[nnz++] = i + 1;
            col_idxs[nnz++] = i + nx;
        }
        else if (row == ny - 1) {
            pivot_offsets[i] = 2;
            col_idxs[nnz++] = i - nx;
            col_idxs[nnz++] = i - 1;
            col_idxs[nnz++] = i;
            col_idxs[nnz++] = i + 1;
        }
        else {
            pivot_offsets[i] = 2;
            col_idxs[nnz++] = i - nx;
            col_idxs[nnz++] = i - 1;
            col_idxs[nnz++] = i;
            col_idxs[nnz++] = i + 1;
            col_idxs[nnz++] = i + nx;
        }

    }

    row_ptrs[n] = nnz;

    FILE* fp;

    for (int it = 0; it < 1; it++) {

        SWEValKernel(n, nx, ny, block_dimx, dx, gra, mann, eddy, ndd, nddu, nddv, eta, u, v, h, detadt, (double*)values, rhs, row_ptrs, pivot_offsets);

        ///////////////////////////////////////////////////////
        ///////////////////////////////////////////////////////

        fp = fopen("values.bin", "wb");
        fwrite(values, sizeof(double), block_dimx* block_dimy * nnz, fp);
        fclose(fp);

        // print values to a text file
        fp = fopen("values.txt", "w");
        for (int i = 0; i < block_dimx * block_dimy * nnz; i++) {
            fprintf(fp, "%f\n", ((double*)values)[i]);
        }
        fclose(fp);

        fp = fopen("rhs.bin", "wb");
        fwrite(rhs, sizeof(double), 3 * n, fp);
        fclose(fp);

        // print rhs to a text file
        fp = fopen("rhs.txt", "w");
        for (int i = 0; i < 3 * n; i++) {
            fprintf(fp, "%f\n", ((double*)rhs)[i]);
        }   
        fclose(fp);

        fp = fopen("row_ptrs.bin", "wb");
        fwrite(row_ptrs, sizeof(int), (n + 1), fp);
        fclose(fp);

        // print row_ptrs to a text file
        fp = fopen("row_ptrs.txt", "w");
        for (int i = 0; i < n + 1; i++) {
            fprintf(fp, "%d\n", row_ptrs[i]);
        }
        fclose(fp);

        fp = fopen("col_idxs.bin", "wb");
        fwrite(col_idxs, sizeof(int64_t), 5 * n, fp);
        fclose(fp);

        // print col_idxs to a text file
        fp = fopen("col_idxs.txt", "w");
        for (int i = 0; i < 5 * n; i++) {
            fprintf(fp, "%d\n", col_idxs[i]);
        }
        fclose(fp);

        AMGX_matrix_upload_all_global(A,
            n, n, nnz, block_dimx, block_dimx,
            row_ptrs, col_idxs, values, NULL,
            nrings, nrings, NULL);

        /* set the connectivity information (for the vector) */
        AMGX_vector_bind(x, A);
        AMGX_vector_bind(b, A);
        /* upload the vector (and the connectivity information) */
        AMGX_vector_upload(x, n, 3, h_x);
        AMGX_vector_upload(b, n, 3, rhs);
        /* solver setup */
        //MPI barrier for stability (should be removed in practice to maximize performance)
        MPI_Barrier(amgx_mpi_comm);
        AMGX_solver_setup(solver, A);
        /* solver solve */
        //MPI barrier for stability (should be removed in practice to maximize performance)
        MPI_Barrier(amgx_mpi_comm);
        AMGX_solver_solve(solver, b, x);
        /* example of how to change parameters between non-linear iterations */
        //AMGX_config_add_parameters(&cfg, "config_version=2, default:tolerance=1e-12");
        //AMGX_solver_solve(solver, b, x);
        /* example of how to replace coefficients between non-linear iterations */
        //AMGX_matrix_replace_coefficients(A, n, nnz, values, diag);
        //AMGX_solver_setup(solver, A);
        //AMGX_solver_solve(solver, b, x);
        AMGX_solver_get_status(solver, &status);
        /* example of how to get (the local part of) the solution */
        //int sizeof_v_val;
        //sizeof_v_val = ((NVAMG_GET_MODE_VAL(NVAMG_VecPrecision, mode) == NVAMG_vecDouble))? sizeof(double): sizeof(float);
        // 
        void* SLN = malloc((3 * n) * sizeof(double));
        AMGX_vector_download(x, SLN);

        //// update eta, u and v
        for (int i = 0; i < n; i++) {
            eta[i] = eta[i] + ((double*)SLN)[i * 3];
        }

        //for (int i = 0; i < n + ny; i++) {
        //    u[i] = u[i] + ((double*)SLN)[n + i];
        //}

        //for (int i = 0; i < n + nx; i++) {
        //    v[i] = v[i] + ((double*)SLN)[2 * n + ny + i];
        //}

    }

    //fp = fopen("SLN2.bin", "wb");
    //fwrite(h_x, sizeof(double), (3 * n + nx + ny), fp);
    //fclose(fp);

    //// export u
    //fp = fopen("u.bin", "wb");
    //fwrite(u, sizeof(double), (n + ny), fp);
    //fclose(fp);

    //// export v
    //fp = fopen("v.bin", "wb");
    //fwrite(v, sizeof(double), (n + nx), fp);
    //fclose(fp);

    // export eta
    fp = fopen("eta.bin", "wb");
    fwrite(eta, sizeof(double), n, fp);
    fclose(fp);

    ///////////////////////////////////////////////////////
    ////////////////////////////////////////////////////

// 

}