the black boxes are blocks and the red points are non-zero values.
the values of the matrix points are like this:
The challenge in solving this linear equation lies in the small diagonal values of the matrix. To address this, I attempted to use a block matrix approach.
Reproduction steps
Below is the code snippet. You can directly copy the entire code into a .c file and run it. In the file, you only need to adjust the value of nx and ny
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>
#include "cuda_runtime.h"
#include <cusolverSp.h>
#include <stdint.h>
#define M_PI 3.14159265358979323846
/* CUDA error macro */
#define CUDA_SAFE_CALL(call) do { \
cudaError_t err = call; \
if(cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
#define CUSOLVER_SAFE_CALL(call) do { \
cusolverStatus_t status = call; \
if (status != CUSOLVER_STATUS_SUCCESS) { \
fprintf(stderr, "CUSOLVER error in %s:%d. Code:%d\n", __FILE__, __LINE__, status); \
exit(1); \
} \
} while(0)
//#define AMGX_DYNAMIC_LOADING
//#undef AMGX_DYNAMIC_LOADING
#define MAX_MSG_LEN 4096
/* standard or dynamically load library */
#ifdef AMGX_DYNAMIC_LOADING
#include "amgx_capi.h"
#else
#include "amgx_c.h"
#endif
/* print error message and exit */
void errAndExit(const char* err)
{
printf("%s\n", err);
fflush(stdout);
MPI_Abort(MPI_COMM_WORLD, 1);
exit(1);
}
/* print callback (could be customized) */
void print_callback(const char* msg, int length)
{
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) { printf("%s", msg); }
}
/* print usage and exit */
void printUsageAndExit()
{
char msg[MAX_MSG_LEN] = "Usage: mpirun [-n nranks] ./example [-mode [dDDI | dDFI | dFFI]] [-p nx ny] [-c config_file] [-amg \"variable1=value1 ... variable3=value3\"] [-gpu] [-it k]\n";
strcat(msg, " -mode: select the solver mode\n");
strcat(msg, " -p nx ny: select x- and y-dimensions of the 2D (5-points) local discretization of the Poisson operator (the global problem size will be nranks*nx*ny)\n");
strcat(msg, " -c: set the amg solver options from the config file\n");
strcat(msg, " -amg: set the amg solver options from the command line\n");
print_callback(msg, MAX_MSG_LEN);
MPI_Finalize();
exit(0);
}
/* parse parameters */
int findParamIndex(char** argv, int argc, const char* parm)
{
int count = 0;
int index = -1;
for (int i = 0; i < argc; i++)
{
if (strncmp(argv[i], parm, 100) == 0)
{
index = i;
count++;
}
}
if (count == 0 || count == 1)
{
return index;
}
else
{
char msg[MAX_MSG_LEN];
sprintf(msg, "ERROR: parameter %s has been specified more than once, exiting\n", parm);
print_callback(msg, MAX_MSG_LEN);
exit(1);
}
return -1;
}
double dhudx(double hw, double he, double uw, double ue, double dx)
{
return (he * ue - hw * uw) / dx;
}
double dhvdy(double hb, double ht, double vb, double vt, double dx)
{
return (ht * vt - hb * vb) / dx;
}
double gdetadx(double gra, double etaw, double etae, double dx)
{
return gra * (etae - etaw) / dx;
}
double friction(double gra, double mann, double h, double u, double v)
{
return gra * mann * mann / pow(h, 4.0 / 3.0) * sqrt(u * u + v * v) * u;
}
void SWEValKernel(
int n,
int nx,
int ny,
int block_dim,
double dx,
double gra,
double mann,
double eddy,
int* ndd,
int* nddu,
int* nddv,
double* eta,
double* u,
double* v,
double* h,
double* detadt,
void* values,
void* rhs,
int* row_ptrs,
int* pivot_offset) {
for (int i = 0; i < n; i++) {
int row = i / nx;
int col = i % nx;
int nnz = row_ptrs[i];
int offset = pivot_offset[i];
int nnz0 = 0;
int nnz1 = 0;
int nnz2 = 0;
int nnz3 = 0;
int nnz4 = 0;
/* continuity equation start*/
nnz2 = block_dim * block_dim * (nnz + offset);
if(col > 0){
nnz0 = block_dim * block_dim * (nnz + offset - 2);
nnz1 = block_dim * block_dim * (nnz + offset - 1);
}
else {
nnz0 = block_dim * block_dim * (nnz + offset - 2);
}
int idx = block_dim * i;
if (ndd[i] == 2) { // open boundary
((double*)values)[nnz2] = 1.0;
((double*)rhs)[idx] = -eta[i];
}
else {
double he = (col < nx - 1) ? 0.5 * (h[i] + h[i + 1]) : 0.0;
double hw = (col > 0) ? 0.5 * (h[i - 1] + h[i]) : 0.0;
double ht = (row < ny - 1) ? 0.5 * (h[i] + h[i + nx]) : 0.0;
double hb = (row > 0) ? 0.5 * (h[i - nx] + h[i]) : 0.0;
double ue = (col < nx - 1) ? u[i] : 0.0;
double uw = (col > 0) ? u[i - 1] : 0.0;
double vn = (row < ny - 1) ? v[i] : 0.0;
double vs = (row > 0) ? v[i - nx] : 0.0;
double dhudx0 = dhudx(hw, he, uw, ue, dx);
double dhvdy0 = dhvdy(hb, ht, vs, vn, dx);
if (row > 0) {
((double*)values)[nnz0 + 2] = -hb / dx;
}
if (col > 0) {
((double*)values)[nnz1 + 1] = -hw / dx;
}
if (col < nx - 1) {
((double*)values)[nnz2 + 1] = he / dx;
}
if (row < ny - 1) {
((double*)values)[nnz2 + 2] = ht / dx;
}
((double*)rhs)[idx] = -dhudx0 - dhvdy0 + detadt[i];
}// continuity equation end
/* X momentom equation start */
double du = 1e-8;
nnz2 = block_dim * block_dim * (nnz + offset) + block_dim;
if (col > 0) {
nnz0 = block_dim * block_dim * (nnz + offset - 2) + block_dim;
nnz1 = block_dim * block_dim * (nnz + offset - 1) + block_dim;
}
else {
nnz0 = block_dim * block_dim * (nnz + offset - 1) + block_dim;
}
if (col < nx - 1) {
nnz3 = block_dim * block_dim * (nnz + offset + 1) + block_dim;
nnz4 = block_dim * block_dim * (nnz + offset + 2) + block_dim;
}
else {
nnz4 = block_dim * block_dim * (nnz + offset + 1) + block_dim;
}
idx = block_dim * i + 1;
if (col == nx - 1) {
((double*)values)[nnz2 + 1] = 1.0;
((double*)rhs)[idx] = 0.0;
}
else {
double h0 = 0.5 * (h[i] + h[i + 1]);
double v0, v00, v01;
double gdeta0 = gdetadx(gra, eta[i + 1], eta[i], dx);
double frc0, frcu, dfrcu;
double ddu0;
int iup = i + nx;
int idown = i - nx;
if (row == 0) {
v00 = nddv[i] < 0 ? 0.0 : 2.0 * v[i] - v[i + nx];
v01 = nddv[i + 1] < 0 ? 0.0 : 2.0 * v[i + 1] - v[i + nx + 1];
v0 = 0.25 * (v[i] + v[i + 1] + v00 + v01);
idown = iup;
((double*)values)[nnz4 + 1] = -2.0 * eddy / dx / dx;
}
else if (row == ny - 1) {
v00 = nddv[i] < 0 ? 0.0 : 2.0 * v[i - nx] - v[i - 2 * nx];
v01 = nddv[i + 1] < 0 ? 0.0 : 2.0 * v[i - nx + 1] - v[i - 2 * nx + 1];
v0 = 0.25 * (v00 + v01 + v[i - nx] + v[i - nx + 1]);
iup = idown;
((double*)values)[nnz0 + 1] = -2.0 * eddy / dx / dx;
}
else {
v0 = 0.25 * (v[i] + v[i + 1] + v[i - nx] + v[i - nx + 1]);
((double*)values)[nnz0 + 1] = -eddy / dx / dx;
((double*)values)[nnz4 + 1] = -eddy / dx / dx;
}
frc0 = friction(gra, mann, h0, u[i], v0);
frcu = friction(gra, mann, h0, u[i] + du, v0);
dfrcu = (frcu - frc0) / du;
if (nddu[i] == 2 || nddu[i + 1] == 2) // open boundary
{
ddu0 = eddy * (2.0 * u[i] - u[idown] - u[iup]) / dx / dx;
((double*)values)[nnz2 + 1] = 2.0 * eddy / dx / dx + dfrcu;
}
else {
((double*)values)[nnz2 + 1] = 4.0 * eddy / dx / dx + dfrcu;
if (col == 0) {
ddu0 = eddy * (4.0 * u[i] - u[i + 1] - u[idown] - u[iup]) / dx / dx;
((double*)values)[nnz3 + 1] = -2.0 * eddy / dx / dx;
}
else if (col == nx - 2) {
ddu0 = eddy * (4.0 * u[i] - u[i - 1] - u[idown] - u[iup]) / dx / dx;
((double*)values)[nnz1 + 1] = -2.0 * eddy / dx / dx;
}
else {
ddu0 = eddy * (4.0 * u[i] - u[i - 1] - u[i + 1] - u[idown] - u[iup]) / dx / dx;
((double*)values)[nnz1 + 1] = -eddy / dx / dx;
((double*)values)[nnz3 + 1] = -eddy / dx / dx;
}
}
((double*)values)[nnz2] = -gra / dx;
((double*)values)[nnz3] = gra / dx;
((double*)rhs)[idx] = -gdeta0 - frc0 - ddu0;
}// X momentum equation end
/* Y momentom equation start */
nnz2 = block_dim * block_dim * (nnz + offset) + block_dim * 2;
if (col > 0) {
nnz1 = block_dim * block_dim * (nnz + offset - 2) + block_dim * 2;
nnz0 = block_dim * block_dim * (nnz + offset - 1) + block_dim * 2;
}
else {
nnz1 = block_dim * block_dim * (nnz + offset - 1) + block_dim * 2;
}
if (col < nx - 1) {
nnz4 = block_dim * block_dim * (nnz + offset + 1) + block_dim * 2;
nnz3 = block_dim * block_dim * (nnz + offset + 2) + block_dim * 2;
}
else {
nnz3 = block_dim * block_dim * (nnz + offset + 1) + block_dim * 2;
}
idx = block_dim * i + 2;
if (row == ny - 1) {
((double*)values)[nnz2 + 2] = 1.0;
((double*)rhs)[idx] = 0.0;
}
else {
double h0 = 0.5 * (h[i] + h[i + nx]);
double v0, v00, v01;
double gdeta0 = gdetadx(gra, eta[i + nx], eta[i], dx);
double frc0, frcu, dfrcu;
double ddu0;
int iup = i + 1;
int idown = i - 1;
if (col == 0) {
v00 = nddu[i] < 0 ? 0.0 : 2.0 * u[i] - u[i + 1];
v01 = nddu[i + nx] < 0 ? 0.0 : 2.0 * u[i + nx] - u[i + nx + 1];
v0 = 0.25 * (u[i] + u[i + nx] + v00 + v01);
idown = iup;
((double*)values)[nnz4 + 2] = -2.0 * eddy / dx / dx;
}
else if (col == nx - 1) {
v00 = nddu[i] < 0 ? 0.0 : 2.0 * u[i - 1] - u[i - 2];
v01 = nddu[i + nx] < 0 ? 0.0 : 2.0 * u[i - 1 + nx] - u[i - 2 * 1 + nx];
v0 = 0.25 * (v00 + v01 + u[i - 1] + u[i - 1 + nx]);
iup = idown;
((double*)values)[nnz0 + 2] = -2.0 * eddy / dx / dx;
}
else {
v0 = 0.25 * (u[i] + u[i + nx] + u[i - 1] + u[i - 1 + nx]);
((double*)values)[nnz0 + 2] = -eddy / dx / dx;
((double*)values)[nnz4 + 2] = -eddy / dx / dx;
}
frc0 = friction(gra, mann, h0, v[i], v0);
frcu = friction(gra, mann, h0, v[i] + du, v0);
dfrcu = (frcu - frc0) / du;
if (nddv[i] == 2 || nddv[i + nx] == 2) // open boundary
{
ddu0 = eddy * (2.0 * v[i] - v[idown] - v[iup]) / dx / dx;
((double*)values)[nnz2 + 2] = 2.0 * eddy / dx / dx + dfrcu;
}
else {
((double*)values)[nnz2 + 2] = 4.0 * eddy / dx / dx + dfrcu;
if (row == 0) {
ddu0 = eddy * (4.0 * v[i] - v[i + nx] - v[idown] - v[iup]) / dx / dx;
((double*)values)[nnz3 + 2] = -2.0 * eddy / dx / dx;
}
else if (row == ny - 2) {
ddu0 = eddy * (4.0 * v[i] - v[i - nx] - v[idown] - v[iup]) / dx / dx;
((double*)values)[nnz1 + 2] = -2.0 * eddy / dx / dx;
}
else {
ddu0 = eddy * (4.0 * v[i] - v[i - nx] - v[i + nx] - v[idown] - v[iup]) / dx / dx;
((double*)values)[nnz1 + 2] = -eddy / dx / dx;
((double*)values)[nnz3 + 2] = -eddy / dx / dx;
}
}
((double*)values)[nnz2] = -gra / dx;
((double*)values)[nnz3] = gra / dx;
((double*)rhs)[idx] = -gdeta0 - frc0 - ddu0;
}// Y momentum equation end
}
}
int main(int argc, char** argv)
{
//parameter parsing
int pidx = 0;
int pidy = 0;
//MPI (with CUDA GPUs)
int rank = 0;
int lrank = 0;
int nranks = 0;
int n;
int nx, ny;
int gpu_count = 0;
MPI_Comm amgx_mpi_comm = MPI_COMM_WORLD;
//versions
int major, minor;
char* ver, * date, * time;
//input matrix and rhs/solution
int* partition_sizes = NULL;
int* partition_vector = NULL;
int partition_vector_size = 0;
//library handles
AMGX_Mode mode;
AMGX_config_handle cfg;
AMGX_resources_handle rsrc;
AMGX_matrix_handle A;
AMGX_vector_handle b, x;
AMGX_solver_handle solver;
AMGX_SOLVE_STATUS status;
MPI_Init(&argc, &argv);
MPI_Comm_size(amgx_mpi_comm, &nranks);
MPI_Comm_rank(amgx_mpi_comm, &rank);
CUDA_SAFE_CALL(cudaGetDeviceCount(&gpu_count));
lrank = rank % gpu_count;
CUDA_SAFE_CALL(cudaSetDevice(lrank));
printf("Process %d selecting device %d\n", rank, lrank);
nx = 20;
ny = 20;
n = nx * ny;
/* init */
AMGX_SAFE_CALL(AMGX_initialize());
AMGX_SAFE_CALL(AMGX_initialize_plugins());
/* system */
AMGX_SAFE_CALL(AMGX_register_print_callback(&print_callback));
AMGX_SAFE_CALL(AMGX_install_signal_handler());
mode = AMGX_mode_dDDI;
int sizeof_m_val = ((AMGX_GET_MODE_VAL(AMGX_MatPrecision, mode) == AMGX_matDouble)) ? sizeof(double) : sizeof(float);
int sizeof_v_val = ((AMGX_GET_MODE_VAL(AMGX_VecPrecision, mode) == AMGX_vecDouble)) ? sizeof(double) : sizeof(float);
AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, "FGMRES_AGGREGATION_JACOBI.json"));
AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "tolerance=1e-16"));
AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1"));
AMGX_resources_create(&rsrc, cfg, &amgx_mpi_comm, 1, &lrank);
AMGX_matrix_create(&A, rsrc, mode);
AMGX_vector_create(&x, rsrc, mode);
AMGX_vector_create(&b, rsrc, mode);
AMGX_solver_create(&solver, rsrc, mode, cfg);
int nrings; //=1; //=2;
AMGX_config_get_default_number_of_rings(cfg, &nrings);
//printf("nrings=%d\n",nrings);
int nglobal = 0;
/* generate the matrix
In more detail, this routine will create 2D (5 point) discretization of the
Poisson operator. The discretization is performed on a the 2D domain consisting
of nx and ny points in x- and y-dimension respectively. Each rank processes it's
own part of discretization points. Finally, the rhs and solution will be set to
a vector of ones and zeros, respectively. */
int* row_ptrs = (int*)malloc((n + 1) * sizeof(int));
int* pivot_offsets = (int*)malloc((n) * sizeof(int));
int64_t* col_idxs = (int64_t*)malloc(5 * n * sizeof(int64_t));
int block_dimx = 3;
int block_dimy = 3;
void* values = malloc( block_dimx * block_dimy * 5 * n * sizeof(double)); // maximum nnz
int nnz = 0;
int count = 0;
///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
double dx = 5.0;
double gra = 9.81;
double mann = 0.012;
double eddy = 0.5;
double Trange = 1.0;
double Tperiod = 43200.0;
int* ndd;
int* nddu;
int* nddv;
double* u;
double* v;
double* eta;
double* h;
double* detadt;
void* rhs;
void* h_x;
// Allocate memory on host
ndd = (int*)malloc(n * sizeof(int));
nddu = (int*)malloc(n * sizeof(int));
nddv = (int*)malloc(n * sizeof(int));
u = (double*)malloc(n * sizeof(double));
v = (double*)malloc(n * sizeof(double));
eta = (double*)malloc(n * sizeof(double));
h = (double*)malloc(n * sizeof(double));
detadt = (double*)malloc(n * sizeof(double));
rhs = malloc(block_dimx * n * sizeof(double));
h_x = malloc(block_dimx * n * sizeof(double));
// mebset h_x to be 0
memset(h_x, 0, (block_dimx * n) * sizeof(double));
// memset values to be 0
memset(values, 0, (block_dimx * block_dimy * 5 * n) * sizeof(double));
// Allocate memory on device
double* d_values;
int* d_row_ptrs;
int* d_col_idxs;
double* d_rhs;
double* d_x;
// Initialize host memory
for (int i = 0; i < n; i++) {
int row = i / nx; // row index in the 2D work array
int col = i % nx; // col index in the 1D work array
if (col == 0) {
ndd[i] = 2;
}
else {
ndd[i] = 0;
}
if (col == 0) {
nddu[i] = 2;
}
else if (col == nx - 1) {
nddu[i] = -1;
}
else {
nddu[i] = 0;
}
if (row == 0) {
nddv[i] = -1;
}
else if (row == ny - 1) {
nddv[i] = -1;
}
else {
nddv[i] = 0;
}
u[i] = 0.0;
v[i] = 0.0;
eta[i] = 0.0;
h[i] = 2.0;
detadt[i] = M_PI * Trange / Tperiod;
}
// initialize sparse matrix (determine row_ptrs and col_idxs)
nnz = 0;
for (int i = 0; i < n; i++) {
int row = i / nx;
int col = i % nx;
row_ptrs[i] = nnz;
if (col == 0 && row == 0){
pivot_offsets[i] = 0;
col_idxs[nnz++] = i;
col_idxs[nnz++] = i + 1;
col_idxs[nnz++] = i + nx;
}
else if (col == 0 && row == ny - 1) {
pivot_offsets[i] = 1;
col_idxs[nnz++] = i - nx;
col_idxs[nnz++] = i;
col_idxs[nnz++] = i + 1;
}
else if (col == nx - 1 && row == 0) {
pivot_offsets[i] = 1;
col_idxs[nnz++] = i - 1;
col_idxs[nnz++] = i;
col_idxs[nnz++] = i + nx;
}
else if (col == nx - 1 && row == ny - 1) {
pivot_offsets[i] = 2;
col_idxs[nnz++] = i - nx;
col_idxs[nnz++] = i - 1;
col_idxs[nnz++] = i;
}
else if (col == 0) {
pivot_offsets[i] = 1;
col_idxs[nnz++] = i - nx;
col_idxs[nnz++] = i;
col_idxs[nnz++] = i + 1;
col_idxs[nnz++] = i + nx;
}
else if (col == nx - 1) {
pivot_offsets[i] = 2;
col_idxs[nnz++] = i - nx;
col_idxs[nnz++] = i - 1;
col_idxs[nnz++] = i;
col_idxs[nnz++] = i + nx;
}
else if (row == 0) {
pivot_offsets[i] = 1;
col_idxs[nnz++] = i - 1;
col_idxs[nnz++] = i;
col_idxs[nnz++] = i + 1;
col_idxs[nnz++] = i + nx;
}
else if (row == ny - 1) {
pivot_offsets[i] = 2;
col_idxs[nnz++] = i - nx;
col_idxs[nnz++] = i - 1;
col_idxs[nnz++] = i;
col_idxs[nnz++] = i + 1;
}
else {
pivot_offsets[i] = 2;
col_idxs[nnz++] = i - nx;
col_idxs[nnz++] = i - 1;
col_idxs[nnz++] = i;
col_idxs[nnz++] = i + 1;
col_idxs[nnz++] = i + nx;
}
}
row_ptrs[n] = nnz;
FILE* fp;
for (int it = 0; it < 1; it++) {
SWEValKernel(n, nx, ny, block_dimx, dx, gra, mann, eddy, ndd, nddu, nddv, eta, u, v, h, detadt, (double*)values, rhs, row_ptrs, pivot_offsets);
///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
fp = fopen("values.bin", "wb");
fwrite(values, sizeof(double), block_dimx* block_dimy * nnz, fp);
fclose(fp);
// print values to a text file
fp = fopen("values.txt", "w");
for (int i = 0; i < block_dimx * block_dimy * nnz; i++) {
fprintf(fp, "%f\n", ((double*)values)[i]);
}
fclose(fp);
fp = fopen("rhs.bin", "wb");
fwrite(rhs, sizeof(double), 3 * n, fp);
fclose(fp);
// print rhs to a text file
fp = fopen("rhs.txt", "w");
for (int i = 0; i < 3 * n; i++) {
fprintf(fp, "%f\n", ((double*)rhs)[i]);
}
fclose(fp);
fp = fopen("row_ptrs.bin", "wb");
fwrite(row_ptrs, sizeof(int), (n + 1), fp);
fclose(fp);
// print row_ptrs to a text file
fp = fopen("row_ptrs.txt", "w");
for (int i = 0; i < n + 1; i++) {
fprintf(fp, "%d\n", row_ptrs[i]);
}
fclose(fp);
fp = fopen("col_idxs.bin", "wb");
fwrite(col_idxs, sizeof(int64_t), 5 * n, fp);
fclose(fp);
// print col_idxs to a text file
fp = fopen("col_idxs.txt", "w");
for (int i = 0; i < 5 * n; i++) {
fprintf(fp, "%d\n", col_idxs[i]);
}
fclose(fp);
AMGX_matrix_upload_all_global(A,
n, n, nnz, block_dimx, block_dimx,
row_ptrs, col_idxs, values, NULL,
nrings, nrings, NULL);
/* set the connectivity information (for the vector) */
AMGX_vector_bind(x, A);
AMGX_vector_bind(b, A);
/* upload the vector (and the connectivity information) */
AMGX_vector_upload(x, n, 3, h_x);
AMGX_vector_upload(b, n, 3, rhs);
/* solver setup */
//MPI barrier for stability (should be removed in practice to maximize performance)
MPI_Barrier(amgx_mpi_comm);
AMGX_solver_setup(solver, A);
/* solver solve */
//MPI barrier for stability (should be removed in practice to maximize performance)
MPI_Barrier(amgx_mpi_comm);
AMGX_solver_solve(solver, b, x);
/* example of how to change parameters between non-linear iterations */
//AMGX_config_add_parameters(&cfg, "config_version=2, default:tolerance=1e-12");
//AMGX_solver_solve(solver, b, x);
/* example of how to replace coefficients between non-linear iterations */
//AMGX_matrix_replace_coefficients(A, n, nnz, values, diag);
//AMGX_solver_setup(solver, A);
//AMGX_solver_solve(solver, b, x);
AMGX_solver_get_status(solver, &status);
/* example of how to get (the local part of) the solution */
//int sizeof_v_val;
//sizeof_v_val = ((NVAMG_GET_MODE_VAL(NVAMG_VecPrecision, mode) == NVAMG_vecDouble))? sizeof(double): sizeof(float);
//
void* SLN = malloc((3 * n) * sizeof(double));
AMGX_vector_download(x, SLN);
//// update eta, u and v
for (int i = 0; i < n; i++) {
eta[i] = eta[i] + ((double*)SLN)[i * 3];
}
//for (int i = 0; i < n + ny; i++) {
// u[i] = u[i] + ((double*)SLN)[n + i];
//}
//for (int i = 0; i < n + nx; i++) {
// v[i] = v[i] + ((double*)SLN)[2 * n + ny + i];
//}
}
//fp = fopen("SLN2.bin", "wb");
//fwrite(h_x, sizeof(double), (3 * n + nx + ny), fp);
//fclose(fp);
//// export u
//fp = fopen("u.bin", "wb");
//fwrite(u, sizeof(double), (n + ny), fp);
//fclose(fp);
//// export v
//fp = fopen("v.bin", "wb");
//fwrite(v, sizeof(double), (n + nx), fp);
//fclose(fp);
// export eta
fp = fopen("eta.bin", "wb");
fwrite(eta, sizeof(double), n, fp);
fclose(fp);
///////////////////////////////////////////////////////
////////////////////////////////////////////////////
//
}
I generate a block matrix with 3x3 blocks. and I used FGMRES_AGGREGATION_JACOBI.json configureation to solve the system but it returns:
Environment information: OS: Windows 11 CUDA runtime: CUDA 11. AMGX version: v2.3.0 NVIDIA driver: 12.2 NVIDIA GPU: 3060ti
AMGX solver configuration
{ "config_version": 2, "solver": { "preconditioner": { "error_scaling": 0, "print_grid_stats": 1, "algorithm": "AGGREGATION", "solver": "AMG", "smoother": "BLOCK_JACOBI", "presweeps": 0, "selector": "SIZE_2", "coarse_solver": "NOSOLVER", "max_iters": 1, "min_coarse_rows": 32, "relaxation_factor": 0.75, "scope": "amg", "max_levels": 100, "postsweeps": 3, "cycle": "V" }, "use_scalar_norm": 1, "solver": "FGMRES", "print_solve_stats": 1, "obtain_timings": 1, "max_iters": 1000, "monitor_residual": 1, "gmres_n_restart": 32, "convergence": "RELATIVE_INI", "scope": "main", "tolerance": 0.0001, "norm": "L2" } }
Matrix Data The form of matrix is in 3x3 block CSR form col_idxs.txt rhs.txt row_ptrs.txt values.txt
the shape of the sparse matrix is like this:
the values of the matrix points are like this:
The challenge in solving this linear equation lies in the small diagonal values of the matrix. To address this, I attempted to use a block matrix approach.
Reproduction steps
Below is the code snippet. You can directly copy the entire code into a .c file and run it. In the file, you only need to adjust the value of nx and ny