JuliaGPU / AMGX.jl

MIT License
11 stars 4 forks source link

Weird error for special matrix values #21

Closed navidcy closed 2 years ago

navidcy commented 2 years ago

I'm getting a really strange error. I run:

julia> using AMGX
[NVBLAS] NVBLAS_CONFIG_FILE environment variable is NOT set : relying on default config filename 'nvblas.conf'
[NVBLAS] Cannot open default config file 'nvblas.conf'
[NVBLAS] Config parsed
[NVBLAS] CPU Blas library need to be provided

julia> try
           AMGX.initialize()
           AMGX.initialize_plugins()
       catch e
           AMGX.finalize_plugins()
           AMGX.finalize()
           AMGX.initialize()
           AMGX.initialize_plugins()
       end
AMGX version 2.1.0.131-opensource
Built on Jan 20 2021, 13:08:41
Compiled with CUDA Runtime 10.0, using CUDA driver 11.7

julia> config = AMGX.Config(Dict("monitor_residual" => 1, "max_iters" => 1, "store_res_history" => 1, "tolerance" => 1))
AMGX.Config @0x000000000242a910

julia> resources = AMGX.Resources(config)
AMGX.Resources @0x0000000019828ef0

julia> solver = AMGX.Solver(resources, AMGX.dDDI, config)
AMGX.Solver @0x000000001d9426a0 dDDI

julia> device_matrix = AMGX.AMGXMatrix(resources, AMGX.dDDI)
AMGX.AMGXMatrix @0x0000000001ec7710 dDDI

Then if I give:

julia> AMGX.upload!(device_matrix, Int32[0, 2, 4], Int32[0, 1, 0, 1], [-0.0008, 0.0008, 0.0008, -0.00081])
AMGX.AMGXMatrix @0x00000000019cab80 dDDI of size 2×2 with 4 stored entries

julia> AMGX.setup!(solver, device_matrix)
AMGX.Solver @0x000000001cb540a0 dDDI

all seems OK. But if I finalize and close everything and start a new Julia session and run the above again but now I just modify the last matrix element then I get this error:

julia> AMGX.upload!(device_matrix, Int32[0, 2, 4], Int32[0, 1, 0, 1], [-0.0008, 0.0008, 0.0008, -0.0008])
AMGX.AMGXMatrix @0x0000000001ec7710 dDDI of size 2×2 with 4 stored entries

julia> AMGX.setup!(solver, device_matrix)
Caught amgx exception: Fail to get info from cudense
 at: /workspace/srcdir/AMGX-2.1.0/core/src/solvers/dense_lu_solver.cu:638
Stack trace:
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::dense_lu_solver::DenseLUSolver<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >::cudense_getrf()+0x5ec
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::dense_lu_solver::DenseLUSolver<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >::solver_setup(bool)+0x179
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::Solver<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >::setup(amgx::Operator<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >&, bool)+0x206
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : void amgx::AMG_Setup<(AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2>::setup<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2>, (AMGX_MemorySpace)1, (AMGX_MemorySpace)0>(amgx::AMG<(AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2>*, amgx::Matrix<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >&)+0xf5
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::AMG<(AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2>::setup(amgx::Matrix<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >&)+0x8e
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::AlgebraicMultigrid_Solver<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >::solver_setup(bool)+0x50
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::Solver<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >::setup(amgx::Operator<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >&, bool)+0x206
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::Solver<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >::setup_no_throw(amgx::Operator<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >&, bool)+0x69
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::AMG_Solver<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >::setup(amgx::Matrix<amgx::TemplateConfig<(AMGX_MemorySpace)1, (AMGX_VecPrecision)0, (AMGX_MatPrecision)0, (AMGX_IndPrecision)2> >&)+0x5c
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : amgx::AMGX_ERROR amgx::(anonymous namespace)::set_solver_with_shared<(AMGX_Mode)8193, amgx::AMG_Solver, amgx::Matrix>(AMGX_solver_handle_struct*, AMGX_matrix_handle_struct*, amgx::Resources*, amgx::AMGX_ERROR (amgx::AMG_Solver<amgx::TemplateMode<(AMGX_Mode)8193>::Type>::*)(std::shared_ptr<amgx::Matrix<amgx::TemplateMode<(AMGX_Mode)8193>::Type> >))+0xbf
 /g/data/v45/nc3020/.julia/artifacts/49a9b2455017b89ca83e19da8bee7cec48a4b961/lib/libamgxsh.so : AMGX_solver_setup()+0x153
 [0x14a6cc059a8a]
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : jl_apply_generic()+0x316
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xcc2f5
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xcbedc
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xcca33
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xcd508
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xe8996
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xe9654
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xccc13
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xccf08
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xcd508
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0xe8996
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : jl_toplevel_eval_in()+0xaa
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x81eb2
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x82888
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x82bfe
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0xb2733
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0xb2d79
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : jl_apply_generic()+0x316
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x174392
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x174439
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : jl_apply_generic()+0x316
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : jl_f__call_latest()+0x47
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x18d599
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x19ea4e
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x19f925
 /g/data/v45/nc3020/julia/usr/lib/julia/sys.so : ()+0x19fa96
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : jl_apply_generic()+0x316
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : ()+0x1098a6
 /g/data/v45/nc3020/julia/usr/bin/../lib/libjulia-internal.so.1 : repl_entrypoint()+0x8b
 /g/data/v45/nc3020/julia/julia : main()+0x9
 /lib64/libc.so.6 : __libc_start_main()+0xf3
 /g/data/v45/nc3020/julia/julia : _start()+0x2e

ERROR: AMGX.AMGXException("Internal error.")
Stacktrace:
 [1] macro expansion
   @ /g/data/v45/nc3020/.julia/packages/AMGX/GFHHN/src/errors.jl:22 [inlined]
 [2] setup!(solver::AMGX.Solver, matrix::AMGX.AMGXMatrix)
   @ AMGX /g/data/v45/nc3020/.julia/packages/AMGX/GFHHN/src/Solver.jl:40
 [3] top-level scope
   @ REPL[8]:1
 [4] top-level scope
   @ /g/data/v45/nc3020/.julia/packages/CUDA/DfvRa/src/initialization.jl:52

Any ideas?

cc @Elise-palethorpe

KristofferC commented 2 years ago

The error happens here: https://github.com/NVIDIA/AMGX/blob/08a6b9c92047e9a30e67d22f7ecb2cca84ef8a0a/src/solvers/dense_lu_solver.cu#L566-L572.

Maybe it has something to do with singularity of the matrix? I would suggest creating a C-repo and creating an issue upstream.

navidcy commented 2 years ago

OK, indeed the matrix is singular! My bad! Thanks @KristofferC