Dhondtguido / PaStiX4CalculiX

Other
16 stars 7 forks source link

Segmentation fault cLightSpMV.cu:99 #6

Closed maxgit33 closed 3 years ago

maxgit33 commented 3 years ago

Hello, I get a Segmentation fault while running ccx_2.17 with PaStiX4CalculiX Solver CUDA enabled and PASTIX_GPU=1 The input file is beam10p from the test tarball.

System: CENTOS 7.8 gcc 9.3.0 cuda 11.0.3

gdb shows:

Thread 1 "ccx_2.17_i8" received signal SIGSEGV, Segmentation fault. 0x000000000100167e in performLightLsMV (alpha=, dval=0xfffffffffffffff8, drowptr=0x0, dcolind=0xfffffffffffffff8, dx=0x1ef90, beta=0, dy=0x5c78) at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/kernels/gpus/LightSpMV-1.0/src/cLightSpMV.cu:99 99 spmv->_rowOffsets[0] = drowptr;

(gdb) l 94 CHECK_CUSPARSE( cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 95 &alpha, matA, vecX, &beta, vecY, CUDA_R_64F, 96 CUSPARSE_MV_ALG_DEFAULT, dBuffer) ) 97 */ 98
99 spmv->_rowOffsets[0] = drowptr; 100 spmv->_colIndexValues[0] = dcolind; 101 spmv->_numericalValues[0] = dval; 102 spmv->_vectorX[0] = dx; 103 spmv->_vectorY[0] = dy;

(gdb) p drowptr $1 = (int64_t *) 0x0

(gdb) where

0 0x000000000100167e in performLightLsMV (alpha=, dval=0xfffffffffffffff8, drowptr=0x0, dcolind=0xfffffffffffffff8, dx=0x1ef90, beta=0, dy=0x5c78)

at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/kernels/gpus/LightSpMV-1.0/src/cLightSpMV.cu:99

1 0x0000000000dad7b9 in gpu_d_spmv (n=258, alpha=1, beta=0, A=0x0, x=0x1ef98, y=0x5c78, rowptr=0x0, colind=0x0)

at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/build/kernels/gpu_d_spmv.c:35

2 0x0000000000a268a3 in d_gmres_gpu_smp (pastix_data=0xfd9d620, x=0x382b8, b=0x38ac8)

at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/build/refinement/d_refine_gmres_gpu.c:270

3 0x0000000000a0b186 in pastix_subtask_refine (pastix_data=0xfd9d620, n=258, nrhs=1, b=0x160d5e30, ldb=258, x=0xf8edf80, ldx=258)

at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/refinement/pastix_task_refine.c:182

4 0x0000000000a0b45d in pastix_task_refine (pastix_data=0xfd9d620, n=258, nrhs=1, b=0x160d5e30, ldb=258, x=0xf8edf80, ldx=258)

at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/refinement/pastix_task_refine.c:303

5 0x00000000005f21fb in pastix_solve_generic (symmetryflag=, nrhs=, neq=, x=) at pastix.c:696

6 pastix_solve_generic (x=0xf8edf80, neq=0x7fffffffaaa0, symmetryflag=, nrhs=0x7fffffff95a0) at pastix.c:644

7 0x00000000005f2d0d in pastix_main_generic (nrhs=0x7fffffff95a0, nzs3=0x7fffffffaad0, jq=0xf8ebab0, inputformat=0x7fffffff9460, symmetryflag=0x7fffffff9458,

nzs=0x7fffffffaac0, neq=0x7fffffffaaa0, irow=0xf8f8c70, icol=0xf8e9c40, b=0xf8edf80, sigma=0x7fffffff95b0, aub=0x0, adb=0x0, au=0xf914440, ad=0xf8ecf40) at pastix.c:837

8 pastix_main_generic (ad=0xf8ecf40, au=0xf914440, adb=0x0, aub=0x0, sigma=0x7fffffff95b0, b=0xf8edf80, icol=0xf8e9c40, irow=0xf8f8c70, neq=0x7fffffffaaa0,

nzs=0x7fffffffaac0, symmetryflag=0x7fffffff9458, inputformat=0x7fffffff9460, jq=0xf8ebab0, nzs3=0x7fffffffaad0, nrhs=0x7fffffff95a0) at pastix.c:756

9 0x00000000005cec53 in linstatic (co=0xf8e6e50, nk=nk@entry=0x7fffffffa518, konp=konp@entry=0x7fffffffa358, ipkonp=ipkonp@entry=0x7fffffffa460,

lakonp=lakonp@entry=0x7fffffffa310, ne=ne@entry=0x7fffffffa520, nodeboun=0xf8e69a0, ndirboun=0xf8e6a10, xboun=0xf8e8bc0, nboun=0x7fffffffa528, ipompc=0x0, nodempc=, coefmpc=, labmpc=0xf8e8d30 "", nmpc=0x7fffffffa530, nodeforc=0xf8e8db0, ndirforc=0xf8e8e50, xforc=0xf8e8f40, nforc=0x7fffffffa538, nelemload=0xf8e9780, sideload=0xf8e97a0 "\270缬\252*", xload=0xf8e97c0, nload=0x7fffffffa540, nactdof=0xf8f0040, icolp=0x7fffffffa3a0, jq=0xf8ebab0, irowp=0x7fffffffa3c0, neq=0x7fffffffaaa0, nzl=0x7fffffffa570, nmethod=0x7fffffffa560, ikmpc=0x0, ilmpc=0x0, ikboun=0xf8e8c30, ilboun=0xf8e8ca0, elcon=0xf8e9a30, nelcon=0xf8e9a50, rhcon=0xf8e9a70, nrhcon=0xf8e9a90, alcon=0xf8e9b00, nalcon=0xf8e9b40, alzero=0xf8e9b60, ielmatp=0x7fffffffa408, ielorienp=0x7fffffffa410, norien=0x7fffffffa610, orab=0x0, ntmat=0x7fffffffa608, t0=0xf8ea9d0, t1=0xf8eaf00, t1old=0x0, ithermal=0x7fffffffaa00, prestr=0xf8eb960, iprestr=0x7fffffffa618, vold=0xf8eebb0, iperturb=0x7fffffffa9f0, sti=0xf8f27d0, nzs=0x7fffffffaac0, kode=0x7fffffffa620, filab=0xf8f1750 ' ' <repeats 200 times>..., eme=0xf8f5a20, iexpl=0x7fffffffa660, plicon=0x0, nplicon=0x0, plkcon=0x0, nplkcon=0x0, xstatep=0x7fffffffa950, npmat=0x7fffffffa680, matname=0xf8f16f0 "EL", ' ' <repeats 78 times>, isolver=0x7fffffffa628, mi=0x7fffffffaae0, ncmat=0x7fffffffa838, nstate=0x7fffffffa830, cs=0x0, mcs=0x7fffffffa708, nkon=0x7fffffffa650, enerp=0x7fffffffa960, xbounold=0xf8eba40, xforcold=0xf8e8ef0, xloadold=0x0, amname=0xf8ea8b0 "", amta=0xf8ea960, namta=0xf8ea990, nam=0x7fffffffa578, iamforc=0xf8e8ea0, iamload=0x0, iamt1=0xf8eb430, iamboun=0xf8e6aa0, ttime=0x7fffffffa9e8, output=0x7fffffffa2db "asc ", set=0xf8e9150 "NALLN", ' ' <repeats 76 times>, "EALLE", ' ' <repeats 76 times>, "FIXN", ' ' <repeats 34 times>..., nset=0x7fffffffa550, istartset=0xf8e92a0, iendset=0xf8e92d0, ialset=0xf8e9300, nprint=0x7fffffffa548, prlab=0xf8e9030 "U LRF LS L", prset=0xf8e9050 "NALLN", ' ' <repeats 76 times>, "NALLN", ' ' <repeats 76 times>, "EALLE", ' ' <repeats 33 times>..., nener=0x7fffffffa6d8, trab=0x0, inotr=0xf8e9e60, ntrans=0x7fffffffa688, fmpc=0x0, ipobody=0x0, ibody=0xf8e9800, xbody=0xf8e9820, nbody=0x7fffffffa598, xbodyold=0xf8e9840, timepar=0x7fffffffabd0, thicke=0x0, jobnamec=0x7fffffffaf60 "beam10p", tieset=0x0, ntie=0x7fffffffa6f8, istep=0x7fffffffa5e0, nmat=0x7fffffffa600, ielprop=0x0, prop=0x0, typeboun=0xf8e6a80 'B' <repeats 12 times>, mortar=0x7fffffffa640, mpcinfo=0x7fffffffab60, tietol=0x0, ics=0x0, icontact=0x7fffffffa760, orname=0x0, itempuser=0x7fffffffaa60) at linstatic.c:985

10 0x00000000004179ae in main (argc=, argv=) at ccx_2.17.c:1176

Is this a bug or a mistake from my side while setting up/compiling?

Any help is appreciated. Kind Regards, and thanks in advance

Dhondtguido commented 3 years ago

Hi,

that's correct. I tried several examples right now and I always get a segmentation fault. I haven't tried GPU for some time, since it doesn't bring anything on my machine.

Tobias, can you have a look at that? I get the segfault with version 2.17 and with my current development version.

Best Greetings,

Guido

On Wednesday, December 16, 2020 6:02:44 PM CET maxgit33 wrote:

Hello, I get a Segmentation fault while running ccx_2.17 with PaStiX4CalculiX Solver CUDA enabled and PASTIX_GPU=1 The input file is beam10p from the test tarball.

System: CENTOS 7.8 gcc 9.3.0 cuda 11.0.3

gdb shows:

Thread 1 "ccx_2.17_i8" received signal SIGSEGV, Segmentation fault.

0x000000000100167e in performLightLsMV (alpha=, dval=0xfffffffffffffff8, drowptr=0x0, dcolind=0xfffffffffffffff8, dx=0x1ef90, beta=0, dy=0x5c78) at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/kernels/gpus/LightSpMV-1.0 /src/cLightSpMV.cu:99 99 spmv->_rowOffsets[0] = drowptr;

(gdb) l

94 CHECK_CUSPARSE( cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 95
&alpha, matA, vecX, &beta, vecY, CUDA_R_64F, 96
CUSPARSE_MV_ALG_DEFAULT, dBuffer) ) 97 */ 98
99 spmv->_rowOffsets[0] = drowptr; 100 spmv->_colIndexValues[0] = dcolind; 101 spmv->_numericalValues[0] = dval; 102 spmv->_vectorX[0] = dx; 103 spmv->_vectorY[0] = dy;

(gdb) p drowptr

$1 = (int64_t *) 0x0

(gdb) where

0 0x000000000100167e in performLightLsMV (alpha=,

dval=0xfffffffffffffff8, drowptr=0x0, dcolind=0xfffffffffffffff8, dx=0x1ef90, beta=0, dy=0x5c78) at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/kernels/gpus/LightSpMV-1.0 /src/cLightSpMV.cu:99 #1 0x0000000000dad7b9 in gpu_d_spmv (n=258, alpha=1, beta=0, A=0x0, x=0x1ef98, y=0x5c78, rowptr=0x0, colind=0x0) at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/build/kernels/gpu_d_spmv.c :35 #2 0x0000000000a268a3 in d_gmres_gpu_smp (pastix_data=0xfd9d620, x=0x382b8, b=0x38ac8) at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/build/refinement/drefine gmres_gpu.c:270 #3 0x0000000000a0b186 in pastix_subtask_refine (pastix_data=0xfd9d620, n=258, nrhs=1, b=0x160d5e30, ldb=258, x=0xf8edf80, ldx=258) at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/refinement/pastix_task_ref ine.c:182 #4 0x0000000000a0b45d in pastix_task_refine (pastix_data=0xfd9d620, n=258, nrhs=1, b=0x160d5e30, ldb=258, x=0xf8edf80, ldx=258) at /tmp/work/Calculix_ccx_i8_pastix/PaStiX4CalculiX/refinement/pastix_task_ref ine.c:303 #5 0x00000000005f21fb in pastix_solve_generic (symmetryflag=, nrhs=, neq=, x=) at pastix.c:696 #6 pastix_solve_generic (x=0xf8edf80, neq=0x7fffffffaaa0, symmetryflag=, nrhs=0x7fffffff95a0) at pastix.c:644 #7 0x00000000005f2d0d in pastix_main_generic (nrhs=0x7fffffff95a0, nzs3=0x7fffffffaad0, jq=0xf8ebab0, inputformat=0x7fffffff9460, symmetryflag=0x7fffffff9458, nzs=0x7fffffffaac0, neq=0x7fffffffaaa0, irow=0xf8f8c70, icol=0xf8e9c40, b=0xf8edf80, sigma=0x7fffffff95b0, aub=0x0, adb=0x0, au=0xf914440, ad=0xf8ecf40) at pastix.c:837 #8 pastix_maingeneric (ad=0xf8ecf40, au=0xf914440, adb=0x0, aub=0x0, sigma=0x7fffffff95b0, b=0xf8edf80, icol=0xf8e9c40, irow=0xf8f8c70, neq=0x7fffffffaaa0, nzs=0x7fffffffaac0, symmetryflag=0x7fffffff9458, inputformat=0x7fffffff9460, jq=0xf8ebab0, nzs3=0x7fffffffaad0, nrhs=0x7fffffff95a0) at pastix.c:756 #9 0x00000000005cec53 in linstatic (co=0xf8e6e50, nk=nk@entry=0x7fffffffa518, konp=konp@entry=0x7fffffffa358, ipkonp=ipkonp@entry=0x7fffffffa460, lakonp=lakonp@entry=0x7fffffffa310, ne=ne@entry=0x7fffffffa520, nodeboun=0xf8e69a0, ndirboun=0xf8e6a10, xboun=0xf8e8bc0, nboun=0x7fffffffa528, ipompc=0x0, nodempc=, coefmpc=, labmpc=0xf8e8d30 "", nmpc=0x7fffffffa530, nodeforc=0xf8e8db0, ndirforc=0xf8e8e50, xforc=0xf8e8f40, nforc=0x7fffffffa538, nelemload=0xf8e9780, sideload=0xf8e97a0 "\270缬\252*", xload=0xf8e97c0, nload=0x7fffffffa540, nactdof=0xf8f0040, icolp=0x7fffffffa3a0, jq=0xf8ebab0, irowp=0x7fffffffa3c0, neq=0x7fffffffaaa0, nzl=0x7fffffffa570, nmethod=0x7fffffffa560, ikmpc=0x0, ilmpc=0x0, ikboun=0xf8e8c30, ilboun=0xf8e8ca0, elcon=0xf8e9a30, nelcon=0xf8e9a50, rhcon=0xf8e9a70, nrhcon=0xf8e9a90, alcon=0xf8e9b00, nalcon=0xf8e9b40, alzero=0xf8e9b60, ielmatp=0x7fffffffa408, ielorienp=0x7fffffffa410, norien=0x7fffffffa610, orab=0x0, ntmat=0x7fffffffa608, t0=0xf8ea9d0, t1=0xf8eaf00, t1old=0x0, ithermal=0x7fffffffaa00, prestr=0xf8eb960, iprestr=0x7fffffffa618, vold=0xf8eebb0, iperturb=0x7fffffffa9f0, sti=0xf8f27d0, nzs=0x7fffffffaac0, kode=0x7fffffffa620, filab=0xf8f1750 ' ' <repeats 200 times>..., eme=0xf8f5a20, iexpl=0x7fffffffa660, plicon=0x0, nplicon=0x0, plkcon=0x0, nplkcon=0x0, xstatep=0x7fffffffa950, npmat=0x7fffffffa680, matname=0xf8f16f0 "EL", ' ' <repeats 78 times>, isolver=0x7fffffffa628, mi=0x7fffffffaae0, ncmat=0x7fffffffa838, nstate_=0x7fffffffa830, cs=0x0, mcs=0x7fffffffa708, nkon=0x7fffffffa650, enerp=0x7fffffffa960, xbounold=0xf8eba40, xforcold=0xf8e8ef0, xloadold=0x0, amname=0xf8ea8b0 "", amta=0xf8ea960, namta=0xf8ea990, nam=0x7fffffffa578, iamforc=0xf8e8ea0, iamload=0x0, iamt1=0xf8eb430, iamboun=0xf8e6aa0, ttime=0x7fffffffa9e8, output=0x7fffffffa2db "asc ", set=0xf8e9150 "NALLN", ' ' <repeats 76 times>, "EALLE", ' ' <repeats 76 times>, "FIXN", ' ' <repeats 34 times>..., nset=0x7fffffffa550, istartset=0xf8e92a0, iendset=0xf8e92d0, ialset=0xf8e9300, nprint=0x7fffffffa548, prlab=0xf8e9030 "U LRF LS
L", prset=0xf8e9050 "NALLN", ' ' <repeats 76 times>, "NALLN", ' ' <repeats 76 times>, "EALLE", ' ' <repeats 33 times>..., nener=0x7fffffffa6d8, trab=0x0, inotr=0xf8e9e60, ntrans=0x7fffffffa688, fmpc=0x0, ipobody=0x0, ibody=0xf8e9800, xbody=0xf8e9820, nbody=0x7fffffffa598, xbodyold=0xf8e9840, timepar=0x7fffffffabd0, thicke=0x0, jobnamec=0x7fffffffaf60 "beam10p", tieset=0x0, ntie=0x7fffffffa6f8, istep=0x7fffffffa5e0, nmat=0x7fffffffa600, ielprop=0x0, prop=0x0, typeboun=0xf8e6a80 'B' <repeats 12 times>, mortar=0x7fffffffa640, mpcinfo=0x7fffffffab60, tietol=0x0, ics=0x0, icontact=0x7fffffffa760, orname=0x0, itempuser=0x7fffffffaa60) at linstatic.c:985

10 0x00000000004179ae in main (argc=, argv=<optimized

out>) at ccx_2.17.c:1176

Is this a bug or a mistake from my side while setting up/compiling?

Any help is appreciated. Kind Regards, and thanks in advance

-- You are receiving this because you are subscribed to this thread. Reply to this email directly or view it on GitHub: https://github.com/Dhondtguido/PaStiX4CalculiX/issues/6

Kabbone commented 3 years ago

I will look more into it as soon as I have a CUDA machine available, could eventually take a few weeks

Kabbone commented 3 years ago

I took only a very short look yet. Could you try to set PASTIX_REFINE_GPU=1 as well as environment variable?

maxgit33 commented 3 years ago

I took only a very short look yet. Could you try to set PASTIX_REFINE_GPU=1 as well as environment variable?

Thank you very much @Kabbone , this solves the problem. I get no segmentation faults with this additional environment variable.