JuliaGPU / AMDGPU.jl

AMD GPU (ROCm) programming in Julia
Other
283 stars 47 forks source link

Coredump on using AMDGPU #696

Open sakura-nyaa opened 1 week ago

sakura-nyaa commented 1 week ago

Hoping somebody who understands HIP/ROCM better than me can help me understand whats going on here. Using the version you get when you use "add AMDGPU" I get a core dump instantly. By going into src/discovery/discovery.jl and moving

        global libMIOpen_path = get_library(lib_prefix * "MIOpen"; rocm_path)

up to the top (it needs to come before libhsa gets loaded. one line below and the coredumps return.):

    try
        global libMIOpen_path = get_library(lib_prefix * "MIOpen"; rocm_path)
        global libhsaruntime = if Sys.islinux()
            get_library("libhsa-runtime64"; rocm_path, ext="so.1")
        else
            ""
        end

        # Linker.
        lld_path = get_ld_lld(rocm_path; from_artifact=false,
            artifact_library=:LLD_jll, artifact_field=:lld_path)
        lld_artifact = false
        if isempty(lld_path)
            lld_path = get_ld_lld(rocm_path; from_artifact=true,
                artifact_library=:LLD_jll, artifact_field=:lld_path)
            lld_artifact = true
        end
        global lld_path = lld_path
        global lld_artifact = lld_artifact

        # HIP.
        global libhip = get_library(
            Sys.islinux() ? "libamdhip64" : "amdhip64"; rocm_path)

        # Check if opaque pointers are enabled and turn off artifacts.
        llvm_args = get(ENV, "JULIA_LLVM_ARGS", "")
        enabled_opaque_pointers = occursin("-opaque-pointers", llvm_args)
        from_artifact = (
            # Detect HIP version, which will influence what device libraries to use.
            (isempty(libhip) || Base.thisminor(_hip_runtime_version()) > v"5.4")
            && !enabled_opaque_pointers)

        # If ROCm 5.5+ - use artifact device libraries.
        global libdevice_libs = get_device_libs(from_artifact; rocm_path)

        # HIP-based libraries.
        global librocblas = get_library(lib_prefix * "rocblas"; rocm_path)
        global librocsparse = get_library(lib_prefix * "rocsparse"; rocm_path)
        global librocsolver = get_library(lib_prefix * "rocsolver"; rocm_path)
        global librocrand = get_library(lib_prefix * "rocrand"; rocm_path)
        global librocfft = get_library(lib_prefix * "rocfft"; rocm_path)
    catch err
        @error """ROCm discovery failed!
        Discovered ROCm path: $rocm_path.
        Use `ROCM_PATH` env variable to specify ROCm directory.

        """ exception=(err, catch_backtrace())
    end

the core dumps stop and everything seems to work normally. Anybody have any ideas? Thanks for any help.

commit:

commit 4385ed941d5bf1b4818d8cd34ca090b8827d1ca4 (HEAD -> master, tag: v1.1.0, origin/master, origin/HEAD)
Author: Anton Smirnov <tonysmn97@gmail.com>
Date:   Sat Nov 9 14:50:57 2024 +0200

    Bump to 1.1.0

AMDGPU.versioninfo()

julia> AMDGPU.versioninfo()
[ Info: AMDGPU versioninfo
┌───────────┬──────────────────┬───────────┬─────────────────────────────────────────────────────────────────────────────────────────┐
│ Available │ Name             │ Version   │ Path                                                                                    │
├───────────┼──────────────────┼───────────┼─────────────────────────────────────────────────────────────────────────────────────────┤
│     +     │ LLD              │ -         │ /opt/rocm/llvm/bin/ld.lld                                                               │
│     +     │ Device Libraries │ -         │ /home/goopnook/.julia/artifacts/5ad5ecb46e3c334821f54c1feecc6c152b7b6a45/amdgcn/bitcode │
│     +     │ HIP              │ 6.2.41134 │ /opt/rocm/lib/../lib/libamdhip64.so.6                                                   │
│     +     │ rocBLAS          │ 4.2.1     │ /opt/rocm/lib/../lib/librocblas.so.4                                                    │
│     +     │ rocSOLVER        │ 3.26.0    │ /opt/rocm/lib/librocsolver.so                                                           │
│     +     │ rocSPARSE        │ -         │ /opt/rocm/lib/librocsparse.so                                                           │
│     +     │ rocRAND          │ 2.10.5    │ /opt/rocm/lib/librocrand.so                                                             │
│     +     │ rocFFT           │ 1.0.27    │ /opt/rocm/lib/librocfft.so                                                              │
│     +     │ MIOpen           │ 3.2.0     │ /opt/rocm/lib/libMIOpen.so                                                              │
└───────────┴──────────────────┴───────────┴─────────────────────────────────────────────────────────────────────────────────────────┘

[ Info: AMDGPU devices
┌────┬───────────────────────┬──────────┬───────────┬────────────┬───────────────┐
│ Id │                  Name │ GCN arch │ Wavefront │     Memory │ Shared Memory │
├────┼───────────────────────┼──────────┼───────────┼────────────┼───────────────┤
│  1 │ AMD Radeon RX 7600 XT │  gfx1102 │        32 │ 15.984 GiB │    64.000 KiB │
└────┴───────────────────────┴──────────┴───────────┴────────────┴───────────────┘

GDB backtrace:

julia> using AMDGPU
[Detaching after vfork from child process 19143]
[New Thread 0x7fffa34006c0 (LWP 19144)]
[New Thread 0x7ffea28006c0 (LWP 19145)]
[Thread 0x7ffea28006c0 (LWP 19145) exited]
julia: /usr/src/debug/hip-runtime/clr-rocm-6.2.2/hipamd/src/hip_code_object.cpp:1152: hip::FatBinaryInfo** hip::StatCO::addFatBinary(const void*, bool): Assertion `err == hipSuccess' failed.

Thread 1 "julia" received signal SIGABRT, Aborted.
0x00007ffff7e383f4 in ?? () from /usr/lib/libc.so.6
(gdb) bt
#0  0x00007ffff7e383f4 in ?? () from /usr/lib/libc.so.6
#1  0x00007ffff7ddf120 in raise () from /usr/lib/libc.so.6
#2  0x00007ffff7dc64c3 in abort () from /usr/lib/libc.so.6
#3  0x00007ffff7dc63df in ?? () from /usr/lib/libc.so.6
#4  0x00007ffff7dd7177 in __assert_fail () from /usr/lib/libc.so.6
#5  0x00007fffae250955 in ?? () from /opt/rocm/lib/libamdhip64.so
#6  0x00007ffe8cd6b91d in ?? () from /opt/rocm/lib/libMIOpen.so
#7  0x00007ffff7fcb5b7 in ?? () from /lib64/ld-linux-x86-64.so.2
#8  0x00007ffff7fcb6ad in ?? () from /lib64/ld-linux-x86-64.so.2
#9  0x00007ffff7fc85c2 in _dl_catch_exception () from /lib64/ld-linux-x86-64.so.2
#10 0x00007ffff7fd24fc in ?? () from /lib64/ld-linux-x86-64.so.2
#11 0x00007ffff7fc8523 in _dl_catch_exception () from /lib64/ld-linux-x86-64.so.2
#12 0x00007ffff7fd2904 in ?? () from /lib64/ld-linux-x86-64.so.2
#13 0x00007ffff7e31f14 in ?? () from /usr/lib/libc.so.6
#14 0x00007ffff7fc8523 in _dl_catch_exception () from /lib64/ld-linux-x86-64.so.2
#15 0x00007ffff7fc8679 in ?? () from /lib64/ld-linux-x86-64.so.2
#16 0x00007ffff7e319f3 in ?? () from /usr/lib/libc.so.6
#17 0x00007ffff7e31fcf in dlopen () from /usr/lib/libc.so.6
#18 0x00007ffff6c723f7 in ijl_dlopen (filename=<optimized out>, flags=<optimized out>) at /cache/build/builder-demeter6-6/julialang/julia-master/src/dlload.c:200
#19 0x00007ffff6c724f6 in ijl_load_dynamic_library (modname=0x7fffeb9b4298 "libMIOpen", flags=4, throw_err=0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/dlload.c:365
#20 0x00007fffe1251aaa in julia_#dlopen#3_49859 () at libdl.jl:120
#21 0x00007fffaf628cdb in dlopen () at libdl.jl:119
#22 julia_find_library_7656 () at libdl.jl:209
#23 0x00007fffaf612f1c in find_library () at libdl.jl:217
#24 find_library () at libdl.jl:217
--------------------------------------------------------------------------------
#25 japi1_find_rocm_library_5949 () at /home/neil/.julia/dev/AMDGPU/src/discovery/utils.jl:109
#26 0x00007fffaf626f57 in #get_library#3 () at /home/neil/.julia/dev/AMDGPU/src/discovery/discovery.jl:16
#27 get_library () at /home/neil/.julia/dev/AMDGPU/src/discovery/discovery.jl:15
#28 julia___init___6051 () at /home/neil/.julia/dev/AMDGPU/src/discovery/discovery.jl:111
#29 0x00007fffaf612859 in jfptr___init___6052 () from /home/neil/.julia/compiled/v1.11/AMDGPU/arpZD_5tx5N.so
#30 0x00007ffff6c8d132 in jl_apply (nargs=1, args=0x7fffffff9c08) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#31 jl_module_run_initializer (m=0x7fffaf68b560 <jl_system_image_data+333280>) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:76
#32 0x00007fffe1927a49 in julia_run_module_init_69426 () at loading.jl:1336
#33 0x00007fffe1b4541b in julia_register_restored_modules_69409 () at loading.jl:1324
#34 0x00007fffe29eb390 in julia_#_include_from_serialized#1066_69268 () at loading.jl:1213
#35 0x00007fffe2300613 in _include_from_serialized () at loading.jl:1169
#36 _include_from_serialized () at loading.jl:1169
#37 julia_#_require_search_from_serialized#1077_69714 () at loading.jl:1985
#38 0x00007fffe213e38c in julia__require_search_from_serialized_44532 () at loading.jl:1908
#39 0x00007fffe2ab083c in jfptr.require_search_from_serialized_44533 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#40 0x00007fffe12b8b12 in julia__require_69981 () at loading.jl:2450
#41 0x00007fffe17ab2fa in julia___require_prelocked_69876 () at loading.jl:2315
#42 0x00007fffe1cf8933 in jfptr___require_prelocked_69877.1 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#43 0x00007ffff6c5f97a in jl_apply (nargs=3, args=0x7fffffffcde0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#44 jl_f__call_in_world (F=<optimized out>, args=0x7fffffffcdd8, nargs=4) at /cache/build/builder-demeter6-6/julialang/julia-master/src/builtins.c:894
#45 0x00007fffe1ccd235 in #invoke_in_world#3 () at essentials.jl:1089
#46 invoke_in_world () at essentials.jl:1086
#47 julia__require_prelocked_69874 () at loading.jl:2302
#48 0x00007fffe21260e9 in macro expansion () at loading.jl:2241
#49 macro expansion () at lock.jl:273
#50 julia___require_69813 () at loading.jl:2198
#51 0x00007fffe26d29b3 in jfptr___require_69814.1 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#52 0x00007ffff6c5f97a in jl_apply (nargs=3, args=0x7fffffffd3d0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#53 jl_f__call_in_world (F=<optimized out>, args=0x7fffffffd3c8, nargs=4) at /cache/build/builder-demeter6-6/julialang/julia-master/src/builtins.c:894
#54 0x00007fffe2925e2f in #invoke_in_world#3 () at essentials.jl:1089
#55 invoke_in_world () at essentials.jl:1086
#56 julia_require_69802 () at loading.jl:2191
#57 0x00007fffe1ff71c3 in jfptr_require_69803 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#58 0x00007ffff6c8cd54 in jl_apply (nargs=3, args=0x7fffffffd440) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#59 call_require (var=0x7fffef3a0c28, mod=0x7fffe5f9b870 <jl_system_image_data+50367280>) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:486
#60 eval_import_path (where=where@entry=0x7fffe5f9b870 <jl_system_image_data+50367280>, from=from@entry=0x0, args=0x7fffed1ec8b0, name=name@entry=0x7fffffffd510, keyword=keyword@entry=0x7ffff6e29346 "using")
    at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:523
#61 0x00007ffff6c8ed21 in jl_toplevel_eval_flex (m=m@entry=0x7fffe5f9b870 <jl_system_image_data+50367280>, e=<optimized out>, fast=fast@entry=1, expanded=expanded@entry=0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:759
#62 0x00007ffff6c8e92a in jl_toplevel_eval_flex (m=m@entry=0x7fffe5f9b870 <jl_system_image_data+50367280>, e=e@entry=0x7fffed1ee3f0, fast=fast@entry=1, expanded=expanded@entry=0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:886
#63 0x00007ffff6c8f40c in ijl_toplevel_eval (m=m@entry=0x7fffe5f9b870 <jl_system_image_data+50367280>, v=v@entry=0x7fffed1ee3f0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:952
#64 0x00007ffff6c8f996 in ijl_toplevel_eval_in (m=0x7fffe5f9b870 <jl_system_image_data+50367280>, ex=0x7fffed1ee3f0) at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:994
#65 0x00007fffd0e536d8 in eval () at boot.jl:430
#66 japi1_eval_user_input_9990 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:245
#67 0x00007fffd0e73b02 in julia_repl_backend_loop_10025 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:342
--Type <RET> for more, q to quit, c to continue without paging--c
#68 0x00007fffd0e6d26d in japi1_#start_repl_backend#59_10022 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:327
#69 0x00007fffd0ce871f in japi1_start_repl_backend_10632 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:324
#70 0x00007fffd0deb1ab in julia_#run_repl#72_10096 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:483
#71 0x00007fffd0d749ed in julia_run_repl_10087 () at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:469
#72 0x00007fffd0e1d063 in jfptr_run_repl_10088 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/share/julia/compiled/v1.11/REPL/u0gqU_GYsA8.so
#73 0x00007fffd0cac40a in julia_#1139_14648 () at client.jl:446
#74 0x00007fffd0e6ead8 in jfptr_YY.1139_14649 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/share/julia/compiled/v1.11/REPL/u0gqU_GYsA8.so
#75 0x00007ffff6c5f82a in jl_apply (nargs=2, args=0x7fffffffe580) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#76 jl_f__call_latest (F=<optimized out>, args=0x7fffffffe580, nargs=2) at /cache/build/builder-demeter6-6/julialang/julia-master/src/builtins.c:875
#77 0x00007fffe1544387 in #invokelatest#2 () at essentials.jl:1055
#78 invokelatest () at essentials.jl:1052
#79 julia_run_main_repl_72104 () at client.jl:430
#80 0x00007fffe20498d5 in repl_main () at client.jl:567
#81 julia__start_72143 () at client.jl:541
#82 0x00007fffe1b194a4 in jfptr.start_72144 () from /home/neil/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/lib/julia/sys.so
#83 0x00007ffff6cc4ca6 in jl_apply (nargs=1, args=0x7fffffffe8e8) at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157
#84 true_main (argc=<optimized out>, argv=<optimized out>) at /cache/build/builder-demeter6-6/julialang/julia-master/src/jlapi.c:900
#85 0x00007ffff6cc573f in jl_repl_entrypoint (argc=<optimized out>, argv=0x7fffffffecc8) at /cache/build/builder-demeter6-6/julialang/julia-master/src/jlapi.c:1059
#86 0x0000000000401089 in main (argc=<optimized out>, argv=<optimized out>) at /cache/build/builder-demeter6-6/julialang/julia-master/cli/loader_exe.c:58

rocminfo:

=====================    
HSA System Attributes    
=====================    
Runtime Version:         1.1
Runtime Ext Version:     1.6
System Timestamp Freq.:  1000.000000MHz
Sig. Max Wait Duration:  18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model:           LARGE                              
System Endianness:       LITTLE                             
Mwaitx:                  DISABLED
DMAbuf Support:          YES

==========               
HSA Agents               
==========               
*******                  
Agent 1                  
*******                  
  Name:                    Intel(R) Core(TM) i3-6100 CPU @ 3.70GHz
  Uuid:                    CPU-XX                             
  Marketing Name:          Intel(R) Core(TM) i3-6100 CPU @ 3.70GHz
  Vendor Name:             CPU                                
  Feature:                 None specified                     
  Profile:                 FULL_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        0(0x0)                             
  Queue Min Size:          0(0x0)                             
  Queue Max Size:          0(0x0)                             
  Queue Type:              MULTI                              
  Node:                    0                                  
  Device Type:             CPU                                
  Cache Info:              
    L1:                      32768(0x8000) KB                   
  Chip ID:                 0(0x0)                             
  ASIC Revision:           0(0x0)                             
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   3700                               
  BDFID:                   0                                  
  Internal Node ID:        0                                  
  Compute Unit:            4                                  
  SIMDs per CU:            0                                  
  Shader Engines:          0                                  
  Shader Arrs. per Eng.:   0                                  
  WatchPts on Addr. Ranges:1                                  
  Memory Properties:       
  Features:                None
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: FINE GRAINED        
      Size:                    16318556(0xf9005c) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 2                   
      Segment:                 GLOBAL; FLAGS: KERNARG, FINE GRAINED
      Size:                    16318556(0xf9005c) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 3                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    16318556(0xf9005c) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
  ISA Info:                
*******                  
Agent 2                  
*******                  
  Name:                    gfx1102                            
  Uuid:                    GPU-XX                             
  Marketing Name:          AMD Radeon RX 7600 XT              
  Vendor Name:             AMD                                
  Feature:                 KERNEL_DISPATCH                    
  Profile:                 BASE_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        128(0x80)                          
  Queue Min Size:          64(0x40)                           
  Queue Max Size:          131072(0x20000)                    
  Queue Type:              MULTI                              
  Node:                    1                                  
  Device Type:             GPU                                
  Cache Info:              
    L1:                      32(0x20) KB                        
    L2:                      2048(0x800) KB                     
  Chip ID:                 29824(0x7480)                      
  ASIC Revision:           0(0x0)                             
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   2539                               
  BDFID:                   768                                
  Internal Node ID:        1                                  
  Compute Unit:            32                                 
  SIMDs per CU:            2                                  
  Shader Engines:          2                                  
  Shader Arrs. per Eng.:   2                                  
  WatchPts on Addr. Ranges:4                                  
  Coherent Host Access:    FALSE                              
  Memory Properties:       
  Features:                KERNEL_DISPATCH 
  Fast F16 Operation:      TRUE                               
  Wavefront Size:          32(0x20)                           
  Workgroup Max Size:      1024(0x400)                        
  Workgroup Max Size per Dimension:
    x                        1024(0x400)                        
    y                        1024(0x400)                        
    z                        1024(0x400)                        
  Max Waves Per CU:        32(0x20)                           
  Max Work-item Per CU:    1024(0x400)                        
  Grid Max Size:           4294967295(0xffffffff)             
  Grid Max Size per Dimension:
    x                        4294967295(0xffffffff)             
    y                        4294967295(0xffffffff)             
    z                        4294967295(0xffffffff)             
  Max fbarriers/Workgrp:   32                                 
  Packet Processor uCode:: 372                                
  SDMA engine uCode::      21                                 
  IOMMU Support::          None                               
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    16760832(0xffc000) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:2048KB                             
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 2                   
      Segment:                 GLOBAL; FLAGS: EXTENDED FINE GRAINED
      Size:                    16760832(0xffc000) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:2048KB                             
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 3                   
      Segment:                 GROUP                              
      Size:                    64(0x40) KB                        
      Allocatable:             FALSE                              
      Alloc Granule:           0KB                                
      Alloc Recommended Granule:0KB                                
      Alloc Alignment:         0KB                                
      Accessible by all:       FALSE                              
  ISA Info:                
    ISA 1                    
      Name:                    amdgcn-amd-amdhsa--gfx1102         
      Machine Models:          HSA_MACHINE_MODEL_LARGE            
      Profiles:                HSA_PROFILE_BASE                   
      Default Rounding Mode:   NEAR                               
      Default Rounding Mode:   NEAR                               
      Fast f16:                TRUE                               
      Workgroup Max Size:      1024(0x400)                        
      Workgroup Max Size per Dimension:
        x                        1024(0x400)                        
        y                        1024(0x400)                        
        z                        1024(0x400)                        
      Grid Max Size:           4294967295(0xffffffff)             
      Grid Max Size per Dimension:
        x                        4294967295(0xffffffff)             
        y                        4294967295(0xffffffff)             
        z                        4294967295(0xffffffff)             
      FBarrier Max Size:       32                                 
*** Done ***             
pxl-th commented 1 week ago

What OS are you on? Is this an official build of ROCm? I'd take a look at output of libtree on libMIOpen.so:

pxl-th@Tower:~$ libtree /opt/rocm-6.2.2/lib/libMIOpen.so
libMIOpen.so.1 
├── libhiprtc.so.6 [runpath]
│   └── libnuma.so.1 [ld.so.conf]
├── libamdhip64.so.6 [runpath]
│   ├── librocprofiler-register.so.0 [runpath]
│   ├── libamd_comgr.so.2 [runpath]
│   │   ├── libz.so.1 [ld.so.conf]
│   │   ├── libtinfo.so.6 [ld.so.conf]
│   │   └── libzstd.so.1 [ld.so.conf]
│   ├── libhsa-runtime64.so.1 [runpath]
│   │   ├── librocprofiler-register.so.0 [runpath]
│   │   ├── libdrm.so.2 [ld.so.conf]
│   │   ├── libdrm_amdgpu.so.1 [ld.so.conf]
│   │   │   └── libdrm.so.2 [ld.so.conf]
│   │   ├── libelf.so.1 [ld.so.conf]
│   │   │   ├── libz.so.1 [ld.so.conf]
│   │   │   └── libzstd.so.1 [ld.so.conf]
│   │   └── libnuma.so.1 [ld.so.conf]
│   └── libnuma.so.1 [ld.so.conf]
├── libroctx64.so.4 [runpath]
├── librocblas.so.4 [runpath]
│   └── libamdhip64.so.6 [runpath]
├── librocm-core.so.1 [runpath]
├── libamd_comgr.so.2 [runpath]
└── libzstd.so.1 [ld.so.conf]
laochailan commented 2 days ago

I run into the same problem on arch linux. Previously my setup worked but I think after a rocm update it stopped working.

When I tried libtree, I noticed libmiopen was not actually installed. Maybe the ROCm packages where split up and a dependency is missing. Installing miopen did not fix the issue but gives this libtree.

libMIOpen.so.1
├── libhiprtc.so.6 [runpath]
├── libamdhip64.so.6 [runpath]
│   ├── librocprofiler-register.so.0 [runpath]
│   │   ├── libfmt.so.11 [default path]
│   │   └── libglog.so.2 [default path]
│   │       └── libgflags.so.2.2 [default path]
│   ├── libamd_comgr.so.2 [runpath]
│   │   ├── libz.so.1 [default path]
│   │   ├── libncursesw.so.6 [default path]
│   │   └── libzstd.so.1 [default path]
│   ├── libhsa-runtime64.so.1 [runpath]
│   │   ├── libhsakmt.so.1 [ld.so.conf]
│   │   │   ├── libdrm.so.2 [default path]
│   │   │   ├── libnuma.so.1 [default path]
│   │   │   └── libdrm_amdgpu.so.1 [default path]
│   │   │       └── libdrm.so.2 [default path]
│   │   ├── libelf.so.1 [default path]
│   │   │   ├── libz.so.1 [default path]
│   │   │   └── libzstd.so.1 [default path]
│   │   └── libdrm.so.2 [default path]
│   └── libnuma.so.1 [default path]
├── libroctx64.so.4 [runpath]
├── libamd_comgr.so.2 [runpath]
├── librocblas.so.4 [runpath]
│   └── libamdhip64.so.6 [runpath]
├── libbz2.so.1.0 [default path]
└── libsqlite3.so.0 [default path]
pxl-th commented 2 days ago

Hi, @laochailan. Can you try moving:

global libMIOpen_path = get_library(lib_prefix * "MIOpen"; rocm_path)

before line:

global libhsaruntime = if Sys.islinux()
            get_library("libhsa-runtime64"; rocm_path, ext="so.1")
else
            ""
end

in src/discovery/discovery.jl file and see if it also helps you?

ffrancesco94 commented 1 day ago

Also on Arch and also having the same issue. Moving the libMIOpen_path line doesn't seem to fix it.

ffrancesco94 commented 1 day ago

Update: moving the discovery of all libraries (rocblas, rocfft, rocsolver, etc.) before the hsaruntime one does the trick. Not sure what changed. I don't know what effect this might have on other platforms, but if you don't think if affects anything, I can submit a PR. EDIT: while I can allocate arrays on the GPU, even trying to multiply gives this core dump:

julia: /usr/src/debug/hip-runtime/clr-rocm-6.2.2/hipamd/src/hip_code_object.cpp:1152: hip::FatBinaryInfo** hip::StatCO::addFatBinary(const void*, bool): Assertion `err == hipSuccess' failed.

[323653] signal 6 (-6): Aborted
in expression starting at REPL[3]:1
unknown function (ip: 0x7e1f4f62d3f4)
gsignal at /usr/bin/../lib/libc.so.6 (unknown line)
abort at /usr/bin/../lib/libc.so.6 (unknown line)
unknown function (ip: 0x7e1f4f5bb3de)
__assert_fail at /usr/bin/../lib/libc.so.6 (unknown line)
unknown function (ip: 0x7e1ef6a50954)
unknown function (ip: 0x7e1e766ec8a8)
unknown function (ip: 0x7e1f4f79e5b6)
unknown function (ip: 0x7e1f4f79e6ac)
_dl_catch_exception at /lib64/ld-linux-x86-64.so.2 (unknown line)
unknown function (ip: 0x7e1f4f7a54fb)
_dl_catch_exception at /lib64/ld-linux-x86-64.so.2 (unknown line)
unknown function (ip: 0x7e1f4f7a5903)
unknown function (ip: 0x7e1f4f626f13)
_dl_catch_exception at /lib64/ld-linux-x86-64.so.2 (unknown line)
unknown function (ip: 0x7e1f4f79b678)
unknown function (ip: 0x7e1f4f6269f2)
dlopen at /usr/bin/../lib/libc.so.6 (unknown line)
ijl_load_dynamic_library at /cache/build/builder-demeter6-6/julialang/julia-master/src/dlload.c:365
jl_get_library_ at /cache/build/builder-demeter6-6/julialang/julia-master/src/runtime_ccall.cpp:45 [inlined]
jl_get_library_ at /cache/build/builder-demeter6-6/julialang/julia-master/src/runtime_ccall.cpp:29
ijl_lazy_load_and_lookup at /cache/build/builder-demeter6-6/julialang/julia-master/src/runtime_ccall.cpp:73
macro expansion at /home/fra/.julia/packages/AMDGPU/yqCEl/src/utils.jl:134 [inlined]
rocblas_create_handle at /home/fra/.julia/packages/AMDGPU/yqCEl/src/blas/librocblas.jl:230
macro expansion at /home/fra/.julia/packages/AMDGPU/yqCEl/src/utils.jl:134 [inlined]
create_handle at /home/fra/.julia/packages/AMDGPU/yqCEl/src/blas/rocBLAS.jl:36 [inlined]
#14 at /home/fra/.julia/packages/AMDGPU/yqCEl/src/cache.jl:103 [inlined]
#5 at /home/fra/.julia/packages/AMDGPU/yqCEl/src/cache.jl:29
lock at ./lock.jl:232
check_cache at /home/fra/.julia/packages/AMDGPU/yqCEl/src/cache.jl:27 [inlined]
pop! at /home/fra/.julia/packages/AMDGPU/yqCEl/src/cache.jl:48 [inlined]
new_state at /home/fra/.julia/packages/AMDGPU/yqCEl/src/cache.jl:102
#18 at /home/fra/.julia/packages/AMDGPU/yqCEl/src/cache.jl:115 [inlined]
get! at ./dict.jl:458
library_state at /home/fra/.julia/packages/AMDGPU/yqCEl/src/cache.jl:115
lib_state at /home/fra/.julia/packages/AMDGPU/yqCEl/src/blas/rocBLAS.jl:48 [inlined]
gemm! at /home/fra/.julia/packages/AMDGPU/yqCEl/src/blas/wrappers.jl:562 [inlined]
generic_matmatmul! at /home/fra/.julia/packages/AMDGPU/yqCEl/src/blas/highlevel.jl:178
generic_matmatmul! at /home/fra/.julia/packages/AMDGPU/yqCEl/src/blas/highlevel.jl:148 [inlined]
_mul! at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:287 [inlined]
mul! at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:285 [inlined]
mul! at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:253 [inlined]
* at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:124
unknown function (ip: 0x7e1f42f27da6)
jl_apply at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157 [inlined]
do_call at /cache/build/builder-demeter6-6/julialang/julia-master/src/interpreter.c:126
eval_value at /cache/build/builder-demeter6-6/julialang/julia-master/src/interpreter.c:223
eval_stmt_value at /cache/build/builder-demeter6-6/julialang/julia-master/src/interpreter.c:174 [inlined]
eval_body at /cache/build/builder-demeter6-6/julialang/julia-master/src/interpreter.c:663
jl_interpret_toplevel_thunk at /cache/build/builder-demeter6-6/julialang/julia-master/src/interpreter.c:821
jl_toplevel_eval_flex at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:943
jl_toplevel_eval_flex at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:886
ijl_toplevel_eval_in at /cache/build/builder-demeter6-6/julialang/julia-master/src/toplevel.c:994
eval at ./boot.jl:430 [inlined]
eval_user_input at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:245
repl_backend_loop at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:342
#start_repl_backend#59 at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:327
start_repl_backend at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:324
#run_repl#72 at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:483
run_repl at /cache/build/builder-demeter6-6/julialang/julia-master/usr/share/julia/stdlib/v1.11/REPL/src/REPL.jl:469
jfptr_run_repl_10088 at /usr/share/julia/compiled/v1.11/REPL/u0gqU_GYsA8.so (unknown line)
#1139 at ./client.jl:446
jfptr_YY.1139_14649 at /usr/share/julia/compiled/v1.11/REPL/u0gqU_GYsA8.so (unknown line)
jl_apply at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157 [inlined]
jl_f__call_latest at /cache/build/builder-demeter6-6/julialang/julia-master/src/builtins.c:875
#invokelatest#2 at ./essentials.jl:1055 [inlined]
invokelatest at ./essentials.jl:1052 [inlined]
run_main_repl at ./client.jl:430
repl_main at ./client.jl:567 [inlined]
_start at ./client.jl:541
jfptr__start_72144.1 at /usr/lib/julia/sys.so (unknown line)
jl_apply at /cache/build/builder-demeter6-6/julialang/julia-master/src/julia.h:2157 [inlined]
true_main at /cache/build/builder-demeter6-6/julialang/julia-master/src/jlapi.c:900
jl_repl_entrypoint at /cache/build/builder-demeter6-6/julialang/julia-master/src/jlapi.c:1059
main at julia (unknown line)
unknown function (ip: 0x7e1f4f5bce07)
__libc_start_main at /usr/bin/../lib/libc.so.6 (unknown line)
unknown function (ip: 0x4010b8)
Allocations: 6981943 (Pool: 6981676; Big: 267); GC: 9
zsh: IOT instruction (core dumped)  julia

It seems to be the rocblas call that is giving issues. If I do elementwise multiplication it works. However, upon calling exit(), I get a segfault. Definitely something fishy going on.

ffrancesco94 commented 1 day ago

Whatever it is got solved by downgrading ROCm to 6.0.2. Don't know if this is something Arch-specific.