ROCm / HIP

HIP: C++ Heterogeneous-Compute Interface for Portability
https://rocmdocs.amd.com/projects/HIP/
MIT License
3.75k stars 533 forks source link

Multi-GPU, P2P access, Memcpy2D, SIGSEGV #3352

Closed kotee4ko closed 1 month ago

kotee4ko commented 1 year ago

Hello.

Long story short: https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/issues/150

PoC:


#define __HIP_PLATFORM_AMD__
#include<stdio.h>
#include<hip/hip_runtime.h>

#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16

#define Nrows 3
#define Ncols 5

/*****************/
/* HIP MEMCHECK */
/*****************/
#define gpuErrchk(ans) { gpuAssert((ans), (char *)__FILE__, __LINE__); }

inline void gpuAssert(hipError_t code, char *file, int line, bool abort = true)
{
    if (code != hipSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %dn", hipGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int hostPtr, int b){ return ((hostPtr % b) != 0) ? (hostPtr / b + 1) : (hostPtr / b); }

/******************/
/* TEST KERNEL 2D */
/******************/
__global__ void test_kernel_2D(float *devPtr, size_t pitch)
{
    int    tidx = blockIdx.x*blockDim.x + threadIdx.x;
    int    tidy = blockIdx.y*blockDim.y + threadIdx.y;

    if ((tidx < Ncols) && (tidy < Nrows))
    {
        float *row_a = (float *)((char*)devPtr + tidy * pitch);
        row_a[tidx] = row_a[tidx] * tidx * tidy;
    }
}

/********/
/* MAIN */
/********/
int main()
{
    float hostPtr[Nrows][Ncols];
    float *devPtr = NULL, *dev2Ptr = NULL;
    size_t pitch = 0;
    size_t width = 0;
    size_t hiegh = 0;

    for (int i = 0; i < Nrows; i++)
        for (int j = 0; j < Ncols; j++) {
            hostPtr[i][j] = 1.f;
            printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);
        }

   #define NUM_H 256
   #define NUM_W 256
   #define COLUMNS 8
   #define ROWS 8

    // --- 2D pitched allocation and host->device memcopy
    width = Ncols * sizeof(float);
    hiegh = Nrows;
    printf("allocating memPith %zu X %zu\n", width, hiegh);
    gpuErrchk(hipMallocPitch(reinterpret_cast<void**>(&devPtr), &pitch, width, hiegh));
    gpuErrchk(hipSetDevice(1));
    gpuErrchk(hipMallocPitch(reinterpret_cast<void**>(&dev2Ptr), &pitch, width, hiegh));
    gpuErrchk(hipSetDevice(0));
    printf("pitch = %zu\n", pitch);

    //gpuErrchk(hipMallocPitch(&devPtr, &pitch, Ncols * sizeof(float), Nrows));
    gpuErrchk(hipMemcpy2D(devPtr, pitch, hostPtr, Ncols*sizeof(float), Ncols*sizeof(float), Nrows, hipMemcpyDefault));
    //next call will crash. Calls to isPeerCanAccess return 0. But it seems that nvidia has a workaround, because this code shouldn't crash on nvidia, if I understand right.
    // this crash is same with or without forcing amdgpu module args in kernel cmdline.
    gpuErrchk(hipMemcpy2D(dev2Ptr, pitch, devPtr, Ncols*sizeof(float), Ncols*sizeof(float), Nrows, hipMemcpyDefault));

    dim3 gridSize(iDivUp(Ncols, BLOCKSIZE_x), iDivUp(Nrows, BLOCKSIZE_y));
    dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);

    test_kernel_2D<<<gridSize, blockSize>>>(devPtr, pitch);
    gpuErrchk(hipPeekAtLastError());
    gpuErrchk(hipDeviceSynchronize());

    gpuErrchk(hipMemcpy2D(hostPtr, Ncols * sizeof(float), devPtr, pitch, Ncols * sizeof(float), Nrows, hipMemcpyDefault));

    for (int i = 0; i < Nrows; i++) 
        for (int j = 0; j < Ncols; j++) 
            printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);

    return 0;    
}

Crash details:


pwndbg> ctx
LEGEND: STACK | HEAP | CODE | DATA | RWX | RODATA
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[ REGISTERS ]───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 RAX  0x0
 RBX  0x3bab10 —▸ 0x7ffff6884200 —▸ 0x7ffff6732b50 ◂— endbr64 
 RCX  0x3a61f0 —▸ 0x7ffff6883908 —▸ 0x7ffff6717660 ◂— endbr64 
 RDX  0x0
 RDI  0x0
 RSI  0x3bab10 —▸ 0x7ffff6884200 —▸ 0x7ffff6732b50 ◂— endbr64 
 R8   0xd
 R9   0x3ba308 —▸ 0xeca9a0 —▸ 0x3ba090 ◂— 0x0
 R10  0x0
 R11  0xe014a31bb3bd8bec
 R12  0x0
 R13  0x3bab10 —▸ 0x7ffff6884200 —▸ 0x7ffff6732b50 ◂— endbr64 
 R14  0x0
 R15  0x0
 RBP  0x7fffffffdad0 —▸ 0x3ba2d0 —▸ 0x7ffff6883318 —▸ 0x7ffff67074b0 ◂— endbr64 
 RSP  0x7fffffffda40 —▸ 0x3ba2f8 —▸ 0x3ba4b0 —▸ 0x3ba308 —▸ 0xeca9a0 ◂— ...
 RIP  0x7ffff6725bfe ◂— mov    rbx, qword ptr [rdi + 0x10]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[ DISASM ]────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 ► 0x7ffff6725bfe    mov    rbx, qword ptr [rdi + 0x10]
   0x7ffff6725c02    mov    rax, qword ptr fs:[0x28]
   0x7ffff6725c0b    mov    qword ptr [rbp - 0x38], rax
   0x7ffff6725c0f    xor    eax, eax
   0x7ffff6725c11    lea    rax, [rbx + 0x128]
   0x7ffff6725c18    mov    qword ptr [rbp - 0x78], rax
   0x7ffff6725c1c    mov    rax, qword ptr [rip + 0x15f31d]
   0x7ffff6725c23    mov    rdx, qword ptr fs:[rax]
   0x7ffff6725c27    mov    rax, qword ptr [rbx + 0x128]
   0x7ffff6725c2e    test   al, 1
   0x7ffff6725c30    jne    0x7ffff6726018
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[ STACK ]─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
00:0000│ rsp  0x7fffffffda40 —▸ 0x3ba2f8 —▸ 0x3ba4b0 —▸ 0x3ba308 —▸ 0xeca9a0 ◂— ...
01:0008│      0x7fffffffda48 —▸ 0x7ffff6707401 ◂— mov    rsi, rdx
02:0010│      0x7fffffffda50 ◂— 0xd /* '\r' */
03:0018│      0x7fffffffda58 ◂— 0x2ad6ccc499f7e800
04:0020│      0x7fffffffda60 —▸ 0x3ba2d0 —▸ 0x7ffff6883318 —▸ 0x7ffff67074b0 ◂— endbr64 
05:0028│      0x7fffffffda68 —▸ 0x3ba2f8 —▸ 0x3ba4b0 —▸ 0x3ba308 —▸ 0xeca9a0 ◂— ...
06:0030│      0x7fffffffda70 —▸ 0x7fffffffda98 —▸ 0x3a61f0 —▸ 0x7ffff6883908 —▸ 0x7ffff6717660 ◂— ...
07:0038│      0x7fffffffda78 —▸ 0x3ba3f8 ◂— 0x0
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[ BACKTRACE ]───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 ► f 0     7ffff6725bfe
   f 1     7ffff672df70
   f 2     7ffff673003d
   f 3     7ffff66f8df4
   f 4     7ffff656876c
   f 5     7ffff65689b5
   f 6     7ffff6568ab0
   f 7     7ffff659c6fb hipMemcpy2D+1259
   f 8           20b843 main+755
   f 9     7ffff5c29d90 __libc_start_call_main+128
   f 10     7ffff5c29e40 __libc_start_main+128
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
pwndbg> l *0x20b843-5
0x20b83e is in main() (/opt/AI/koboldcpp-rocm/hip_test.cu:79).
74          gpuErrchk(hipSetDevice(0));
75          printf("pitch = %zu\n", pitch);
76
77          //gpuErrchk(hipMallocPitch(&devPtr, &pitch, Ncols * sizeof(float), Nrows));
78          gpuErrchk(hipMemcpy2D(devPtr, pitch, hostPtr, Ncols*sizeof(float), Ncols*sizeof(float), Nrows, hipMemcpyDefault));
79          gpuErrchk(hipMemcpy2D(dev2Ptr, pitch, devPtr, Ncols*sizeof(float), Ncols*sizeof(float), Nrows, hipMemcpyDefault));
80
81          dim3 gridSize(iDivUp(Ncols, BLOCKSIZE_x), iDivUp(Nrows, BLOCKSIZE_y));
82          dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);
83
pwndbg> bt
#0  0x00007ffff6725bfe in ?? () from /opt/rocm/lib/libamdhip64.so.5
#1  0x00007ffff672df70 in ?? () from /opt/rocm/lib/libamdhip64.so.5
#2  0x00007ffff673003d in ?? () from /opt/rocm/lib/libamdhip64.so.5
#3  0x00007ffff66f8df4 in ?? () from /opt/rocm/lib/libamdhip64.so.5
#4  0x00007ffff656876c in ?? () from /opt/rocm/lib/libamdhip64.so.5
#5  0x00007ffff65689b5 in ?? () from /opt/rocm/lib/libamdhip64.so.5
#6  0x00007ffff6568ab0 in ?? () from /opt/rocm/lib/libamdhip64.so.5
#7  0x00007ffff659c6fb in hipMemcpy2D () from /opt/rocm/lib/libamdhip64.so.5
#8  0x000000000020b843 in main () at /opt/AI/koboldcpp-rocm/hip_test.cu:79
#9  0x00007ffff5c29d90 in __libc_start_call_main (main=main@entry=0x20b550 <main()>, argc=argc@entry=1, argv=argv@entry=0x7fffffffe468) at ../sysdeps/nptl/libc_start_call_main.h:58
#10 0x00007ffff5c29e40 in __libc_start_main_impl (main=0x20b550 <main()>, argc=1, argv=0x7fffffffe468, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7fffffffe458) at ../csu/libc-start.c:392
#11 0x000000000020b415 in _start ()
pwndbg> xi $rip
Extended information for virtual address 0x7ffff6725bfe:

  Containing mapping:
    0x7ffff6420000     0x7ffff677c000 r-xp   35c000 20000  /opt/rocm-5.7.1/lib/libamdhip64.so.5.7.50701

  Offset information:
         Mapped Area 0x7ffff6725bfe = 0x7ffff6420000 + 0x305bfe
         File (Base) 0x7ffff6725bfe = 0x7ffff6400000 + 0x325bfe
      File (Segment) 0x7ffff6725bfe = 0x7ffff6420000 + 0x305bfe
         File (Disk) 0x7ffff6725bfe = /opt/rocm-5.7.1/lib/libamdhip64.so.5.7.50701 + 0x325bfe

 Containing ELF sections:
               .text 0x7ffff6725bfe = 0x7ffff6421570 + 0x30468e
pwndbg> 

HW details:


|root@ai-dev|:{/opt} #_ modinfo amdgpu|grep -i "version\|p2p"
version:        6.2.4
srcversion:     0CFF5A1845DE02D3AB8F6F5
vermagic:       6.2.0-35-generic SMP preempt mod_unload modversions 
parm:           use_xgmi_p2p:Enable XGMI P2P interface (0 = disable; 1 = enable (default)) (int)
parm:           hws_gws_support:Assume MEC2 FW supports GWS barriers (false = rely on FW version check (Default), true = force supported) (bool)
parm:           pcie_p2p:Enable PCIe P2P (requires large-BAR). (N = off, Y = on(default)) (bool)
|root@ai-dev|:{/opt} #_ cat /boot/grub/grub.cfg |grep -i amdgpu
        linux   /boot/vmlinuz-6.2.0-35-generic root=UUID=4b4ec28a-909b-439c-9f71-7521744421f8 ro  quiet splash amdgpu.vm_update_mode=3 amdgpu.pcie_p2p=on amdgpu.no_system_mem_limit=on $vt_handoff
|root@ai-dev|:{/opt} #_ rocm-smi -a

========================= ROCm System Management Interface =========================
=========================== Version of System Component ============================
Driver version: 6.2.4
====================================================================================
======================================== ID ========================================
GPU[0]          : GPU ID: 0x6864
GPU[1]          : GPU ID: 0x6864
====================================================================================
==================================== Unique ID =====================================
GPU[0]          : Unique ID: 0x214feae2d4808c4
GPU[1]          : Unique ID: 0x21501841f201964
====================================================================================
====================================== VBIOS =======================================
GPU[0]          : VBIOS version: 113-D0531300-101
GPU[1]          : VBIOS version: 113-D0531300-101
====================================================================================
=================================== Temperature ====================================
GPU[0]          : Temperature (Sensor edge) (C): 41.0
GPU[0]          : Temperature (Sensor junction) (C): 42.0
GPU[0]          : Temperature (Sensor memory) (C): 40.0
GPU[1]          : Temperature (Sensor edge) (C): 31.0
GPU[1]          : Temperature (Sensor junction) (C): 35.0
GPU[1]          : Temperature (Sensor memory) (C): 36.0
====================================================================================
============================ Current clock frequencies =============================
GPU[0]          : dcefclk clock level: 0: (600Mhz)
GPU[0]          : mclk clock level: 0: (167Mhz)
GPU[0]          : sclk clock level: 0: (300Mhz)
GPU[0]          : socclk clock level: 0: (600Mhz)
GPU[0]          : pcie clock level: 0 (8.0GT/s x8)
GPU[1]          : dcefclk clock level: 0: (600Mhz)
GPU[1]          : mclk clock level: 0: (167Mhz)
GPU[1]          : sclk clock level: 0: (300Mhz)
GPU[1]          : socclk clock level: 0: (600Mhz)
GPU[1]          : pcie clock level: 0 (8.0GT/s x8)
====================================================================================
================================ Current Fan Metric ================================
GPU[0]          : Fan Level: 24 (9%)
GPU[0]          : Fan RPM: 0
GPU[1]          : Fan Level: 24 (9%)
GPU[1]          : Fan RPM: 0
====================================================================================
============================== Show Performance Level ==============================
GPU[0]          : Performance Level: auto
GPU[1]          : Performance Level: auto
====================================================================================
================================= OverDrive Level ==================================
GPU[0]          : GPU OverDrive value (%): 0
GPU[1]          : GPU OverDrive value (%): 0
====================================================================================
================================= OverDrive Level ==================================
GPU[0]          : GPU Memory OverDrive value (%): 0
GPU[1]          : GPU Memory OverDrive value (%): 0
====================================================================================
==================================== Power Cap =====================================
GPU[0]          : Max Graphics Package Power (W): 110.0
GPU[1]          : Max Graphics Package Power (W): 110.0
====================================================================================
=============================== Show Power Profiles ================================
GPU[0]          : 1. Available power profile (#1 of 7): CUSTOM
GPU[0]          : 2. Available power profile (#2 of 7): VIDEO
GPU[0]          : 3. Available power profile (#3 of 7): POWER SAVING
GPU[0]          : 4. Available power profile (#4 of 7): COMPUTE
GPU[0]          : 5. Available power profile (#5 of 7): VR
GPU[0]          : 6. Available power profile (#6 of 7): 3D FULL SCREEN
GPU[0]          : 7. Available power profile (#7 of 7): BOOTUP DEFAULT*
GPU[1]          : 1. Available power profile (#1 of 7): CUSTOM
GPU[1]          : 2. Available power profile (#2 of 7): VIDEO
GPU[1]          : 3. Available power profile (#3 of 7): POWER SAVING
GPU[1]          : 4. Available power profile (#4 of 7): COMPUTE
GPU[1]          : 5. Available power profile (#5 of 7): VR
GPU[1]          : 6. Available power profile (#6 of 7): 3D FULL SCREEN
GPU[1]          : 7. Available power profile (#7 of 7): BOOTUP DEFAULT*
====================================================================================
================================ Power Consumption =================================
GPU[0]          : Average Graphics Package Power (W): 5.0
GPU[1]          : Average Graphics Package Power (W): 5.0
====================================================================================
=========================== Supported clock frequencies ============================
GPU[0]          : Supported dcefclk frequencies on GPU0
GPU[0]          : 0: 600Mhz *
GPU[0]          : 1: 720Mhz
GPU[0]          : 2: 847Mhz
GPU[0]          : 3: 900Mhz
GPU[0]          : 
GPU[0]          : 
GPU[0]          : Supported mclk frequencies on GPU0
GPU[0]          : 0: 167Mhz *
GPU[0]          : 1: 500Mhz
GPU[0]          : 2: 800Mhz
GPU[0]          : 3: 945Mhz
GPU[0]          : 
GPU[0]          : Supported sclk frequencies on GPU0
GPU[0]          : 0: 300Mhz *
GPU[0]          : 1: 991Mhz
GPU[0]          : 2: 1138Mhz
GPU[0]          : 3: 1269Mhz
GPU[0]          : 4: 1348Mhz
GPU[0]          : 5: 1399Mhz
GPU[0]          : 6: 1440Mhz
GPU[0]          : 7: 1500Mhz
GPU[0]          : 
GPU[0]          : Supported socclk frequencies on GPU0
GPU[0]          : 0: 600Mhz *
GPU[0]          : 1: 720Mhz
GPU[0]          : 2: 847Mhz
GPU[0]          : 3: 960Mhz
GPU[0]          : 4: 1028Mhz
GPU[0]          : 5: 1107Mhz
GPU[0]          : 
GPU[0]          : Supported PCIe frequencies on GPU0
GPU[0]          : 0: 8.0GT/s x8 *
GPU[0]          : 1: 8.0GT/s x8
GPU[0]          : 
------------------------------------------------------------------------------------
GPU[1]          : Supported dcefclk frequencies on GPU1
GPU[1]          : 0: 600Mhz *
GPU[1]          : 1: 720Mhz
GPU[1]          : 2: 847Mhz
GPU[1]          : 3: 900Mhz
GPU[1]          : 
GPU[1]          : 
GPU[1]          : Supported mclk frequencies on GPU1
GPU[1]          : 0: 167Mhz *
GPU[1]          : 1: 500Mhz
GPU[1]          : 2: 800Mhz
GPU[1]          : 3: 945Mhz
GPU[1]          : 
GPU[1]          : Supported sclk frequencies on GPU1
GPU[1]          : 0: 300Mhz *
GPU[1]          : 1: 991Mhz
GPU[1]          : 2: 1138Mhz
GPU[1]          : 3: 1269Mhz
GPU[1]          : 4: 1348Mhz
GPU[1]          : 5: 1399Mhz
GPU[1]          : 6: 1440Mhz
GPU[1]          : 7: 1500Mhz
GPU[1]          : 
GPU[1]          : Supported socclk frequencies on GPU1
GPU[1]          : 0: 600Mhz *
GPU[1]          : 1: 720Mhz
GPU[1]          : 2: 847Mhz
GPU[1]          : 3: 960Mhz
GPU[1]          : 4: 1028Mhz
GPU[1]          : 5: 1107Mhz
GPU[1]          : 
GPU[1]          : Supported PCIe frequencies on GPU1
GPU[1]          : 0: 8.0GT/s x8 *
GPU[1]          : 1: 8.0GT/s x8
GPU[1]          : 
------------------------------------------------------------------------------------
====================================================================================
================================ % time GPU is busy ================================
GPU[0]          : GPU use (%): 0
GPU[1]          : GPU use (%): 0
====================================================================================
================================ Current Memory Use ================================
GPU[0]          : % memory use, Not supported on the given system
GPU[0]          : Memory Activity: N/A
GPU[1]          : % memory use, Not supported on the given system
GPU[1]          : Memory Activity: N/A
====================================================================================
================================== Memory Vendor ===================================
GPU[0]          : GPU memory vendor: samsung
GPU[1]          : GPU memory vendor: samsung
====================================================================================
=============================== PCIe Replay Counter ================================
GPU[0]          : PCIe Replay Count: 0
GPU[1]          : PCIe Replay Count: 0
====================================================================================
================================== Serial Number ===================================
GPU[0]          : Serial Number: N/A
GPU[1]          : Serial Number: N/A
====================================================================================
================================== KFD Processes ===================================
No KFD PIDs currently running
====================================================================================
=============================== GPUs Indexed by PID ================================
No KFD PIDs currently running
====================================================================================
==================== GPU Memory clock frequencies and voltages =====================
GPU[0]          : get_od_volt, Not supported on the given system
GPU[1]          : get_od_volt, Not supported on the given system
====================================================================================
================================= Current voltage ==================================
GPU[0]          : Voltage (mV): 750
GPU[1]          : Voltage (mV): 750
====================================================================================
==================================== PCI Bus ID ====================================
GPU[0]          : PCI Bus: 0000:88:00.0
GPU[1]          : PCI Bus: 0000:8B:00.0
====================================================================================
=============================== Firmware Information ===============================
GPU[0]          : ASD firmware version:         0x210000af
GPU[0]          : CE firmware version:          80
GPU[0]          : DMCU firmware version:        0
GPU[0]          : MC firmware version:          0
GPU[0]          : ME firmware version:          166
GPU[0]          : MEC firmware version:         33237
GPU[0]          : MEC2 firmware version:        33237
GPU[0]          : PFP firmware version:         194
GPU[0]          : RLC firmware version:         96
GPU[0]          : RLC SRLC firmware version:    0
GPU[0]          : RLC SRLG firmware version:    0
GPU[0]          : RLC SRLS firmware version:    0
GPU[0]          : SDMA firmware version:        434
GPU[0]          : SDMA2 firmware version:       434
GPU[0]          : SMC firmware version:         00.28.57.00
GPU[0]          : SOS firmware version:         0x0008025d
GPU[0]          : TA RAS firmware version:      00.00.00.00
GPU[0]          : TA XGMI firmware version:     00.00.00.00
GPU[0]          : UVD firmware version:         0x422b1100
GPU[0]          : VCE firmware version:         0x39060400
GPU[0]          : VCN firmware version:         0x00000000
GPU[1]          : ASD firmware version:         0x210000af
GPU[1]          : CE firmware version:          80
GPU[1]          : DMCU firmware version:        0
GPU[1]          : MC firmware version:          0
GPU[1]          : ME firmware version:          166
GPU[1]          : MEC firmware version:         33237
GPU[1]          : MEC2 firmware version:        33237
GPU[1]          : PFP firmware version:         194
GPU[1]          : RLC firmware version:         96
GPU[1]          : RLC SRLC firmware version:    0
GPU[1]          : RLC SRLG firmware version:    0
GPU[1]          : RLC SRLS firmware version:    0
GPU[1]          : SDMA firmware version:        434
GPU[1]          : SDMA2 firmware version:       434
GPU[1]          : SMC firmware version:         00.28.57.00
GPU[1]          : SOS firmware version:         0x0008025d
GPU[1]          : TA RAS firmware version:      00.00.00.00
GPU[1]          : TA XGMI firmware version:     00.00.00.00
GPU[1]          : UVD firmware version:         0x422b1100
GPU[1]          : VCE firmware version:         0x39060400
GPU[1]          : VCN firmware version:         0x00000000
====================================================================================
=================================== Product Info ===================================
GPU[0]          : Card series:          Vega 10 [Radeon Pro V340]
GPU[0]          : Card model:           0x0c00
GPU[0]          : Card vendor:          Advanced Micro Devices, Inc. [AMD/ATI]
GPU[0]          : Card SKU:             D0531300
GPU[1]          : Card series:          Vega 10 [Radeon Pro V340]
GPU[1]          : Card model:           0x0c00
GPU[1]          : Card vendor:          Advanced Micro Devices, Inc. [AMD/ATI]
GPU[1]          : Card SKU:             D0531300
====================================================================================
==================================== Pages Info ====================================
GPU[0]          : ras, Not supported on the given system
============================== Show Valid sclk Range ===============================
GPU[0]          : get_od_volt, Not supported on the given system
GPU[1]          : get_od_volt, Not supported on the given system
====================================================================================
============================== Show Valid mclk Range ===============================
GPU[0]          : get_od_volt, Not supported on the given system
GPU[1]          : get_od_volt, Not supported on the given system
====================================================================================
============================= Show Valid voltage Range =============================
GPU[0]          : get_od_volt, Not supported on the given system
GPU[1]          : get_od_volt, Not supported on the given system
====================================================================================
=============================== Voltage Curve Points ===============================
GPU[0]          : get_od_volt_info, Not supported on the given system
GPU[1]          : get_od_volt_info, Not supported on the given system
====================================================================================
================================= Consumed Energy ==================================
GPU[0]          : % Energy Counter, Not supported on the given system
GPU[1]          : % Energy Counter, Not supported on the given system
====================================================================================
============================ Current Compute Partition =============================
GPU[0]          : Not supported on the given system
GPU[1]          : Not supported on the given system
====================================================================================
================================= Current NPS Mode =================================
GPU[0]          : Not supported on the given system
GPU[1]          : Not supported on the given system
====================================================================================
=============================== End of ROCm SMI Log ================================
|root@ai-dev|:{/opt} #_ 
kotee4ko commented 1 year ago

upd:

But, hipMemcpyDtoD() would be done without SIGSEGV just well:


    gpuErrchk(hipMemcpy2D(devPtr, pitch, hostPtr, Ncols*sizeof(float), Ncols*sizeof(float), Nrows, hipMemcpyDefault));
    gpuErrchk(hipMemcpyDtoD(dev2Ptr, devPtr, 0x10)); // works very wll
    gpuErrchk(hipMemcpy2D(dev2Ptr, pitch, devPtr, Ncols*sizeof(float), Ncols*sizeof(float), Nrows, hipMemcpyDefault)); // crashes

Could we have DtoD version of Memcpy2D and Memcpy2DAsync, please? =)

kotee4ko commented 1 year ago

Hey, AMD, anybody home?

kotee4ko commented 1 year ago

@jeffdaily @mangupta @averinevg I'm sorry for disturbing, but I can't continue using devoloping with that issue. Can anybody tell me, please, what should I do to solve this?

iassiour commented 12 months ago

@kotee4ko I am sorry for the delay. I think it would help if we can get a debug build and from the backtrace of that see exactly where it fails in hip runtime. To get debug build of the hip runtime (assuming that you are using rocm version 5.7.1):

clone the 5.7.x branch of HIP https://github.com/ROCm-Developer-Tools/HIP and 5.7.x branch of clr repo https://github.com/ROCm-Developer-Tools/clr

cd clr && mkdir build && cd build

cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR -DCMAKE_PREFIX_PATH=/opt/rocm/ - DCMAKE_INSTALL_PREFIX=$PWD/install -DCMAKE_BUILD_TYPE=Debug

HIP_COMMON_DIR points to the cloned HIP repo.

make install

This will create the libamdhip64.so under clr/build/install/lib You can then set the LD_LIBRARY_PATH to that directory and re-run the test. The failing backtrace should now give more information about the point of failure.

kotee4ko commented 12 months ago

git clone -b 'rocm-5.7.x' --depth=1  https://github.com/ROCm-Developer-Tools/HIP 

git clone -b 'rocm-5.7.x' --depth=1  https://github.com/ROCm-Developer-Tools/clr

cd clr && mkdir build && cd build

export HIP_COMMON_DIR=$(realpath $(pwd)/../../HIP); echo $HIP_COMMON_DIR;  cmake -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR -DCMAKE_PREFIX_PATH=/opt/rocm/ -DCMAKE_INSTALL_PREFIX=$PWD/install -DCMAKE_BUILD_TYPE=Debug ..

|root@ai-dev|:{/usr/src/clr/build} #_ export HIP_COMMON_DIR=$(realpath $(pwd)/../../HIP); echo $HIP_COMMON_DIR;  cmake -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR -DCMAKE_PREFIX_PATH=/opt/rocm/ -DCMAKE_INSTALL_PREFIX=$PWD/install -DCMAKE_BUILD_TYPE=Debug ..
/usr/src/HIP
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- HIPCC Binary Directory: /opt/rocm/bin
-- HIP Common Directory: /usr/src/HIP
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE  
-- Found AMD_OPENCL: /usr/src/clr/opencl/khronos/headers/opencl2.2/CL  
-- Found NUMA: /usr/lib/x86_64-linux-gnu/libnuma.so  
-- Found OpenGL: /usr/lib/x86_64-linux-gnu/libGL.so   
-- HIPCC_BIN_DIR found at /opt/rocm/bin
-- HIP_COMMON_DIR found at /usr/src/HIP
-- Found Perl: /usr/bin/perl (found version "5.34.0") 
-- Found Git: /usr/bin/git (found version "2.34.1") 
-- Using CPACK_DEBIAN_PACKAGE_RELEASE local
-- CPACK_RPM_PACKAGE_RELEASE: local
-- HIP Platform: amd
-- HIP Runtime: rocclr
-- HIP Compiler: clang
-- ROCM Installation path(ROCM_PATH): /opt/rocm
-- HIP will be installed in: /usr/src/clr/build/install
-- Performing Test Terminfo_LINKABLE
-- Performing Test Terminfo_LINKABLE - Success
-- Found Terminfo: /usr/lib/x86_64-linux-gnu/libtinfo.so  
-- Found ZLIB: /usr/lib/x86_64-linux-gnu/libz.so (found version "1.2.11") 
-- Found LibXml2: /usr/lib/x86_64-linux-gnu/libxml2.so (found version "2.9.13") 
'sh' '-c' '/usr/src/clr/hipamd/src/hip_embed_pch.sh /usr/src/HIP/include /usr/src/clr/build/hipamd/include /usr/src/clr/hipamd/include /opt/rocm/llvm/lib/cmake/llvm/../../..'
+ /opt/rocm/llvm/lib/cmake/llvm/../../../bin/clang -O3 --rocm-path=/usr/src/clr/build/hipamd/include/.. -std=c++17 -nogpulib -isystem /usr/src/clr/build/hipamd/include -isystem /usr/src/HIP/include -isystem /usr/src/clr/hipamd/include --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip /tmp/hip_pch.271754/hip_pch.h -E
+ cat /tmp/hip_pch.271754/hip_macros.h
+ /opt/rocm/llvm/lib/cmake/llvm/../../../bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip_pch.271754/hip_wave32.pch -x hip-cpp-output -
+ /opt/rocm/llvm/lib/cmake/llvm/../../../bin/clang -O3 --rocm-path=/usr/src/clr/build/hipamd/include/.. -std=c++17 -nogpulib -isystem /usr/src/clr/build/hipamd/include -isystem /usr/src/HIP/include -isystem /usr/src/clr/hipamd/include --cuda-device-only -x hip /tmp/hip_pch.271754/hip_pch.h -E
+ cat /tmp/hip_pch.271754/hip_macros.h
+ /opt/rocm/llvm/lib/cmake/llvm/../../../bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip_pch.271754/hip_wave64.pch -x hip-cpp-output -
+ /opt/rocm/llvm/lib/cmake/llvm/../../../bin/llvm-mc -o hip_pch.o /tmp/hip_pch.271754/hip_pch.mcin --filetype=obj
+ rm -rf /tmp/hip_pch.271754
CMake Error at /usr/lib/llvm-11/lib/cmake/clang/ClangTargets.cmake:671 (message):
  The imported target "clangBasic" references the file

     "/usr/lib/llvm-11/lib/libclangBasic.a"

  but this file does not exist.  Possible reasons include:

  * The file was deleted, renamed, or moved to another location.

  * An install or uninstall procedure did not complete successfully.

  * The installation package was faulty and contained

     "/usr/lib/llvm-11/lib/cmake/clang/ClangTargets.cmake"

  but not all the files it references.

Call Stack (most recent call first):
  /usr/lib/cmake/clang-11/ClangConfig.cmake:20 (include)
  hipamd/src/hiprtc/CMakeLists.txt:129 (find_package)

-- Configuring incomplete, errors occurred!
See also "/usr/src/clr/build/CMakeFiles/CMakeOutput.log".

/usr/lib/cmake/clang-11/ClangConfig.cmake:20: include("${CLANG_CMAKE_DIR}/ClangTargets.cmake") CLANG_CMAKE_DIR = /usr/lib/llvm-11/lib/cmake/clang

... ~1 hour later ....

for i in $(cd /opt/rocm/llvm/lib/ && ls -1 *.a); do if [ ! -f /usr/lib/llvm-15/lib/$i ]; then echo $i; ln -s /opt/rocm/llvm/lib/$i /usr/lib/llvm-15/lib/$i; fi; done for i in $(cd /usr/lib/llvm-14/lib && ls -1 *.a); do if [ ! -f /usr/lib/llvm-15/lib/$i ]; then echo $i; ln -s /usr/lib/llvm-14/lib/$i /usr/lib/llvm-15/lib/$i; fi; done


|root@ai-dev|:{/usr/src/clr/build} #_ export LLVM_DIR=/usr/lib/llvm-15; export GFX_ARCH_CUSTOM=gfx900; export HIP_COMMON_DIR=$(realpath $(pwd)/../../HIP); echo $HIP_COMMON_DIR;  cmake -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR -DCMAKE_PREFIX_PATH=/usr/local -DCMAKE_INSTALL_PREFIX=$PWD/install -DCMAKE_BUILD_TYPE=Debug ..
/usr/src/HIP
-- HIPCC Binary Directory: /opt/rocm/bin
-- HIP Common Directory: /usr/src/HIP
-- HIPCC_BIN_DIR found at /opt/rocm/bin
-- HIP_COMMON_DIR found at /usr/src/HIP
-- Using CPACK_DEBIAN_PACKAGE_RELEASE local
-- CPACK_RPM_PACKAGE_RELEASE: local
-- HIP Platform: amd
-- HIP Runtime: rocclr
-- HIP Compiler: clang
-- ROCM Installation path(ROCM_PATH): /opt/rocm
-- HIP will be installed in: /usr/src/clr/build/install
'sh' '-c' '/usr/src/clr/hipamd/src/hip_embed_pch.sh /usr/src/HIP/include /usr/src/clr/build/hipamd/include /usr/src/clr/hipamd/include /usr/lib/llvm-15/lib/cmake/llvm/../../..'
/usr/src/HIP/include /usr/src/clr/build/hipamd/include /usr/src/clr/hipamd/include /usr/lib/llvm-15/lib/cmake/llvm/../../..
/usr/lib/llvm-15
+ /usr/lib/llvm-15/bin/clang -O3 --rocm-path=/usr/src/clr/build/hipamd/include/.. -std=c++17 -nogpulib -isystem /usr/src/clr/build/hipamd/include -isystem /usr/src/HIP/include -isystem /usr/src/clr/hipamd/include --cuda-device-only --cuda-gpu-arch=gfx900 -x hip /tmp/hip_pch.281112/hip_pch.h -E
+ cat /tmp/hip_pch.281112/hip_macros.h
+ /usr/lib/llvm-15/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip_pch.281112/hip_wave32.pch -x hip-cpp-output -
+ /usr/lib/llvm-15/bin/clang -O3 --rocm-path=/usr/src/clr/build/hipamd/include/.. -std=c++17 -nogpulib -isystem /usr/src/clr/build/hipamd/include -isystem /usr/src/HIP/include -isystem /usr/src/clr/hipamd/include --cuda-device-only -x hip /tmp/hip_pch.281112/hip_pch.h -E
+ cat /tmp/hip_pch.281112/hip_macros.h
+ /usr/lib/llvm-15/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip_pch.281112/hip_wave64.pch -x hip-cpp-output -
+ /usr/lib/llvm-15/bin/llvm-mc -o hip_pch.o /tmp/hip_pch.281112/hip_pch.mcin --filetype=obj
+ rm -rf /tmp/hip_pch.281112
CMake Warning at hipamd/src/CMakeLists.txt:253 (message):
  Profiling API header not found.  Disabling roctracer integration.  Use
  -DPROF_API_HEADER_PATH=<path to prof_protocol.h header>

-- host interface - found
-- HIP runtime lib type - shared
-- CMAKE_TESTING_TOOL: 
-- CMAKE HIP ARCHITECTURES: OFF
-- ROCm Agent Enumurator Result: 0
-- Using offload arch string:    --offload-arch=gfx900  --offload-arch=gfx900 
-- Configuring done
-- Generating done
-- Build files have been written to: /usr/src/clr/build

make -j16


doxygen: error while loading shared libraries: libclang-cpp.so.14: cannot open shared object file: No such file or directory
make[2]: *** [hipamd/packaging/CMakeFiles/build_doxygen.dir/build.make:70: hipamd/packaging/CMakeFiles/build_doxygen] Error 127
make[1]: *** [CMakeFiles/Makefile2:10645: hipamd/packaging/CMakeFiles/build_doxygen.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....

ln -s /usr/lib/llvm-14/lib/libclang-cpp.so.14 /usr/lib

make -j16


[100%] Linking CXX shared library ../lib/libamdhip64.so
 [100%] Built target amdhip64
[100%] Linking CXX shared library ../../lib/libhiprtc.so
 [100%] Built target hiprtc

I think AMD need to maintain their own BUILD system. ROCm-Build, hell-yeah =)


Thread 1 "hiptest" received signal SIGSEGV, Segmentation fault.
0x00007ffff68fd680 in device::Memory::owner (this=0x0) at /usr/src/clr/rocclr/device/device.hpp:768
768       amd::Memory* owner() const { return owner_; }
bt
#0  0x00007ffff68fd680 in device::Memory::owner (this=0x0) at /usr/src/clr/rocclr/device/device.hpp:768
#1  0x00007ffff699dcca in roc::Memory::syncCacheFromHost (this=0x0, gpu=..., syncFlags=...) at /usr/src/clr/rocclr/device/rocm/rocmemory.cpp:355
#2  0x00007ffff69b662d in roc::VirtualGPU::copyMemory (this=0x3baad0, type=4611, srcMem=..., dstMem=..., entire=false, srcOrigin=..., dstOrigin=..., size=..., srcRect=..., dstRect=..., copyMetadata=...)
    at /usr/src/clr/rocclr/device/rocm/rocvirtual.cpp:1795
#3  0x00007ffff69b6cb3 in roc::VirtualGPU::submitCopyMemory (this=0x3baad0, cmd=...) at /usr/src/clr/rocclr/device/rocm/rocvirtual.cpp:1881
#4  0x00007ffff67cd838 in amd::CopyMemoryCommand::submit (this=0x75b320, device=...) at /usr/src/clr/rocclr/platform/command.hpp:894
#5  0x00007ffff693d75a in amd::Command::enqueue (this=0x75b320) at /usr/src/clr/rocclr/platform/command.cpp:393
#6  0x00007ffff67cf94a in ihipMemcpyCmdEnqueue (command=0x75b320, isAsync=false) at /usr/src/clr/hipamd/src/hip_memory.cpp:2163
#7  0x00007ffff678442b in ihipMemcpyParam3D (pCopy=0x7fffffffdbf0, stream=0x0, isAsync=false) at /usr/src/clr/hipamd/src/hip_memory.cpp:2268
#8  0x00007ffff67844b8 in ihipMemcpyParam2D (pCopy=0x7fffffffdce0, stream=0x0, isAsync=false) at /usr/src/clr/hipamd/src/hip_memory.cpp:2277
#9  0x00007ffff678467f in ihipMemcpy2D (dst=0x7ffde6600000, dpitch=256, src=0x7ffde6a00000, spitch=20, width=20, height=3, kind=hipMemcpyDefault, stream=0x0, isAsync=false) at /usr/src/clr/hipamd/src/hip_memory.cpp:2309
#10 0x00007ffff67852a9 in hipMemcpy2D_common (dst=0x7ffde6600000, dpitch=256, src=0x7ffde6a00000, spitch=20, width=20, height=3, kind=hipMemcpyDefault, stream=0x0, isAsync=false) at /usr/src/clr/hipamd/src/hip_memory.cpp:2380
#11 0x00007ffff6785ae7 in hipMemcpy2D (dst=0x7ffde6600000, dpitch=256, src=0x7ffde6a00000, spitch=20, width=20, height=3, kind=hipMemcpyDefault) at /usr/src/clr/hipamd/src/hip_memory.cpp:2387
#12 0x000000000020b87f in main () at /opt/AI/koboldcpp-rocm/hip_test.cu:80
#13 0x00007ffff5e29d90 in __libc_start_call_main (main=main@entry=0x20b570 <main()>, argc=argc@entry=1, argv=argv@entry=0x7fffffffe448) at ../sysdeps/nptl/libc_start_call_main.h:58
#14 0x00007ffff5e29e40 in __libc_start_main_impl (main=0x20b570 <main()>, argc=1, argv=0x7fffffffe448, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7fffffffe438) at ../csu/libc-start.c:392
#15 0x000000000020b435 in _start ()

In file: /usr/src/clr/rocclr/device/device.hpp
   763 
   764   //! Read the size
   765   size_t size() const { return size_; }
   766 
   767   //! Gets the owner Memory instance
 ► 768   amd::Memory* owner() const { return owner_; }
   769 
   770   //! Immediate blocking write from device cache to owners's backing store.
   771   //! Marks owner as "current" by resetting the last writer to NULL.
   772   virtual void syncHostFromCache(device::VirtualDevice* vDev,
   773                                  SyncFlags syncFlags = SyncFlags()) {}

pwndbg> p owner_
Cannot access memory at address 0x10

up 
up

In file: /usr/src/clr/rocclr/device/rocm/rocvirtual.cpp
   1790   Memory* dstDevMem = dev().getRocMemory(&dstMem);
   1791 
   1792   // Synchronize source and destination memory
   1793   device::Memory::SyncFlags syncFlags;
   1794   syncFlags.skipEntire_ = entire;
 ► 1795   dstDevMem->syncCacheFromHost(*this, syncFlags);
   1796   srcDevMem->syncCacheFromHost(*this);
   1797 
   1798   bool result = false;
   1799   bool srcImageBuffer = false;
   1800   bool dstImageBuffer = false;

pwndbg> p dstDevMem
$4 = (roc::Memory *) 0x0

pwndbg> p &dstMem 
$10 = (amd::Memory *) 0x3ba290

state of dstMem:


pwndbg> p dstMem 
$9 = (amd::Memory &) @0x3ba290: {
  <amd::RuntimeObject> = {
    <amd::ReferenceCountedObject> = {
      _vptr.ReferenceCountedObject = 0x7ffff6bf7408 <vtable for amd::Buffer+16>,
      referenceCount_ = {
        <std::__atomic_base<unsigned int>> = {
          static _S_alignment = 4,
          _M_i = 2
        }, 
        members of std::atomic<unsigned int>:
        static is_always_lock_free = true
      }
    }, 
    <amd::ICDDispatchedObject> = {
      static icdVendorDispatch_ = {{
          clGetPlatformIDs = 0x0,
          clGetPlatformInfo = 0x0,
          clGetDeviceIDs = 0x0,
          clGetDeviceInfo = 0x0,
          clCreateContext = 0x0,
          clCreateContextFromType = 0x0,
          clRetainContext = 0x0,
          clReleaseContext = 0x0,
          clGetContextInfo = 0x0,
          clCreateCommandQueue = 0x0,
          clRetainCommandQueue = 0x0,
          clReleaseCommandQueue = 0x0,
          clGetCommandQueueInfo = 0x0,
          clSetCommandQueueProperty = 0x0,
          clCreateBuffer = 0x0,
          clCreateImage2D = 0x0,
          clCreateImage3D = 0x0,
          clRetainMemObject = 0x0,
          clReleaseMemObject = 0x0,
          clGetSupportedImageFormats = 0x0,
          clGetMemObjectInfo = 0x0,
          clGetImageInfo = 0x0,
          clCreateSampler = 0x0,
          clRetainSampler = 0x0,
          clReleaseSampler = 0x0,
          clGetSamplerInfo = 0x0,
          clCreateProgramWithSource = 0x0,
          clCreateProgramWithBinary = 0x0,
          clRetainProgram = 0x0,
          clReleaseProgram = 0x0,
          clBuildProgram = 0x0,
          clUnloadCompiler = 0x0,
          clGetProgramInfo = 0x0,
          clGetProgramBuildInfo = 0x0,
          clCreateKernel = 0x0,
          clCreateKernelsInProgram = 0x0,
          clRetainKernel = 0x0,
          clReleaseKernel = 0x0,
          clSetKernelArg = 0x0,
          clGetKernelInfo = 0x0,
          clGetKernelWorkGroupInfo = 0x0,
          clWaitForEvents = 0x0,
          clGetEventInfo = 0x0,
          clRetainEvent = 0x0,
          clReleaseEvent = 0x0,
          clGetEventProfilingInfo = 0x0,
          clFlush = 0x0,
          clFinish = 0x0,
          clEnqueueReadBuffer = 0x0,
          clEnqueueWriteBuffer = 0x0,
          clEnqueueCopyBuffer = 0x0,
          clEnqueueReadImage = 0x0,
          clEnqueueWriteImage = 0x0,
          clEnqueueCopyImage = 0x0,
          clEnqueueCopyImageToBuffer = 0x0,
          clEnqueueCopyBufferToImage = 0x0,
          clEnqueueMapBuffer = 0x0,
          clEnqueueMapImage = 0x0,
          clEnqueueUnmapMemObject = 0x0,
          clEnqueueNDRangeKernel = 0x0,
          clEnqueueTask = 0x0,
          clEnqueueNativeKernel = 0x0,
          clEnqueueMarker = 0x0,
          clEnqueueWaitForEvents = 0x0,
          clEnqueueBarrier = 0x0,
          clGetExtensionFunctionAddress = 0x0,
          clCreateFromGLBuffer = 0x0,
          clCreateFromGLTexture2D = 0x0,
          clCreateFromGLTexture3D = 0x0,
          clCreateFromGLRenderbuffer = 0x0,
          clGetGLObjectInfo = 0x0,
          clGetGLTextureInfo = 0x0,
          clEnqueueAcquireGLObjects = 0x0,
          clEnqueueReleaseGLObjects = 0x0,
          clGetGLContextInfoKHR = 0x0,
          clGetDeviceIDsFromD3D10KHR = 0x0,
          clCreateFromD3D10BufferKHR = 0x0,
          clCreateFromD3D10Texture2DKHR = 0x0,
          clCreateFromD3D10Texture3DKHR = 0x0,
          clEnqueueAcquireD3D10ObjectsKHR = 0x0,
          clEnqueueReleaseD3D10ObjectsKHR = 0x0,
          clSetEventCallback = 0x0,
          clCreateSubBuffer = 0x0,
          clSetMemObjectDestructorCallback = 0x0,
          clCreateUserEvent = 0x0,
          clSetUserEventStatus = 0x0,
          clEnqueueReadBufferRect = 0x0,
          clEnqueueWriteBufferRect = 0x0,
          clEnqueueCopyBufferRect = 0x0,
          clCreateSubDevicesEXT = 0x0,
          clRetainDeviceEXT = 0x0,
          clReleaseDeviceEXT = 0x0,
          clCreateEventFromGLsyncKHR = 0x0,
          clCreateSubDevices = 0x0,
          clRetainDevice = 0x0,
          clReleaseDevice = 0x0,
          clCreateImage = 0x0,
          clCreateProgramWithBuiltInKernels = 0x0,
          clCompileProgram = 0x0,
          clLinkProgram = 0x0,
          clUnloadPlatformCompiler = 0x0,
          clGetKernelArgInfo = 0x0,
          clEnqueueFillBuffer = 0x0,
          clEnqueueFillImage = 0x0,
          clEnqueueMigrateMemObjects = 0x0,
          clEnqueueMarkerWithWaitList = 0x0,
          clEnqueueBarrierWithWaitList = 0x0,
          clGetExtensionFunctionAddressForPlatform = 0x0,
          clCreateFromGLTexture = 0x0,
          clGetDeviceIDsFromD3D11KHR = 0x0,
          clCreateFromD3D11BufferKHR = 0x0,
          clCreateFromD3D11Texture2DKHR = 0x0,
          clCreateFromD3D11Texture3DKHR = 0x0,
          clCreateFromDX9MediaSurfaceKHR = 0x0,
          clEnqueueAcquireD3D11ObjectsKHR = 0x0,
          clEnqueueReleaseD3D11ObjectsKHR = 0x0,
          clGetDeviceIDsFromDX9MediaAdapterKHR = 0x0,
          clEnqueueAcquireDX9MediaSurfacesKHR = 0x0,
          clEnqueueReleaseDX9MediaSurfacesKHR = 0x0,
          clCreateFromEGLImageKHR = 0x0,
          clEnqueueAcquireEGLObjectsKHR = 0x0,
          clEnqueueReleaseEGLObjectsKHR = 0x0,
          clCreateEventFromEGLSyncKHR = 0x0,
          clCreateCommandQueueWithProperties = 0x0,
          clCreatePipe = 0x0,
          clGetPipeInfo = 0x0,
          clSVMAlloc = 0x0,
          clSVMFree = 0x0,
          clEnqueueSVMFree = 0x0,
          clEnqueueSVMMemcpy = 0x0,
          clEnqueueSVMMemFill = 0x0,
          clEnqueueSVMMap = 0x0,
          clEnqueueSVMUnmap = 0x0,
          clCreateSamplerWithProperties = 0x0,
          clSetKernelArgSVMPointer = 0x0,
          clSetKernelExecInfo = 0x0,
          clGetKernelSubGroupInfoKHR = 0x0,
          clCloneKernel = 0x0,
          clCreateProgramWithIL = 0x0,
          clEnqueueSVMMigrateMem = 0x0,
          clGetDeviceAndHostTimer = 0x0,
          clGetHostTimer = 0x0,
          clGetKernelSubGroupInfo = 0x0,
          clSetDefaultDeviceCommandQueue = 0x0,
          clSetProgramReleaseCallback = 0x0,
          clSetProgramSpecializationConstant = 0x0
        }},
      dispatch_ = 0x7ffff7e0df60 <amd::ICDDispatchedObject::icdVendorDispatch_>
    }, <No data fields>}, 
  members of amd::Memory:
  numDevices_ = 1,
  deviceMemories_ = 0x3ba450,
  deviceAlloced_ = {
    _M_h = {
      <std::__detail::_Hashtable_base<amd::Device const*, std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, std::__detail::_Select1st, std::equal_to<amd::Device const*>, std::hash<amd::Device const*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Hashtable_traits<false, false, true> >> = {
        <std::__detail::_Hash_code_base<amd::Device const*, std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, std::__detail::_Select1st, std::hash<amd::Device const*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, false>> = {
          <std::__detail::_Hashtable_ebo_helper<1, std::hash<amd::Device const*>, true>> = {
            <std::hash<amd::Device const*>> = {
              <std::__hash_base<unsigned long, amd::Device const*>> = {<No data fields>}, <No data fields>}, <No data fields>}, <No data fields>}, 
        <std::__detail::_Hashtable_ebo_helper<0, std::equal_to<amd::Device const*>, true>> = {
          <std::equal_to<amd::Device const*>> = {
            <std::binary_function<amd::Device const*, amd::Device const*, bool>> = {<No data fields>}, <No data fields>}, <No data fields>}, <No data fields>}, 
      <std::__detail::_Map_base<amd::Device const*, std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, std::allocator<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> > >, std::__detail::_Select1st, std::equal_to<amd::Device const*>, std::hash<amd::Device const*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, true>> = {<No data fields>}, 
      <std::__detail::_Insert<amd::Device const*, std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, std::allocator<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> > >, std::__detail::_Select1st, std::equal_to<amd::Device const*>, std::hash<amd::Device const*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, false>> = {
        <std::__detail::_Insert_base<amd::Device const*, std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, std::allocator<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> > >, std::__detail::_Select1st, std::equal_to<amd::Device const*>, std::hash<amd::Device const*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >> = {<No data fields>}, <No data fields>}, 
      <std::__detail::_Rehash_base<amd::Device const*, std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, std::allocator<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> > >, std::__detail::_Select1st, std::equal_to<amd::Device const*>, std::hash<amd::Device const*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, std::integral_constant<bool, true> >> = {<No data fields>}, 
      <std::__detail::_Equality<amd::Device const*, std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, std::allocator<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> > >, std::__detail::_Select1st, std::equal_to<amd::Device const*>, std::hash<amd::Device const*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true>, true>> = {<No data fields>}, 
      <std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, false> > >> = {
        <std::__detail::_Hashtable_ebo_helper<0, std::allocator<std::__detail::_Hash_node<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, false> >, true>> = {
          <std::allocator<std::__detail::_Hash_node<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, false> >> = {
            <__gnu_cxx::new_allocator<std::__detail::_Hash_node<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, false> >> = {<No data fields>}, <No data fields>}, <No data fields>}, <No data fields>}, 
      <std::_Enable_default_constructor<true, std::__detail::_Hash_node_base>> = {<No data fields>}, 
      members of std::_Hashtable<amd::Device const*, std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> >, std::allocator<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> > >, std::__detail::_Select1st, std::equal_to<amd::Device const*>, std::hash<amd::Device const*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >:
      _M_buckets = 0x3ba470,
      _M_bucket_count = 13,
      _M_before_begin = {
        _M_nxt = 0xa054f0
      },
      _M_element_count = 2,
      _M_rehash_policy = {
        static _S_growth_factor = 2,
        _M_max_load_factor = 1,
        _M_next_resize = 13
      },
      _M_single_bucket = 0x0
    }
  },
  destructorCallbacks_ = {
    _M_b = {
      _M_p = 0x0
    }
  },
  context_ = {
    <amd::EmbeddedObject> = {<No data fields>}, 
    members of amd::SharedReference<amd::Context>:
    reference_ = @0x3b8db0
  },
  parent_ = 0x0,
  type_ = 4336,
  hostMemRef_ = {
    alloced_ = false,
    hostMem_ = 0x0,
    size_ = 0
  },
  origin_ = 0,
  size_ = 768,
  flags_ = 0,
  version_ = 0,
  lastWriter_ = 0x0,
  interopObj_ = 0x0,
  vDev_ = 0x0,
  mapCount_ = {
    <std::__atomic_base<unsigned int>> = {
      static _S_alignment = 4,
      _M_i = 0
    }, 
    members of std::atomic<unsigned int>:
    static is_always_lock_free = true
  },
  svmHostAddress_ = 0x7ffde6600000,
  {
    {
      isParent_ = 0,
      forceSysMemAlloc_ = 0,
      svmPtrCommited_ = 0,
      canBeCached_ = 1,
      p2pAccess_ = 0,
      ipcShared_ = 0,
      largeBarSystem_ = 0,
      image_view_ = 0
    },
    flagsEx_ = 8
  },
  uniqueId_ = 4,
  userData_ = {
    deviceId = 1,
    data = 0x0,
    flags = 0,
    pitch_ = 256,
    width_ = 20,
    height_ = 3,
    depth_ = 1,
    sync_mem_ops_ = false
  },
  lockMemoryOps_ = {
    <amd::HeapObject> = {<No data fields>}, 
    members of amd::Monitor:
    static kLockBit = 1,
    static kMaxSpinIter = 55,
    static kMaxReadSpinIter = 50,
    contendersList_ = {
      <std::__atomic_base<long>> = {
        static _S_alignment = 8,
        _M_i = 0
      }, 
      members of std::atomic<long>:
      static is_always_lock_free = true
    },
    name_ = "Memory Ops Lock", '\000' <repeats 48 times>,
    onDeck_ = {
      <std::__atomic_base<long>> = {
        static _S_alignment = 8,
        _M_i = 0
      }, 
      members of std::atomic<long>:
      static is_always_lock_free = true
    },
    waitersList_ = 0x0,
    owner_ = 0x0,
    lockCount_ = 0,
    recursive_ = true
  },
  subBuffers_ = {
    <std::__cxx11::_List_base<amd::Memory*, std::allocator<amd::Memory*> >> = {
      _M_impl = {
        <std::allocator<std::_List_node<amd::Memory*> >> = {
          <__gnu_cxx::new_allocator<std::_List_node<amd::Memory*> >> = {<No data fields>}, <No data fields>}, 
        members of std::__cxx11::_List_base<amd::Memory*, std::allocator<amd::Memory*> >::_List_impl:
        _M_node = {
          <std::__detail::_List_node_base> = {
            _M_next = 0x3ba420,
            _M_prev = 0x3ba420
          }, 
          members of std::__detail::_List_node_header:
          _M_size = 0
        }
      }
    }, <No data fields>},
  svmBase_ = 0x3ba4e0
}

Okey, it seems, that roc_device_ is not assigned.


In file: /usr/src/clr/rocclr/device/rocm/rocvirtual.hpp
   302              const std::vector<uint32_t>& cuMask = {},
   303              amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
   304   ~VirtualGPU();
   305 
   306   bool create();
 ► 307   const Device& dev() const { return roc_device_; }
   308 
   309   void profilingBegin(amd::Command& command, bool sdmaProfiling = false);
   310   void profilingEnd(amd::Command& command);
   311 
   312   void updateCommandsState(amd::Command* list) const;

   pwndbg> p roc_device_ 
$13 = (roc::Device &) <error reading variable: Cannot access memory at address 0x0>

pwndbg> l *&dev().getRocMemory
0x7ffff6983f64 is in roc::Device::getRocMemory(amd::Memory*) const (/usr/src/clr/rocclr/device/rocm/rocdevice.cpp:1926).

1926    Memory* Device::getRocMemory(amd::Memory* mem) const {
1927      return static_cast<roc::Memory*>(mem->getDeviceMemory(*this));
1928    }

pwndbg> b *mem->getDeviceMemory
Breakpoint 3 at 0x7ffff6954db0: file /usr/src/clr/rocclr/platform/memory.cpp, line 390.
pwndbg> c
Continuing.

Thread 1 "hiptest" hit Breakpoint 3, amd::Memory::getDeviceMemory (this=0x7fffffffd8b0, dev=..., alloc=false) at /usr/src/clr/rocclr/platform/memory.cpp:390
390     device::Memory* Memory::getDeviceMemory(const Device& dev, bool alloc) {

Oh!


In file: /usr/src/clr/rocclr/platform/memory.cpp
   387   deviceAlloced_[dev].store(AllocRealloced, std::memory_order_release);
   388 }
   389 
   390 device::Memory* Memory::getDeviceMemory(const Device& dev, bool alloc) {
   391   device::Memory* dm = NULL;
 ► 392   for (uint i = 0; i < numDevices_; ++i) {
   393     if (deviceMemories_[i].ref_ == &dev) {
   394       dm = deviceMemories_[i].value_;
   395       break;
   396     }
   397   }

pwndbg> p dm
$28 = (device::Memory *) 0x0
pwndbg> p numDevices_ 
$29 = 1

I think, that numDevices_ == 1, but


pwndbg> !rocm-smi
!rocm: event not found

========================= ROCm System Management Interface =========================
=================================== Concise Info ===================================
GPU  Temp (DieEdge)  AvgPwr  SCLK    MCLK    Fan    Perf  PwrCap  VRAM%  GPU%  
0    53.0c           15.0W   300Mhz  945Mhz  9.41%  auto  110.0W    1%   0%    
1    42.0c           14.0W   300Mhz  945Mhz  9.41%  auto  110.0W    1%   0%    
====================================================================================
=============================== End of ROCm SMI Log ================================

It is Radeon Pro V340, btw.


pwndbg> p &numDevices_
$31 = (size_t *) 0x3175b8
pwndbg> l *0x3175b8
pwndbg> xi 0x3175b8
Extended information for virtual address 0x3175b8:

  Containing mapping:
          0x20f000          0x11f3000 rw-p   fe4000 0      [heap]

  Offset information:
         Mapped Area 0x3175b8 = 0x20f000 + 0x1085b8

... restart till main () ...

pwndbg> awatch -l *0x3175b8
Hardware access (read/write) watchpoint 5: -location *0x3175b8

allocating memPith 20 X 3

Hardware access (read/write) watchpoint 5: -location *0x3175b8

Value = 0

...

Continuing.

Hardware access (read/write) watchpoint 5: -location *0x3175b8

Value = 3243408
0x00007ffff5e8cc1c in _IO_new_file_underflow (fp=0x3175a0) at ./libio/fileops.c:511

   f 0     7ffff5e8cc1c _IO_file_underflow+364
   f 1     7ffff5e801c2 getdelim+274
   f 2     7ffff65dbb8c
 ► f 3     7ffff65e05f8 amdgpu_device_initialize+1080
   f 4     7fffed0f4738
   f 5     7fffed0fe45e
   f 6     7fffed0ff9b3
   f 7     7fffed0f9735
   f 8     7fffed04fd6f
   f 9     7fffed0730e5
   f 10     7fffed07332c
   f 11     7fffed050e4e
   f 12     7ffff697cf64 roc::Device::init()+256
   f 13     7ffff68f6fc2 amd::Device::init()+150

pwndbg> xi
Extended information for virtual address 0x7ffff65e05f8:

  Containing mapping:
    0x7ffff65db000     0x7ffff65e2000 r-xp     7000 3000   /opt/amdgpu/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1.0.0

  Offset information:
         Mapped Area 0x7ffff65e05f8 = 0x7ffff65db000 + 0x55f8
         File (Base) 0x7ffff65e05f8 = 0x7ffff65d8000 + 0x85f8
      File (Segment) 0x7ffff65e05f8 = 0x7ffff65db000 + 0x55f8
         File (Disk) 0x7ffff65e05f8 = /opt/amdgpu/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1.0.0 + 0x85f8

 Containing ELF sections:
               .text 0x7ffff65e05f8 = 0x7ffff65db9a0 + 0x4c58

aaaanddd


pwndbg> 
Continuing.
[New Thread 0x7fffecfff640 (LWP 284358)]
[New Thread 0x7fffe7fff640 (LWP 284359)]
[Thread 0x7fffe7fff640 (LWP 284359) exited]
/long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/amd-dbgapi-target.c:1096: internal-error: remove_watchpoint: Assertion `type == hw_write' failed.
A problem internal to GDB has been detected,
further debugging may prove unreliable.
----- Backtrace -----
0x5608b6c0c520 gdb_internal_backtrace_1
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/bt-utils.c:122
0x5608b6c0c520 _Z22gdb_internal_backtracev
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/bt-utils.c:168
0x5608b6feb404 internal_vproblem
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/utils.c:398
0x5608b6feb6c0 _Z15internal_verrorPKciS0_P13__va_list_tag
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/utils.c:478
0x5608b7131294 _Z18internal_error_locPKciS0_z
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdbsupport/errors.cc:58
0x5608b6b85f1f _ZN17amd_dbgapi_target17remove_watchpointEmi17target_hw_bp_typeP10expression
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/amd-dbgapi-target.c:1096
0x5608b6be265a remove_breakpoint_1
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/breakpoint.c:4148
0x5608b6be9448 remove_breakpoint
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/breakpoint.c:4183
0x5608b6be9685 _Z18remove_breakpointsv
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/breakpoint.c:3354
0x5608b6db5ba7 _Z24maybe_remove_breakpointsv
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/infrun.c:8644
0x5608b6db71ec _Z11normal_stopv
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/infrun.c:8837
0x5608b6dc729e _Z20fetch_inferior_eventv
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/infrun.c:4301
0x5608b71319f5 gdb_wait_for_event
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdbsupport/event-loop.cc:694
0x5608b7132479 gdb_wait_for_event
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdbsupport/event-loop.cc:593
0x5608b7132479 _Z16gdb_do_one_eventi
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdbsupport/event-loop.cc:217
0x5608b6e0ea99 start_event_loop
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/main.c:411
0x5608b6e0ea99 captured_command_loop
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/main.c:471
0x5608b6e10714 captured_main
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/main.c:1330
0x5608b6e10714 _Z8gdb_mainP18captured_main_args
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/main.c:1345
0x5608b6b3a45f main
        /long_pathname_so_that_rpms_can_package_the_debug_info/src/rocm-gdb/gdb/gdb.c:32
---------------------

This is a bug, please report it.  For instructions, see:
<https://github.com/ROCm-Developer-Tools/ROCgdb/issues>.

Boooom =)

Seems, I should try other way

kotee4ko commented 12 months ago

@iassiour Thanks for the operative reply, Sir.

iassiour commented 12 months ago

@kotee4ko could you please check that the call to dev->createMemory() https://github.com/ROCm-Developer-Tools/clr/blob/develop/rocclr/platform/memory.cpp#L339 returns a valid pointer in all cases. I think it could also help to export AMD_LOG_LEVEL=4 and then re-run the test, in case there is any "memory allocation failed" errors coming up.

kotee4ko commented 12 months ago

pwndbg> r
pwndbg> [Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
row 0 column 0 value 1.000000 
row 0 column 1 value 1.000000 
row 0 column 2 value 1.000000 
row 0 column 3 value 1.000000 
row 0 column 4 value 1.000000 
row 1 column 0 value 1.000000 
row 1 column 1 value 1.000000 
row 1 column 2 value 1.000000 
row 1 column 3 value 1.000000 
row 1 column 4 value 1.000000 
row 2 column 0 value 1.000000 
row 2 column 1 value 1.000000 
row 2 column 2 value 1.000000 
row 2 column 3 value 1.000000 
row 2 column 4 value 1.000000 
allocating memPith 20 X 3
:3:rocdevice.cpp            :442 : 849861451961 us: [pid:288603 tid:0x7ffff7e1ca80] Initializing HSA stack.
[New Thread 0x7fffecfff640 (LWP 288606)]
:3:comgrctx.cpp             :33  : 849861478084 us: [pid:288603 tid:0x7ffff7e1ca80] Loading COMGR library.
[Thread 0x7ffde7dff640 (LWP 288607) exited]
:3:rocdevice.cpp            :208 : 849861478178 us: [pid:288603 tid:0x7ffff7e1ca80] Numa selects cpu agent[1]=0x315cb0(fine=0x315ea0,coarse=0x316680) for gpu agent=0x368d40 CPU<->GPU XGMI=0
:3:rocdevice.cpp            :1680: 849861478738 us: [pid:288603 tid:0x7ffff7e1ca80] Gfx Major/Minor/Stepping: 9/0/0
:3:rocdevice.cpp            :1682: 849861478754 us: [pid:288603 tid:0x7ffff7e1ca80] HMM support: 1, XNACK: 0, Direct host access: 0
:3:rocdevice.cpp            :1684: 849861478762 us: [pid:288603 tid:0x7ffff7e1ca80] Max SDMA Read Mask: 0x0, Max SDMA Write Mask: 0x0
:4:rocdevice.cpp            :2063: 849861479454 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa host memory 0x7ffeec400000, size 0x101000
:4:rocdevice.cpp            :2063: 849861481005 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa host memory 0x7ffeec200000, size 0x101000
:3:rocdevice.cpp            :208 : 849861481295 us: [pid:288603 tid:0x7ffff7e1ca80] Numa selects cpu agent[1]=0x315cb0(fine=0x315ea0,coarse=0x316680) for gpu agent=0x386ab0 CPU<->GPU XGMI=0
:3:rocdevice.cpp            :1680: 849861481398 us: [pid:288603 tid:0x7ffff7e1ca80] Gfx Major/Minor/Stepping: 9/0/0
:3:rocdevice.cpp            :1682: 849861481407 us: [pid:288603 tid:0x7ffff7e1ca80] HMM support: 1, XNACK: 0, Direct host access: 0
:3:rocdevice.cpp            :1684: 849861481411 us: [pid:288603 tid:0x7ffff7e1ca80] Max SDMA Read Mask: 0x0, Max SDMA Write Mask: 0x0
:4:rocdevice.cpp            :2063: 849861481529 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa host memory 0x7ffff65d4000, size 0x70
:4:rocdevice.cpp            :2063: 849861482178 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa host memory 0x7ffde7000000, size 0x101000
:4:rocdevice.cpp            :2063: 849861483486 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa host memory 0x7ffde6e00000, size 0x101000
:4:runtime.cpp              :83  : 849861484305 us: [pid:288603 tid:0x7ffff7e1ca80] init
:3:hip_context.cpp          :48  : 849861484326 us: [pid:288603 tid:0x7ffff7e1ca80] Direct Dispatch: 1
:3:hip_memory.cpp           :822 : 849861484462 us: [pid:288603 tid:0x7ffff7e1ca80]  hipMallocPitch ( 0x7fffffffe260, 0x7fffffffe258, 20, 3 ) 
:4:rocdevice.cpp            :2191: 849861484654 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa device memory 0x7ffde6a00000, size 0x300
:3:rocdevice.cpp            :2230: 849861484664 us: [pid:288603 tid:0x7ffff7e1ca80] device=0x3a61f0, freeMem_ = 0x3fefffd00
:3:hip_memory.cpp           :824 : 849861484677 us: [pid:288603 tid:0x7ffff7e1ca80] hipMallocPitch: Returned hipSuccess : 0x7ffde6a00000
:3:hip_device_runtime.cpp   :561 : 849861484688 us: [pid:288603 tid:0x7ffff7e1ca80]  hipSetDevice ( 1 ) 
:3:hip_device_runtime.cpp   :565 : 849861484695 us: [pid:288603 tid:0x7ffff7e1ca80] hipSetDevice: Returned hipSuccess : 
:3:hip_memory.cpp           :822 : 849861484702 us: [pid:288603 tid:0x7ffff7e1ca80]  hipMallocPitch ( 0x7fffffffe268, 0x7fffffffe258, 20, 3 ) 
:4:rocdevice.cpp            :2191: 849861484832 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa device memory 0x7ffde6600000, size 0x300
:3:rocdevice.cpp            :2230: 849861484841 us: [pid:288603 tid:0x7ffff7e1ca80] device=0x3b5c20, freeMem_ = 0x3fefffd00
:3:hip_memory.cpp           :824 : 849861484853 us: [pid:288603 tid:0x7ffff7e1ca80] hipMallocPitch: Returned hipSuccess : 0x7ffde6600000
:3:hip_device_runtime.cpp   :561 : 849861484859 us: [pid:288603 tid:0x7ffff7e1ca80]  hipSetDevice ( 0 ) 
:3:hip_device_runtime.cpp   :565 : 849861484865 us: [pid:288603 tid:0x7ffff7e1ca80] hipSetDevice: Returned hipSuccess : 
pitch = 256
:3:hip_memory.cpp           :2385: 849861484888 us: [pid:288603 tid:0x7ffff7e1ca80]  hipMemcpy2D ( 0x7ffde6a00000, 256, 0x7fffffffe270, 20, 20, 3, hipMemcpyDefault ) 
:3:rocdevice.cpp            :2732: 849861484917 us: [pid:288603 tid:0x7ffff7e1ca80] number of allocated hardware queues with low priority: 0, with normal priority: 0, with high priority: 0, maximum per priority is: 4
:3:rocdevice.cpp            :2810: 849861495002 us: [pid:288603 tid:0x7ffff7e1ca80] created hardware queue 0x7ffff65c4000 with size 16384 with priority 1, cooperative: 0
:3:rocdevice.cpp            :2902: 849861495034 us: [pid:288603 tid:0x7ffff7e1ca80] acquireQueue refCount: 0x7ffff65c4000 (1)
:4:rocdevice.cpp            :2063: 849861495716 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa host memory 0x7ffde4e00000, size 0x100000
:3:devprogram.cpp           :2684: 849861747920 us: [pid:288603 tid:0x7ffff7e1ca80] Using Code Object V5.
:4:command.cpp              :349 : 849861822261 us: [pid:288603 tid:0x7ffff7e1ca80] Command (CopyHostToDevice2D) enqueued: 0x66a870
:4:rocmemory.cpp            :966 : 849861822608 us: [pid:288603 tid:0x7ffff7e1ca80] Locking to pool 0x316680, size 0x1000, HostPtr = 0x7fffffffe000, DevPtr = 0x7fffffffe000
:4:rocblit.cpp              :476 : 849861822652 us: [pid:288603 tid:0x7ffff7e1ca80] HSA Async Copy Rect dst=0x7ffde6a00000, src=0x7fffffffe270, wait_event=0x0 completion_signal=0x7ffff7fb4780
:4:rocvirtual.cpp           :553 : 849861825583 us: [pid:288603 tid:0x7ffff7e1ca80] Host wait on completion_signal=0x7ffff7fb4780
:3:rocvirtual.hpp           :66  : 849861825594 us: [pid:288603 tid:0x7ffff7e1ca80] Host active wait for Signal = (0x7ffff7fb4780) for -1 ns
:4:command.cpp              :289 : 849861825640 us: [pid:288603 tid:0x7ffff7e1ca80] Queue marker to command queue: 0x22e2c0
:4:command.cpp              :349 : 849861825645 us: [pid:288603 tid:0x7ffff7e1ca80] Command (InternalMarker) enqueued: 0x8868a0
:4:command.cpp              :179 : 849861825653 us: [pid:288603 tid:0x7ffff7e1ca80] Command 0x66a870 complete
:4:command.cpp              :173 : 849861825658 us: [pid:288603 tid:0x7ffff7e1ca80] Command 0x8868a0 complete (Wall: 849861825657, CPU: 0, GPU: 0 us)
:4:command.cpp              :253 : 849861825663 us: [pid:288603 tid:0x7ffff7e1ca80] Waiting for event 0x66a870 to complete, current status 0
:4:command.cpp              :268 : 849861825667 us: [pid:288603 tid:0x7ffff7e1ca80] Event 0x66a870 wait completed
:3:hip_memory.cpp           :2387: 849861825676 us: [pid:288603 tid:0x7ffff7e1ca80] hipMemcpy2D: Returned hipSuccess : : duration: 340788 us
:3:hip_memory.cpp           :1405: 849861825700 us: [pid:288603 tid:0x7ffff7e1ca80]  hipMemcpyDtoD ( 0x7ffde6600000, 0x7ffde6a00000, 16 ) 
:4:command.cpp              :349 : 849861825722 us: [pid:288603 tid:0x7ffff7e1ca80] Command (CopyDeviceToDevice) enqueued: 0x72e1b0
:4:rocmemory.cpp            :966 : 849861829032 us: [pid:288603 tid:0x7ffff7e1ca80] Locking to pool 0x316680, size 0x400000, HostPtr = 0x7ffde71fe000, DevPtr = 0x7ffde71fe000
:4:rocmemory.cpp            :966 : 849861829055 us: [pid:288603 tid:0x7ffff7e1ca80] Locking to pool 0x316680, size 0x400000, HostPtr = 0x7ffde71fe000, DevPtr = 0x7ffde71fe000
:3:rocdevice.cpp            :2732: 849861829068 us: [pid:288603 tid:0x7ffff7e1ca80] number of allocated hardware queues with low priority: 0, with normal priority: 1, with high priority: 0, maximum per priority is: 4
:3:rocdevice.cpp            :2810: 849861832850 us: [pid:288603 tid:0x7ffff7e1ca80] created hardware queue 0x7ffff6594000 with size 16384 with priority 1, cooperative: 1
:4:rocdevice.cpp            :2063: 849861833366 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa host memory 0x7ffde4a00000, size 0x100000
:4:rocblit.cpp              :727 : 849861833694 us: [pid:288603 tid:0x7ffff7e1ca80] HSA Async Copy dst=0x7ffde71fe000, src=0x7ffde6a00000, size=16, wait_event=0x0, completion_signal=0x7ffff6599300
:4:rocvirtual.cpp           :553 : 849861834859 us: [pid:288603 tid:0x7ffff7e1ca80] Host wait on completion_signal=0x7ffff6599300
:3:rocvirtual.hpp           :66  : 849861834869 us: [pid:288603 tid:0x7ffff7e1ca80] Host active wait for Signal = (0x7ffff6599300) for -1 ns
:3:rocdevice.cpp            :2732: 849861834884 us: [pid:288603 tid:0x7ffff7e1ca80] number of allocated hardware queues with low priority: 0, with normal priority: 0, with high priority: 0, maximum per priority is: 4
:3:rocdevice.cpp            :2810: 849861842230 us: [pid:288603 tid:0x7ffff7e1ca80] created hardware queue 0x7ffff657a000 with size 16384 with priority 1, cooperative: 1
:4:rocdevice.cpp            :2063: 849861842691 us: [pid:288603 tid:0x7ffff7e1ca80] Allocate hsa host memory 0x7ffde4600000, size 0x100000
:3:devprogram.cpp           :2684: 849862082361 us: [pid:288603 tid:0x7ffff7e1ca80] Using Code Object V5.
:4:rocblit.cpp              :727 : 849862092362 us: [pid:288603 tid:0x7ffff7e1ca80] HSA Async Copy dst=0x7ffde6600000, src=0x7ffde71fe000, size=16, wait_event=0x0, completion_signal=0x7ffff656fc80
:4:rocvirtual.cpp           :553 : 849862093680 us: [pid:288603 tid:0x7ffff7e1ca80] Host wait on completion_signal=0x7ffff656fc80
:3:rocvirtual.hpp           :66  : 849862093690 us: [pid:288603 tid:0x7ffff7e1ca80] Host active wait for Signal = (0x7ffff656fc80) for -1 ns
:4:command.cpp              :289 : 849862097671 us: [pid:288603 tid:0x7ffff7e1ca80] Queue marker to command queue: 0x22e2c0
:4:command.cpp              :349 : 849862097678 us: [pid:288603 tid:0x7ffff7e1ca80] Command (InternalMarker) enqueued: 0x67af40
:4:command.cpp              :179 : 849862097697 us: [pid:288603 tid:0x7ffff7e1ca80] Command 0x72e1b0 complete
:4:command.cpp              :173 : 849862097702 us: [pid:288603 tid:0x7ffff7e1ca80] Command 0x67af40 complete (Wall: 849862097701, CPU: 0, GPU: 0 us)
:4:command.cpp              :253 : 849862097707 us: [pid:288603 tid:0x7ffff7e1ca80] Waiting for event 0x72e1b0 to complete, current status 0
:4:command.cpp              :268 : 849862097712 us: [pid:288603 tid:0x7ffff7e1ca80] Event 0x72e1b0 wait completed
:3:hip_memory.cpp           :1411: 849862097721 us: [pid:288603 tid:0x7ffff7e1ca80] hipMemcpyDtoD: Returned hipSuccess : : duration: 272021 us
:3:hip_memory.cpp           :2385: 849862097736 us: [pid:288603 tid:0x7ffff7e1ca80]  hipMemcpy2D ( 0x7ffde6600000, 256, 0x7ffde6a00000, 20, 20, 3, hipMemcpyDeviceToDevice ) 
:4:command.cpp              :349 : 849862097757 us: [pid:288603 tid:0x7ffff7e1ca80] Command (CopyDeviceToDevice2D) enqueued: 0x72e1b0

Thread 1 "hiptest" received signal SIGSEGV, Segmentation fault.
0x00007ffff68fd680 in device::Memory::owner (this=0x0) at /usr/src/clr/rocclr/device/device.hpp:768
768       amd::Memory* owner() const { return owner_; }

as for return value from dev->createMemory(*this) the situation is next:

*0x7ffff6954a33 is next instruction right after needed call

# breaking on main() 

pwndbg> u 0x00007ffff6954a33
   0x7ffff6954a2f    mov    edi, eax
   0x7ffff6954a31    call   rcx

 ► 0x7ffff6954a33    mov    qword ptr [rbp - 0x20], rax
   0x7ffff6954a37    cmp    qword ptr [rbp - 0x20], 0
   0x7ffff6954a3c    je     0x7ffff6954b2f

   0x7ffff6954a42    mov    rax, qword ptr [rbp - 0x48]
   0x7ffff6954a46    mov    rdx, qword ptr [rax + 0x20]
   0x7ffff6954a4a    mov    rax, qword ptr [rbp - 0x48]
   0x7ffff6954a4e    mov    rax, qword ptr [rax + 0x18]
   0x7ffff6954a52    shl    rax, 4
   0x7ffff6954a56    add    rdx, rax

pwndbg> l *0x00007ffff6954a33
0x7ffff6954a33 is in amd::Memory::addDeviceMemory(amd::Device const*) (/usr/src/clr/rocclr/platform/memory.cpp:337).
332         if (numDevices() == NumDevicesWithP2P()) {
333           // Mark the allocation as an empty
334           deviceAlloced_[dev].store(AllocInit, std::memory_order_release);
335           return result;
336         }
337         device::Memory* dm = dev->createMemory(*this);
338
339         // Add the new memory allocation to the device map
340         if (NULL != dm) {
341           deviceMemories_[numDevices_].ref_ = dev;

pwndbg> dprintf *0x7ffff6954a33,"\nreturn %#llx\n",$rax
Dprintf 3 at 0x7ffff6954a33: file /usr/src/clr/rocclr/platform/memory.cpp, line 337.
pwndbg> c
Continuing.
row 0 column 0 value 1.000000 
row 0 column 1 value 1.000000 
row 0 column 2 value 1.000000 
row 0 column 3 value 1.000000 
row 0 column 4 value 1.000000 
row 1 column 0 value 1.000000 
row 1 column 1 value 1.000000 
row 1 column 2 value 1.000000 
row 1 column 3 value 1.000000 
row 1 column 4 value 1.000000 
row 2 column 0 value 1.000000 
row 2 column 1 value 1.000000 
row 2 column 2 value 1.000000 
row 2 column 3 value 1.000000 
row 2 column 4 value 1.000000 
allocating memPith 20 X 3
[New Thread 0x7fffecfff640 (LWP 288927)]
[New Thread 0x7ffde7dff640 (LWP 288928)]
[Thread 0x7ffde7dff640 (LWP 288928) exited]

return 0x3b7ec0

return 0x3b9ed0

return 0x3ba520
pitch = 256

return 0x4764b0

return 0x4764b0

return 0x3eaed0

Thread 1 "hiptest" received signal SIGSEGV, Segmentation fault.
0x00007ffff68fd680 in device::Memory::owner (this=0x0) at /usr/src/clr/rocclr/device/device.hpp:768

So, it seems that the answer is yes.

But I notice one strange behavior:

Thread 1 "hiptest" hit Breakpoint 5, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x3b7c20, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);
 ► 0x7ffff6954a31    call   rcx

   0x7ffff6954a33    mov    qword ptr [rbp - 0x20], rax

pwndbg> ni
0x00007ffff6954a33      337         device::Memory* dm = dev->createMemory(*this);

   0x7ffff6954a31    call   rcx

 ► 0x7ffff6954a33    mov    qword ptr [rbp - 0x20], rax

#setting hbreak right after first  call to $rcx using next command:

pwndbg> hb *$pc

Hardware assisted breakpoint 6 at 0x7ffff6954a33: file /usr/src/clr/rocclr/platform/memory.cpp, line 337.

# 0x7ffff6954a33 is as expected...

pwndbg> c
Continuing.

Thread 1 "hiptest" hit Breakpoint 5, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x317700, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 6, 0x00007ffff6954a33 in amd::Memory::addDeviceMemory (this=0x317700, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

...
#once again  it hits 5 and 6
...

#and now the interesting

pwndbg> 
Continuing.
pitch = 256

Thread 1 "hiptest" hit Breakpoint 5, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x66ea70, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

bt:
 ► f 0     7ffff6954a31
   f 1     7ffff6954e46
   f 2     7ffff69547d6
   f 3     7ffff6955822
   f 4     7ffff6a27918
   f 5     7ffff6a23930
   f 6     7ffff69b5c21 roc::VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand&)+1485
   f 7     7ffff67cd374 amd::WriteMemoryCommand::submit(device::VirtualDevice&)+50
   f 8     7ffff693d75a amd::Command::enqueue()+1000
   f 9     7ffff67cf94a
   f 10     7ffff678442b

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 5, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x3b79a0, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

bt:
 ► f 0     7ffff6954a31
   f 1     7ffff6954e46
   f 2     7ffff69b7a65 roc::VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand&)+1279
   f 3     7ffff67cda44 amd::CopyMemoryP2PCommand::submit(device::VirtualDevice&)+50
   f 4     7ffff693d75a amd::Command::enqueue()+1000
   f 5     7ffff675dc0e
   f 6     7ffff677ca48 hipMemcpyDtoD+2361
   f 7           20b79b main+731
   f 8     7ffff5e29d90 __libc_start_call_main+128
   f 9     7ffff5e29e40 __libc_start_main+128
   f 10           20b385 _start+37

What? It doesn't return from the call, does it? Why? Throw-catch? Anyway, let's continue the session:


pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 5, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x3b79a0, dev=0x3b5c20) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

bt:
 ► f 0     7ffff6954a31
   f 1     7ffff6954e46
   f 2     7ffff69b7ac5 roc::VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand&)+1375
   f 3     7ffff67cda44 amd::CopyMemoryP2PCommand::submit(device::VirtualDevice&)+50
   f 4     7ffff693d75a amd::Command::enqueue()+1000
   f 5     7ffff675dc0e
   f 6     7ffff677ca48 hipMemcpyDtoD+2361
   f 7           20b79b main+731
   f 8     7ffff5e29d90 __libc_start_call_main+128
   f 9     7ffff5e29e40 __libc_start_main+128
   f 10           20b385 _start+37

and now some fat black magic took a place, or I mess something?

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 6, 0x00007fffed084050 in ?? () from /opt/rocm/lib/libhsa-runtime64.so.1
LEGEND: STACK | HEAP | CODE | DATA | RWX | RODATA
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[ REGISTERS ]──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 RAX  0x0
 RBX  0x97ad40 —▸ 0x7fffed174730 —▸ 0x7fffed084e60 ◂— endbr64 
 RCX  0x0
 RDX  0x1
 RDI  0x229140 ◂— 0x0
 RSI  0x0
 R8   0x7fffed17c460 (_amdgpu_r_debug) ◂— 9 /* '\t' */
 R9   0x0
 R10  0x7fffed05f1d0 ◂— endbr64 
 R11  0xd92a34f8db757182
 R12  0x0
 R13  0xffffffff
 R14  0x0
 R15  0x7ffff7ffd040 (_rtld_global) —▸ 0x7ffff7ffe2e0 ◂— 0x0
 RBP  0x229128 ◂— 0x0
 RSP  0x7fffffffd5d8 —▸ 0x7fffed0842c3 ◂— mfence 
 RIP  0x7fffed084050 ◂— endbr64 
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[ DISASM ]────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 ► 0x7fffed084050    endbr64 
   0x7fffed084054    mov    eax, dword ptr [rip + 0x2388ae]
   0x7fffed08405a    xor    eax, 1
   0x7fffed08405d    mov    dword ptr [rip + 0x2388a5], eax
   0x7fffed084063    ret    
    ↓
   0x7fffed0842c3    mfence 
   0x7fffed0842c6    mov    rdx, qword ptr [rbx + 0x188]
   0x7fffed0842cd    mov    rax, qword ptr [rbx + 0x190]
   0x7fffed0842d4    cmp    rax, rdx
   0x7fffed0842d7    je     0x7fffed084327

   0x7fffed0842d9    mov    rdi, qword ptr [rip + 0x238620]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[ STACK ]────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
00:0000│ rsp  0x7fffffffd5d8 —▸ 0x7fffed0842c3 ◂— mfence 
01:0008│      0x7fffffffd5e0 —▸ 0x7ffff69a806e ◂— endbr64 
02:0010│      0x7fffffffd5e8 —▸ 0x97ad40 —▸ 0x7fffed174730 —▸ 0x7fffed084e60 ◂— endbr64 
03:0018│      0x7fffffffd5f0 ◂— 0x0
04:0020│      0x7fffffffd5f8 —▸ 0x7fffed057f3c ◂— add    rsp, 0x18
05:0028│      0x7fffffffd600 —▸ 0x7fffffffd6a0 —▸ 0x7fffffffd710 —▸ 0x7fffffffd740 —▸ 0x7fffffffd7d0 ◂— ...
06:0030│      0x7fffffffd608 ◂— 0x8a50
07:0038│      0x7fffffffd610 ◂— 0xffffffff
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[ BACKTRACE ]──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 ► f 0     7fffed084050
   f 1     7fffed0842c3
   f 2     7fffed057f3c
   f 3     7ffff69a82a8
   f 4     7ffff690c290 device::Program::loadLC()+168
   f 5     7ffff690c30e device::Program::load()+44
   f 6     7ffff6963255
   f 7     7ffff68f6ce1
   f 8     7ffff697fc81 roc::Device::createBlitProgram()+323
   f 9     7ffff6a1ea0c roc::KernelBlitManager::createProgram(roc::Device&)+84
   f 10     7ffff6a1e9a2 roc::KernelBlitManager::create(amd::Device&)+72
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
pwndbg> xi
Extended information for virtual address 0x7fffed084050:

  Containing mapping:
    0x7fffed018000     0x7fffed104000 r-xp    ec000 18000  /opt/rocm-5.7.1/lib/libhsa-runtime64.so.1.11.50701

  Offset information:
         Mapped Area 0x7fffed084050 = 0x7fffed018000 + 0x6c050
         File (Base) 0x7fffed084050 = 0x7fffed000000 + 0x84050
      File (Segment) 0x7fffed084050 = 0x7fffed018000 + 0x6c050
         File (Disk) 0x7fffed084050 = /opt/rocm-5.7.1/lib/libhsa-runtime64.so.1.11.50701 + 0x84050

 Containing ELF sections:
               .text 0x7fffed084050 = 0x7fffed01a240 + 0x69e10
pwndbg> up
   f 0     7fffed084050
 ► f 1     7fffed0842c3
   f 2     7fffed057f3c
   f 3     7ffff69a82a8
   f 4     7ffff690c290 device::Program::loadLC()+168
   f 5     7ffff690c30e device::Program::load()+44
   f 6     7ffff6963255
   f 7     7ffff68f6ce1
   f 8     7ffff697fc81 roc::Device::createBlitProgram()+323
   f 9     7ffff6a1ea0c roc::KernelBlitManager::createProgram(roc::Device&)+84
   f 10     7ffff6a1e9a2 roc::KernelBlitManager::create(amd::Device&)+72
   f 11     7ffff69b3911 roc::VirtualGPU::create()+495

It looks like the value in cpu debug register has changed silently, and that cause it doesn't break on 6 after 5, but break in totally different place? Or what?


pwndbg> bt
#0  0x00007fffed084050 in ?? () from /opt/rocm/lib/libhsa-runtime64.so.1
#1  0x00007fffed0842c3 in ?? () from /opt/rocm/lib/libhsa-runtime64.so.1
#2  0x00007fffed057f3c in ?? () from /opt/rocm/lib/libhsa-runtime64.so.1
#3  0x00007ffff69a82a8 in roc::LightningProgram::setKernels (this=0xa5bc40, binary=0x496c70, binSize=35408, fdesc=-1, foffset=0, uri=...) at /usr/src/clr/rocclr/device/rocm/rocprogram.cpp:326
#4  0x00007ffff690c290 in device::Program::loadLC (this=0xa5bc40) at /usr/src/clr/rocclr/device/devprogram.cpp:1903
#5  0x00007ffff690c30e in device::Program::load (this=0xa5bc40) at /usr/src/clr/rocclr/device/devprogram.cpp:1914
#6  0x00007ffff6963255 in amd::Program::load (this=0x96a7a0, devices=...) at /usr/src/clr/rocclr/platform/program.cpp:616
#7  0x00007ffff68f6ce1 in amd::Device::BlitProgram::create (this=0x4743a0, device=0x3b5c20, extraKernels=..., extraOptions=...) at /usr/src/clr/rocclr/device/device.cpp:462
#8  0x00007ffff697fc81 in roc::Device::createBlitProgram (this=0x3b5c20) at /usr/src/clr/rocclr/device/rocm/rocdevice.cpp:858
#9  0x00007ffff6a1ea0c in roc::KernelBlitManager::createProgram (this=0x67e760, device=...) at /usr/src/clr/rocclr/device/rocm/rocblit.cpp:890
#10 0x00007ffff6a1e9a2 in roc::KernelBlitManager::create (this=0x67e760, device=...) at /usr/src/clr/rocclr/device/rocm/rocblit.cpp:881
#11 0x00007ffff69b3911 in roc::VirtualGPU::create (this=0x4bb5b0) at /usr/src/clr/rocclr/device/rocm/rocvirtual.cpp:1271
#12 0x00007ffff698371c in roc::Device::createVirtualDevice (this=0x3b5c20, queue=0x0) at /usr/src/clr/rocclr/device/rocm/rocdevice.cpp:1771
#13 0x00007ffff6988358 in roc::Device::xferQueue (this=0x3b5c20) at /usr/src/clr/rocclr/device/rocm/rocdevice.cpp:2617
#14 0x00007ffff698f830 in roc::Device::xferMgr (this=0x3b5c20) at /usr/src/clr/rocclr/device/rocm/rocdevice.hpp:461
#15 0x00007ffff69b7c12 in roc::VirtualGPU::submitCopyMemoryP2P (this=0x3bab10, cmd=...) at /usr/src/clr/rocclr/device/rocm/rocvirtual.cpp:2045
#16 0x00007ffff67cda44 in amd::CopyMemoryP2PCommand::submit (this=0x72e1b0, device=...) at /usr/src/clr/rocclr/platform/command.hpp:1661
#17 0x00007ffff693d75a in amd::Command::enqueue (this=0x72e1b0) at /usr/src/clr/rocclr/platform/command.cpp:393
#18 0x00007ffff675dc0e in ihipMemcpy (dst=0x7ffde6600000, src=0x7ffde6a00000, sizeBytes=16, kind=hipMemcpyDeviceToDevice, stream=..., isHostAsync=false, isGPUAsync=true) at /usr/src/clr/hipamd/src/hip_memory.cpp:502
#19 0x00007ffff677ca48 in hipMemcpyDtoD (dstDevice=0x7ffde6600000, srcDevice=0x7ffde6a00000, ByteCount=16) at /usr/src/clr/hipamd/src/hip_memory.cpp:1411
#20 0x000000000020b79b in main () at /opt/AI/koboldcpp-rocm/hip_test.cu:79
#21 0x00007ffff5e29d90 in __libc_start_call_main (main=main@entry=0x20b4c0 <main()>, argc=argc@entry=1, argv=argv@entry=0x7fffffffe428) at ../sysdeps/nptl/libc_start_call_main.h:58
#22 0x00007ffff5e29e40 in __libc_start_main_impl (main=0x20b4c0 <main()>, argc=1, argv=0x7fffffffe428, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7fffffffe418) at ../csu/libc-start.c:392
#23 0x000000000020b385 in _start ()

So, I decide to try to reproduce this magic, and started from playing with that address which appearred instead our hb 6. Using some dprintf gdb functions with setup to behave from GDB's context, I got the next:

pwndbg> i b
Num     Type           Disp Enb Address            What
1       breakpoint     keep y   0x000000000020b4ca in main() at /opt/AI/koboldcpp-rocm/hip_test.cu:51
        breakpoint already hit 1 time
3       hw breakpoint  keep n   0x00007ffff6954a33 
4       hw breakpoint  keep n   0x00007ffff6954a31 
5       catchpoint     keep y                      exception throw
6       catchpoint     keep y                      exception rethrow
7       catchpoint     keep y                      exception catch
8       catchpoint     keep y                      signal "<standard signals>" 
9       breakpoint     keep n   0x00007fffed084050 
10      dprintf        keep y   0x00007ffff6954a31 
        printf "dev->createMemory(* %#llx )",$rdi
11      dprintf        keep y   0x00007ffff6954a33 
        printf " = %#llx \n",$rax
12      dprintf        keep y   0x00007fffed084050 
        printf "magic = %#llx \n",$rax

pwndbg> c
Continuing.
row 0 column 0 value 1.000000 
row 0 column 1 value 1.000000 
row 0 column 2 value 1.000000 
row 0 column 3 value 1.000000 
row 0 column 4 value 1.000000 
row 1 column 0 value 1.000000 
row 1 column 1 value 1.000000 
row 1 column 2 value 1.000000 
row 1 column 3 value 1.000000 
row 1 column 4 value 1.000000 
row 2 column 0 value 1.000000 
row 2 column 1 value 1.000000 
row 2 column 2 value 1.000000 
row 2 column 3 value 1.000000 
row 2 column 4 value 1.000000 
allocating memPith 20 X 3
[New LWP 289099]
[New LWP 289100]
[LWP 289100 exited]
dev->createMemory(* 0x3a61f0 ) = 0x3b7ec0 
dev->createMemory(* 0x3a61f0 ) = 0x3b9ed0 
dev->createMemory(* 0x3b5c20 ) = 0x3ba520 
pitch = 256
magic = 0 
magic = 0x3e54e8 
dev->createMemory(* 0x3a61f0 ) = 0x4764b0 
dev->createMemory(* 0x3a61f0 ) = 0x4764b0 
dev->createMemory(* 0x3b5c20 ) = 0x3eaed0 
magic = 0 
magic = 0x424ad8 

Thread 1 "hiptest" hit Catchpoint 8 (signal SIGSEGV), 0x00007ffff68fd680 in device::Memory::owner (this=0x0) at /usr/src/clr/rocclr/device/device.hpp:768

i b

5       catchpoint     keep y                      exception throw
6       catchpoint     keep y                      exception rethrow
7       catchpoint     keep y                      exception catch
8       catchpoint     keep y                      signal "<standard signals>" 
        catchpoint already hit 1 time

So, let's try to reproduce this trick with hb:


pwndbg> [Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".

Breakpoint 1, main () at /opt/AI/koboldcpp-rocm/hip_test.cu:49

pwndbg> hb *0x00007ffff6954a31
Hardware assisted breakpoint 2 at 0x7ffff6954a31: file /usr/src/clr/rocclr/platform/memory.cpp, line 337.

pwndbg> c
Continuing.
row 0 column 0 value 1.000000 
row 0 column 1 value 1.000000 
row 0 column 2 value 1.000000 
row 0 column 3 value 1.000000 
row 0 column 4 value 1.000000 
row 1 column 0 value 1.000000 
row 1 column 1 value 1.000000 
row 1 column 2 value 1.000000 
row 1 column 3 value 1.000000 
row 1 column 4 value 1.000000 
row 2 column 0 value 1.000000 
row 2 column 1 value 1.000000 
row 2 column 2 value 1.000000 
row 2 column 3 value 1.000000 
row 2 column 4 value 1.000000 
allocating memPith 20 X 3
[New Thread 0x7fffecfff640 (LWP 289167)]
[New Thread 0x7fffe7fff640 (LWP 289168)]
[Thread 0x7fffe7fff640 (LWP 289168) exited]

Thread 1 "hiptest" hit Breakpoint 2, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x3b7c20, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

pwndbg> ni
0x00007ffff6954a33      337         device::Memory* dm = dev->createMemory(*this);

pwndbg> hb *$pc
Hardware assisted breakpoint 3 at 0x7ffff6954a33: file /usr/src/clr/rocclr/platform/memory.cpp, line 337.
pwndbg> c
Continuing.

Thread 1 "hiptest" hit Breakpoint 2, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x317700, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 3, 0x00007ffff6954a33 in amd::Memory::addDeviceMemory (this=0x317700, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 2, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x3ba2d0, dev=0x3b5c20) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 3, 0x00007ffff6954a33 in amd::Memory::addDeviceMemory (this=0x3ba2d0, dev=0x3b5c20) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

# notice quite long delay here each time...

pwndbg> 
Continuing.
pitch = 256

Thread 1 "hiptest" hit Breakpoint 2, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x66ea70, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 2, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x3b79a0, dev=0x3a61f0) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 2, 0x00007ffff6954a31 in amd::Memory::addDeviceMemory (this=0x3b79a0, dev=0x3b5c20) at /usr/src/clr/rocclr/platform/memory.cpp:337
337         device::Memory* dm = dev->createMemory(*this);

pwndbg> 
Continuing.

Thread 1 "hiptest" hit Breakpoint 3, 0x00007fffed084050 in ?? () from /opt/rocm/lib/libhsa-runtime64.so.1

By the way:

pwndbg> show schedule-multiple 
Resuming the execution of threads of all processes is off.
pwndbg> show scheduler-locking 
Mode for locking scheduler during execution is "replay".

upd:

from this call
 ► 1790   Memory* dstDevMem = dev().getRocMemory(&dstMem);

    323 // ================================================================================================
   324 bool Memory::addDeviceMemory(const Device* dev) {
   325   bool result = false;
   326   AllocState create = AllocCreate;
   327   AllocState init = AllocInit;
   328 
   329   amd::ScopedLock lock(lockMemoryOps());
   330   if (deviceAlloced_[dev].compare_exchange_strong(init, create, std::memory_order_acq_rel)) {
   331     // Check if runtime already allocated all available slots for device memory
 ► 332     if (numDevices() == NumDevicesWithP2P()) {              /// here numDevices() == 1 == NumDevicesWithP2P()
   333       // Mark the allocation as an empty
   334       deviceAlloced_[dev].store(AllocInit, std::memory_order_release); 
   335       return result;        /// <- and than this would be hit
   336     }
   337     device::Memory* dm = dev->createMemory(*this);

# before atomic write
pwndbg> p deviceAlloced_[dev]
$7 = (std::unordered_map<amd::Device const*, std::atomic<amd::Memory::AllocState>, std::hash<amd::Device const*>, std::equal_to<amd::Device const*>, std::allocator<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> > > >::mapped_type &) @0x4248d0: {
  static _S_min_alignment = 4,
  static _S_alignment = 4,
  _M_i = amd::Memory::AllocCreate
}

# after it
pwndbg> p deviceAlloced_[dev]
$11 = (std::unordered_map<amd::Device const*, std::atomic<amd::Memory::AllocState>, std::hash<amd::Device const*>, std::equal_to<amd::Device const*>, std::allocator<std::pair<amd::Device const* const, std::atomic<amd::Memory::AllocState> > > >::mapped_type &) @0x4248d0: {
  static _S_min_alignment = 4,
  static _S_alignment = 4,
  _M_i = amd::Memory::AllocInit
}

# returned 

   399   if ((NULL == dm) && alloc) {
 ► 400     if (!addDeviceMemory(&dev)) {   /// check that this is false
   401       return NULL; /// <- return invalid pointer
   402     }
   403     dm = deviceMemories_[numDevices() - 1].value_;
   404   }

   1926 Memory* Device::getRocMemory(amd::Memory* mem) const {
   1927   return static_cast<roc::Memory*>(mem->getDeviceMemory(*this)); /// cast invalid pointr 
 ► 1928 }

 ► 1790   Memory* dstDevMem = dev().getRocMemory(&dstMem); /// and assign  nullptrto dstDevMem 

 ► 1795   dstDevMem->syncCacheFromHost(*this, syncFlags); /// and crash

@iassiour , Sir, can you clarify please, if I should use develop branch as you indicated in the link, or it was just a reference to a specific place in the code? Thanks.

iassiour commented 12 months ago

@kotee4ko Yes sorry that was just a reference to the dev->createMemory call in addDeviceMemory, please continue to use the 5.7.x branches of hip/clr as the develop branch is ahead and will be incompatible with other components in rocm 5.7.1

kotee4ko commented 12 months ago

@kotee4ko Yes sorry that was just a reference to the dev->createMemory call in addDeviceMemory, please continue to use the 5.7.x branches of hip/clr as the develop branch is ahead and will be incompatible with other components in rocm 5.7.1

Okey, thanks.

What should I do as a next step?

iassiour commented 11 months ago

Hi @kotee4ko so it looks like we can confirm that dev->createMemory returns a nullptr at some point. I do not have access to a multi-gpu vega 10 yet, I am currently running the test on a 2-node vega 20 but can't reproduce the issue there. I notice that there is an extra hipMemcpyDtoD in your logs, is this the original test shown at the top with logs enabled?

kotee4ko commented 11 months ago

Hi @kotee4ko so it looks like we can confirm that dev->createMemory returns a nullptr at some point. I do not have access to a multi-gpu vega 10 yet, I am currently running the test on a 2-node vega 20 but can't reproduce the issue there. I notice that there is an extra hipMemcpyDtoD in your logs, is this the original test shown at the top with logs enabled?

Thanks. I can share access to the server with vega 10, if needed. As for hipMemcpyDtoD() call, yes, this was explained here:

https://github.com/ROCm-Developer-Tools/HIP/issues/3352#issuecomment-1782827126

The reason of adding that call was to check, if hardware side and kernel software side both able to perform data transfer to each other.

@iassiour I don't sure, but... what's the sense of 'virtual GPU' in code? I mean, vega10, if I understand right, is single device, but it got 2 GPU cores?

iassiour commented 11 months ago

@kotee4ko I think its worth looking into NumDevicesWithP2P() and why its reports 1 in you case, as you have two physical devices I think this should have returned 2 unless p2p is disabled. I think this would eventually go down to this code here https://github.com/ROCm-Developer-Tools/clr/blob/rocm-5.7.x/rocclr/device/rocm/rocdevice.cpp#L1223

kotee4ko commented 11 months ago

Yes, Sir, seems hsa doesn't allow p2p dma.

There are two agents in vector, if I understand right. Since first agent handle == bkendDevice handle -- it pass ifcase for second agent (bkendDevice.handle = 3575104 != agent.handle == 3697328) So we calling hsa_amd_agent_memory_pool_get_info(), with gpuvmsegment provided below.

pwndbg> p gpuvm_segment_
$45 = {
  handle = 3576576
}

Here is details of used variables:

pwndbg> p gpu_agents_
$31 = {
  <std::_Vector_base<hsa_agent_s, std::allocator<hsa_agent_s> >> = {
    _M_impl = {
      <std::allocator<hsa_agent_s>> = {
        <__gnu_cxx::new_allocator<hsa_agent_s>> = {<No data fields>}, <No data fields>},
      <std::_Vector_base<hsa_agent_s, std::allocator<hsa_agent_s> >::_Vector_impl_data> = {
        _M_start = 0x3a5f00,
        _M_finish = 0x3a5f10,
        _M_end_of_storage = 0x3a5f10
      }, <No data fields>}
  }, <No data fields>}

pwndbg> p gpu_agents_[0]
$35 = (__gnu_cxx::__alloc_traits<std::allocator<hsa_agent_s>, hsa_agent_s>::value_type &) @0x3a5f00: {
  handle = 3575104
}
pwndbg> p gpu_agents_[1]
$36 = (__gnu_cxx::__alloc_traits<std::allocator<hsa_agent_s>, hsa_agent_s>::value_type &) @0x3a5f08: {
  handle = 3697328
}

hsa_amd_agent_memory_pool_get_info() returns HSA_STATUS_SUCCESS, as expected, and write HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED into access var... Then breaks the loop, since vector's end reached.

Then it creates new list for 1 element, because p2pagents is empty.

   1243   // Keep track of all P2P Agents in a Array including current device handle for IPC
 ► 1244   p2p_agents_list_ = new hsa_agent_t[1 + p2p_agents_.size()];

Assign backend's handle (3575104) to first array's element.

   1245   p2p_agents_list_[0] = getBackendDevice();

next, it skip the loop because (0 < 0) is false.

 ► 1246   for (size_t agent_idx = 0; agent_idx < p2p_agents_.size(); ++agent_idx) {
   1247     p2p_agents_list_[1 + agent_idx] = p2p_agents_[agent_idx];
   1248   }

next, call to hsa_amd_memory_pool_get_info(), tell that group_segment_size = 0x10000 and so on, should I provide more info?

In HSA description told that to make p2p work we need two things: large bar size, to be able to handle linear memory address, and atomic support, if I understand right.

For now my host seems fails first constrain, but why DtoD works in that case? As for second constrain - I was thinking, that all aligned memory operations are atomic, on modern i386, doesn't they?

iassiour commented 11 months ago

Hi @kotee4ko the reason why DtoD works is because it follows a different path that ends-up performing a 2-step transfer with staging buffer. It gets here https://github.com/ROCm-Developer-Tools/clr/blob/rocm-5.7.x/rocclr/device/rocm/rocvirtual.cpp#L2016 and I think in your configuration p2pAllowed would be false (for the same reasons as above) but in this case it falls back to the 2-step transfer. I will check why memCpy2D does not have a similar fallback. If that can't be supported, it should at least handle the case with no sigsegv and with an error message that p2p is disabled.

kotee4ko commented 10 months ago

@iassiour

Dirty workaround, but works like a charm.


 diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp
index a67365a..551d7ee 100644
--- a/hipamd/src/hip_memory.cpp
+++ b/hipamd/src/hip_memory.cpp
@@ -2363,6 +2363,37 @@ hipError_t hipMemcpy2DValidateArray(hipArray_const_t arr, size_t wOffset, size_t
   return hipSuccess;
 }

+static
+hipError_t hipMemcpy2DAsyncHacked(void *dst, size_t dpitch, const void *src,
+                            size_t spitch, size_t width, size_t height,
+                            hipMemcpyKind kind, hipStream_t stream = nullptr,
+                           bool isAsync = false) {
+  if (spitch == 0)
+    spitch = width;
+  if (dpitch == 0) 
+    dpitch = width;
+
+  if (spitch == 0 || dpitch == 0)
+    return hipErrorInvalidValue;
+
+  for (size_t i = 0; i < height; ++i) {
+    if (kind == hipMemcpyHostToHost) {
+      memcpy(dst, src, width);
+    } else {
+      if (isAsync) {
+        if (hipMemcpyDtoDAsync(dst, (void* )src, width, stream) != hipSuccess)
+          return (hipErrorLaunchFailure);
+      } else {
+        if (hipMemcpyDtoD(dst,(void *)src, width) != hipSuccess)
+          return (hipErrorLaunchFailure);
+      }
+    }
+    src = (char *)src + spitch;
+    dst = (char *)dst + dpitch;
+  }
+  return hipSuccess;
+}
+
 hipError_t hipMemcpy2D_common(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
                        size_t height, hipMemcpyKind kind, hipStream_t stream = nullptr,
                        bool isAsync = false) {
@@ -2377,7 +2408,8 @@ hipError_t hipMemcpy2D_common(void* dst, size_t dpitch, const void* src, size_t
   if ((validateDst = hipMemcpy2DValidateBuffer(dst,dpitch, width)) != hipSuccess) {
     return validateDst;
   }
-  return ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, isAsync);
+  //return ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, isAsync);
+  return hipMemcpy2DAsyncHacked(dst, dpitch, src, spitch, width, height, kind, stream, isAsync);
 }

 hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
iassiour commented 10 months ago

Thank you for sharing @kotee4ko, I am glad it works. Let me take this internally and see how it can be consolidated into a permanent fix. There may be some performance concerns as there are multiple staging copies submitted but in any case, I believe it is a nice workaround to have in the case p2p is disabled.

gandryey commented 9 months ago

Hi @kotee4ko, The current workaround for this issue is going to be very slow and may cause other users undesirably enable slow execution path. Could you enable resizeable BAR in system BIOS and see if P2P access will be available after?

ppanchad-amd commented 2 months ago

@kotee4ko Do you still need assistance with this ticket? If not, please close the ticket. Thanks!

ppanchad-amd commented 1 month ago

@kotee4ko Closing ticket for now. Please feel free to re-open ticket if you still require assistance. Thanks!