ROCm / HIP

HIP: C++ Heterogeneous-Compute Interface for Portability
https://rocmdocs.amd.com/projects/HIP/
MIT License
3.76k stars 539 forks source link

[Issue]: Shared Memory didn't Released after `hipIpcCloseMemHandle` #3580

Closed SiyangShao closed 3 months ago

SiyangShao commented 3 months ago

Problem Description

There are two processes sharing GPU memory using hipIpcMemHandle_t.

Client process create GPU memory and gets memory handle through hipIpcGetMemHandle, then sent the handle to Server process and stopped.

Server process read from the handler via hipIpcOpenEventHandle, do some operation and close it via hipIpcCloseMemHandle.

After that, there's no process handling the memory, but it will not be automatically cleaned.

Expected behaviour: If we change all the hip apis here to cuda and do that on NVIDIA GPU, the memory will be released after cudaIpcCloseMemHandle.

Operating System

Ubuntu 20.04.6 LTS (Focal Fossa)

CPU

AMD EPYC 7V13 64-Core Processor

GPU

AMD Instinct MI210

ROCm Version

ROCm 6.1.0

ROCm Component

No response

Steps to Reproduce

Client code:

#include <arpa/inet.h>
#include <cstring>
#include <hip/hip_runtime.h>
#include <iostream>
#include <sys/socket.h>
#include <unistd.h>

int sentToAnotherProcess(const hipIpcMemHandle_t &handle) {
  int sock = 0;
  struct sockaddr_in serv_addr;
  const char *message = reinterpret_cast<const char *>(&handle);
  char buffer[64] = {0};

  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
    std::cerr << "Socket creation error" << std::endl;
    return -1;
  }

  serv_addr.sin_family = AF_INET;
  serv_addr.sin_port = htons(8080);

  if (inet_pton(AF_INET, "127.0.0.1", &serv_addr.sin_addr) <= 0) {
    std::cerr << "Invalid address/ Address not supported" << std::endl;
    return -1;
  }

  if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
    std::cerr << "Connection Failed" << std::endl;
    return -1;
  }

  send(sock, message, 64, 0);

  close(sock);
  return 0;
}

int main() {
  void *device_ptr;
  size_t size = 30ULL * 1024 * 1024 * 1024;
  // allocate GPU memory
  hipError_t err = hipMalloc(&device_ptr, size);
  if (err != hipSuccess) {
    std::cerr << "Failed to allocate memory, reason: "
              << hipGetErrorString(err) << std::endl;
    return -1;
  }
  // set IPC memory handler
  hipIpcMemHandle_t handle;
  err = hipIpcGetMemHandle(&handle, device_ptr);
  if (err != hipSuccess) {
    std::cerr << "Failed to get IPC memory handle, reason: "
              << hipGetErrorString(err);
    return -1;
  }
  for (int i = 0; i < 64; ++i) {
    std::cout << handle.reserved[i] << " ";
  }
  std::cout << std::endl;

  // enter any number to sent the message
  std::cout << "Enter any number to sent the message" << std::endl;
  int tmp;
  std::cin >> tmp;
  int status = sentToAnotherProcess(handle);
  if (status != 0) {
    std::cerr << "Failed to sent message" << std::endl;
    return -1;
  }
  std::cout << "Enter any number to close the process" << std::endl;
  std::cin >> tmp;

  return 0;
}

Server code:

#include <cstring>
#include <hip/hip_runtime.h>
#include <iostream>
#include <netinet/in.h>
#include <sys/socket.h>
#include <unistd.h>

int main() {
  // create a socket to get the IPC memory handle
  int server_fd, new_socket;
  struct sockaddr_in address;
  int addrlen = sizeof(address);
  const int PORT = 8080;
  char buffer[64] = {0};
  size_t byte_size = sizeof(buffer);

  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) {
    std::cerr << "Socket creation failed" << std::endl;
    return -1;
  }

  address.sin_family = AF_INET;
  address.sin_addr.s_addr = INADDR_ANY;
  address.sin_port = htons(PORT);

  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
    std::cerr << "Bind failed" << std::endl;
    return -1;
  }

  if (listen(server_fd, 3) < 0) {
    std::cerr << "Listen failed" << std::endl;
    return -1;
  }

  std::cout << "Waiting for client to connect..." << std::endl;
  if ((new_socket = accept(server_fd, (struct sockaddr *)&address,
                           (socklen_t *)&addrlen)) < 0) {
    std::cerr << "Accept failed" << std::endl;
    return -1;
  }

  int valread = read(new_socket, buffer, byte_size);
  std::cout << "Received data" << std::endl;

  hipSetDevice(0);
  // read the memory handler
  hipIpcMemHandle_t *handle = reinterpret_cast<hipIpcMemHandle_t *>(buffer);
  void *device_ptr;
  for (int i = 0; i < 64; ++i) {
    std::cout << handle->reserved[i] << " ";
  }
  std::cout << std::endl;
  // read the device
  hipError_t status = hipIpcOpenMemHandle(&device_ptr, *handle,
                                            hipIpcMemLazyEnablePeerAccess);
  if (status != hipSuccess) {
    std::cerr << "Failed to open IPC memory handle with error: "
              << hipGetErrorString(status) << std::endl;
    return -1;
  }
  // enter any number to close the ipc mem handle
  std::cout << "Enter any number to close the IPC memory handle" << std::endl;
  int x;
  std::cin >> x;
  status = hipIpcCloseMemHandle(device_ptr);
  if (status != hipSuccess) {
    std::cerr << "Failed to close IPC memory handle with error: "
              << hipGetErrorString(status) << std::endl;
    return -1;
  }

  std::cout << "Enter any number to close the process" << std::endl;
  // enter any number to close the process
  std::cin >> x;
  close(new_socket);
  close(server_fd);

  return 0;
}

(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support

*******
Agent 2
*******
  Name:                    gfx90a
  Uuid:                    GPU-86e2ac051869a69d
  Marketing Name:          AMD Instinct MI210
  Vendor Name:             AMD
  Feature:                 KERNEL_DISPATCH
  Profile:                 BASE_PROFILE
  Float Round Mode:        NEAR
  Max Queue Number:        128(0x80)
  Queue Min Size:          64(0x40)
  Queue Max Size:          131072(0x20000)
  Queue Type:              MULTI
  Node:                    1
  Device Type:             GPU
  Cache Info:
    L1:                      16(0x10) KB
    L2:                      8192(0x2000) KB
  Chip ID:                 29711(0x740f)
  ASIC Revision:           1(0x1)
  Cacheline Size:          64(0x40)
  Max Clock Freq. (MHz):   1700
  BDFID:                   1792
  Internal Node ID:        1
  Compute Unit:            104
  SIMDs per CU:            4
  Shader Engines:          8
  Shader Arrs. per Eng.:   1
  WatchPts on Addr. Ranges:4
  Coherent Host Access:    FALSE
  Features:                KERNEL_DISPATCH
  Fast F16 Operation:      TRUE
  Wavefront Size:          64(0x40)
  Workgroup Max Size:      1024(0x400)
  Workgroup Max Size per Dimension:
    x                        1024(0x400)
    y                        1024(0x400)
    z                        1024(0x400)
  Max Waves Per CU:        32(0x20)
  Max Work-item Per CU:    2048(0x800)
  Grid Max Size:           4294967295(0xffffffff)
  Grid Max Size per Dimension:
    x                        4294967295(0xffffffff)
    y                        4294967295(0xffffffff)
    z                        4294967295(0xffffffff)
  Max fbarriers/Workgrp:   32
  Packet Processor uCode:: 83
  SDMA engine uCode::      8
  IOMMU Support::          None
  Pool Info:
    Pool 1
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED
      Size:                    67092480(0x3ffc000) KB
      Allocatable:             TRUE
      Alloc Granule:           4KB
      Alloc Recommended Granule:2048KB
      Alloc Alignment:         4KB
      Accessible by all:       FALSE
    Pool 2
      Segment:                 GLOBAL; FLAGS: EXTENDED FINE GRAINED
      Size:                    67092480(0x3ffc000) KB
      Allocatable:             TRUE
      Alloc Granule:           4KB
      Alloc Recommended Granule:2048KB
      Alloc Alignment:         4KB
      Accessible by all:       FALSE
    Pool 3
      Segment:                 GLOBAL; FLAGS: FINE GRAINED
      Size:                    67092480(0x3ffc000) KB
      Allocatable:             TRUE
      Alloc Granule:           4KB
      Alloc Recommended Granule:2048KB
      Alloc Alignment:         4KB
      Accessible by all:       FALSE
    Pool 4
      Segment:                 GROUP
      Size:                    64(0x40) KB
      Allocatable:             FALSE
      Alloc Granule:           0KB
      Alloc Recommended Granule:0KB
      Alloc Alignment:         0KB
      Accessible by all:       FALSE
  ISA Info:
    ISA 1
      Name:                    amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack-
      Machine Models:          HSA_MACHINE_MODEL_LARGE
      Profiles:                HSA_PROFILE_BASE
      Default Rounding Mode:   NEAR
      Default Rounding Mode:   NEAR
      Fast f16:                TRUE
      Workgroup Max Size:      1024(0x400)
      Workgroup Max Size per Dimension:
        x                        1024(0x400)
        y                        1024(0x400)
        z                        1024(0x400)
      Grid Max Size:           4294967295(0xffffffff)
      Grid Max Size per Dimension:
        x                        4294967295(0xffffffff)
        y                        4294967295(0xffffffff)
        z                        4294967295(0xffffffff)
      FBarrier Max Size:       32
*** Done ***

Additional Information

No response

mangupta commented 3 months ago

@SiyangShao : Thanks for the reporting the issue. However, the issue is already been fixed in ROCm 6.2.0 via https://github.com/ROCm/clr/commit/c74bdf212974f00b8030a9e2a5b2a83e55a10806.