Closed cocoa-xu closed 2 weeks ago
Evision.Mat.to_pointer/2
is half-way done!
I can get both local pointer and CUDA IPC pointer in evision, and EXLA can construct a tensor using the shared local pointer from evision.
iex> img = Evision.imread("test/testdata/dog.jpg")
%Evision.Mat{
channels: 3,
dims: 2,
type: {:u, 8},
raw_type: 16,
shape: {576, 768, 3},
ref: #Reference<0.3493820873.2916483080.54551>
}
iex> cuda_img = Evision.CUDA.GpuMat.gpuMat(img)
%Evision.CUDA.GpuMat{
channels: 3,
type: {:u, 8},
raw_type: 16,
shape: {576, 768, 3},
elemSize: 3,
ref: #Reference<0.3493820873.2916483076.54290>
}
iex> {:ok, cuda_ptr} = Evision.Mat.to_pointer(cuda_img, :local)
{:ok, [0, 0, 192, 5, 0, 1, 0, 0]}
iex> {:ok, cuda_ipc_ptr} = Evision.Mat.to_pointer(cuda_img, :cuda_ipc)
{:ok,
[192, 173, 1, 244, 84, 127, 0, 0, 156, 81, 1, 0, 0, 0, 0, 0, 0, 128, 22, 0, 0,
0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 255, 0, 0, 60, 0, 0, 0, 0, 0,
0, 0, ...]}
iex> {:ok, shared_img} = Nx.from_pointer(
...> {EXLA.Backend, client_name: :cuda},
...> cuda_ptr,
...> cuda_img.type,
...> cuda_img.shape
...> )
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1717847741.838420 86622 cuda_executor.cc:1040] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1717847741.838709 86448 service.cc:146] XLA service 0x7f550047dc90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1717847741.838736 86448 service.cc:154] StreamExecutor device (0): NVIDIA A16-2Q, Compute Capability 8.6
I0000 00:00:1717847741.839045 86448 se_gpu_pjrt_client.cc:889] Using BFC allocator.
I0000 00:00:1717847741.839080 86448 gpu_helpers.cc:114] XLA backend allocating 982482944 bytes on device 0 for BFCAllocator.
I0000 00:00:1717847741.839115 86448 gpu_helpers.cc:154] XLA backend will use up to 982482944 bytes on device 0 for CollectiveBFCAllocator.
I0000 00:00:1717847741.839222 86448 cuda_executor.cc:1040] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
{:ok,
#Nx.Tensor<
u8[576][768][3]
EXLA.Backend<cuda:0, 0.3493820873.2916483073.59015>
[
[
[50, 58, 57],
[51, 59, 58],
[53, 61, 60],
[54, 62, 61],
[55, 63, 62],
[54, 62, 61],
[51, 59, 58],
[50, 58, 57],
[49, 57, 56],
[50, 58, 57],
[52, 60, 59],
[52, 60, 59],
[51, 59, 58],
[50, 58, 57],
[50, 58, 57],
[50, 58, 57],
...
],
...
]
>}
Passing CUDA IPC pointers actually works, and attempts to open a CUDA IPC pointer in the same OS process will return invalid device context
, i.e., it has to be used between OS processes.
And the code below shows a demo that uses CUDA IPC pointer between evision and exla:
In terminal 1 we start a distributed erlang node with short name exla
and cookie cuda
$ iex --sname exla --cookie cuda -S mix
iex> defmodule EXLA.Process do
...> def receive_message do
...> receive do
...> {:cuda_ipc, pointer, type, shape} ->
...> IO.puts("Received CUAD IPC pointer: #{inspect(%{pointer: pointer, type: type, shape: shape})}")
...> shared_cuda_tensor = Nx.from_pointer(
...> {EXLA.Backend, client_name: :cuda},
...> pointer,
...> type,
...> shape,
...> [mode: :cuda_ipc]
...> )
...> dbg(shared_cuda_tensor)
...> end
...> end
...>end
iex> pid = spawn(EXLA.Process, :receive_message, [])
iex> :global.register_name(:messenger, pid)
In terminal 2 we start a distributed erlang node with short name exla
and cookie cuda
$ iex --sname evision --cookie cuda -S mix
iex> true = Node.connect(:exla@vultr)
true
iex> img = Evision.imread("test/testdata/dog.jpg")
iex> cuda_img = Evision.CUDA.GpuMat.gpuMat(img)
iex> {:ok, cuda_ipc_ptr} = Evision.CUDA.GpuMat.to_pointer(cuda_img, mode: :cuda_ipc)
iex> :global.send(:messenger, {:cuda_ipc, cuda_ipc_ptr, cuda_img.type, cuda_img.shape})
After calling :global.send/2
on the second node, similar output is expected to be seen on node 1:
[iex:13: EXLA.Process.receive_message/0]
shared_cuda_tensor #=> {:ok,
#Nx.Tensor<
u8[576][768][3]
EXLA.Backend<cuda:0, 0.1497940450.3507355650.151627>
[
[
[50, 58, 57],
[51, 59, 58],
[53, 61, 60],
[54, 62, 61],
[55, 63, 62],
[54, 62, 61],
[51, 59, 58],
[50, 58, 57],
[49, 57, 56],
[50, 58, 57],
[52, 60, 59],
[52, 60, 59],
[51, 59, 58],
[50, 58, 57],
[50, 58, 57],
[50, 58, 57],
...
],
...
]
>}
Now in v0.2.3 evision can pass or take a shared CUDA buffer from another library like EXLA,
iex> cuda_img = Evision.CUDA.GpuMat.gpuMat(Evision.imread("test/testdata/dog.jpg"))
%Evision.CUDA.GpuMat{
channels: 3,
type: {:u, 8},
raw_type: 16,
shape: {576, 768, 3},
elemSize: 3,
ref: #Reference<0.1595069942.205914176.92309>
}
iex> {:ok, cuda_ptr} = Evision.CUDA.GpuMat.to_pointer(cuda_img, mode: :local)
{:ok, [0, 0, 224, 249, 229, 116, 0, 0]}
iex> Nx.from_pointer({EXLA.Backend, client_name: :cuda}, cuda_ptr, cuda_img.type, cuda_img.shape)
{:ok,
#Nx.Tensor<
u8[576][768][3]
EXLA.Backend<cuda:0, 0.1595069942.205914114.106508>
[
[
[50, 58, 57],
[51, 59, 58],
[53, 61, 60],
[54, 62, 61],
[55, 63, 62],
[54, 62, 61],
[51, 59, 58],
[50, 58, 57],
[49, 57, 56],
[50, 58, 57],
[52, 60, 59],
[52, 60, 59],
[51, 59, 58],
[50, 58, 57],
[50, 58, 57],
[50, 58, 57],
...
],
...
]
>}
iex> exla_tensor = Nx.tensor([[1,2,3]], type: :f32, backend: {EXLA.Backend, client: :cuda})
#Nx.Tensor<
f32[1][3]
EXLA.Backend<cuda:0, 0.1595069942.205914178.96324>
[
[1.0, 2.0, 3.0]
]
>
iex> {:ok, local_ptr} = Nx.to_pointer(exla_tensor, mode: :local)
{:ok, [0, 0, 0, 120, 226, 116, 0, 0]}
iex> exla_shared = Evision.CUDA.GpuMat.from_pointer(local_ptr, exla_tensor.type, exla_tensor.shape)
%Evision.CUDA.GpuMat{
channels: 1,
type: {:f, 32},
raw_type: 5,
shape: {1, 3, 1},
elemSize: 4,
ref: #Reference<0.1595069942.205914176.92310>
}
iex> Evision.Mat.to_nx(Evision.CUDA.GpuMat.download(exla_shared))
#Nx.Tensor<
f32[1][3]
Evision.Backend
[
[1.0, 2.0, 3.0]
]
>
Although OpenCV's cv::CUDA::GpuMat
is limited to n-channel 2D images, {height, width, channels}
, i.e., can only have 3 dims at most.
/cc @davydog187 @polvalente
If this is possible with OpenCV's implementation, then according to @polvalente,
This feature is not urgent but it seems to be a very good performance enhancement for anyone who needs to process images in CUDA among different libraries.
Related link, https://github.com/elixir-nx/nx/pull/1473.