jipolanco / PencilFFTs.jl

Fast Fourier transforms of MPI-distributed Julia arrays
https://jipolanco.github.io/PencilFFTs.jl/dev/
MIT License
77 stars 7 forks source link

UCX ERROR #54

Closed Lightup1 closed 1 year ago

Lightup1 commented 2 years ago

For single node with 4 Tesla V100 GPUs, a system mpi openmpi/3.1.4-gcc45-cuda11.0 built without gdrcopy are used. bench.sh:

#!/bin/bash
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --ntasks-per-node=4
#SBATCH -J gpuN1p4            # N nodes p process t threads
#SBATCH --cpus-per-task=7       # th2 qiming gpu 28 cpus per node (adjust as you need)
#SBATCH --time=00:3:00         # days-hours:minutes:seconds
#SBATCH -p gpu_v100
#SBATCH --output=slurm-%x-%j.out
#SBATCH --error=slurm-%x-%j.err
source /GPUFS/app/MPI/openmpi/3.1.4-gcc45-cuda11.0/env.sh
export UCX_MEMTYPE_CACHE=n
export LD_LIBRARY_PATH=$HOME/.julia-1.7.3/lib/julia:$LD_LIBRARY_PATH
export JULIA_CUDA_USE_BINARYBUILDER=false
julia --project -e 'using Pkg; Pkg.instantiate()'
julia --project -e 'using Pkg; Pkg.precompile()'
srun hostname>hostlist 
mpiexecjl --project --mca btl_openib_if_include mlx5_2 --mca btl_tcp_if_include ib0 --mca btl ^openib  --machinefile hostlist -np $SLURM_NTASKS julia -t7 gpubench.jl #

gpuben.jl:

using MPI
using PencilFFTs
using PencilArrays
using BenchmarkTools
using Random
using CUDA

MPI.Init(threadlevel=:funneled)
comm = MPI.COMM_WORLD
dims = (5120, 32, 32)

rank=MPI.Comm_rank(comm)
device!(rank)
sleep(1*rank)
print("rank:",rank,"GPU:",device(),"\n")

pen = Pencil(CuArray,dims, comm)
transform=Transforms.FFT!()

plan = PencilFFTPlan(pen, transform)
u = allocate_input(plan)
if rank == 0
    print("data size:",dims,"\n")
    print("Start data allocationg\n")
end
randn!(first(u))

b = @benchmark $plan*$u evals=1 samples=100 seconds=30 teardown=(MPI.Barrier(comm))

if rank == 0
    io = IOBuffer()
    show(io, "text/plain", b)
    s = String(take!(io))
    println(s)
end

err:

[gpu55:148782] pml_ucx.c:726 Error: ucx send failed: Input/output error
[gpu55:148780] pml_ucx.c:726 Error: ucx send failed: Input/output error
[gpu55:148783] pml_ucx.c:726 Error: ucx send failed: Input/output error
[gpu55:148781] pml_ucx.c:726 Error: ucx send failed: Input/output error
ERROR: ERROR: ERROR: ERROR: LoadError: LoadError: LoadError: LoadError: MPIError(MPIError(MPIError(16): MPI_ERR_OTHER: known error not in list16): MPI_ERR_OTHER: known error not in list16): MPI_ERR_OTHER: known error not in listMPIError(
Stacktrace:

Stacktrace:

Stacktrace:
  [1]   [1]   [1] 16): MPI_ERR_OTHER: known error not in list
Stacktrace:
  [1] Isend(Isend(Isend(Isend(buf::buf::buf::buf::MPI.Buffer{CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, dest::Int64, tag::Int64, comm::MPI.Comm)
    @ MPI.Buffer{CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, dest::Int64, tag::Int64, comm::MPI.Comm)
    @ MPI.Buffer{CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, dest::Int64, tag::Int64, comm::MPI.Comm)
    @ MPI.Buffer{CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, dest::Int64, tag::Int64, comm::MPI.Comm)
    @ MPI ~/.julia/packages/MPI/08SPr/src/MPI ~/.julia/packages/MPI/08SPr/src/MPI ~/.julia/packages/MPI/08SPr/src/MPI ~/.julia/packages/MPI/08SPr/src/pointtopoint.jl:229
pointtopoint.jl:229
  [2] transpose_send_other!
    @   [2] transpose_send_other!
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:446 [inlined]
  [3] ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:446 [inlined]
  [3] transpose_send!(::transpose_send!(::Tuple{CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, recv_offsets::Tuple{CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, recv_offsets::Vector{Int64}, requests::Vector{Int64}, requests::Tuple{Vector{MPI.Request}, Vector{MPI.Request}}, length_self::Int64, remote_inds::Tuple{Vector{MPI.Request}, Vector{MPI.Request}}, length_self::Int64, remote_inds::pointtopoint.jl:229
  [2] transpose_send_other!
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:446 [inlined]
  [3] transpose_send!(::CartesianIndices{2, Tuple{UnitRange{Int64}, UnitRange{Int64}}}, pointtopoint.jl:229
CartesianIndices{2, Tuple{UnitRange{Int64}, UnitRange{Int64}}}, ::::  [2] transpose_send_other!
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:446 [inlined]
  [3] transpose_send!(::Tuple{MPI.Comm, Vector{Int64}, Int64}, Ao::Tuple{MPI.Comm, Vector{Int64}, Int64}, Ao::Tuple{CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, recv_offsets::Vector{Int64}, requests::Tuple{CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, recv_offsets::Vector{Int64}, requests::Tuple{Vector{MPI.Request}, Vector{MPI.Request}}, length_self::Int64, remote_inds::CartesianIndices{2, Tuple{UnitRange{Int64}, UnitRange{Int64}}}, Tuple::{Vector{MPI.Request}, Vector{MPI.Request}}, length_self::Int64, remote_inds::Tuple{MPI.Comm, Vector{Int64}, Int64}, Ao::CartesianIndices{2, Tuple{UnitRange{Int64}, UnitRange{Int64}}}, ::Tuple{MPI.Comm, Vector{Int64}, Int64}, Ao::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, Ai::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, Ai::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, method::PencilArrays.Transpositions.PointToPoint, timer::TimerOutputs.TimerOutput)
    @ PencilArrays.Transpositions PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, method::PencilArrays.Transpositions.PointToPoint, timer::TimerOutputs.TimerOutput)
    @ PencilArrays.Transpositions ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:388
  [4] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
  [5] Transpositions.jl:388
  [4] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
  [5] transpose_impl!(transpose_impl!(R::Int64, t::R::Int64, t::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, Ai::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, Ai::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, method::PencilArrays.Transpositions.PointToPoint, timer::TimerOutputs.TimerOutput)
    @ PencilArrays.Transpositions ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:388
  [4] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
  [5] transpose_impl!(R::Int64, t::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, method::PencilArrays.Transpositions.PointToPoint, timer::TimerOutputs.TimerOutput)
    @ PencilArrays.Transpositions ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:388
  [4] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
  [5] transpose_impl!(R::Int64, t::PencilArrays.Transpositions.Transposition{ComplexF64, 3, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArrays.Transpositions.PointToPoint})
    @ PencilArrays.Transpositions ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:308
PencilArrays.Transpositions.Transposition{ComplexF64, 3, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArrays.Transpositions.PointToPoint})
    @ PencilArrays.Transpositions ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:308  [6]
 macro expansion
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:152 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
  [6] macro expansion
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:152 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/  [8] #transpose!#4
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:151 [inlined]
  [9] _apply_plans_in_place!(dir::TimerOutput.jl:252 [inlined]
  [8] #transpose!#4
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:151 [inlined]
  [9] _apply_plans_in_place!(dir::Val{-1}, full_plan::Val{-1}, full_plan::PencilArrays.Transpositions.Transposition{ComplexF64, 3, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArrays.Transpositions.PointToPoint})
    @ PencilArrays.Transpositions ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:308
  [6] macro expansion
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:152 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
  [8] #transpose!#4
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:151 [inlined]
  [9] _apply_plans_in_place!(dir::Val{-1}, full_plan::PencilArrays.Transpositions.Transposition{ComplexF64, 3, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArrays.Transpositions.PointToPoint})
    @ PencilArrays.Transpositions ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:308
  [6] macro expansion
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:152 [inlined]
  [7] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
  [8] #transpose!#4
    @ ~/.julia/packages/PencilArrays/aXij6/src/Transpositions/Transpositions.jl:151 [inlined]
  [9] _apply_plans_in_place!(dir::Val{-1}, full_plan::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, u_prev::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, u_prev::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, pair::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, pair::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, u_prev::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, u_prev::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, pair::PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, pair::Pair{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, next_pairs::Pair{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, next_pairs::Pair{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, next_pairs::Pair{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, next_pairs::Pair{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}})
    @ PairPencilFFTs {PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}})
    @ PencilFFTs ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:231
 [10] ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:231
 [10] _apply_plans_in_place!
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:238 [inlined]_apply_plans_in_place!
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/
 operations.jl:238 [inlined]
 [11] _apply_plans!
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:164 [inlined]
 [12] macro expansion
    @ [11] _apply_plans!
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:164 [inlined]
 [12] macro expansion
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:27 [inlined]
 ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:27 [inlined][13] macro expansion

 [13] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
 [14] mul!    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
 [14] mul!(dst::(dst::Pair{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}})
    @ PencilFFTs ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:231
 [10] _apply_plans_in_place!
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:238 [inlined]
 [11] _apply_plans!
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:164 [inlined]
 [12] macro expansion
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:27 [inlined]
 [13] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
 [14] mul!(dst::Pair{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}})
    @ PencilFFTs ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:231
 [10] _apply_plans_in_place!
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:238 [inlined]
 [11] _apply_plans!
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:164 [inlined]
 [12] macro expansion
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:27 [inlined]
 [13] macro expansion
    @ ~/.julia/packages/TimerOutputs/jgSVI/src/TimerOutput.jl:252 [inlined]
 [14] mul!(dst::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, p::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, p::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, p::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}, p::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, src::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, src::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, src::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, src::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}})
    @ PencilFFTs ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:25
ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}})
    @ PencilFFTs ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:25
 [15] *
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:43 [inlined]
 [16]  [15] *
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:43 [inlined]
 [16] var"##core#294"(plan#292::var"##core#294"(plan#292::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}})
    @ PencilFFTs ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:25
 [15] *
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:43 [inlined]
 [16] var"##core#294"(plan#292::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}})
    @ PencilFFTs ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:25
 [15] *
    @ ~/.julia/packages/PencilFFTs/jtlxh/src/operations.jl:43 [inlined]
 [16] var"##core#294"(plan#292::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, u#293::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, u#293::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, u#293::PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, u#293::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}})
    @ Main ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:489
ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}})
    @ Main ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:489
 [17]  [17] var"##sample#295"(::var"##sample#295"(::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}})
    @ Main ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:489
 [17] var"##sample#295"(::ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}})
    @ Main ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:489
 [17] var"##sample#295"(::Tuple{PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}}, __paramsTuple{PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}}, __params::BenchmarkTools.Parameters)
    @ Main ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:495
::BenchmarkTools.Parameters)
    @  [18] Main ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:495
 [18] Tuple{PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}}, __params::BenchmarkTools.Parameters)
    @ Main ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:495
 [18] Tuple{PencilFFTPlan{ComplexF64, 3, true, 3, 2, 0, PencilFFTs.GlobalFFTParams{Float64, 3, true, Tuple{PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!, PencilFFTs.Transforms.FFT!}}, Tuple{PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}, PencilFFTs.PencilPlan1D{ComplexF64, ComplexF64, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, PencilFFTs.Transforms.FFT!, CUDA.CUFFT.cCuFFTPlan{ComplexF64, -1, true, 3}, CUDA.CUFFT.cCuFFTPlan{ComplexF64, 1, true, 3}}}, PencilArrays.Transpositions.PointToPoint, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}, ManyPencilArray{ComplexF64, 3, 3, Tuple{PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, NoPermutation, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(2, 1, 3), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}, PencilArray{ComplexF64, 3, CuArray{ComplexF64, 3, CUDA.Mem.DeviceBuffer}, 3, 0, Pencil{3, 2, Permutation{(3, 2, 1), 3}, CuArray{UInt8, 1, CUDA.Mem.DeviceBuffer}}}}, CuArray{ComplexF64, 1, CUDA.Mem.DeviceBuffer}}}, __params::BenchmarkTools.Parameters)
    @ Main ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:495
 [18] _run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; _run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargsverbose::Bool, pad::::String, kwargs::_run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::_run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}})
    @ BenchmarkToolsBase.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}}) ~/.julia/packages/BenchmarkTools/7xSXH/src/
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:99
 [19] #invokelatest#2
    @ ./essentials.jl:718 [inlined]
 [20] #run_result#45
    @ execution.jl:99
 [19] #invokelatest#2
    @ ./essentials.jl:718 [inlined]
 [20] #run_result#45
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:34 [inlined]
 [21] ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:34 [inlined]
run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Nothing, nleaves [21] run(b::BenchmarkTools.Benchmark, p::Float64, ndone::BenchmarkTools.Parameters; ::Float64, kwargs::progressid::Nothing, nleaves::Float64, ndone::Float64, kwargs::Base.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:99
 [19] #invokelatest#2
    @ ./essentials.jl:718 [inlined]
 [20] #run_result#45
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:34 [inlined]
 [21] run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, kwargs::Base.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:99
 [19] #invokelatest#2
    @ ./essentials.jl:718 [inlined]
 [20] #run_result#45
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:34 [inlined]
 [21] run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, kwargs::Base.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:117
Base.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
    @ BenchmarkToolsBase.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:117
  [22] #warmup#54
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:169 [inlined]
 [23] warmup(item::BenchmarkTools.Benchmark)
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:169
 [24] [22] #warmup#54
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:169 [inlined]
 [23] warmup(item::BenchmarkTools.Benchmark)
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:169
 [24] top-level scope~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:117
 [22] #warmup#54
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:169 [inlined]
 [23] warmup(item::BenchmarkTools.Benchmark)
    @ BenchmarkTools
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/ top-level scope
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:169
 [24] execution.jl:393
in expression starting at /GPUFS/hust_jmcai_2/YuBY/LargeDipole/PencilFFTstest/gpubench.jl:28
execution.jl:393
in expression starting at /GPUFS/hust_jmcai_2/YuBY/LargeDipole/PencilFFTstest/gpubench.jl:28
top-level scope
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:393
in expression starting at /GPUFS/hust_jmcai_2/YuBY/LargeDipole/PencilFFTstest/gpubench.jl:28
Base.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:117
 [22] #warmup#54
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:169 [inlined]
 [23] warmup(item::BenchmarkTools.Benchmark)
    @ BenchmarkTools ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:169
 [24] top-level scope
    @ ~/.julia/packages/BenchmarkTools/7xSXH/src/execution.jl:393
in expression starting at /GPUFS/hust_jmcai_2/YuBY/LargeDipole/PencilFFTstest/gpubench.jl:28
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpiexec detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[35775,1],2]
  Exit code:    1
--------------------------------------------------------------------------
ERROR: failed process: Process(`mpiexec --mca btl_openib_if_include mlx5_2 --mca btl_tcp_if_include ib0 --mca btl ^openib --machinefile hostlist -np 4 julia -t7 gpubench.jl`, ProcessExited(1)) [1]

Stacktrace:
 [1] pipeline_error
   @ ./process.jl:540 [inlined]
 [2] run(::Cmd; wait::Bool)
   @ Base ./process.jl:455
 [3] run(::Cmd)
   @ Base process.jl:453
 [4] (::var"#1#2")(exe::Cmd)
   @ Main none:4
 [5] (::MPI.var"#28#29"{var"#1#2"})(cmd::Cmd)
   @ MPI ~/.julia/packages/MPI/08SPr/src/environment.jl:25
 [6] _mpiexec(fn::MPI.var"#28#29"{var"#1#2"})
   @ MPI ~/.julia/packages/MPI/08SPr/deps/deps.jl:6
 [7] mpiexec(fn::var"#1#2")
   @ MPI ~/.julia/packages/MPI/08SPr/src/environment.jl:25
 [8] top-level scope
   @ none:4

out:

rank:0GPU:CuDevice(0)
rank:1GPU:CuDevice(1)
rank:2GPU:CuDevice(2)
rank:3GPU:CuDevice(3)
data size:(5120, 32, 32)
Start data allocationg
[1656147482.464678] [gpu55:148782:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x60ac00400, length=10485760, access=0xf) failed: Cannot allocate memory
[1656147482.464735] [gpu55:148782:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x60ac00400 length 10485760 on md[4]=ib/mlx5_8: Input/output error
[1656147482.464743] [gpu55:148782:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x60ac00400 len 10485760: Input/output error
[1656147482.475617] [gpu55:148780:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x60ac00400, length=10485760, access=0xf) failed: Cannot allocate memory
[1656147482.475681] [gpu55:148780:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x60ac00400 length 10485760 on md[4]=ib/mlx5_8: Input/output error
[1656147482.475689] [gpu55:148780:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x60ac00400 len 10485760: Input/output error
[1656147482.478879] [gpu55:148783:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x60ac00400, length=10485760, access=0xf) failed: Cannot allocate memory
[1656147482.478929] [gpu55:148783:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x60ac00400 length 10485760 on md[4]=ib/mlx5_8: Input/output error
[1656147482.478967] [gpu55:148783:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x60ac00400 len 10485760: Input/output error
[1656147482.478960] [gpu55:148781:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x60ac00400, length=10485760, access=0xf) failed: Cannot allocate memory
[1656147482.479000] [gpu55:148781:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x60ac00400 length 10485760 on md[4]=ib/mlx5_8: Input/output error
[1656147482.479010] [gpu55:148781:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x60ac00400 len 10485760: Input/output error
jipolanco commented 2 years ago

What happens if you set transpose_method = Transpositions.Alltoallv()? (See here for details.)

Lightup1 commented 2 years ago

hang on with out:

rank:0GPU:CuDevice(0)
rank:1GPU:CuDevice(1)
rank:2GPU:CuDevice(2)
rank:3GPU:CuDevice(3)
data size:(5120, 32, 32)
Start data allocationg
[1656149516.868709] [gpu53:309102:0]         cma_ep.c:113  UCX  ERROR process_vm_readv delivered 0 instead of 10485760, error message Bad address
[1656149516.868708] [gpu53:309100:0]         cma_ep.c:113  UCX  ERROR process_vm_readv delivered 0 instead of 10485760, error message Bad address
[1656149516.894273] [gpu53:309103:0]         cma_ep.c:113  UCX  ERROR process_vm_readv delivered 0 instead of 10485760, error message Bad address
[1656149516.894271] [gpu53:309101:0]         cma_ep.c:113  UCX  ERROR process_vm_readv delivered 0 instead of 10485760, error message Bad address

Then job cancelled due to time limit I set.

jipolanco commented 2 years ago

Have you tried the hints over at the MPI docs? In particular:

export JULIA_CUDA_MEMORY_POOL=none
Lightup1 commented 2 years ago

Seems not working

without export JULIA_CUDA_MEMORY_POOL=none

rank:0GPU:CuDevice(0)
rank:1GPU:CuDevice(1)
rank:2GPU:CuDevice(2)
rank:3GPU:CuDevice(3)
has-cuda:true
data size:(5120, 32, 32)
Start data allocationg
[1656470804.862083] [gpu13:416431:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x60ac00400, length=10485760, access=0xf) failed: Cannot allocate memory
[1656470804.862186] [gpu13:416431:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x60ac00400 length 10485760 on md[5]=ib/mlx5_0: Input/output error
[1656470804.862194] [gpu13:416431:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x60ac00400 len 10485760: Input/output error
[1656470804.925721] [gpu13:416430:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x60ac00400, length=10485760, access=0xf) failed: Cannot allocate memory
[1656470804.925771] [gpu13:416430:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x60ac00400 length 10485760 on md[5]=ib/mlx5_0: Input/output error
[1656470804.925780] [gpu13:416430:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x60ac00400 len 10485760: Input/output error
[1656470804.930835] [gpu13:416429:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x60ac00400, length=10485760, access=0xf) failed: Cannot allocate memory
[1656470804.930879] [gpu13:416429:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x60ac00400 length 10485760 on md[5]=ib/mlx5_0: Input/output error
[1656470804.930902] [gpu13:416429:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x60ac00400 len 10485760: Input/output error
[1656470805.223480] [gpu13:416428:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x60ac00400, length=10485760, access=0xf) failed: Cannot allocate memory
[1656470805.223550] [gpu13:416428:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x60ac00400 length 10485760 on md[5]=ib/mlx5_0: Input/output error
[1656470805.223559] [gpu13:416428:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x60ac00400 len 10485760: Input/output error

with export JULIA_CUDA_MEMORY_POOL=none:

rank:0GPU:CuDevice(0)
rank:1GPU:CuDevice(1)
rank:2GPU:CuDevice(2)
rank:3GPU:CuDevice(3)
has-cuda:true
data size:(5120, 32, 32)
Start data allocationg
[1656470932.716062] [gpu21:57697:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x2b647b400000, length=10485760, access=0xf) failed: Cannot allocate memory
[1656470932.716119] [gpu21:57697:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x2b647b400000 length 10485760 on md[5]=ib/mlx5_0: Input/output error
[1656470932.716127] [gpu21:57697:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x2b647b400000 len 10485760: Input/output error
[1656470932.735698] [gpu21:57698:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x2b338b400000, length=10485760, access=0xf) failed: Cannot allocate memory
[1656470932.735749] [gpu21:57698:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x2b338b400000 length 10485760 on md[5]=ib/mlx5_0: Input/output error
[1656470932.735758] [gpu21:57698:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x2b338b400000 len 10485760: Input/output error
[1656470932.746492] [gpu21:57700:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x2b4545400000, length=10485760, access=0xf) failed: Cannot allocate memory
[1656470932.746538] [gpu21:57700:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x2b4545400000 length 10485760 on md[5]=ib/mlx5_0: Input/output error
[1656470932.746561] [gpu21:57700:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x2b4545400000 len 10485760: Input/output error
[1656470932.774987] [gpu21:57699:0]          ib_md.c:438  UCX  ERROR ibv_reg_mr(address=0x2ba6eb400000, length=10485760, access=0xf) failed: Cannot allocate memory
[1656470932.775034] [gpu21:57699:0]         ucp_mm.c:110  UCX  ERROR failed to register address 0x2ba6eb400000 length 10485760 on md[5]=ib/mlx5_0: Input/output error
[1656470932.775043] [gpu21:57699:0]    ucp_request.c:264  UCX  ERROR failed to register user buffer datatype 0x80 address 0x2ba6eb400000 len 10485760: Input/output error

batch file:

#!/bin/bash
#SBATCH -N 1
#SBATCH --ntasks-per-node=4
#SBATCH -J gpuN1p4TCP_CUDAPOOL_none           # N nodes p process t threads
#SBATCH --cpus-per-task=7       # th2 qiming gpu 28 cpus per node (adjust as you need)
#SBATCH --time=00:3:00         # days-hours:minutes:seconds
#SBATCH -p gpu_v100
#SBATCH --output=slurm-%x-%j.out
#SBATCH --error=slurm-%x-%j.err
source /GPUFS/app/MPI/openmpi/3.1.4-gcc45-cuda11.0/env.sh
export LD_LIBRARY_PATH=$HOME/.julia-1.7.3/lib/julia:$LD_LIBRARY_PATH
export JULIA_CUDA_MEMORY_POOL=none
export JULIA_CUDA_USE_BINARYBUILDER=false
julia --project -e 'using Pkg; Pkg.instantiate()'
julia --project -e 'using Pkg; Pkg.precompile()'
srun hostname>hostlist 
mpiexecjl --project --mca btl tcp,self,vader --mca btl_tcp_if_include ib0 --machinefile hostlist -np $SLURM_NTASKS julia -t7 gpubench.jl
Lightup1 commented 1 year ago

After I install openmpi and ucx and cuda by myself, the error dissappeared. It seems that our cluster manager do not install openmpi with ucx. I'll close the issue.