microsoft / onnxruntime

ONNX Runtime: cross-platform, high performance ML inferencing and training accelerator
https://onnxruntime.ai
MIT License
14.06k stars 2.83k forks source link

create op #12017

Open zyxcambridge opened 2 years ago

zyxcambridge commented 2 years ago

/home/zyx/Desktop/pointpillars_pytorch_trt/pytorch/tools/onnxruntime/onnxruntime/contrib_ops/cuda/cuda_voxel_generator.cu(299): error: explicit stream argument not provided in kernel launch

/home/zyx/Desktop/pointpillars_pytorch_trt/pytorch/tools/onnxruntime/onnxruntime/contrib_ops/cuda/cuda_voxel_generator.cu(317): error: explicit stream argument not provided in kernel launch

2 errors detected in the compilation of "/home/zyx/Desktop/pointpillars_pytorch_trt/pytorch/tools/onnxruntime/onnxruntime/contrib_ops/cuda/cuda_voxel_generator.cu". CMakeFiles/onnxruntime_providers_cuda.dir/build.make:2427: recipe for target 'CMakeFiles/onnxruntime_providers_cuda.dir/home/zyx/Desktop/pointpillars_pytorch_trt/pytorch/tools/onnxruntime/onnxruntime/contrib_ops/cuda/cuda_voxel_generator.cu.o' failed make[2]: [CMakeFiles/onnxruntime_providers_cuda.dir/home/zyx/Desktop/pointpillars_pytorch_trt/pytorch/tools/onnxruntime/onnxruntime/contrib_ops/cuda/cuda_voxel_generator.cu.o] Error 1 CMakeFiles/Makefile2:1808: recipe for target 'CMakeFiles/onnxruntime_providers_cuda.dir/all' failed make[1]: [CMakeFiles/onnxruntime_providers_cuda.dir/all] Error 2 Makefile:165: recipe for target 'all' failed

include <torch/serialize/tensor.h>

include

include

include

include

include "voxel_generator.h"

include "core/providers/cuda/cuda_kernel.h"

include "core/providers/cuda/math/unary_elementwise_ops_impl.h"

include "core/providers/cuda/cuda_common.h"

include "core/providers/cuda/cuda_execution_provider.h"

include "contrib_ops/cuda/transformers/beam_search.h"

include "contrib_ops/cuda/transformers/beam_search_device_helper.h"

include "contrib_ops/cuda/transformers/dump_cuda_tensor.h"

using namespace std;

namespace onnxruntime { namespace contrib { namespace cuda {

class VoxelGeneratorV1 final : public OpKernel { public: explicit VoxelGeneratorV1(const OpKernelInfo& info) : OpKernel(info) {} Status Compute(OpKernelContext* ctx) const override;

private: using Base = CudaKernel; using CublasHandle = cublasHandle_t;

template // struct ComputeImpl; };

ONNX_OPERATOR_KERNEL_EX( VoxelGeneratorV1, kMSDomain, 1, kCpuExecutionProvider, KernelDefBuilder() .InputMemoryType(OrtMemTypeCPUInput, 0) // 'points' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 1) // 'ValidInput' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 2) // 'voxel_size' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 3) // 'point_cloud_range' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 4) // 'max_num_points' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 5) // 'max_voxels' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 6) // 'batch_size' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 7) // 'center_offset' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 8) // 'cluster_offset' needs to be on CPU .InputMemoryType(OrtMemTypeCPUInput, 9) .TypeConstraint("T", BuildKernelDefConstraints<float, double, MLFloat16>()), VoxelGeneratorV1);

std::vector VoxelGeneratorV1(torch::Tensor points, torch::Tensor ValidInput, std::vector voxel_size, std::vector point_cloud_range, int max_num_points, int max_voxels, int batch_size, int center_offset, int cluster_offset, int supplement) { CHECK_INPUT(points); CHECK_INPUT(ValidInput); auto inputType = points.scalar_type();

int inCols = points.size(1); int num_features = inCols; int outCols = num_features; if(cluster_offset !=0) outCols += 3; if(center_offset !=0) outCols += 3; int N = points.size(0);

const int NDim = 3; std::vector grid_size(3); for (int i = 0; i < NDim; ++i) { grid_size[i] = round((point_cloud_range[NDim + i] - point_cloud_range[i]) / voxel_size[i]); }

int cuda_idx = points.device().index(); auto options = torch::TensorOptions({at::kCUDA, cuda_idx}).dtype(torch::kInt32);

torch::Tensor voxels = torch::zeros({max_voxels batch_size, max_num_points, outCols}, torch::dtype(inputType).device(points.device())); //init 0 torch::Tensor coors = torch::zeros({max_voxels batch_size, 3}, options) - 1; //init -1 torch::Tensor num_points_per_voxel = torch::zeros({max_voxels * batch_size, }, options); //init 0 torch::Tensor ValidOutput = torch::zeros({batch_size, }, options); //init 0

float bev_map = (float)(grid_size[0] grid_size[1] sizeof(float)) / 1024 / 1024; int value_map_z = 40.0 / bev_map;//限制映射图的尺寸,超过40M只用一层

value_map_z = std::max(value_map_z, 1); value_map_z = std::min(value_map_z, grid_size[2]); int mapsize = grid_size[0] grid_size[1] value_map_z;

torch::Tensor map_tensor = torch::zeros({batch_size, mapsize}, options) - 1; int* map_tensor_rw = map_tensor.data_ptr();

torch::Tensor addr_tensor = torch::zeros({batch_sizemax_voxels, }, options) - 1; int addr_tensor_rw = addr_tensor.data_ptr();

const int listBytes = N sizeof(VoxelGeneratorSpace::HashEntry); torch::Tensor list_tensor = torch::zeros({listBytes / 4, }, options); VoxelGeneratorSpace::HashEntry list_tensor_rw = reinterpret_cast<VoxelGeneratorSpace::HashEntry*>(list_tensor.data_ptr());

const int ValidInput_ptr = ValidInput.data_ptr(); int coors_ptr = coors.data_ptr(); int num_points_per_voxel_ptr = num_points_per_voxel.data_ptr(); int ValidOutput_ptr = ValidOutput.data_ptr();

if(inputType == torch::kFloat32) { const float points_ptr = points.data_ptr(); float voxels_ptr = voxels.data_ptr();

VoxelGeneratorSpace::cuda_points_to_voxel(points_ptr, ValidInput_ptr,
                                          coors_ptr, num_points_per_voxel_ptr, voxels_ptr, ValidOutput_ptr,
                                          map_tensor_rw, addr_tensor_rw, list_tensor_rw,
                                          point_cloud_range, voxel_size, grid_size,
                                          batch_size, N, inCols, outCols,
                                          cluster_offset, center_offset, supplement, max_voxels, max_num_points, value_map_z);

} else if(inputType == torch::kHalf) { const half *points_ptr = reinterpret_cast<half>(points.data_ptr()); __half voxels_ptr = reinterpret_cast<__half*>(voxels.data_ptr());

VoxelGeneratorSpace::cuda_points_to_voxel_fp16(points_ptr, ValidInput_ptr,
                                               coors_ptr, num_points_per_voxel_ptr, voxels_ptr, ValidOutput_ptr,
                                               map_tensor_rw, addr_tensor_rw, list_tensor_rw,
                                               point_cloud_range, voxel_size, grid_size,
                                               batch_size, N, inCols, outCols,
                                               cluster_offset, center_offset, supplement, max_voxels, max_num_points, value_map_z);

} else { cout<< "error inputs type in VoxelGeneratorV1: " << inputType << endl; }

return {voxels.contiguous(), coors.contiguous(), ValidOutput.contiguous(), num_points_per_voxel}; }

Status VoxelGeneratorV1::Compute(OpKernelContext* ctx) const { // // .InputMemoryType(OrtMemTypeCPUInput, 0) // 'points' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 1) // 'ValidInput' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 2) // 'voxel_size' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 3) // 'point_cloud_range' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 4) // 'max_num_points' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 5) // 'max_voxels' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 6) // 'batch_size' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 7) // 'center_offset' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 8) // 'cluster_offset' needs to be on CPU // .InputMemoryType(OrtMemTypeCPUInput, 9) // 'supplement' needs to be on CPUstd::vector voxel_size, // std::vector point_cloud_range, int max_num_points, int max_voxels, int batch_size, // int center_offset, int cluster_offset, int supplement

torch::Tensor points = context->Input(0); torch::Tensor ValidInput = context->Input(1); std::vector voxel_size = context->Input(2); std::vector point_cloud_range = context->Input(3); int max_num_points = context->Input(4); int max_voxels = context->Input(5); int batch_size = context->Input(6); int center_offset = context->Input(7); int cluster_offset = context->Input(8); int supplement = context->Input(9); VoxelGeneratorV1(points,ValidInput,point_cloud_range,max_num_points, max_voxels,batch_size,center_offset,cluster_offset,supplement);

return Status::OK(); }

} // namespace cuda } // namespace contrib } // namespace onnxruntime

hariharans29 commented 2 years ago

I think nvcc is giving you some errors here -

/home/zyx/Desktop/pointpillars_pytorch_trt/pytorch/tools/onnxruntime/onnxruntime/contrib_ops/cuda/cuda_voxel_generator.cu(299): error: explicit stream argument not provided in kernel launch

/home/zyx/Desktop/pointpillars_pytorch_trt/pytorch/tools/onnxruntime/onnxruntime/contrib_ops/cuda/cuda_voxel_generator.cu(317): error: explicit stream argument not provided in kernel launch

Can you check on these ?