Hello, I modified the roi_pool_kernel.cu and made it accept 3D input, but I'm not very familiar with the cuda code, can anyone helps me to check if I made some mistakes? Very appreciate!
My input shape is (B, C, H, W, D), roi shape is (num_of_roi, 7) (the order of roi shape is (label, min_width, min_depth, min_height, max_width, max_depth, max_height))
And the output I expect is: (num_of_roi, C, pool_size, pool_size, pool_size)
(I don't know why GitHub break my code to multiple parts, sorry for the inconvenient)
`
include <torch/extension.h>
include <THC/THCAtomics.cuh>
include "cuda_helpers.h"
template global void RoIPoolForward(
const int nthreads,
const T input,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int depth,
const int pooled_height,
const int pooled_width,
const int pooled_depth,
const T rois,
T output,
int argmax_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int pd = (index / (pooled_widthpooled_height)) % pooled_depth;
int c = (index / (pooled_width pooled_height pooled_depth)) % channels;
int n = index / (pooled_width pooled_height pooled_depth channels);
const T* offset_rois = rois + n * 7;
int roi_batch_ind = offset_rois[0];
int roi_start_w = round(offset_rois[1] * (284/62));//spatial_scale); //for spatial need to change
int roi_start_h = round(offset_rois[3] * (266/60));//spatial_scale); //different side need different number
int roi_start_d = round(offset_rois[2] * (316/124));//spatial_scale);
int roi_end_w = round(offset_rois[4] * (284/62));//spatial_scale);
int roi_end_h = round(offset_rois[6] * (266/60));//spatial_scale);
int roi_end_d = round(offset_rois[5] * (316/124));//spatial_scale);
// Force malformed ROIs to be 1x1
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_depth = max(roi_end_d - roi_start_d + 1, 1);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T bin_size_d = static_cast<T>(roi_depth) / static_cast<T>(pooled_depth);
int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
int dstart = static_cast<int>(floor(static_cast<T>(pd) * bin_size_d));
int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
int dend = static_cast<int>(ceil(static_cast<T>(pd + 1) * bin_size_d));
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, 0), height);
hend = min(max(hend + roi_start_h, 0), height);
wstart = min(max(wstart + roi_start_w, 0), width);
wend = min(max(wend + roi_start_w, 0), width);
dstart = min(max(dstart + roi_start_d, 0), depth);
dend = min(max(dend + roi_start_d, 0), depth);
bool is_empty = (hend <= hstart) || (wend <= wstart) || (dend <= dstart);
// Define an empty pooling region to be zero
T maxval = is_empty ? 0 : -FLT_MAX;
// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
int maxidx = -1;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width * depth;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
for (int d = dstart; d < dend; ++d) {
int input_index = d*width*height + h * width + w; //h*depth*width + w*depth + d; //
if (offset_input[input_index] > maxval) {
maxval = offset_input[input_index];
maxidx = input_index;
}
}
}
}
output[index] = maxval;
argmax_data[index] = maxidx;
}
}
template global void RoIPoolBackward(
const int nthreads,
const T grad_output,
const int argmax_data,
const int channels,
const int height,
const int width,
const int depth,
const int pooled_height,
const int pooled_width,
const int pooled_depth,
T grad_input,
const T rois,
const int n_stride,
const int c_stride,
const int h_stride,
const int w_stride,
const int d_stride) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int pd = (index / (pooled_widthpooled_height)) % pooled_depth;
int c = (index / (pooled_width pooled_height pooled_depth)) % channels;
int n = index / (pooled_width pooled_height pooled_depth channels);
//int c = (index / pooled_width / pooled_height) % channels;
//int n = index / pooled_width / pooled_height / channels;
std::tuple<torch::Tensor, torch::Tensor> roi_pool_forward3d_cuda(const torch::Tensor& input,
const torch::Tensor& rois,
const float spatial_scale,
const int output_size) {
AT_ASSERTM(input.is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.is_cuda(), "rois must be a CUDA tensor");
const int num_rois = rois.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int depth = input.size(4);
const int pooling_width = output_size;
const int pooling_height = output_size;
const int pooling_depth = output_size;
const auto total_size = num_rois * pooling_height * pooling_width * pooling_depth * channels;
auto output = torch::empty(
{num_rois, channels, pooling_height, pooling_width, pooling_depth}, input.options());
auto argmax = torch::zeros(
{num_rois, channels, pooling_height, pooling_width, pooling_depth},
input.options().dtype(torch::kInt));
const dim3 grid(std::min((total_size + 512 - 1) / 512, 4*4096));
const dim3 block(512);
if (output.numel() == 0) {
return std::make_tuple(output, argmax);
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "RoIPool_forward", [&] {
RoIPoolForward<scalar_t><<<grid, block>>>(
total_size,
input.contiguous().data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
depth,
pooling_width,
pooling_height,
pooling_depth,
rois.contiguous().data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
argmax.data_ptr<int>());
});
return std::make_tuple(output, argmax);
}
torch::Tensor roi_pool_backward3d_cuda(const torch::Tensor& grad,
const torch::Tensor& argmax,
const torch::Tensor& input_size,
const torch::Tensor& rois) {
// Check if input tensors are CUDA tensors
AT_ASSERTM(grad.is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.is_cuda(), "rois must be a CUDA tensor");
AT_ASSERTM(argmax.is_cuda(), "argmax must be a CUDA tensor");
auto input_size_a = input_size.accessor<int,1>();
const int batch_size = input_size_a[0];
const int channels = input_size_a[1];
const int height = input_size_a[2];
const int width = input_size_a[3];
const int depth = input_size_a[4];
const int num_rois = argmax.size(0);
const int pooling_width = argmax.size(2);
const int pooling_height = argmax.size(3);
const int pooling_depth = argmax.size(4);
const auto total_size = num_rois * pooling_height * pooling_width * pooling_depth * channels;
auto grad_input =
torch::zeros({batch_size, channels, width, depth, height}, grad.options());
const dim3 grid(std::min((total_size + 512 - 1) / 512, 4*4096));
const dim3 block(512);
// handle possibly empty gradients
if (grad.numel() == 0) {
return grad_input;
}
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad.stride(0);
int c_stride = grad.stride(1);
int h_stride = grad.stride(2);
int w_stride = grad.stride(3);
int d_stride = grad.stride(4);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "RoIPool_backward", [&] {
RoIPoolBackward<scalar_t><<<grid, block>>>(
grad.numel(),
grad.data_ptr<scalar_t>(),
argmax.contiguous().data_ptr<int>(),
channels,
height,
width,
depth,
pooling_width,
pooling_height,
pooling_depth,
grad_input.data_ptr<scalar_t>(),
rois.contiguous().data_ptr<scalar_t>(),
n_stride,
c_stride,
h_stride,
w_stride,
d_stride);
});
return grad_input;
Hello, I modified the roi_pool_kernel.cu and made it accept 3D input, but I'm not very familiar with the cuda code, can anyone helps me to check if I made some mistakes? Very appreciate!
My input shape is (B, C, H, W, D), roi shape is (num_of_roi, 7) (the order of roi shape is (label, min_width, min_depth, min_height, max_width, max_depth, max_height)) And the output I expect is: (num_of_roi, C, pool_size, pool_size, pool_size) (I don't know why GitHub break my code to multiple parts, sorry for the inconvenient)
`
include <torch/extension.h>
include <THC/THCAtomics.cuh>
include "cuda_helpers.h"
template
global void RoIPoolForward(
const int nthreads,
const T input,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int depth,
const int pooled_height,
const int pooled_width,
const int pooled_depth,
const T rois,
T output,
int argmax_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int pd = (index / (pooled_widthpooled_height)) % pooled_depth;
int c = (index / (pooled_width pooled_height pooled_depth)) % channels;
int n = index / (pooled_width pooled_height pooled_depth channels);
} }
template
global void RoIPoolBackward(
const int nthreads,
const T grad_output,
const int argmax_data,
const int channels,
const int height,
const int width,
const int depth,
const int pooled_height,
const int pooled_width,
const int pooled_depth,
T grad_input,
const T rois,
const int n_stride,
const int c_stride,
const int h_stride,
const int w_stride,
const int d_stride) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int pd = (index / (pooled_widthpooled_height)) % pooled_depth;
int c = (index / (pooled_width pooled_height pooled_depth)) % channels;
int n = index / (pooled_width pooled_height pooled_depth channels);
//int c = (index / pooled_width / pooled_height) % channels;
//int n = index / pooled_width / pooled_height / channels;
} }
std::tuple<torch::Tensor, torch::Tensor> roi_pool_forward3d_cuda(const torch::Tensor& input, const torch::Tensor& rois, const float spatial_scale, const int output_size) { AT_ASSERTM(input.is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.is_cuda(), "rois must be a CUDA tensor");
}
torch::Tensor roi_pool_backward3d_cuda(const torch::Tensor& grad, const torch::Tensor& argmax, const torch::Tensor& input_size, const torch::Tensor& rois) { // Check if input tensors are CUDA tensors AT_ASSERTM(grad.is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.is_cuda(), "rois must be a CUDA tensor"); AT_ASSERTM(argmax.is_cuda(), "argmax must be a CUDA tensor");
} ``