Open DoubleWJX opened 6 years ago
您好,我也遇到了这个问题,请问您是怎么解决的
@YanHengxu 我重点把反向传播的代码改了,其他代码可能也改过,参考了caffe2的roi_align,不过我目前实验的效果是ps_roi_align相比ps_roi_pooling没有提升,甚至下降了一点,不过训练过程确实收敛了。你帮我看一下吧,可能有些问题: // -------------------------------------------------------- // R-FCN // Written by Afantiafanti.deng@gmail.com // --------------------------------------------------------
using std::max; using std::min;
namespace caffe {
template
// deal with cases that inverse elements are out of feature map boundary
if (h < -0.5 || h > height - 0.5 || w < -0.5 || w > width - 0.5){
val = Dtype(0);
return;
}
if (h <= 0) h = 0;
if (w <= 0) w = 0;
int h_high; // h_high 是比 h 大的最小整数
int w_high; // w_high 是比 w 大的最小整数
int h_low = (int) h; // h_low 是比 h 小的最大整数
int w_low = (int) w; // w_low 是比 w 小的最大整数
if (w_low >= width - 1) {
w_high = w_low = width - 1;
w = (Dtype) w_low;
} else
w_high = w_low + 1;
if (h_low >= height - 1) {
h_high = h_low = height - 1;
h = (Dtype) h_low;
} else
h_high = h_low + 1;
Dtype l_dh = h - h_low, l_dw = w - w_low;
Dtype h_dh = 1. - l_dh, h_dw = 1. - l_dw;
// 进行双线性内插
Dtype u1 = bottom_data[h_low * width + w_low];
Dtype u2 = bottom_data[h_low * width + w_high];
Dtype u3 = bottom_data[h_high * width + w_low];
Dtype u4 = bottom_data[h_high * width + w_high];
Dtype w1 = h_dh * h_dw, w2 = h_dh * l_dw, w3 = l_dh * h_dw, w4 = l_dh * l_dw;
val = (w1 * u1 + w2 * u2 + w3 * u3 + w4 * u4);
}
template
// [start, end) interval for spatial sampling
bottom_rois += n * 5;
int roi_batch_ind = bottom_rois[0];
Dtype roi_start_w = bottom_rois[1] * spatial_scale;
Dtype roi_start_h = bottom_rois[2] * spatial_scale;
Dtype roi_end_w = (bottom_rois[3] + 1.) * spatial_scale;
Dtype roi_end_h = (bottom_rois[4] + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
Dtype roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0
Dtype roi_height = max(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
// 获得当前RoI的宽和高在池化前特征图上的起始和结束索引值, 浮点数
Dtype hstart = static_cast<Dtype>(ph) * bin_size_h;
Dtype wstart = static_cast<Dtype>(pw) * bin_size_w;
Dtype hend = static_cast<Dtype>(ph + 1.) * bin_size_h;
Dtype wend = static_cast<Dtype>(pw + 1.) * bin_size_w;
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, Dtype(0)), Dtype(height));
hend = min(max(hend + roi_start_h, Dtype(0)), Dtype(height));
wstart = min(max(wstart + roi_start_w, Dtype(0)), Dtype(width));
wend = min(max(wend + roi_start_w, Dtype(0)), Dtype(width));
bool is_empty = (hend <= hstart) || (wend <= wstart);
int gw = pw;
int gh = ph;
int c = (ctop*group_size + gh)*group_size + gw; //
// 在池化前特征图上采样点之间的距离,浮点数 (在 h 和 w 两个方向上)
Dtype sample_h = bin_size_h / (sample_num + 1.);
Dtype sample_w = bin_size_w / (sample_num + 1.);
Dtype val = 0;
bottom_data += (roi_batch_ind * channels + c) * height * width;
Dtype out_sum = 0.0;
for (int i = 1; i <= sample_num; ++i) {
for (int j = 1; j <= sample_num; ++j) {
Dtype cur_h = hstart + i * sample_h;
Dtype cur_w = wstart + j * sample_w;
if (cur_h >= hend || cur_w >= wend) continue;
bilinear_interpolate(bottom_data, height, width, cur_h, cur_w, val);
out_sum += val;
}
}
// Dtype bin_area = (hend - hstart) * (wend - wstart);
top_data[index] = is_empty ? 0. : out_sum / static_cast<Dtype>(sample_num * sample_num);
mapping_channel[index] = c;
}
}
template
template
// deal with cases that inverse elements are out of feature map boundary
if (h < -0.5 || h > height - 0.5 || w < -0.5 || w > width - 0.5){
w1 = w2 = w3 = w4 = Dtype(0);
w_low = w_high = h_low = h_high = -1;
return;
}
if (h <= 0) h = 0;
if (w <= 0) w = 0;
h_low = (int) h; // h_low 是比 h 小的最大整数
w_low = (int) w; // w_low 是比 w 小的最大整数
if (w_low >= width - 1) {
w_low = w_high = width - 1;
w = (Dtype) w_low;
} else
w_high = w_low + 1;
if (h_low >= height - 1) {
h_high = h_low = height - 1;
h = (Dtype) h_low;
} else
h_high = h_low + 1;
Dtype l_dh = h - h_low;
Dtype l_dw = w - w_low;
Dtype h_dh = 1. - l_dh, h_dw = 1. - l_dw;
// 进行双线性内插
w1 = h_dh * h_dw, w2 = h_dh * l_dw, w3 = l_dh * h_dw, w4 = l_dh * l_dw;
}
template
// ------------------------------------ 计算当前 pooled 后的点在原图中的位置范围 ------------------------------------------------
// [start, end) interval for spatial sampling
bottom_rois += n * 5;
int roi_batch_ind = bottom_rois[0];
Dtype roi_start_w = bottom_rois[1] * spatial_scale;
Dtype roi_start_h = bottom_rois[2] * spatial_scale;
Dtype roi_end_w = (bottom_rois[3] + 1.) * spatial_scale;
Dtype roi_end_h = (bottom_rois[4] + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
Dtype roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0
Dtype roi_height = max(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
// 获得当前RoI的宽和高在池化前特征图上的起始和结束索引值, 浮点数
Dtype hstart = static_cast<Dtype>(ph) * bin_size_h;
Dtype wstart = static_cast<Dtype>(pw) * bin_size_w;
Dtype hend = static_cast<Dtype>(ph + 1.) * bin_size_h;
Dtype wend = static_cast<Dtype>(pw + 1.) * bin_size_w;
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, Dtype(0)), Dtype(height));
hend = min(max(hend + roi_start_h, Dtype(0)), Dtype(height));
wstart = min(max(wstart + roi_start_w, Dtype(0)), Dtype(width));
wend = min(max(wend + roi_start_w, Dtype(0)), Dtype(width));
bool is_empty = (hend <= hstart) || (wend <= wstart);
// -------------------------------------------------------------------------------------
// Compute c at bottom
int c = mapping_channel[index];
Dtype* offset_bottom_diff = bottom_diff +
(roi_batch_ind * channels + c) * height * width;
Dtype diff_val = is_empty ? 0. : top_diff[index] / (sample_num * sample_num);
Dtype sample_h = bin_size_h / (sample_num + 1.);
Dtype sample_w = bin_size_w / (sample_num + 1.);
//
for (int i = 1; i <= sample_num; ++i) {
for (int j = 1; j <= sample_num; ++j) {
Dtype cur_h = hstart + i * sample_h;
Dtype cur_w = wstart + j * sample_w;
if (cur_h >= hend || cur_w >= wend) continue;
Dtype w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height, width, cur_h, cur_w,
w1, w2, w3, w4,
x_low, x_high, y_low, y_high
);
Dtype g1 = diff_val * w1;
Dtype g2 = diff_val * w2;
Dtype g3 = diff_val * w3;
Dtype g4 = diff_val * w4;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
caffe_gpu_atomic_add(g1, offset_bottom_diff + y_low * width + x_low);
caffe_gpu_atomic_add(g2, offset_bottom_diff + y_low * width + x_high);
caffe_gpu_atomic_add(g3, offset_bottom_diff + y_high * width + x_low);
caffe_gpu_atomic_add(g4, offset_bottom_diff + y_high * width + x_high);
}
}
}
/*
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
for(int i = 0; i < sample_num * sample_num; ++i){
Dtype d_h = abs(sample_pos_diff[2*i + 0] - h);
Dtype d_w = abs(sample_pos_diff[2*i + 1] - w);
if(d_h < 1 && d_w < 1){
int bottom_index = h*width + w;
caffe_gpu_atomic_add((1 - d_h)*(1 - d_w)*diff_val, offset_bottom_diff + bottom_index);
}
}
}
}
*/
}
}
template
const Dtype* bottom_rois = bottom[1]->gpu_data();
const Dtype* top_diff = top[0]->gpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
const int bottom_count = bottom[0]->count();
const int* mapping_channel_ptr = mapping_channel_.gpu_data();
caffe_gpu_set(bottom[1]->count(), Dtype(0), bottom[1]->mutable_gpu_diff());
caffe_gpu_set(bottom_count, Dtype(0), bottom_diff);
const int count = top[0]->count();
// NOLINT_NEXT_LINE(whitespace/operators)
PSROIAlignBackwardAtomic<Dtype> << <CAFFE_GET_BLOCKS(count),
CAFFE_CUDA_NUM_THREADS >> >(count, top_diff, mapping_channel_ptr,
top[0]->num(), spatial_scale_, channels_, height_, width_,
pooled_height_, pooled_width_, output_dim_, bottom_diff,
bottom_rois, sample_num_);
CUDA_POST_KERNEL_CHECK;
}
INSTANTIATE_LAYER_GPU_FUNCS(PSROIAlignLayer);
} // namespace caffe
谢谢您的代码,我看看反向传播这块
@YanHengxu 我重点把反向传播的代码改了,其他代码可能也改过,参考了caffe2的roi_align,不过我目前实验的效果是ps_roi_align相比ps_roi_pooling没有提升,甚至下降了一点,不过训练过程确实收敛了。你帮我看一下吧,可能有些问题: // -------------------------------------------------------- // R-FCN // Written by Afantiafanti.deng@gmail.com // --------------------------------------------------------
include
include
include
include
include "caffe/layers/psroi_align_layer.hpp"
include "caffe/util/gpu_util.cuh"
using std::max; using std::min;
namespace caffe {
template device void bilinear_interpolate( const Dtype* bottom_data, const int height, const int width, Dtype h, Dtype w, Dtype & val) {
// deal with cases that inverse elements are out of feature map boundary if (h < -0.5 || h > height - 0.5 || w < -0.5 || w > width - 0.5){ val = Dtype(0); return; } if (h <= 0) h = 0; if (w <= 0) w = 0; int h_high; // h_high 是比 h 大的最小整数 int w_high; // w_high 是比 w 大的最小整数 int h_low = (int) h; // h_low 是比 h 小的最大整数 int w_low = (int) w; // w_low 是比 w 小的最大整数 if (w_low >= width - 1) { w_high = w_low = width - 1; w = (Dtype) w_low; } else w_high = w_low + 1; if (h_low >= height - 1) { h_high = h_low = height - 1; h = (Dtype) h_low; } else h_high = h_low + 1; Dtype l_dh = h - h_low, l_dw = w - w_low; Dtype h_dh = 1. - l_dh, h_dw = 1. - l_dw; // 进行双线性内插 Dtype u1 = bottom_data[h_low * width + w_low]; Dtype u2 = bottom_data[h_low * width + w_high]; Dtype u3 = bottom_data[h_high * width + w_low]; Dtype u4 = bottom_data[h_high * width + w_high]; Dtype w1 = h_dh * h_dw, w2 = h_dh * l_dw, w3 = l_dh * h_dw, w4 = l_dh * l_dw; val = (w1 * u1 + w2 * u2 + w3 * u3 + w4 * u4);
}
template global void PSROIAlignForward( const int nthreads, const Dtype bottom_data, const Dtype spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const Dtype bottom_rois, const int output_dim, // 输出通道数 const int group_size, // kk(c+1) 中的 k Dtype top_data, int mapping_channel, const int sample_num) { CUDA_KERNEL_LOOP(index, nthreads) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling bottom_rois += n * 5; int roi_batch_ind = bottom_rois[0]; Dtype roi_start_w = bottom_rois[1] * spatial_scale; Dtype roi_start_h = bottom_rois[2] * spatial_scale; Dtype roi_end_w = (bottom_rois[3] + 1.) * spatial_scale; Dtype roi_end_h = (bottom_rois[4] + 1.) * spatial_scale; // Force too small ROIs to be 1x1 Dtype roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 Dtype roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height); Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width); // 获得当前RoI的宽和高在池化前特征图上的起始和结束索引值, 浮点数 Dtype hstart = static_cast<Dtype>(ph) * bin_size_h; Dtype wstart = static_cast<Dtype>(pw) * bin_size_w; Dtype hend = static_cast<Dtype>(ph + 1.) * bin_size_h; Dtype wend = static_cast<Dtype>(pw + 1.) * bin_size_w; // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, Dtype(0)), Dtype(height)); hend = min(max(hend + roi_start_h, Dtype(0)), Dtype(height)); wstart = min(max(wstart + roi_start_w, Dtype(0)), Dtype(width)); wend = min(max(wend + roi_start_w, Dtype(0)), Dtype(width)); bool is_empty = (hend <= hstart) || (wend <= wstart); int gw = pw; int gh = ph; int c = (ctop*group_size + gh)*group_size + gw; // // 在池化前特征图上采样点之间的距离,浮点数 (在 h 和 w 两个方向上) Dtype sample_h = bin_size_h / (sample_num + 1.); Dtype sample_w = bin_size_w / (sample_num + 1.); Dtype val = 0; bottom_data += (roi_batch_ind * channels + c) * height * width; Dtype out_sum = 0.0; for (int i = 1; i <= sample_num; ++i) { for (int j = 1; j <= sample_num; ++j) { Dtype cur_h = hstart + i * sample_h; Dtype cur_w = wstart + j * sample_w; if (cur_h >= hend || cur_w >= wend) continue; bilinear_interpolate(bottom_data, height, width, cur_h, cur_w, val); out_sum += val; } } // Dtype bin_area = (hend - hstart) * (wend - wstart); top_data[index] = is_empty ? 0. : out_sum / static_cast<Dtype>(sample_num * sample_num); mapping_channel[index] = c; }
}
template void PSROIAlignLayer::Forwardgpu(const vector<Blob>& bottom, const vector
& top) { const Dtype bottom_data = bottom[0]->gpu_data(); const Dtype bottom_rois = bottom[1]->gpu_data(); Dtype top_data = top[0]->mutable_gpu_data(); int mapping_channel_ptr = mappingchannel.mutable_gpu_data(); int count = top[0]->count(); caffe_gpu_set(count, Dtype(0), top_data); caffe_gpu_set(count, -1, mapping_channel_ptr); // NOLINT_NEXT_LINE(whitespace/operators) PSROIAlignForward << <CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS >> >(count, bottom_data, spatialscale, channels, height, width_, pooledheight, pooledwidth, bottom_rois, outputdim, groupsize, top_data, mapping_channel_ptr, samplenum); CUDA_POST_KERNEL_CHECK; } template device void bilinear_interpolate_gradient( const int height, const int width, Dtype h, Dtype w, Dtype& w1, Dtype& w2, Dtype& w3, Dtype& w4, int& w_low, int& w_high, int& h_low, int& h_high) {
// deal with cases that inverse elements are out of feature map boundary if (h < -0.5 || h > height - 0.5 || w < -0.5 || w > width - 0.5){ w1 = w2 = w3 = w4 = Dtype(0); w_low = w_high = h_low = h_high = -1; return; } if (h <= 0) h = 0; if (w <= 0) w = 0; h_low = (int) h; // h_low 是比 h 小的最大整数 w_low = (int) w; // w_low 是比 w 小的最大整数 if (w_low >= width - 1) { w_low = w_high = width - 1; w = (Dtype) w_low; } else w_high = w_low + 1; if (h_low >= height - 1) { h_high = h_low = height - 1; h = (Dtype) h_low; } else h_high = h_low + 1; Dtype l_dh = h - h_low; Dtype l_dw = w - w_low; Dtype h_dh = 1. - l_dh, h_dw = 1. - l_dw; // 进行双线性内插 w1 = h_dh * h_dw, w2 = h_dh * l_dw, w3 = l_dh * h_dw, w4 = l_dh * l_dw;
}
template global void PSROIAlignBackwardAtomic( const int nthreads, const Dtype top_diff, const int mapping_channel, const int num_rois, const Dtype spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int output_dim, Dtype bottom_diff, const Dtype bottom_rois, const int sample_num) { // 遍历池化后特征图的每一个像素点 CUDA_KERNEL_LOOP(index, nthreads) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int n = index / pooled_width / pooled_height / output_dim;
// ------------------------------------ 计算当前 pooled 后的点在原图中的位置范围 ------------------------------------------------ // [start, end) interval for spatial sampling bottom_rois += n * 5; int roi_batch_ind = bottom_rois[0]; Dtype roi_start_w = bottom_rois[1] * spatial_scale; Dtype roi_start_h = bottom_rois[2] * spatial_scale; Dtype roi_end_w = (bottom_rois[3] + 1.) * spatial_scale; Dtype roi_end_h = (bottom_rois[4] + 1.) * spatial_scale; // Force too small ROIs to be 1x1 Dtype roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 Dtype roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height); Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width); // 获得当前RoI的宽和高在池化前特征图上的起始和结束索引值, 浮点数 Dtype hstart = static_cast<Dtype>(ph) * bin_size_h; Dtype wstart = static_cast<Dtype>(pw) * bin_size_w; Dtype hend = static_cast<Dtype>(ph + 1.) * bin_size_h; Dtype wend = static_cast<Dtype>(pw + 1.) * bin_size_w; // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, Dtype(0)), Dtype(height)); hend = min(max(hend + roi_start_h, Dtype(0)), Dtype(height)); wstart = min(max(wstart + roi_start_w, Dtype(0)), Dtype(width)); wend = min(max(wend + roi_start_w, Dtype(0)), Dtype(width)); bool is_empty = (hend <= hstart) || (wend <= wstart); // ------------------------------------------------------------------------------------- // Compute c at bottom int c = mapping_channel[index]; Dtype* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; Dtype diff_val = is_empty ? 0. : top_diff[index] / (sample_num * sample_num); Dtype sample_h = bin_size_h / (sample_num + 1.); Dtype sample_w = bin_size_w / (sample_num + 1.); // for (int i = 1; i <= sample_num; ++i) { for (int j = 1; j <= sample_num; ++j) { Dtype cur_h = hstart + i * sample_h; Dtype cur_w = wstart + j * sample_w; if (cur_h >= hend || cur_w >= wend) continue; Dtype w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient( height, width, cur_h, cur_w, w1, w2, w3, w4, x_low, x_high, y_low, y_high ); Dtype g1 = diff_val * w1; Dtype g2 = diff_val * w2; Dtype g3 = diff_val * w3; Dtype g4 = diff_val * w4; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { caffe_gpu_atomic_add(g1, offset_bottom_diff + y_low * width + x_low); caffe_gpu_atomic_add(g2, offset_bottom_diff + y_low * width + x_high); caffe_gpu_atomic_add(g3, offset_bottom_diff + y_high * width + x_low); caffe_gpu_atomic_add(g4, offset_bottom_diff + y_high * width + x_high); } } } /* for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { for(int i = 0; i < sample_num * sample_num; ++i){ Dtype d_h = abs(sample_pos_diff[2*i + 0] - h); Dtype d_w = abs(sample_pos_diff[2*i + 1] - w); if(d_h < 1 && d_w < 1){ int bottom_index = h*width + w; caffe_gpu_atomic_add((1 - d_h)*(1 - d_w)*diff_val, offset_bottom_diff + bottom_index); } } } } */ }
}
template void PSROIAlignLayer::Backwardgpu(const vector<Blob>& top, const vector& propagatedown, const vector<Blob>& bottom) { if (!propagate_down[0]) { return; }
const Dtype* bottom_rois = bottom[1]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); const int bottom_count = bottom[0]->count(); const int* mapping_channel_ptr = mapping_channel_.gpu_data(); caffe_gpu_set(bottom[1]->count(), Dtype(0), bottom[1]->mutable_gpu_diff()); caffe_gpu_set(bottom_count, Dtype(0), bottom_diff); const int count = top[0]->count(); // NOLINT_NEXT_LINE(whitespace/operators) PSROIAlignBackwardAtomic<Dtype> << <CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS >> >(count, top_diff, mapping_channel_ptr, top[0]->num(), spatial_scale_, channels_, height_, width_, pooled_height_, pooled_width_, output_dim_, bottom_diff, bottom_rois, sample_num_); CUDA_POST_KERNEL_CHECK;
}
INSTANTIATE_LAYER_GPU_FUNCS(PSROIAlignLayer);
} // namespace caffe
Hi, there, 你好,请问你这里的实现是参考的FACE++这个实现吗?light_head_rcnn
他这个实现里面双线性插值对于整数坐标点的计算貌似有问题,看到你这里面有修正,是这样吗?谢谢 @DoubleWJX
我把ps_roi_pooling换成ps_roi_align后,几个batch之后,loss开始疯长。除了换成ps_roi_align,还需要做什么么?谢谢 [
](url)