Open husheng12345 opened 1 year ago
Hello, there might be some issues with your augmentation. Please try the following _spatialtransform function.
PS. We have also applied an augmentation to the input images to the network, converting them to grayscale images with a probability of 0.1.
def random_vertical_disp(self, inputs, angle, px, diff_angle=0, order=2, reshape=False):
px2 = random.uniform(-px,px)
angle2 = random.uniform(-angle,angle)
image_center = (np.random.uniform(0,inputs[1].shape[0]),\
np.random.uniform(0,inputs[1].shape[1]))
rot_mat = cv2.getRotationMatrix2D(image_center, angle2, 1.0)
inputs[1] = cv2.warpAffine(inputs[1], rot_mat, inputs[1].shape[1::-1], flags=cv2.INTER_LINEAR)
#trans_mat = np.float32([[1,0,0],[0,1,px2]])
#inputs[1] = cv2.warpAffine(inputs[1], trans_mat, inputs[1].shape[1::-1], flags=cv2.INTER_LINEAR)
return inputs
# gt already filtered based on AO
def spatial_transform(self, im1, im2, im3, gt=None, conf=None):
# randomly sample scale
ht, wd = im2.shape[:2]
min_scale = np.maximum(
(self.crop_size[0] + 8) / float(ht),
(self.crop_size[1] + 8) / float(wd))
scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
scale_x = scale
scale_y = scale
if np.random.rand() < self.stretch_prob:
scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
scale_x = np.clip(scale_x, min_scale, None)
scale_y = np.clip(scale_y, min_scale, None)
if np.random.rand() < self.spatial_aug_prob:
# rescale the images
im1 = cv2.resize(im1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
im2 = cv2.resize(im2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
im3 = cv2.resize(im3, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
if gt is not None:
gt = cv2.resize(gt, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_NEAREST) * scale_x
conf = cv2.resize(conf, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_NEAREST)
if self.do_flip:
if np.random.rand() < self.h_flip_prob: # h-flip
tmp_left = im1[:, ::-1]
tmp_center = im2[:, ::-1]
tmp_right = im3[:, ::-1]
im1 = tmp_right
im2 = tmp_center
im3 = tmp_left
if gt is not None:
gt = gt[:, ::-1]
conf = conf[:, ::-1]
if np.random.rand() < self.v_flip_prob: # v-flip
im1 = im1[::-1, :]
im2 = im2[::-1, :]
im3 = im3[::-1, :]
if gt is not None:
gt = gt[::-1, :]
conf = conf[::-1, :]
# allow full size crops
y0 = np.random.randint(2, im2.shape[0] - self.crop_size[0]-2)
x0 = np.random.randint(2, im2.shape[1] - self.crop_size[1]-2)
y1 = y0 + np.random.randint(-2, 2 + 1)
im1_o = im1[:,:,:3][y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
im2_o = im2[:,:,:3][y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
im3_o = im3[:,:,:3][y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
im1_aug = im1[:,:,3:6][y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
im2_aug = im2[:,:,3:6][y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
im3_aug = im3[:,:,3:6][y1:y1+self.crop_size[0], x0:x0+self.crop_size[1]]
im1 = np.concatenate((im1_o,im1_aug),-1)
im2 = np.concatenate((im2_o,im2_aug),-1)
im3 = np.concatenate((im3_o,im3_aug),-1)
if gt is not None:
gt = gt[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
conf = conf[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
angle=0;px=0
if np.random.binomial(1,0.5):
angle=0.1;px=3
augmented = self.random_vertical_disp([im2[:,:,3:6], im3[:,:,3:6]], angle, px)
# random occlusion to right image
if np.random.rand() < self.eraser_aug_prob:
sx = int(np.random.uniform(50, 100))
sy = int(np.random.uniform(50, 100))
cx = int(np.random.uniform(sx, im3.shape[0] - sx))
cy = int(np.random.uniform(sy, im3.shape[1] - sy))
augmented[1][cx - sx : cx + sx, cy - sy : cy + sy] = np.mean(
np.mean(augmented[1], 0), 0
)[np.newaxis, np.newaxis]
im2 = np.concatenate((im2[:,:,:3],augmented[0]),-1)
im3 = np.concatenate((im3[:,:,:3],augmented[1]),-1)
return im1, im2, im3, gt, conf
def __call__(self, im0, im1, im2, gt=None, conf=None):
im0c, im1c, im2c = self.color_transform(im0, im1, im2)
im0, im1, im2, gt, conf = self.spatial_transform(np.concatenate((im0,im0c),-1), np.concatenate((im1,im1c),-1), np.concatenate((im2,im2c),-1), gt, conf)
# ...
I greatly appreciate your response. I would like to pose two additional questions.
#trans_mat = np.float32([[1,0,0],[0,1,px2]])
#inputs[1] = cv2.warpAffine(inputs[1], trans_mat, inputs[1].shape[1::-1], flags=cv2.INTER_LINEAR)
What are the default values for the data augmentation parameters you employ, are they exactly consistent with those used by RAFT-Stereo? https://github.com/princeton-vl/RAFT-Stereo/blob/main/core/utils/augmentor.py#L60-L80
class FlowAugmentor:
def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True, yjitter=False, saturation_range=[0.6,1.4], gamma=[1,1,1,1]):
# spatial augmentation params
self.crop_size = crop_size
self.min_scale = min_scale
self.max_scale = max_scale
self.spatial_aug_prob = 1.0
self.stretch_prob = 0.8
self.max_stretch = 0.2
# flip augmentation params
self.yjitter = yjitter
self.do_flip = do_flip
self.h_flip_prob = 0.5
self.v_flip_prob = 0.1
# photometric augmentation params
self.photo_aug = Compose([ColorJitter(brightness=0.4, contrast=0.4, saturation=saturation_range, hue=0.5/3.14), AdjustGamma(*gamma)])
self.asymmetric_color_aug_prob = 0.2
self.eraser_aug_prob = 0.5
You're welcome! I'll answer your two additional questions:
Hello, I have updated the data augmentation code as your guidance. Here are the results I obtained. | Model | KITTI-15 (>3px All) |
Midd-T Full (>2px All) |
---|---|---|---|
Official pretrained weights | 5.41 | 16.38 | |
Trained with 474943f | 6.06 | 22.36 | |
Trained with new version | 6.12 | 20.94 |
I am yet to achieve a full reproduction of the paper. Might you have any other recommendations?
Additionally, I have another query to discuss. https://github.com/fabiotosi92/NeRF-Supervised-Deep-Stereo/blob/7f88ef280e558e0981fc2f42871c3c914b79b65f/code_snippets/losses.py#L68 In this line of code, disp_diff.shape is (B,1,H,W), conf.shape is (B,H,W), target_disp.shape is (B,1,H,W). It appears this may lead to an incorrect PyTorch broadcast, resulting in a product shape of (B,B,H,W). Do you believe this represents a bug, or could it be a misunderstanding on my part? Thank you.
Hello! A bit of improvement on Middlebury, but I suppose there's still something different. What crop size did you use during training? In our experiments, we used CROP_HEIGHT=384 and CROP_WIDTH=768.
By the way, you're absolutely right. It seems we might have accidentally introduced an error in the commented code section in losses.py while cleaning up the code. I'll make sure to address and fix that file as soon as I can.
Thank you kindly for your response.
In my experiment, the crop size used was [384, 768], which was defined in this line of code.
parser.add_argument('--image_size', type=int, nargs='+', default=[384, 768], help="size of the random image crops used during training.")
Furthermore, I wish to share that I have attempted to conduct experiments using the corrected loss function, but the results remained roughly unaltered. My hypothesis is that the tensor 'target_disp > 0' may have aided in filtering out erroneous broadcast to some extent.
I believe there might still be something (perhaps subtle) that is missing to accurately reproduce our results. In the next few days, I'll try to rerun the training to ensure there are no issues with the released stereo data. If necessary, I'll conduct a more thorough check on the differences between the code in your repository and our code.
@husheng12345 Hello, can you now get the same results with the weights provided by the author?
@Liyunfengabc There are no updates from my side.
@husheng12345 Hi, maybe the kitti D1 difference comes from the negative disparity treatment in your implemention
@CaptainEven May I ask if you could provide a detailed explanation on how to fix the issue? Additionally, have you been successful in reproducing the results presented in the paper?
(>3px All)
(>2px All)
Would you be able to offer some guidance on which of my training hyperparameters might not be appropriately set? Thank you.