Get Absolute distances value from the Kitti GT Depth map

letdivedeep commented 2 years ago

@JiawangBian Thanks for the wonderful work !!

I wanted to get the absolute distances for objects from the Kitti GT depth map provided. I have downloaded the kitti raw dataset provided in the repo

To load the kitti GT depth map used the following code

images = sorted(glob.glob("kitti/training/2011_09_26_drive_0001_sync_02/*.jpg"))
depth_maps =  sorted(glob.glob("kitti/training/2011_09_26_drive_0001_sync_02/depth/*.npz"))

print("There are",len(images),"images with ",len(depth_maps)," depth maps")

index = 32

#load the gt depth map 
gt_depth_map = np.load(depth_maps[index])
gt_depth_map = csr_matrix((gt_depth_map['data'],gt_depth_map['indices'],gt_depth_map['indptr']),shape=gt_depth_map['shape'])
gt_depth_map = gt_depth_map.toarray()

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))
ax1.imshow(cv2.cvtColor(cv2.imread(images[index]), cv2.COLOR_BGR2RGB))
ax1.set_title('Image', fontsize=30)
ax2.imshow(gt_depth_map)
ax2.set_title('Depth Map', fontsize=30)

the resulted out is as shown below

Than to get the bbox used an Yolov4 model

yolo = YOLOv4()
yolo.classes = "Yolov4/coco.names"
yolo.make_model()
yolo.load_weights("Yolov4/yolov4.weights", weights_type="yolo")

def run_obstacle_detection(img):
    start_time=time.time()
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    resized_image = yolo.resize_image(img)
    # 0 ~ 255 to 0.0 ~ 1.0
    resized_image = resized_image / 255.
    #input_data == Dim(1, input_size, input_size, channels)
    input_data = resized_image[np.newaxis, ...].astype(np.float32)

    candidates = yolo.model.predict(input_data)

    _candidates = []
    for candidate in candidates:
        batch_size = candidate.shape[0]
        grid_size = candidate.shape[1]
        _candidates.append(tf.reshape(candidate, shape=(1, grid_size * grid_size * 3, -1)))
        # candidates == Dim(batch, candidates, (bbox))
        candidates = np.concatenate(_candidates, axis=1)

        # pred_bboxes == Dim(candidates, (x, y, w, h, class_id, prob))
        pred_bboxes = yolo.candidates_to_pred_bboxes(candidates[0], iou_threshold=0.35, score_threshold=0.40)
        pred_bboxes = pred_bboxes[~(pred_bboxes==0).all(1)] #https://stackoverflow.com/questions/35673095/python-how-to-eliminate-all-the-zero-rows-from-a-matrix-in-numpy?lq=1
        pred_bboxes = yolo.fit_pred_bboxes_to_original(pred_bboxes, img.shape)
        exec_time = time.time() - start_time
        print("time: {:.2f} ms".format(exec_time * 1000))
        result = yolo.draw_bboxes(img, pred_bboxes)
    return result, pred_bboxes

img = cv2.imread(images[index])
result, pred_bboxes = run_obstacle_detection(img)
plt.figure(figsize = (14, 10))
plt.imshow(result)
plt.show()

Overlayed the bbox on the depth map and took the depth value from the center point as shown

def find_distances(depth_map, pred_bboxes, img, method="center"):
    depth_list = []
    h, w, _ = img.shape
    print("shape :",img.shape)
    #h, w, _  = 256,256,3
    for box in pred_bboxes:
        x1 = int(box[0]*w - box[2]*w*0.5) # center_x - width /2
        y1 = int(box[1]*h-box[3]*h*0.5) # center_y - height /2
        x2 = int(box[0]*w + box[2]*w*0.5) # center_x + width/2
        y2 = int(box[1]*h+box[3]*h*0.5) # center_y + height/2
        obstacle_depth = depth_map[y1:y2, x1:x2]
        if method=="closest":
            depth_list.append(obstacle_depth.min()) # take the closest point in the box
        elif method=="average":
            depth_list.append(np.mean(obstacle_depth)) # take the average
        elif method=="median":
            depth_list.append(np.median(obstacle_depth)) # take the median
        else:
            depth_list.append(depth_map[int(box[1]*h)][int(box[0]*w)]) # take the center

    return depth_list

def add_depth(depth_list, result, pred_bboxes):
    h, w, _ = result.shape
    res = result.copy()
    for i, distance in enumerate(depth_list):
        cv2.putText(res, '{0:.2f} m'.format(distance), (int(pred_bboxes[i][0]*w - pred_bboxes[i][2]*w*0.2),int(pred_bboxes[i][1]*h)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)    
    return res

depth_list = find_distances(gt_depth_map, pred_bboxes, img, method="center")
print(depth_list)
res = add_depth(depth_list, result, pred_bboxes)

plt.figure(figsize = (14,10))
plt.imshow(res)

the resulted out is this where we get 0m distances

Do we have to any other pre-processing prior to using the kitti gt depth maps ?

JiawangBian commented 2 years ago

Note that kitti_gt depth is sparse, so that "obstacle_depth = depth_map[y1:y2, x1:x2]" has many 0 values. You should exclude them first, and then find the min() or mean() value.

letdivedeep commented 2 years ago

@JiawangBian thanks for your inputs. it Worked !!

letdivedeep commented 2 years ago

@JiawangBian I used the kitti pretrained model to infer on some images using the inference.sh script

The depth map were stored in the .npy dir structure. we read the file in the following way gt_depth_map = np.load("0044.npy")

Now to get the distance from this map what should be done, do we have to use the focal length and baseline formulation for it or we can directly lay the bbox on it and retrieve the distances byt min or mean

JiawangBian commented 2 years ago

Do not need to use the focal length and baseline. However, you need to know the scaling ratio between the predicted depth and ground truth. The monocular depth estimation is up to an unknown scale, so you need to recover it from an external source. Fortunately, the scale-consistent depth method ensures that our predicted depths on all images have the same scale. It means that you can recover the scale by using one image (where you have ground truth and you can compute median scaling, like in evaluation code), and then you can apply this scale on all other images.

letdivedeep commented 2 years ago

@JiawangBian Thanks for your quick reply

I checked the test.py and found that an function compute_errors is called to calculate the errors

for i, (tgt_img, gt_depth) in enumerate(tqdm(test_loader)):
        pred_depth = model.inference_depth(tgt_img.cuda())

        errs = compute_errors(gt_depth.cuda(), pred_depth,
                              hparams.dataset_name)

        all_errs.append(np.array(errs))

    all_errs = np.stack(all_errs)
    mean_errs = np.mean(all_errs, axis=0)

which call the compute_error method

def compute_errors(gt, pred, dataset):
    # pred : b c h w
    # gt: b h w

    abs_diff = abs_rel = sq_rel = log10 = rmse = rmse_log = a1 = a2 = a3 = 0.0

    batch_size, h, w = gt.size()

    if pred.nelement() != gt.nelement():
        pred = F.interpolate(pred, [h, w], mode='nearest')

    pred = pred.view(batch_size, h, w)

    if dataset == 'kitti':
        crop_mask = gt[0] != gt[0]
        y1, y2 = int(0.40810811 * gt.size(1)), int(0.99189189 * gt.size(1))
        x1, x2 = int(0.03594771 * gt.size(2)), int(0.96405229 * gt.size(2))
        crop_mask[y1:y2, x1:x2] = 1
        max_depth = 80

    if dataset == 'nyu':
        crop_mask = gt[0] != gt[0]
        crop = np.array([45, 471, 41, 601]).astype(np.int32)
        crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1
        max_depth = 10

    if dataset == 'ddad':
        crop_mask = gt[0] != gt[0]
        crop_mask[:, :] = 1
        max_depth = 200

    min_depth = 1e-3
    for current_gt, current_pred in zip(gt, pred):
        valid = (current_gt > min_depth) & (current_gt < max_depth)
        valid = valid & crop_mask

        valid_gt = current_gt[valid]
        valid_pred = current_pred[valid]

        # align scale
        valid_pred = valid_pred * \
            torch.median(valid_gt)/torch.median(valid_pred)

        valid_pred = valid_pred.clamp(min_depth, max_depth)

        thresh = torch.max((valid_gt / valid_pred), (valid_pred / valid_gt))
        a1 += (thresh < 1.25).float().mean()
        a2 += (thresh < 1.25 ** 2).float().mean()
        a3 += (thresh < 1.25 ** 3).float().mean()

        diff_i = valid_gt - valid_pred
        abs_diff += torch.mean(torch.abs(diff_i))
        abs_rel += torch.mean(torch.abs(diff_i) / valid_gt)
        sq_rel += torch.mean(((diff_i)**2) / valid_gt)
        rmse += torch.sqrt(torch.mean(diff_i ** 2))
        rmse_log += torch.sqrt(torch.mean((torch.log(valid_gt) -
                               torch.log(valid_pred)) ** 2))
        log10 += torch.mean(torch.abs((torch.log10(valid_gt) -
                            torch.log10(valid_pred))))

    return [metric.item() / batch_size for metric in [abs_diff, abs_rel, sq_rel, log10, rmse, rmse_log, a1, a2, a3]]

when i run the test eval script i get the following output Screenshot 2022-01-24 at 5 31 53 PM

A bit confused on two parts 1) What do you mean by median scaling.. is it this part

# align scale
valid_pred = valid_pred * torch.median(valid_gt)/torch.median(valid_pred)
valid_pred = valid_pred.clamp(min_depth, max_depth)

2) How can this median scale be applied all other images (do we need to perform the same operation on predicted depthmap and than get the closest bbox distances )

letdivedeep commented 2 years ago

@JiawangBian I tried with above approch as illustrated in the following code base :


def find_distances(gt_depth_map,pt_depth_map, pred_bboxes, img, method="closest"):
    depth_list = []
    h, w, _ = img.shape
    for box in pred_bboxes:
        x1 = int(box[0]*w - box[2]*w*0.5) # center_x - width /2
        y1 = int(box[1]*h-box[3]*h*0.5) # center_y - height /2
        x2 = int(box[0]*w + box[2]*w*0.5) # center_x + width/2
        y2 = int(box[1]*h+box[3]*h*0.5) # center_y + height/2

        pt_obstacle_depth = pt_depth_map[y1:y2, x1:x2]
        gt_obstacle_depth = gt_depth_map[y1:y2, x1:x2]

        # Remove the 0's
        pt_obstacle_depth = pt_obstacle_depth[pt_obstacle_depth != 0]
        gt_obstacle_depth = gt_obstacle_depth[gt_obstacle_depth != 0]

        # Convert numpy array to tensor
        pt_depth_tensor = torch.from_numpy(pt_obstacle_depth)
        gt_depth_tensor = torch.from_numpy(gt_obstacle_depth)

        # perform  the median scaling on predicted depth
        valid_pred = pt_depth_tensor * torch.median(gt_depth_tensor)/torch.median(pt_depth_tensor)

        if method=="closest":
            depth_list.append(valid_pred.min()) # take the closest point in the box
            print("closed point :",valid_pred.min())
        elif method=="average":
            depth_list.append(np.mean(valid_pred)) # take the average
        elif method=="median":
            depth_list.append(np.median(valid_pred)) # take the median
        else:
            depth_list.append(pt_obstacle_depth[int(box[1]*h)][int(box[0]*w)]) # take the center

    return depth_list

When compared with the GT from KT there was 5 -6m Kitti Ground Truth sc-depth model prediction

@JiawangBian your inputs may be helpful

JiawangBian / sc_depth_pl

Get Absolute distances value from the Kitti GT Depth map #6