Open adricostas opened 7 months ago
Hello,
I'm trying to use the network to estimate the depth from one of my own images. This is the code that I'm using:
from __future__ import absolute_import, division, print_function import os import argparse import numpy as np import PIL.Image as pil import matplotlib as mpl import matplotlib.cm as cm import torch from torchvision import transforms import networks from layers import disp_to_depth def parse_args(): parser = argparse.ArgumentParser(description='Inference on one Single Image.') parser.add_argument('--image_path', type=str, help='path to a test image', required=True) parser.add_argument("--load_weights_folder", type=str, help="name of model to load", required=True) parser.add_argument("--no_cuda", help='if set, disables CUDA', action='store_true') return parser.parse_args() def prepare_model_for_test(args, device): model_path = args.load_weights_folder print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") decoder_path = os.path.join(model_path, "depth.pth") encoder_dict = torch.load(encoder_path, map_location=device) decoder_dict = torch.load(decoder_path, map_location=device) encoder = networks.ResnetEncoder(18, False) decoder = networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(1),num_output_channels=3, use_skips=True, PixelCoorModu=True ) encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in encoder.state_dict()}) decoder.load_state_dict(decoder_dict) encoder = encoder.to(device).eval() decoder = decoder.to(device).eval() return encoder, decoder, encoder_dict['height'], encoder_dict['width'] def inference(args): device = torch.device("cpu") encoder, decoder, thisH, thisW = prepare_model_for_test(args, device) image_path = args.image_path output_path = "/data/adriana/IndoorDepth/results/" print("-> Inferencing on image ", image_path) with torch.no_grad(): # Load image and preprocess input_image = pil.open(image_path).convert('RGB') extension = image_path.split('.')[-1] original_width, original_height = input_image.size name_original_image = output_path + 'original.png' input_image.save(name_original_image) input_image = input_image.crop((16, 16, original_width-16, original_height-16)) name_crop = output_path + 'crop.png' input_image.save(name_crop) cropped_image = input_image crop_width, crop_height = input_image.size print("thisW ", thisW) print("thisH ", thisH) input_image = input_image.resize((thisW, thisH), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # Norm_pix_coords fx = 5.1885790117450188e+02 / (original_width - 2*16) fy = 5.1946961112127485e+02 / (original_height - 2*16) cx = (3.2558244941119034e+02 -16) / (original_width - 2*16) cy = (2.5373616633400465e+02 -16) / (original_height - 2*16) feed_height = thisH feed_width = thisW Us, Vs = np.meshgrid(np.linspace(0, feed_width - 1, feed_width, dtype=np.float32), np.linspace(0, feed_height - 1, feed_height, dtype=np.float32), indexing='xy') Us /= feed_width Vs /= feed_height Ones = np.ones([feed_height, feed_width], dtype=np.float32) norm_pix_coords = np.stack(((Us - cx) / fx, (Vs - cy) / fy, Ones), axis=0) norm_pix_coords = torch.from_numpy(norm_pix_coords).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features_tmp = encoder(input_image) outputs = decoder(features_tmp, norm_pix_coords) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (crop_height, crop_width), mode="bilinear", align_corners=False) # Saving numpy file name_dest_npy = output_path + 'depth.npy' print("-> Saving depth npy to ", name_dest_npy) #scaled_disp, _ = disp_to_depth(disp, 0.1, 10) scaled_disp, _ = disp_to_depth(disp_resized, 0.1, 10) np.save(name_dest_npy, scaled_disp.cpu().numpy()) print(torch.max(scaled_disp)) print(torch.min(scaled_disp)) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = output_path + 'depth.png' print("-> Saving depth png to ", name_dest_im) im.save(name_dest_im) print(cropped_image.size) depth = scaled_disp.squeeze().permute(1,0).cpu().numpy() print(depth) centerX = 624.10065 centerY = 484.05255 focalLength = 968.28955 points = [] for v in range(cropped_image.size[1]): for u in range(cropped_image.size[0]): color = cropped_image.getpixel((u,v)) Z = depth[u,v] if Z==0: continue X = (u + 16 - centerX) * Z / focalLength Y = (v + 16 - centerY) * Z / focalLength #print(X, Y, Z) points.append("%f %f %f %d %d %d %d\n"%(X,Y,Z,color[0],color[1],color[2],255)) file = open(output_path + 'poincloud.ply',"w") file.write('''ply format ascii 1.0 element vertex %d property float x property float y property float z property uchar red property uchar green property uchar blue property uchar alpha end_header %s '''%(len(points),"".join(points))) file.close() print('-> Done!') if __name__ == '__main__': args = parse_args() inference(args)
This is the cropped image and the disparity map returned by the network:
This is the pointcloud that I'm creating using the depth values extracted from the disparity values:
It seems that this pointcloud is not correct. Is it an issue related to the output of the network or is it an issue with the computation of the 3D points? I'm using the centerX, centerY and focalLength values from my own camera.
Thank you in advance!
During the inference of the depth map, it is necessary to adjust the corresponding parameter values, including fx, fy, cx, cy in the code, in conjunction with the camera intrinsics of your own input image. In our reference_single_image.py file, the initial values in lines 87 through 91 are the camera intrinsics in the NYUv2 dataset. Another possible issue is the need to rotate the image to a more appropriate angle. Here it should be rotated by 90 degrees, thus bringing it closer to the training data. Of course, there may be no need for such a rotation, and you can simply have a try.
Hello,
I'm trying to use the network to estimate the depth from one of my own images. This is the code that I'm using:
This is the cropped image and the disparity map returned by the network:
This is the pointcloud that I'm creating using the depth values extracted from the disparity values:
It seems that this pointcloud is not correct. Is it an issue related to the output of the network or is it an issue with the computation of the 3D points? I'm using the centerX, centerY and focalLength values from my own camera.
Thank you in advance!