sjtuytc / UnboundedNeRFPytorch

State-of-the-art, simple, fast unbounded / large-scale NeRFs.
MIT License
1.32k stars 120 forks source link

About ray_origins and ray_directions in Waymo raw dataset #44

Closed GeJintian closed 1 year ago

GeJintian commented 1 year ago

Hello, thank you for your great works! I want to try my own data on block-nerf, so I want to create a dataset similar to Waymo Block-NeRF Dataset. May I know that how is ray_origins and ray_directions generated? Is there any method we can get r_o and r_d, if we have pose and camera intrinsics from SLAM?

sjtuytc commented 1 year ago

In general, ro is the location of the camera, and rd is the direction of the camera. You may need to run colmap to get the camera poses (ro and rd) for your datasets.

GeJintian commented 1 year ago

Thank you very much for your reply. And for those who might have the same problems like me, you may refer to this function https://github.com/bmild/nerf/blob/master/run_nerf_helpers.py#L123 which generates rays given camera pose.

rockywind commented 1 year ago

Hi, @GeJintian Can you share the code of generate own dataset for block-nerf。 Thanks very much!

GeJintian commented 1 year ago

@rockywind The author provides code to convert a waymo tfrecord to their dataset. Please refer to https://github.com/dvlab-research/LargeScaleNeRFPytorch/blob/main/docs/get_pytorch_waymo_dataset.md. So, I write a code which convert raw images and poses from SLAM to the tfrecord in Waymo's format. See below:

import tensorflow as tf
import numpy as np
import torch
import yaml
import os
import cv2
import base64
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt

def get_rays(H, W, K, c2w, inverse_y, flip_x, flip_y, mode='center'):
    i, j = torch.meshgrid(
        torch.linspace(0, W-1, W, device=c2w.device),
        torch.linspace(0, H-1, H, device=c2w.device))  # pytorch's meshgrid has indexing='ij'
    i = i.t().float()
    j = j.t().float()
    if mode == 'lefttop':
        pass
    elif mode == 'center':
        i, j = i+0.5, j+0.5
    elif mode == 'random':
        i = i+torch.rand_like(i)
        j = j+torch.rand_like(j)
    else:
        raise NotImplementedError

    if flip_x:
        i = i.flip((1,))
    if flip_y:
        j = j.flip((0,))
    if inverse_y:
        dirs = torch.stack([(i-K[0][2])/K[0][0], (j-K[1][2])/K[1][1], torch.ones_like(i)], -1)
    else:
        dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)
    # Rotate ray directions from camera frame to the world frame
    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
    # Translate camera frame's origin to the world frame. It is the origin of all rays.
    rays_o = c2w[:3,3].expand(rays_d.shape)
    return rays_o, rays_d

def ndc_rays(H, W, focal, near, rays_o, rays_d):
    # Shift ray origins to near plane
    t = -(near + rays_o[...,2]) / rays_d[...,2]
    rays_o = rays_o + t[...,None] * rays_d

    # Projection
    o0 = -1./(W/(2.*focal)) * rays_o[...,0] / rays_o[...,2]
    o1 = -1./(H/(2.*focal)) * rays_o[...,1] / rays_o[...,2]
    o2 = 1. + 2. * near / rays_o[...,2]

    d0 = -1./(W/(2.*focal)) * (rays_d[...,0]/rays_d[...,2] - rays_o[...,0]/rays_o[...,2])
    d1 = -1./(H/(2.*focal)) * (rays_d[...,1]/rays_d[...,2] - rays_o[...,1]/rays_o[...,2])
    d2 = -2. * near / rays_o[...,2]

    rays_o = torch.stack([o0,o1,o2], -1)
    rays_d = torch.stack([d0,d1,d2], -1)

    return rays_o, rays_d

def get_rays_of_a_view(H, W, K, c2w, ndc = False, inverse_y = False, flip_x = False, flip_y = False, mode='center'):
    rays_o, rays_d = get_rays(H, W, K, c2w, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y, mode=mode)
    if ndc:
        rays_o, rays_d = ndc_rays(H, W, K[0][0], 1., rays_o, rays_d)
    return rays_o.numpy(), rays_d.numpy()

def read_poses(dir,i):
    # output: transform matrix M_p(3x4)
    dir = os.path.join(dir,'poses','pose_'+str(i)+'.yaml')
    with open(dir,'r') as f:
        temp = yaml.safe_load(f.read())
    Ts = np.array([[temp['position']['x']], [temp['position']['y']], [temp['position']['z']]])
    x = temp['orientation']['x']
    y = temp['orientation']['y']
    z = temp['orientation']['z']
    w = temp['orientation']['w']
    M_r = np.zeros((3,3))
    M_r[0][0] = 1 - 2*(y**2) - 2*(z**2)
    M_r[0][1] = 2*x*y - 2*w*z
    M_r[0][2] = 2*x*z + 2*w*y
    M_r[1][0] = 2*x*y + 2*w*z
    M_r[1][1] = 1 - 2*(x**2) - 2*(z**2)
    M_r[1][2] = 2*y*z - 2*w*x
    M_r[2][0] = 2*x*z - 2*w*y
    M_r[2][1] = 2*y*z + 2*w*x
    M_r[2][2] = 1 - 2*(x**2) -2*(y**2)
    M_p = np.hstack((M_r,Ts))
    return M_p

def read_image(dir,i):
    # output: img
    dir = os.path.join(dir,'images','img_'+str(i)+'.png')
    img = cv2.imread(dir)
    return img

def read_ins(dir):
    # output: height, width, K(3x3)
    dir = os.path.join(dir,"intrinsic.yaml")
    with open(dir,'r') as f:
        temp = yaml.safe_load(f.read())
    return temp['height'], temp['width'], np.array(temp['K']).reshape((3,3))

def save_one(root_dir, save_path):
    writer = tf.io.TFRecordWriter(save_path)
    cam_idx = 0
    exposure = 0.006
    height, width, K = read_ins(root_dir)
    for i in range(200):
        image = read_image(root_dir, i)
        pose = read_poses(root_dir, i)
        rays_o, rays_d = get_rays_of_a_view(height, width, torch.from_numpy(K), torch.from_numpy(pose), ndc = False)
        #print(rays_d.shape)
        # image_str = cv2.imencode('.png',image)[1].tostring()
        # image_hash = base64.b64encode(image_str)
        example = tf.train.Example(features=tf.train.Features(feature={
            "image_hash": tf.train.Feature(int64_list=tf.train.Int64List(value=[i])),
            "cam_idx": tf.train.Feature(int64_list=tf.train.Int64List(value=[cam_idx])),
            "equivalent_exposure": tf.train.Feature(float_list=tf.train.FloatList(value=[exposure])),
            "height": tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
            "width": tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
            "image": tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.image.encode_png(image).numpy()])),
            "ray_origins": tf.train.Feature(bytes_list=tf.train.BytesList(value=[rays_o.tobytes()])),
            "ray_dirs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[rays_d.tobytes()])),
            "intrinsics":  tf.train.Feature(float_list=tf.train.FloatList(value=[K[0][0], K[1][1]]))
        }))
        #print(example)
        writer.write(example.SerializeToString())
    writer.close()

if __name__=="__main__":
    dir = 'raw_format'
    save_path = 'tf_format/rrc1.tfrecords'
    #draw_path(dir)
    save_one(dir, save_path)

By the way, after training with default hyperparameters, the result is still not good. If you obtained a satisfying result, could you please share your settings and hyperparameters? Thanks in advance.

sjtuytc commented 1 year ago

What do you mean by the results are not good? It is known to us that Waymo's pose is very inaccurate, and this would inhibit many approaches from working well.

GeJintian commented 1 year ago

I prepare our own dataset and get poses from SLAM. This SLAM result should be relatively accurate. Then I trained nerf-block with default settings (like downscale = 4, epochs = 10 and etc.). After training, I use the training data to verify block0, but the generated model is still fuzzy. I wonder if the network has converged or not. So I want to know others' training settings. This is an example of our result. 50_50_51_block_0

rockywind commented 1 year ago

Hi, @GeJintian Thanks for your help. If I obtained satisfying result, I will tell you.

rockywind commented 1 year ago

Hi, @GeJintian What is the meaning of the pose? I thank the pose is camera coordinate to gloabl coordinate.

    dir = os.path.join(dir,'poses','pose_'+str(i)+'.yaml')
    with open(dir,'r') as f:
        temp = yaml.safe_load(f.read())
    Ts = np.array([[temp['position']['x']], [temp['position']['y']], [temp['position']['z']]])
    x = temp['orientation']['x']
    y = temp['orientation']['y']
    z = temp['orientation']['z']
    w = temp['orientation']['w']
GeJintian commented 1 year ago

@rockywind the pose matrix of a camera should be [R|t], which could transfer a point in camera coordinate to world coordinate. R is a [3x3] rotation matrix and t is a [3x1] translation matrix. Meanwhile, R could be obtained by quaternions, which is [x,y,z,w] is my code. If you obtained [3x3] rotation matrix, you can use it directly. If you obtained quaternions, you need to transfer it to rotation matrix. Transformation is coded as:

    M_r = np.zeros((3,3))
    M_r[0][0] = 1 - 2*(y**2) - 2*(z**2)
    M_r[0][1] = 2*x*y - 2*w*z
    M_r[0][2] = 2*x*z + 2*w*y
    M_r[1][0] = 2*x*y + 2*w*z
    M_r[1][1] = 1 - 2*(x**2) - 2*(z**2)
    M_r[1][2] = 2*y*z - 2*w*x
    M_r[2][0] = 2*x*z - 2*w*y
    M_r[2][1] = 2*y*z + 2*w*x
    M_r[2][2] = 1 - 2*(x**2) -2*(y**2)

For details, you may refer to https://en.wikipedia.org/wiki/Conversion_between_quaternions_and_Euler_angles

rockywind commented 1 year ago

Hi, @GeJintian Thanks for your help. Is the world coordinate based on IMU origin or Lidar origin?

GeJintian commented 1 year ago

@rockywind I think it doesn't matter, as long as all poses you use are from the same coordinate system

rockywind commented 1 year ago

@GeJintian thanks a lot!

rockywind commented 1 year ago

Hi, @GeJintian When I run the script(split_block.py), I met the error below. I think pose is based on global coordinate, so it can split the block according different car location in global coordinate.

 in <module>
    overlap=args['overlap'])
  File "/SHFP12/02_bevdet/LargeScaleNeRFPytorch/data_preprocess/split_block_own_data.py", line 175, in split_dataset
    if origin != centroids[-1] and judge == False: # have not reached the first centroid
IndexError: list index out of range
GeJintian commented 1 year ago

@rockywind Where is your starting point in world coordinate? Our starting point is around (0,0) in the world, so there is no problem for us

rockywind commented 1 year ago

Hi, @GeJintian Our world coordinate is based on Lidar coordinate in the ego car. Our starting point is Lidar origin. I printed the img_train_origins. The img_train_origins is (-1.59. 0.91, 0.96) image

GeJintian commented 1 year ago

@rockywind I think it should be fine. Maybe you will need to check the radius of each block? It is one of the arguments. Also, you might also need to check how many blocks it splits for you. According to your error, centroid[-1] leads to "list index out of range", which indicates that centroid might be an empty list.

rockywind commented 1 year ago

Hi, @GeJintian Thanks for your help! This is my result. It is very poor. image

GeJintian commented 1 year ago

@rockywind seems like further training is needed. Thanks a lot for telling me your results.

rockywind commented 1 year ago

Hi, @GeJintian
How many epoch is fit for your experiment? I follow the default setting.

GeJintian commented 1 year ago

@rockywind I use default settings as well. So 10 epochs in my training. And it takes several hours to train one block. I dont want to increase training epochs.

rockywind commented 1 year ago

Hi, @GeJintian Why use the transpose matrix of the cam_to_world rotate matrix in the line? I remember the transpose matrix of a rotate matrix is equal to the inverse matrix. The inverse matrix is the mean from the world coordinate to the camera coordinate. image

GeJintian commented 1 year ago

@rockywind Are you sure this is part of my code? I haven't seen it in my code

rockywind commented 1 year ago

Hi, @GeJintian This is not in your code. This is part of the Waymo_dataset.py in the 29 line. image

GeJintian commented 1 year ago

@rockywind I think the author's implementation is correct. Normally, we use R$\times$A to present rotation. Notice that in this equation, R is [3x3] and A is [3x1]. However, in this code, direction is [1x3], and the operation should become A$\times$R.T. If you write down the whole process you will know it is correct.

rockywind commented 1 year ago

OK, thanks a lot!

JonasW-byte commented 1 year ago

@rockywind 你好,请问深度训练问题怎么解决得呢

JonasW-byte commented 1 year ago

@GeJintian hello, do u get better result same as author's example?

GeJintian commented 1 year ago

@JonasW-byte No, I didn't do any further experiments.

JonasW-byte commented 1 year ago

@GeJintian Thanks a lot.

Bin-ze commented 1 year ago

@rockywind The author provides code to convert a waymo tfrecord to their dataset. Please refer to https://github.com/dvlab-research/LargeScaleNeRFPytorch/blob/main/docs/get_pytorch_waymo_dataset.md. So, I write a code which convert raw images and poses from SLAM to the tfrecord in Waymo's format. See below:

import tensorflow as tf
import numpy as np
import torch
import yaml
import os
import cv2
import base64
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt

def get_rays(H, W, K, c2w, inverse_y, flip_x, flip_y, mode='center'):
    i, j = torch.meshgrid(
        torch.linspace(0, W-1, W, device=c2w.device),
        torch.linspace(0, H-1, H, device=c2w.device))  # pytorch's meshgrid has indexing='ij'
    i = i.t().float()
    j = j.t().float()
    if mode == 'lefttop':
        pass
    elif mode == 'center':
        i, j = i+0.5, j+0.5
    elif mode == 'random':
        i = i+torch.rand_like(i)
        j = j+torch.rand_like(j)
    else:
        raise NotImplementedError

    if flip_x:
        i = i.flip((1,))
    if flip_y:
        j = j.flip((0,))
    if inverse_y:
        dirs = torch.stack([(i-K[0][2])/K[0][0], (j-K[1][2])/K[1][1], torch.ones_like(i)], -1)
    else:
        dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)
    # Rotate ray directions from camera frame to the world frame
    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
    # Translate camera frame's origin to the world frame. It is the origin of all rays.
    rays_o = c2w[:3,3].expand(rays_d.shape)
    return rays_o, rays_d

def ndc_rays(H, W, focal, near, rays_o, rays_d):
    # Shift ray origins to near plane
    t = -(near + rays_o[...,2]) / rays_d[...,2]
    rays_o = rays_o + t[...,None] * rays_d

    # Projection
    o0 = -1./(W/(2.*focal)) * rays_o[...,0] / rays_o[...,2]
    o1 = -1./(H/(2.*focal)) * rays_o[...,1] / rays_o[...,2]
    o2 = 1. + 2. * near / rays_o[...,2]

    d0 = -1./(W/(2.*focal)) * (rays_d[...,0]/rays_d[...,2] - rays_o[...,0]/rays_o[...,2])
    d1 = -1./(H/(2.*focal)) * (rays_d[...,1]/rays_d[...,2] - rays_o[...,1]/rays_o[...,2])
    d2 = -2. * near / rays_o[...,2]

    rays_o = torch.stack([o0,o1,o2], -1)
    rays_d = torch.stack([d0,d1,d2], -1)

    return rays_o, rays_d

def get_rays_of_a_view(H, W, K, c2w, ndc = False, inverse_y = False, flip_x = False, flip_y = False, mode='center'):
    rays_o, rays_d = get_rays(H, W, K, c2w, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y, mode=mode)
    if ndc:
        rays_o, rays_d = ndc_rays(H, W, K[0][0], 1., rays_o, rays_d)
    return rays_o.numpy(), rays_d.numpy()

def read_poses(dir,i):
    # output: transform matrix M_p(3x4)
    dir = os.path.join(dir,'poses','pose_'+str(i)+'.yaml')
    with open(dir,'r') as f:
        temp = yaml.safe_load(f.read())
    Ts = np.array([[temp['position']['x']], [temp['position']['y']], [temp['position']['z']]])
    x = temp['orientation']['x']
    y = temp['orientation']['y']
    z = temp['orientation']['z']
    w = temp['orientation']['w']
    M_r = np.zeros((3,3))
    M_r[0][0] = 1 - 2*(y**2) - 2*(z**2)
    M_r[0][1] = 2*x*y - 2*w*z
    M_r[0][2] = 2*x*z + 2*w*y
    M_r[1][0] = 2*x*y + 2*w*z
    M_r[1][1] = 1 - 2*(x**2) - 2*(z**2)
    M_r[1][2] = 2*y*z - 2*w*x
    M_r[2][0] = 2*x*z - 2*w*y
    M_r[2][1] = 2*y*z + 2*w*x
    M_r[2][2] = 1 - 2*(x**2) -2*(y**2)
    M_p = np.hstack((M_r,Ts))
    return M_p

def read_image(dir,i):
    # output: img
    dir = os.path.join(dir,'images','img_'+str(i)+'.png')
    img = cv2.imread(dir)
    return img

def read_ins(dir):
    # output: height, width, K(3x3)
    dir = os.path.join(dir,"intrinsic.yaml")
    with open(dir,'r') as f:
        temp = yaml.safe_load(f.read())
    return temp['height'], temp['width'], np.array(temp['K']).reshape((3,3))

def save_one(root_dir, save_path):
    writer = tf.io.TFRecordWriter(save_path)
    cam_idx = 0
    exposure = 0.006
    height, width, K = read_ins(root_dir)
    for i in range(200):
        image = read_image(root_dir, i)
        pose = read_poses(root_dir, i)
        rays_o, rays_d = get_rays_of_a_view(height, width, torch.from_numpy(K), torch.from_numpy(pose), ndc = False)
        #print(rays_d.shape)
        # image_str = cv2.imencode('.png',image)[1].tostring()
        # image_hash = base64.b64encode(image_str)
        example = tf.train.Example(features=tf.train.Features(feature={
            "image_hash": tf.train.Feature(int64_list=tf.train.Int64List(value=[i])),
            "cam_idx": tf.train.Feature(int64_list=tf.train.Int64List(value=[cam_idx])),
            "equivalent_exposure": tf.train.Feature(float_list=tf.train.FloatList(value=[exposure])),
            "height": tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
            "width": tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
            "image": tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.image.encode_png(image).numpy()])),
            "ray_origins": tf.train.Feature(bytes_list=tf.train.BytesList(value=[rays_o.tobytes()])),
            "ray_dirs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[rays_d.tobytes()])),
            "intrinsics":  tf.train.Feature(float_list=tf.train.FloatList(value=[K[0][0], K[1][1]]))
        }))
        #print(example)
        writer.write(example.SerializeToString())
    writer.close()

if __name__=="__main__":
    dir = 'raw_format'
    save_path = 'tf_format/rrc1.tfrecords'
    #draw_path(dir)
    save_one(dir, save_path)

By the way, after training with default hyperparameters, the result is still not good. If you obtained a satisfying result, could you please share your settings and hyperparameters? Thanks in advance.

Thank you very much for the data transformation script! I used your conversion script to convert the pose and image obtained from slam to waymo tfrecord, and then used the fetch_data_from_tf_record.py script provided by the author to convert the format, but the following error occurred:

  2023-04-03 11:59:49.322621: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
  To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  2023-04-03 11:59:50.329804: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/cv2/../../lib64:
  2023-04-03 11:59:50.329888: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/cv2/../../lib64:
  2023-04-03 11:59:50.329902: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
    0%|                                                     | 0/1 [00:00<?, ?it/s]2023-04-03 11:59:52.770685: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
  To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  2023-04-03 11:59:59.388352: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1443 MB memory:  -> device: 0, name: NVIDIA Tesla V100-PCIE-16GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
  2023-04-03 11:59:59.390172: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 14616 MB memory:  -> device: 1, name: NVIDIA Tesla V100-PCIE-16GB, pci bus id: 0000:86:00.0, compute capability: 7.0
  2023-04-03 11:59:59.391899: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 14616 MB memory:  -> device: 2, name: NVIDIA Tesla V100-PCIE-16GB, pci bus id: 0000:af:00.0, compute capability: 7.0
  2023-04-03 11:59:59.393577: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 14616 MB memory:  -> device: 3, name: NVIDIA Tesla V100-PCIE-16GB, pci bus id: 0000:d8:00.0, compute capability: 7.0
    0%|                                                     | 0/1 [00:07<?, ?it/s]
  Traceback (most recent call last):
    File "/home/guozebin/work_code/LargeScaleNeRFPytorch/data_preprocess/fetch_data_from_tf_record.py", line 180, in <module>
      train_index, val_index = handle_one_record(tfrecord, train_index, val_index)
    File "/home/guozebin/work_code/LargeScaleNeRFPytorch/data_preprocess/fetch_data_from_tf_record.py", line 98, in handle_one_record
      for batch in dataset_map:
    File "/home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 787, in __next__
      return self._next_internal()
    File "/home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 770, in _next_internal
      ret = gen_dataset_ops.iterator_get_next(
    File "/home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 3017, in iterator_get_next
      _ops.raise_from_not_ok_status(e, name)
    File "/home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/tensorflow/python/framework/ops.py", line 7215, in raise_from_not_ok_status
      raise core._status_to_exception(e) from None  # pylint: disable=protected-access
  tensorflow.python.framework.errors_impl.DataLossError: {{function_node __wrapped__IteratorGetNext_output_types_9_device_/job:localhost/replica:0/task:0/device:CPU:0}} inflate() failed with error -3: incorrect header check [Op:IteratorGetNext]

  Process finished with exit code 1

Have you ever encountered such a problem? Does this have something to do with the tensorflow version?

Looking forward to your reply!

GeJintian commented 1 year ago

@Bin-ze I am not sure whether I encountered this problem or not. I used Tensorflow-2.10.0 to run this script. You might try to run fetch_data_from_tf_record.py with Waymo's official tfrecords to see if this is caused by TF version.

Bin-ze commented 1 year ago

@Bin-ze I am not sure whether I encountered this problem or not. I used Tensorflow-2.10.0 to run this script. You might try to run fetch_data_from_tf_record.py with Waymo's official tfrecords to see if this is caused by TF version.

注意到您的中文名,我用中文跟您进行沟通。 我已经解决了这个问题,这是由于数据结构不同导致的。 我想要尝试与您相同的实验,初始pose从slam获取,我想请教您一些问题:

  1. 您使用的是block-nerf而非foried-nerf吗,虽然作者说foried-nerf支持分块训练,但是我并未在代码中看到对应实现
  2. 您最后是否在真实场景上获得了清晰的图像渲染呢?虽然从该issues里的评论我了解到这样的实验在真实场景是不佳的,但我想要了解您有没有进步一步的成果
  3. 关于block-nerf,其对每个块进行训练,在推理时合并它们,已产生一致性的结果,我的理解正确吗?
  4. 我可以添加您的联系方式吗?以进一步的向您请教,我的微信:Bin1978108038
GeJintian commented 1 year ago

@Bin-ze 您好。关于您的问题:

  1. 我是去年11月份尝试用block-nerf重建,那时候还没有foried-nerf,所以我不清楚作者是否有上传相应代码
  2. 我的实验结果基本就是上面我评论的。我基于作者给的default setting稍作调参并没有看到明显提升
  3. 对,每个block是单独训练,然后相邻block会对结果共同产生影响
  4. 由于缺乏实时性,我没有继续nerf相关的研究,所以我没有更多的进展。如果您有别的提问我会尽量解答
Bin-ze commented 1 year ago

@GeJintian 您好,我遇到了新的问题 我想了解下您是否更改了作者的split——block部分的代码,以适应实际场景的逻辑。 在我看来,实际场景可能是由一个相机捕获的序列,而且由于存在大量遮挡,block-nerf中定义的圆形区域可能不适应与自己构建的场景。请问你有什么经验吗? 还有我也遇到了block-nerf的实时性问题,作者提供的推理脚本在推理每一张图像是都实例化一次模型,这一步是否是必要的?我观察到,其推理速度很慢,大概一分钟推理一张图像,这样的速度很慢

GeJintian commented 1 year ago

@Bin-ze 我没有更改作者的代码,因为我使用的图片来源于多个不同角度的相机,因此圆形区域适用于我的情况。但我认为,即使你使用的场景只来源于一个相机,在训练之后,只要你使用训练集中的pose来进行测试,应该也能得到满意的结果。 关于实时性问题,我的关注点在于模型的训练速度过慢,导致实时重建不可行,因此我没有关注模型推理时候的性能

Bin-ze commented 1 year ago

@Bin-ze 我没有更改作者的代码,因为我使用的图片来源于多个不同角度的相机,因此圆形区域适用于我的情况。但我认为,即使你使用的场景只来源于一个相机,在训练之后,只要你使用训练集中的pose来进行测试,应该也能得到满意的结果。 关于实时性问题,我的关注点在于模型的训练速度过慢,导致实时重建不可行,因此我没有关注模型推理时候的性能

您使用训练集的pose训练得到的重建结果是比较好的吗?我使用训练集的pose来测试我的模型,但是依旧完全无法重建深度信息。如果除去模型因素的影响,导致我的精度很差的原因似乎只能来源于数据,但是我目前还未发现数据上存在的问题。非常感谢您提供的帮助,您的经验对我来说非常重要

Bin-ze commented 1 year ago

@Bin-ze 我没有更改作者的代码,因为我使用的图片来源于多个不同角度的相机,因此圆形区域适用于我的情况。但我认为,即使你使用的场景只来源于一个相机,在训练之后,只要你使用训练集中的pose来进行测试,应该也能得到满意的结果。 关于实时性问题,我的关注点在于模型的训练速度过慢,导致实时重建不可行,因此我没有关注模型推理时候的性能

我想知道你的脚本中的exposure = 0.006是如何计算得到的?这个参数对模型的影响有多大?

GeJintian commented 1 year ago

@Bin-ze 使用训练集进行测试是因为我认为这样能保证得到较好的结果(训练集结果一般好于测试集),只不过最后结果依旧不理想。我个人认为如此模糊的原因是因为没有完全收敛(训练数据不够/训练epoch不够)。至于曝光度,我们无法得到,不过在室内的环境下我默认基本一致,所以取了waymo data里面的一个值

wt202020 commented 1 year ago

作者提供了将 waymo tfrecord 转换为其数据集的代码。请参阅 https://github.com/dvlab-research/LargeScaleNeRFPytorch/blob/main/docs/get_pytorch_waymo_dataset.md。因此,我编写了一段代码,将原始图像和姿势从SLAM转换为Waymo格式的tfrecord。见下文

import tensorflow as tf
import numpy as np
import torch
import yaml
import os
import cv2
import base64
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt

def get_rays(H, W, K, c2w, inverse_y, flip_x, flip_y, mode='center'):
    i, j = torch.meshgrid(
        torch.linspace(0, W-1, W, device=c2w.device),
        torch.linspace(0, H-1, H, device=c2w.device))  # pytorch's meshgrid has indexing='ij'
    i = i.t().float()
    j = j.t().float()
    if mode == 'lefttop':
        pass
    elif mode == 'center':
        i, j = i+0.5, j+0.5
    elif mode == 'random':
        i = i+torch.rand_like(i)
        j = j+torch.rand_like(j)
    else:
        raise NotImplementedError

    if flip_x:
        i = i.flip((1,))
    if flip_y:
        j = j.flip((0,))
    if inverse_y:
        dirs = torch.stack([(i-K[0][2])/K[0][0], (j-K[1][2])/K[1][1], torch.ones_like(i)], -1)
    else:
        dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)
    # Rotate ray directions from camera frame to the world frame
    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
    # Translate camera frame's origin to the world frame. It is the origin of all rays.
    rays_o = c2w[:3,3].expand(rays_d.shape)
    return rays_o, rays_d

def ndc_rays(H, W, focal, near, rays_o, rays_d):
    # Shift ray origins to near plane
    t = -(near + rays_o[...,2]) / rays_d[...,2]
    rays_o = rays_o + t[...,None] * rays_d

    # Projection
    o0 = -1./(W/(2.*focal)) * rays_o[...,0] / rays_o[...,2]
    o1 = -1./(H/(2.*focal)) * rays_o[...,1] / rays_o[...,2]
    o2 = 1. + 2. * near / rays_o[...,2]

    d0 = -1./(W/(2.*focal)) * (rays_d[...,0]/rays_d[...,2] - rays_o[...,0]/rays_o[...,2])
    d1 = -1./(H/(2.*focal)) * (rays_d[...,1]/rays_d[...,2] - rays_o[...,1]/rays_o[...,2])
    d2 = -2. * near / rays_o[...,2]

    rays_o = torch.stack([o0,o1,o2], -1)
    rays_d = torch.stack([d0,d1,d2], -1)

    return rays_o, rays_d

def get_rays_of_a_view(H, W, K, c2w, ndc = False, inverse_y = False, flip_x = False, flip_y = False, mode='center'):
    rays_o, rays_d = get_rays(H, W, K, c2w, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y, mode=mode)
    if ndc:
        rays_o, rays_d = ndc_rays(H, W, K[0][0], 1., rays_o, rays_d)
    return rays_o.numpy(), rays_d.numpy()

def read_poses(dir,i):
    # output: transform matrix M_p(3x4)
    dir = os.path.join(dir,'poses','pose_'+str(i)+'.yaml')
    with open(dir,'r') as f:
        temp = yaml.safe_load(f.read())
    Ts = np.array([[temp['position']['x']], [temp['position']['y']], [temp['position']['z']]])
    x = temp['orientation']['x']
    y = temp['orientation']['y']
    z = temp['orientation']['z']
    w = temp['orientation']['w']
    M_r = np.zeros((3,3))
    M_r[0][0] = 1 - 2*(y**2) - 2*(z**2)
    M_r[0][1] = 2*x*y - 2*w*z
    M_r[0][2] = 2*x*z + 2*w*y
    M_r[1][0] = 2*x*y + 2*w*z
    M_r[1][1] = 1 - 2*(x**2) - 2*(z**2)
    M_r[1][2] = 2*y*z - 2*w*x
    M_r[2][0] = 2*x*z - 2*w*y
    M_r[2][1] = 2*y*z + 2*w*x
    M_r[2][2] = 1 - 2*(x**2) -2*(y**2)
    M_p = np.hstack((M_r,Ts))
    return M_p

def read_image(dir,i):
    # output: img
    dir = os.path.join(dir,'images','img_'+str(i)+'.png')
    img = cv2.imread(dir)
    return img

def read_ins(dir):
    # output: height, width, K(3x3)
    dir = os.path.join(dir,"intrinsic.yaml")
    with open(dir,'r') as f:
        temp = yaml.safe_load(f.read())
    return temp['height'], temp['width'], np.array(temp['K']).reshape((3,3))

def save_one(root_dir, save_path):
    writer = tf.io.TFRecordWriter(save_path)
    cam_idx = 0
    exposure = 0.006
    height, width, K = read_ins(root_dir)
    for i in range(200):
        image = read_image(root_dir, i)
        pose = read_poses(root_dir, i)
        rays_o, rays_d = get_rays_of_a_view(height, width, torch.from_numpy(K), torch.from_numpy(pose), ndc = False)
        #print(rays_d.shape)
        # image_str = cv2.imencode('.png',image)[1].tostring()
        # image_hash = base64.b64encode(image_str)
        example = tf.train.Example(features=tf.train.Features(feature={
            "image_hash": tf.train.Feature(int64_list=tf.train.Int64List(value=[i])),
            "cam_idx": tf.train.Feature(int64_list=tf.train.Int64List(value=[cam_idx])),
            "equivalent_exposure": tf.train.Feature(float_list=tf.train.FloatList(value=[exposure])),
            "height": tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
            "width": tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
            "image": tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.image.encode_png(image).numpy()])),
            "ray_origins": tf.train.Feature(bytes_list=tf.train.BytesList(value=[rays_o.tobytes()])),
            "ray_dirs": tf.train.Feature(bytes_list=tf.train.BytesList(value=[rays_d.tobytes()])),
            "intrinsics":  tf.train.Feature(float_list=tf.train.FloatList(value=[K[0][0], K[1][1]]))
        }))
        #print(example)
        writer.write(example.SerializeToString())
    writer.close()

if __name__=="__main__":
    dir = 'raw_format'
    save_path = 'tf_format/rrc1.tfrecords'
    #draw_path(dir)
    save_one(dir, save_path)

By the way, after training with default hyperparameters, the result is still not good. If you obtained a satisfying result, could you please share your settings and hyperparameters? Thanks in advance.

Thank you very much for the data transformation script! I used your conversion script to convert the pose and image obtained from slam to waymo tfrecord, and then used the fetch_data_from_tf_record.py script provided by the author to convert the format, but the following error occurred:

  2023-04-03 11:59:49.322621: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
  To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  2023-04-03 11:59:50.329804: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/cv2/../../lib64:
  2023-04-03 11:59:50.329888: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/cv2/../../lib64:
  2023-04-03 11:59:50.329902: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
    0%|                                                     | 0/1 [00:00<?, ?it/s]2023-04-03 11:59:52.770685: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
  To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  2023-04-03 11:59:59.388352: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1443 MB memory:  -> device: 0, name: NVIDIA Tesla V100-PCIE-16GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
  2023-04-03 11:59:59.390172: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 14616 MB memory:  -> device: 1, name: NVIDIA Tesla V100-PCIE-16GB, pci bus id: 0000:86:00.0, compute capability: 7.0
  2023-04-03 11:59:59.391899: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 14616 MB memory:  -> device: 2, name: NVIDIA Tesla V100-PCIE-16GB, pci bus id: 0000:af:00.0, compute capability: 7.0
  2023-04-03 11:59:59.393577: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 14616 MB memory:  -> device: 3, name: NVIDIA Tesla V100-PCIE-16GB, pci bus id: 0000:d8:00.0, compute capability: 7.0
    0%|                                                     | 0/1 [00:07<?, ?it/s]
  Traceback (most recent call last):
    File "/home/guozebin/work_code/LargeScaleNeRFPytorch/data_preprocess/fetch_data_from_tf_record.py", line 180, in <module>
      train_index, val_index = handle_one_record(tfrecord, train_index, val_index)
    File "/home/guozebin/work_code/LargeScaleNeRFPytorch/data_preprocess/fetch_data_from_tf_record.py", line 98, in handle_one_record
      for batch in dataset_map:
    File "/home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 787, in __next__
      return self._next_internal()
    File "/home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 770, in _next_internal
      ret = gen_dataset_ops.iterator_get_next(
    File "/home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 3017, in iterator_get_next
      _ops.raise_from_not_ok_status(e, name)
    File "/home/guozebin/miniconda3/envs/large-scale-nerf/lib/python3.9/site-packages/tensorflow/python/framework/ops.py", line 7215, in raise_from_not_ok_status
      raise core._status_to_exception(e) from None  # pylint: disable=protected-access
  tensorflow.python.framework.errors_impl.DataLossError: {{function_node __wrapped__IteratorGetNext_output_types_9_device_/job:localhost/replica:0/task:0/device:CPU:0}} inflate() failed with error -3: incorrect header check [Op:IteratorGetNext]

  Process finished with exit code 1

Have you ever encountered such a problem? Does this have something to do with the tensorflow version?

Looking forward to your reply!

Hello, I have encountered the same problem, how did you solve it? Looking forward to your reply!