Open mozpp opened 4 years ago
No, but I believe we can estimate it by an affine transformation with a very good accuracy.
Is that mean: x_rgb= x_d a+y_d b+c ?
Thank you @shahroudy for the suggestion on how to align the frames! For anyone looking for the affine transformations between Kinect's RGB and Depth referentials, I've obtained these based on 10k points from S001 for each camera. Also code example on how to apply the transform.
# convert from rgb to depth frame
rgb_to_depth_affine_transforms = dict(
C001=np.array([[3.45638311e-01, 2.79844266e-03, -8.22281898e+01],
[-1.37185375e-03, 3.46949734e-01, 1.30882644e+01],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]),
C002=np.array([[3.42938209e-01, 8.72629655e-04, -7.28786114e+01],
[3.43287830e-04, 3.43578203e-01, 1.75767495e+01],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]),
C003=np.array([[3.45121348e-01, 8.53232038e-04, -7.33328852e+01],
[1.51167845e-03, 3.45115132e-01, 2.22178592e+01],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]),
affine_transform = rgb_to_depth_affine_transforms["C001"]
frame_rgb = cv2.warpAffine(frame_rgb, affine_transform[:2, :], (512, 424))
-------------------------------------------------------------------------
# convert from depth to rgb frame
depth_to_rgb_affine_transforms = dict(
C001=np.array([[2.89310518e+00, -2.33353370e-02, 2.38200221e+02],
[1.14394588e-02, 2.88216964e+00, -3.67819523e+01],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]),
C002=np.array([[2.90778446e+00, -1.04633946e-02, 2.15505801e+02],
[-3.43830682e-03, 2.91094100e+00, -5.13416831e+01],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]),
C003=np.array([[2.89756295e+00, -7.16367761e-03, 2.12645813e+02],
[-1.26919485e-02, 2.89761514e+00, -6.53095423e+01],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]),
)
affine_transform = depth_to_rgb_affine_transforms["C001"]
frame_depth = cv2.warpAffine(frame_depth, affine_transform[:2, :], (1920, 1080))
@ManuelPalermo Can you please share the affine transforms for the remaining setups if you have them calculated? This is really really helpful! Thank you for sharing this!
Okay so I ended up writing a small script to extract mappings, if someone is still looking you can use this:
import cv2 as cv
import numpy as np
import os
def readVideo(filename):
cap = cv.VideoCapture(filename)
# Check if camera opened successfully
if (cap.isOpened()== False):
print("Error opening video stream or file")
frames=[]
# Read until video is completed
while(cap.isOpened()):
# Capture frame-by-frame
ret, frame = cap.read()
if ret == True:
frames.append(frame)
# Break the loop
else:
break
cap.release()
return frames
def read_skeleton_file(filename):
fileid = open(filename, 'r')
framecount = int(fileid.readline().strip())
bodyinfo=[]
for frame in range(framecount):
bodycount = int(fileid.readline().strip())
for b in range(bodycount):
body = {}
body['bodycount'] = bodycount
bodyDetails = fileid.readline().strip().split()
body['bodyID'] = bodyDetails[0]
body['clipedEdges'] = bodyDetails[1]
body['handLeftConfidence'] = bodyDetails[2]
body['handLeftState'] = bodyDetails[3]
body['handRightConfidence'] = bodyDetails[4]
body['handRightState'] = bodyDetails[5]
body['isResticted'] = bodyDetails[6]
body['leanX'] = bodyDetails[7]
body['leanY'] = bodyDetails[8]
body['trackingState'] = bodyDetails[9]
body['jointCount'] = int(fileid.readline().strip())
#joints = []
body['joints'] = []
for j in range(body['jointCount']):
jointinfo = fileid.readline().strip().split()
joint={};
# 3D location of the joint j
joint['x'] = jointinfo[0]
joint['y'] = jointinfo[1]
joint['z'] = jointinfo[2]
# 2D location of the joint j in corresponding depth/IR frame
joint['depthX'] = jointinfo[3]
joint['depthY'] = jointinfo[4]
# 2D location of the joint j in corresponding RGB frame
joint['colorX'] = jointinfo[5]
joint['colorY'] = jointinfo[6]
# The orientation of the joint j
joint['orientationW'] = jointinfo[7]
joint['orientationX'] = jointinfo[8]
joint['orientationY'] = jointinfo[9]
joint['orientationZ'] = jointinfo[10]
# The tracking state of the joint j
joint['trackingState'] = jointinfo[11]
body['joints'].append(joint)
bodyinfo.append(body)
return bodyinfo
def readDepth(filename):
files = os.listdir(filename)
files = [os.path.join(filename, f) for f in files if f.startswith('MDepth') and f.endswith('.png')]
files = sorted(files)
depths=[]
for f_ind, f in enumerate(files):
im = cv.imread(f, cv.IMREAD_GRAYSCALE)
data = np.array(im, dtype=np.uint16)
depths.append( data )
return depths
def generate_joints(bodyinfo):
rgb_joints = []
depth_joints = []
for frame in bodyinfo:
for joint in (frame['joints']):
rgb_joints.append([float(joint['colorX']),float(joint['colorY'])])
depth_joints.append([float(joint['depthX']),float(joint['depthY'])])
return rgb_joints,depth_joints
def generate_single_mapping(rgb,depth,rgb_joints,depth_joints):
warp_mat,_ = cv.estimateAffine2D(np.array(rgb_joints), np.array(depth_joints), cv.RANSAC)
res = cv.warpAffine(rgb, warp_mat, (512, 424))
return res
filename = 'S001C002P001R001A059'
skele_file = 'RAW-NTU/nturgb+d_skeletons/'+filename+'.skeleton'
bodyinfo = read_skeleton_file(skele_file)
rgb_joints,depth_joints = generate_joints(bodyinfo)
video_file = '/RAW-NTU/nturgb+d_rgb/'+filename+'_rgb.avi'
frames = readVideo(video_file)
depth_file = '/RAW-NTU/nturgb+d_depth_masked/'+filename+'/'
depths = readDepth(depth_file)
res = generate_single_mapping(frames[0],depths[0],rgb_joints,depth_joints)
import matplotlib.pyplot as plt
def plot_both(res,depth):
depth2=cv.cvtColor(depth*100, cv.COLOR_GRAY2BGR)
added_image = cv.addWeighted(cv.cvtColor(res, cv.COLOR_RGB2BGR),0.4,depth2.astype('uint8'),0.3,0)
plt.figure(figsize=(20,10))
imgplot = plt.imshow(added_image)
Thank you @shahroudy for the suggestion on how to align the frames! For anyone looking for the affine transformations between Kinect's RGB and Depth referentials, I've obtained these based on 10k points from S001 for each camera. Also code example on how to apply the transform.
# convert from rgb to depth frame rgb_to_depth_affine_transforms = dict( C001=np.array([[3.45638311e-01, 2.79844266e-03, -8.22281898e+01], [-1.37185375e-03, 3.46949734e-01, 1.30882644e+01], [0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]), C002=np.array([[3.42938209e-01, 8.72629655e-04, -7.28786114e+01], [3.43287830e-04, 3.43578203e-01, 1.75767495e+01], [0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]), C003=np.array([[3.45121348e-01, 8.53232038e-04, -7.33328852e+01], [1.51167845e-03, 3.45115132e-01, 2.22178592e+01], [0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]), affine_transform = rgb_to_depth_affine_transforms["C001"] frame_rgb = cv2.warpAffine(frame_rgb, affine_transform[:2, :], (512, 424)) ------------------------------------------------------------------------- # convert from depth to rgb frame depth_to_rgb_affine_transforms = dict( C001=np.array([[2.89310518e+00, -2.33353370e-02, 2.38200221e+02], [1.14394588e-02, 2.88216964e+00, -3.67819523e+01], [0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]), C002=np.array([[2.90778446e+00, -1.04633946e-02, 2.15505801e+02], [-3.43830682e-03, 2.91094100e+00, -5.13416831e+01], [0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]), C003=np.array([[2.89756295e+00, -7.16367761e-03, 2.12645813e+02], [-1.26919485e-02, 2.89761514e+00, -6.53095423e+01], [0.00000000e+00, 0.00000000e+00, 1.00000000e+00]]), ) affine_transform = depth_to_rgb_affine_transforms["C001"] frame_depth = cv2.warpAffine(frame_depth, affine_transform[:2, :], (1920, 1080))
Can you share the code of the internal parameter matrix of the camera?
Is the transformation a linear transformation?