Closed aviralchharia closed 1 year ago
Demo for Internet videos is produced as follows:
x_det
by running HRNet frame by frame.(Optional) Smooth pose sequence. We use One-Euro filter to alleviate temporal jitter.
Normalize 2D pixel coordinate by x_norm = (x_det - 0.5L) / 0.5L
, where L = max(W, H)
Feed x_norm
to D-GridConv model, learnable SGT ver.
GT-trained model is used in practice, but technically GT & HRNet model are both okay for inference.
visualize 3D results using Matplotlib as you guess. We modified existing visualization code https://github.com/facebookresearch/VideoPose3D/blob/1afb1ca0f1237776518469876342fc8669d3f6a9/common/visualization.py#L62
For better visual effect, we add viewpoint animation, and will post modifications below for reference.
Here is the visualization code:
``` # This is a demo code clip for reference, cannot directly run before finishing step 1-4. import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.animation import FuncAnimation, writers from mpl_toolkits.mplot3d import Axes3D import numpy as np import subprocess as sp def get_resolution(filename): command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', 'stream=width,height', '-of', 'csv=p=0', filename] with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe: for line in pipe.stdout: w, h = line.decode().strip().split(',') return int(w), int(h) def get_fps(filename): command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', 'stream=r_frame_rate', '-of', 'csv=p=0', filename] with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe: for line in pipe.stdout: a, b = line.decode().strip().split('/') return int(a) / int(b) def read_video(filename, skip=0, limit=-1): w, h = get_resolution(filename) command = ['ffmpeg', '-i', filename, '-f', 'image2pipe', '-pix_fmt', 'rgb24', '-vsync', '0', '-vcodec', 'rawvideo', '-'] i = 0 with sp.Popen(command, stdout = sp.PIPE, bufsize=-1) as pipe: while True: data = pipe.stdout.read(w*h*3) if not data: break i += 1 if i > limit and limit != -1: continue if i > skip: yield np.frombuffer(data, dtype='uint8').reshape((h, w, 3)) def downsample_tensor(X, factor): length = X.shape[0]//factor * factor return np.mean(X[:length].reshape(-1, factor, *X.shape[1:]), axis=1) def render_animation(keypoints, poses, skeleton, fps, bitrate, azim, output, viewport, poses_gt=None, poses_reference=None, elev=15, limit=-1, downsample=1, size=6, input_video_path=None, input_video_frames=None, input_video_skip=0, rot_view=False): """ TODO Render an animation. The supported output modes are: -- 'interactive': display an interactive figure (also works on notebooks if associated with %matplotlib inline) -- 'html': render the animation as HTML5 video. Can be displayed in a notebook using HTML(...). -- 'filename.mp4': render and export the animation as an h264 video (requires ffmpeg). -- 'filename.gif': render and export the animation a gif file (requires imagemagick). """ plt.ioff() fig = plt.figure(figsize=(size*(1 + len(poses)), size)) ax_in = fig.add_subplot(1, 1 + len(poses), 1) ax_in.get_xaxis().set_visible(False) ax_in.get_yaxis().set_visible(False) ax_in.set_axis_off() #ax_in.set_title('Input') ax_3d = [] lines_3d = [] lines_3d_gt = [] points_3d = [] points_3d_gt = [] trajectories = [] radius = 1.8 for index, (title, data) in enumerate(poses.items()): ax = fig.add_subplot(1, 1 + len(poses), index+2, projection='3d') ax.view_init(elev=elev, azim=azim) ax.set_xlim3d([-radius/2, radius/2]) # ax.set_zlim3d([0, radius]) ax.set_zlim3d([-radius/2, radius/2]) ax.set_ylim3d([-radius/2, radius/2]) ax.set_aspect('auto') ax.set_xticklabels([]) # ax.set_yticklabels([]) # ax.set_zticklabels([]) #ax.set_xlabel('x') #ax.set_ylabel('y') #ax.set_zlabel('z') ax.dist = 7.5 ax.set_title(title) #, pad=35 ax_3d.append(ax) lines_3d.append([]) lines_3d_gt.append([]) points_3d.append([]) points_3d_gt.append([]) trajectories.append(data[:, 0, [0, 1]]) poses = list(poses.values()) # Decode video if input_video_path is None and input_video_frames is None: # Black background all_frames = np.zeros((keypoints.shape[0], viewport[1], viewport[0]), dtype='uint8') else: if input_video_path is not None: # Load video using ffmpeg all_frames = [] for f in read_video(input_video_path, skip=input_video_skip, limit=limit): all_frames.append(f) effective_length = min(keypoints.shape[0], len(all_frames)) all_frames = all_frames[:effective_length] else: all_frames = input_video_frames keypoints = keypoints[input_video_skip:] # todo remove for idx in range(len(poses)): poses[idx] = poses[idx][input_video_skip:] if fps is None: fps = get_fps(input_video_path) if downsample > 1: keypoints = downsample_tensor(keypoints, downsample) all_frames = downsample_tensor(np.array(all_frames), downsample).astype('uint8') for idx in range(len(poses)): poses[idx] = downsample_tensor(poses[idx], downsample) trajectories[idx] = downsample_tensor(trajectories[idx], downsample) fps /= downsample h, w, _ = all_frames[0].shape initialized = False image = None lines = [] points = None if limit < 1: limit = len(all_frames) else: limit = min(limit, len(all_frames)) parents = skeleton.parents() def update_video(i): nonlocal initialized, image, lines, points for n, ax in enumerate(ax_3d): ax.set_xlim3d([-radius/2 + trajectories[n][i, 0], radius/2 + trajectories[n][i, 0]]) ax.set_ylim3d([-radius/2 + trajectories[n][i, 1], radius/2 + trajectories[n][i, 1]]) # Update 2D poses joints_right_2d = skeleton.joints_right() colors_2d = np.full(keypoints.shape[1], 'black') colors_2d[joints_right_2d] = 'red' if not initialized: #image = ax_in.imshow(all_frames[i], aspect=1.3) image = ax_in.imshow(all_frames[i], aspect='equal') for j, j_parent in enumerate(parents): if j_parent == -1: continue if len(parents) == keypoints.shape[1]: # Draw skeleton only if keypoints match (otherwise we don't have the parents definition) lines.append(ax_in.plot([keypoints[i, j, 0], keypoints[i, j_parent, 0]], [keypoints[i, j, 1], keypoints[i, j_parent, 1]], c='pink')) col = 'red' if j in skeleton.joints_right() else 'black' for n, ax in enumerate(ax_3d): if poses_gt is not None: pos_gt = poses_gt[i] lines_3d_gt[n].append(ax.plot([pos_gt[j, 0], pos_gt[j_parent, 0]], [pos_gt[j, 1], pos_gt[j_parent, 1]], [pos_gt[j, 2], pos_gt[j_parent, 2]], zdir='z', c='green')) pos = poses[n][i] lines_3d[n].append(ax.plot([pos[j, 0], pos[j_parent, 0]], [pos[j, 1], pos[j_parent, 1]], [pos[j, 2], pos[j_parent, 2]], zdir='z', c=col)) for n, ax in enumerate(ax_3d): if poses_gt is not None: pos_gt = poses_gt[i] points_3d_gt[n] = ax.scatter(pos_gt[:,0],pos_gt[:,1],pos_gt[:,2],s=10, c='green') pos = poses[n][i] points_3d[n] = ax.scatter(pos[:,0],pos[:,1],pos[:,2],s=10,c='black') points = ax_in.scatter(*keypoints[i].T, 10, color=colors_2d, edgecolors='white', zorder=10) initialized = True else: image.set_data(all_frames[i]) for j, j_parent in enumerate(parents): if j_parent == -1: continue if len(parents) == keypoints.shape[1]: lines[j-1][0].set_data([keypoints[i, j, 0], keypoints[i, j_parent, 0]], [keypoints[i, j, 1], keypoints[i, j_parent, 1]]) for n, ax in enumerate(ax_3d): if poses_gt is not None: pos_gt = poses_gt[i] lines_3d_gt[n][j-1][0].set_xdata([pos_gt[j, 0], pos_gt[j_parent, 0]]) lines_3d_gt[n][j-1][0].set_ydata([pos_gt[j, 1], pos_gt[j_parent, 1]]) lines_3d_gt[n][j-1][0].set_3d_properties([pos_gt[j, 2], pos_gt[j_parent, 2]], zdir='z') pos = poses[n][i] lines_3d[n][j-1][0].set_xdata([pos[j, 0], pos[j_parent, 0]]) lines_3d[n][j-1][0].set_ydata([pos[j, 1], pos[j_parent, 1]]) lines_3d[n][j-1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z') for n, ax in enumerate(ax_3d): if poses_gt is not None: pos_gt = poses_gt[i] points_3d_gt[n]._offsets3d = (pos_gt[:,0],pos_gt[:,1],pos_gt[:,2]) pos = poses[n][i] points_3d[n]._offsets3d = (pos[:,0],pos[:,1],pos[:,2]) if rot_view: ax.view_init(azim=time_to_azim(i), elev=elev) points.set_offsets(keypoints[i]) print('{}/{} '.format(i, limit), end='\r') fig.tight_layout() anim = FuncAnimation(fig, update_video, frames=np.arange(0, limit), interval=1000/fps, repeat=False) if output.endswith('.mp4'): Writer = writers['ffmpeg'] writer = Writer(fps=fps, metadata={}, bitrate=bitrate) anim.save(output, writer=writer) elif output.endswith('.gif'): anim.save(output, dpi=80, writer='imagemagick') else: raise ValueError('Unsupported output format (only .mp4 and .gif are supported)') plt.close() def time_to_azim(i): # first 10 frames, azim=0 # next 180 frames, azim increase # next 10 frames, azim=180 # then reverse sign = 1 if (i // 200) % 2 == 0 else -1 if sign == 1: if i % 200 < 10: azim = 0 elif i % 200 > 190: azim = 180 else: azim = (i % 200) - 10 else: if i % 200 < 10: azim = 180 elif i % 200 > 190: azim = 0 else: azim = 180 - ((i % 200) - 10) return azim def animate3(video_fname, pose2d_seq, pose3d_seq, fps, input_video_frames=None, output_fname='out.mp4', elev=0, downsample=1, rot_view=False): from libs.skeleton.skeleton import Skeleton h36m_skeleton = Skeleton(parents=[-1, 0, 1, 2, 0, 4, 5, 0, 7, 8, 9, 8, 11, 12, 8, 14, 15], joints_left=[4, 5, 6, 11, 12, 13], joints_right=[1, 2, 3, 14, 15, 16]) from copy import copy pose3d_seq = copy(pose3d_seq) render_animation(pose2d_seq, pose3d_seq, h36m_skeleton, fps=fps, bitrate=6000, azim=0, elev=elev, downsample=downsample, output='%s'%output_fname, viewport=(800, 600), input_video_path=video_fname, input_video_frames=input_video_frames, rot_view=rot_view) # Assume 3D estimations are stored in outputs_pack. # video_dir, video_fname are predefined arguments. interval = 1 repeat_time = 1 downsample_res = 4 pose3d_pack_for_vis = OrderedDict({key: np.tile(outputs_pack[key][::interval, :, [2,0,1]] * [-1,1,-1], (repeat_time,1,1)) for key in ['GCN', 'LCN', 'PoseAug', 'Ours']}) print('Loading images...') image_frames = [] vcap = cv2.VideoCapture(os.path.join(video_dir, '%s.mp4'%video_fname)) ret = True while ret: ret, img = vcap.read() if not ret: break img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = cv2.resize(img, np.array(img.shape[:-1][::-1])//downsample_res) #print(img.shape) image_frames.append(img) image_frames = np.tile(image_frames[::interval], (repeat_time,1,1,1)) animate3(None, np.tile(pose2d_h36m_order_smoothed[::interval], (repeat_time,1,1))/downsample_res, pose3d_pack_for_vis, fps=24, input_video_frames = image_frames, output_fname=os.path.join(result_dir, '%s 3d %.1fscale.mp4'%(video_fname, scalar)), elev=30, rot_view=True) ```
Thank you so much! So outputs_pack
will contain the 3D estimations generated from GridConv, i.e., after following Steps 1-3?
Not accurate, outputs_pack = {'Ours': np.ndarray([T, 17, 3]), 'PoseAug': np.ndarray([T, 17, 3]), ...}
, where T
is video length, and np.ndarray
is what you describe.
Thanks, it resolved my doubt! Lastly, for running on internet videos, we first need to obtain 2D pose x_det
by running HRNet frame by frame. Are you here referring to the standard HRNet model trained on MS-COCO
(i.e., https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation)? Thanks once again!
Exactly, it is the repository we use. And we choose pose_hrnet_w48_384x288
model trained on COCO.
Thank you so much!
Hi, can you share the setting used for running GridConv on Internet Videos? Also, did you used simple Matplotlib to visualize the final Output?