Liangdacai / Human3d-keypoints

23 stars 6 forks source link

老师,您好!我们在优化的过程中遇到一点问题,想向您请教一下 #10

Closed simple123456T closed 2 years ago

simple123456T commented 2 years ago

我们在对2D模型预测出来的关键点做归一化时,发现摄像头画面里人身上的画线消失了,想知道 2D的输出数据做完归一化需要经过coco_h36m转化成h36m数据集的2d pose吗?, 下面是我的代码

simple123456T commented 2 years ago
import time

import cv2
import torch
from common.camera import *
from common.model import *
from utils_all import Person_23d
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from lib.preprocess import coco_h36m
import numpy as np
import argparse
import sys

# sys.path.append("./")
# sys.path.append("../")

l_leg = [[0, 1], [1, 2], [2, 3]]
r_leg = [[0, 4], [4, 5], [5, 6]]
bone = [[0, 7], [7, 8], [8, 9], [9, 10]]
l_hand = [[8, 14], [14, 15], [15, 16]]
r_hand = [[8, 11], [11, 12], [12, 13]]

draw = [l_leg, r_leg, bone, l_hand, r_hand]

metadata = {'layout_name': 'coco', 'num_joints': 17,
            'keypoints_symmetry': [[1, 3, 5, 7, 9, 11, 13, 15], [2, 4, 6, 8, 10, 12, 14, 16]]}

def args():
    parser = argparse.ArgumentParser(description='person3d args')
    parser.add_argument('--video', type=str, default=None, metavar='N', help='camera or video')
    parser.add_argument('--camera', type=int, default=1, metavar='C', help='camera or video')
    parser.add_argument('--arc', default=[3, 3, 3, 3], metavar='LAYERS', help='filter widths separated by comma')
    parser.add_argument('--w', '--weight-file', type=str, default='lib/checkpoint/epoch_120_3333.bin', help='The path')
    return parser.parse_args()

if __name__ == "__main__":

    args = args()
    print('\n args is:', args)

    if args.video is not None:
        cap = cv2.VideoCapture(args.video)
    elif args.camera is not None:
        cap = cv2.VideoCapture(args.camera)
    else:
        "camera or video load error,please check your code !"

    p23d = Person_23d()
    p23d.load_model(CUDA=True)  # 加载yolov3模型
    p23d.reset_config()
    pose_model = p23d.model_load()  # 加载HRNet模型
    model = TemporalModel(17, 2, 17, filter_widths=args.arc, causal=True, dropout=0.25, channels=1024)
    checkpoint = torch.load(args.w, map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint['model_pos'])
    model = model.cuda()
    model.eval()

    kps_2d = []
    fig = plt.figure()

    while cap.isOpened():
        ax = fig.add_subplot(111, projection='3d')
        # ax.view_init(elev=0., azim=90)  # 有点效果的写法
        ax.view_init(elev=45.)
        ax.set_xlim3d([-1.5, 1.5])
        ax.set_zlim3d([0.0, 2.0])
        ax.set_ylim3d([2.5, -2.5])

        _, frame = cap.read()
        b, c = p23d.yolo_human_det(frame)  # yolo人形检测器,返回四个边框顶点

        if b is None:
            continue

        cv2.rectangle(frame, (int(b[0][0]), int(b[0][1])), (int(b[0][2]), int(b[0][3])), (0, 255, 0), 2)

        # 预处理
        kps_input, data_numpy, center, scale = p23d.PreProcess(frame, b[0])
        kps_inputs = kps_input[:, [2, 1, 0]]
        # print('\n kps_inputs:\n', kps_inputs)
        # print('\n data_numpy:\n', data_numpy)
        # print('\n center:\n', center)
        # print('\n scale:\n', scale)
        # print('\n kps_inputs.shape:\n', kps_inputs.shape)  # [1, 3, 256, 256]
        # print('\n data_numpy.shape:\n', data_numpy.shape)  # (1080, 1920, 3)
        # print('\n center.shape:\n', center.shape)  # (2,)
        # print('\n scale.shape:\n', scale.shape)  # (2,)

        # kps
        if torch.cuda.is_available():
            kps_inputs = kps_inputs.cuda()
        output = pose_model(kps_inputs)  #
        # print('\n pose_model 的 output: \n', output)
        # print('\n pose_model 的 output 形状: \n', output.shape)  # [1, 17, 64, 64]

        kps_pre, maxval = p23d.get_final_preds(output.clone().cpu().detach().numpy(), np.asarray([center]), np.asarray([scale]))
        print('\n get_final_preds 的 kps_pre: \n', kps_pre)  # 2D关键点数据
        print('\n get_final_preds 的 kps_pre 形状: \n', kps_pre.shape)  # (1, 17, 2)
        # print('\n get_final_preds 的 maxval: \n', maxval)
        # print('\n get_final_preds 的 maxval 形状: \n', maxval.shape)  # (1, 17, 1)

        ####归一化 2D 输出数据 kps_pre
        # normlization keypoints  Suppose using the camera parameter
        print('********************')
        print(frame.shape[1])
        print(frame.shape[0])
        # time.sleep(55000)
        print('********************')
        kps_pre[..., :2] = normalize_screen_coordinates(kps_pre[..., :2], w=frame.shape[1], h=frame.shape[0])
        # print('\n kps_pre :\n', kps_pre)
        print('\n 归一化后的kps_pre的形状:\n', kps_pre.shape)  # (1, 17, 2)

        h36m_kps, _ = coco_h36m(kps_pre)  # h36m数据集的2d pose
        # print('\n coco_h36m 的 h36m_kps: \n', h36m_kps)
        print('\n coco_h36m 的 h36m_kps 形状: \n', h36m_kps.shape)  # (1, 17, 2)
        # time.sleep(60000)

        kps_2d.append(h36m_kps[0])
        fps_nums = 3 ** (len(args.arc)) + 1
        if len(kps_2d) == fps_nums:
            print('\n')
            print('\n')
            print('\n')
            print('8' * 89)
            # print('kps_2d:', kps_2d)
            print('kps_2d 的形状是:', len(kps_2d))  # 82
            print('fps_nums:', fps_nums)  # 82

            kps_2d.pop(0)
            input_2d = np.array([kps_2d])
            print('input_2d 的形状:', input_2d.shape)  # (1, 81, 17, 2)

            input_2d[:, :, :, 0] /= 480.
            input_2d[:, :, :, 1] /= 640.

            input_2d = torch.from_numpy(input_2d)
            print('input_2d 形状是:', input_2d.shape)  # torch.Size([1, 81, 17, 2])

            if torch.cuda.is_available():
                input_2d = input_2d.cuda()
            pre_3d, pre_traj = model(input_2d)  # Inference the 3d positions from 2d position.
            print('\n model 的 pre_3d:\n', pre_3d)
            print('\n model 的 pre_3d 形状:\n', pre_3d.shape)  # torch.Size([1, 1, 17, 3])
            # print('\n model 的 pre_traj:\n', pre_traj)
            print('\n model 的 pre_traj 形状:\n', pre_traj.shape)  # torch.Size([1, 1, 17, 3])

            ######将预测的三维点从相机坐标系转换到世界坐标系
            rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32)
            # t = np.array([1.841107, 4.9552846, 1.5634454], dtype=np.float32)
            # 降维
            pre_3d = pre_3d.view([1, 17, 3])
            print('降维逅的pre3d', pre_3d)
            pre_3d = pre_3d.cpu().detach().numpy()
            print('\n 相机世界坐标系之前的pre_3d.shape:\n', pre_3d.shape)
            pre_3d = camera_to_world(pre_3d, R=rot, t=0)  # todo
            print('\n 相机世界坐标系之后的pre_3d.shape:\n', pre_3d.shape)
            # We don't have the trajectory, but at least we can rebase the height将预测的三维点的Z值减去预测的三维点中Z的最小值,得到正向的Z值
            pre_3d[:, :, 2] -= np.min(pre_3d[:, :, 2])
            # 升维
            pre_3d = pre_3d[np.newaxis, :, :, :, ]
            print('\n 升维后 \n', pre_3d.shape)
            time.sleep(50000)

            pre_3d = torch.tensor(pre_3d)

            pre_3d = pre_3d.cpu().detach().numpy()  # cuda tensor 加载在cpu上
            pre_traj = pre_traj.cpu().detach().numpy()
            # output = pre_3d + pre_traj
            output = pre_3d  # 固定小人
            # print('\n output:\n', output)
            print('\n output 形状:\n', output.shape)  # (1, 1, 17, 3)

            x = output[:, :, :, 0]
            y = output[:, :, :, 2]
            z = -1 * output[:, :, :, 1]

            # print('\n x:\n', x)
            # print('\n y:\n', y)
            # print('\n z:\n', z)
            print('\n x 的形状:\n', x.shape)  # (1, 1, 17)
            print('\n y 的形状:\n', y.shape)  # (1, 1, 17)
            print('\n z 的形状:\n', z.shape)  # (1, 1, 17)

            for dd in draw:
                for line in dd:
                    a = [x[0, 0, line[0]], x[0, 0, line[1]]]
                    b = [y[0, 0, line[0]], y[0, 0, line[1]]]
                    c = [z[0, 0, line[0]], z[0, 0, line[1]]]
                    # print('\n a:\n', a)
                    # print('\n b:\n', b)
                    # print('\n c:\n', c)

                    ax.plot(a, b, c, c='b')  # 小人身上点的连线的颜色

            # ax.scatter(x, y, z, c='r')  # 小人身上的点的颜色
            # ax.set_xlabel('X Label')
            # ax.set_ylabel('Y Label')
            ax.set_zlabel('Z Label')
            plt.pause(0.0001)
            # time.sleep(50000)
            plt.clf()  # 清除当前画面

        for i in range(17):  # 标记17个坐标点  BGR
            cv2.circle(frame, (int(h36m_kps[0][i][0]), int(h36m_kps[0][i][1])), 2, (255, 0, 0), 2)  # 标记人身上的17个点

        for dd in draw:  # 将17个点连线起来  BGR
            for line in dd:
                cv2.line(frame, (int(h36m_kps[0][line[0]][0]), int(h36m_kps[0][line[0]][1])),
                         (int(h36m_kps[0][line[1]][0]), int(h36m_kps[0][line[1]][1])), (0, 255, 0), 2)

        cv2.namedWindow("show 2d keypoints", cv2.WINDOW_AUTOSIZE)
        cv2.imshow("show 2d keypoints", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cv2.destroyAllWindows()
    cap.release()
Liangdacai commented 2 years ago

已更新main.py,你那份代码做了两次归一化,当初我是为了简单直接做的归一化,你加进去就做了两次。

simple123456T commented 2 years ago

原来是这样啊,明白了,谢谢老师