opendr-eu / opendr

A modular, open and non-proprietary toolkit for core robotic functionalities by harnessing deep learning
Apache License 2.0
614 stars 95 forks source link

Accuracy stuck at 0.33 when training with my own dataset #434

Closed JoaoLuis00 closed 1 year ago

JoaoLuis00 commented 1 year ago

I am trying to use this toolkit with a different pose estimator, instead of lightweight openpose i am using medipipe since i need the keypoints for for both hands and some from the pose. The two model types i tested on were the STGCN and COSTGCN since these have a few demos on how to use them.

To do that, i created my own preprocessing script that takes a folder full of videos for different labels and processes the data in the format i believe is required and that i saw in one of the demo files skeleton_extraction.py

frames_total = 150
num_keypoints = 46
pose_keypoints = np.ones((num_keypoints, 3), dtype=np.int32) * -1
keypoints_scores = np.ones((num_keypoints, 1), dtype=np.float32) * -1

def pose2numpy(num_frames, poses_list, kptscores_list, num_channels=4):
    C = num_channels
    T = frames_total
    V = num_keypoints
    M = 1  # num_person_in
    data_numpy = np.zeros((1, C, num_frames, V, M))
    skeleton_seq = np.zeros((1, C, T, V, M))

    for t in range(num_frames):
        data_numpy[0, 0:3, t, :, 0] = np.transpose(poses_list[t].data)
        if C == 4:
            data_numpy[0, 3, t, :, 0] = kptscores_list[t][:, 0]

    # if we have less than 150 frames, repeat frames to reach 150
    diff = T - num_frames
    if diff != 0:
        while diff > 0:
            num_tiles = int(diff / num_frames)
            if num_tiles > 0:
                data_numpy = tile(data_numpy, 2, num_tiles+1)
                num_frames = data_numpy.shape[2]
                diff = T - num_frames
            elif num_tiles == 0:
                skeleton_seq[:, :, :num_frames, :, :] = data_numpy
                for j in range(diff):
                    skeleton_seq[:, :, num_frames+j, :,
                                 :] = data_numpy[:, :, -1, :, :]
            break
    elif diff == 0:
        skeleton_seq = data_numpy
    return skeleton_seq

# Generate training data
train_skeleton_data = np.zeros(
    (len(train_sample_labels), 3, frames_total, 46, 1), dtype=np.float32)

# Open Medipipe
with mp_pose.Pose(
        min_detection_confidence=0.7,
        min_tracking_confidence=0.5) as pose, mp_hands.Hands(
        model_complexity=0,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as hands:

    for s_number, s_name in enumerate(tqdm(train_sample_names)):
        video_path = os.path.join(videos_path, s_name + '_rgb.avi')
        print(s_name)
        print(video_path)
        image_provider = VideoReader(video_path)
        counter = 0
        poses_list = []
        kptscores_list = []

        for img in image_provider:
            frame_to_process = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            frame_to_process.flags.writeable = False
            pose_results = pose.process(frame_to_process)
            hands_results = hands.process(frame_to_process)

                # Left Shoulder
                pose_keypoints[0, 0] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].x * 1920)
                pose_keypoints[0, 1] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].y * 1080)
                pose_keypoints[0, 2] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_SHOULDER].z * 1920)
                keypoints_scores[0, 0] = float(
                    pose_results.pose_landmarks.landmark[11].visibility)
                # Right Shoulder
                pose_keypoints[1, 0] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].x * 1920)
                pose_keypoints[1, 1] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].y * 1080)
                pose_keypoints[1, 2] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].z * 1920)
                keypoints_scores[1, 0] = float(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_SHOULDER].visibility)
                # Left Elbow
                pose_keypoints[2, 0] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_ELBOW].x * 1920)
                pose_keypoints[2, 1] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_ELBOW].y * 1080)
                pose_keypoints[2, 2] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_ELBOW].z * 1920)
                keypoints_scores[2, 0] = float(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_ELBOW].visibility)
                # Right Elbow
                pose_keypoints[3, 0] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_ELBOW].x * 1920)
                pose_keypoints[3, 1] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_ELBOW].y * 1080)
                pose_keypoints[3, 2] = int(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_ELBOW].z * 1920)
                keypoints_scores[3, 0] = float(
                    pose_results.pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_ELBOW].visibility)

            if hands_results.multi_hand_landmarks:  
                for hand_no, hand_landmarks in enumerate(hands_results.multi_hand_landmarks):
                    for i in range(21):
                        # Left Hand
                            if hand_no == 0:
                                pose_keypoints[i+4, 0] = int(
                                    hands_results.multi_hand_landmarks[hand_no].landmark[i].x * 1920)
                                pose_keypoints[i+4, 1] = int(
                                    hands_results.multi_hand_landmarks[hand_no].landmark[i].y * 1080)
                                pose_keypoints[i+4, 2] = int(
                                    hands_results.multi_hand_landmarks[hand_no].landmark[i].z * 1920)
                                # made up number the hand model does not output visibility scores
                                keypoints_scores[i+4, 0] = float(0.99)

                         # Right Hand
                            if hand_no == 1:
                                pose_keypoints[i+25, 0] = int(
                                    hands_results.multi_hand_landmarks[hand_no].landmark[i].x * 1920)
                                pose_keypoints[i+25, 1] = int(
                                    hands_results.multi_hand_landmarks[hand_no].landmark[i].y * 1080)
                                pose_keypoints[i+25, 2] = int(
                                    hands_results.multi_hand_landmarks[hand_no].landmark[i].z * 1920)
                                # made up number the hand model does not output visibility scores
                                keypoints_scores[i+25, 0] = float(0.99)
                        except IndexError:
                            print("Values missing Right Hand")

            pose_list = Pose(pose_keypoints, -1)
            counter += 1
            poses_list.append(pose_list)
            kptscores_list.append(keypoints_scores)

        if counter > frames_total:
            for cnt in range(counter - frames_total):
                poses_list.pop(0)
                kptscores_list.pop(0)
            counter = frames_total
        if counter > 0:
            skeleton_seq = pose2numpy(counter, poses_list, kptscores_list, 3)
            train_skeleton_data[s_number, :, :, :, :] = skeleton_seq

np.save('pkl_results/train_joints.npy', train_skeleton_data)

#CODE REPEATS FOR THE VALIDATION DATA SET

I also added a custom dataset_name and graph type to the spatio_temporal_gcn_learner.py and continual_stgcn_learner.py.

My main question is why does the accuracy stays stuck at 0.333 even when changing the values of the different hyper-parameters for the stgcn model and the loss stays at 1.1 in the costgcn?

I apologize in advance for the poorly edited post and any confusion the code above might generate.