Is there anyway to get the eye patches given we have `xgaze_224` (cropped face version) and `annotations` (lmks in original frame)

vuthede commented 2 years ago

Hi everyone, I wonder is there anyway to get the eye patches given we have

xgaze_224 (cropped face version) and
annotations (lmks in original frame). Because download full frame version is 7T which will take along time to download.

Thanks and appreciate if anyone can help.

vuthede commented 2 years ago

I just found the solution. Just use some transofrmation matrix in function normalizeData_face in file normalization_example.py for landmarks annotation.

AbdouMechraoui commented 2 years ago

I just found the solution. Just use some transformation matrix in function normalizeData_face in file normalization_example.py for landmarks annotation.

Could you please elaborate on that? what do you mean by some transformation matrix? As far as I know, 224, and 448 datasets don't include any transformation matrices. I also would like to get the eye crops, the option that I have now is to do landmark detection on the normalized face patches, then crop them.

vuthede commented 2 years ago

Hi @AbdouMechraoui . I've just write the code that leverage some code in normalization_example.py to revert the lmks and then can crop the eyes.

from operator import sub
import numpy as np
import h5py
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import os
import json
import random
from typing import List
import cv2
import pandas as pd

import albumentations as A
import imgaug.augmenters as iaa
import time
import math

transformerr = A.Compose(
    [
        A.ColorJitter (brightness=0.35, contrast=0.5, saturation=0.5, hue=0.2, always_apply=False, p=0.7),
        A.ShiftScaleRotate (shift_limit=0.1, scale_limit=(-0.1,0.1), rotate_limit=5, interpolation=1, border_mode=1, always_apply=False, p=1)
    ]
)

strongseq = iaa.Sequential([
   iaa.CoarseDropout((0.1, 0.15), size_percent=(0.02, 0.03)),
   iaa.JpegCompression(compression=(20, 97)),
   iaa.CoarsePepper(0.05, size_percent=(0.01, 0.1)),  
])

def get_train_loader(data_dir,
                           batch_size,
                           num_workers=4,
                           is_shuffle=True):
    # load dataset
    refer_list_file = f'{data_dir}/raw/data/train_test_split.json'
    print('load the train file list from: ', refer_list_file)

    with open(refer_list_file, 'r') as f:
        datastore = json.load(f)
    print(datastore["train"])
    # there are three subsets for ETH-XGaze dataset: train, test and test_person_specific
    # train set: the training set includes 80 participants data
    # test set: the test set for cross-dataset and within-dataset evaluations
    # test_person_specific: evaluation subset for the person specific setting
    sub_folder_use = 'train'
    train_set = GazeDataset(dataset_path=data_dir, keys_to_use=datastore[sub_folder_use], sub_folder=sub_folder_use,
                            transform=None, is_shuffle=is_shuffle, is_load_label=True)
    train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_workers)
    # train_loader = train_set
    return train_loader

def get_test_loader(data_dir,
                           batch_size,
                           num_workers=4,
                           is_shuffle=True):
    # load dataset
    refer_list_file = f'{data_dir}/raw/data/train_test_split.json'

    print('load the train file list from: ', refer_list_file)

    with open(refer_list_file, 'r') as f:
        datastore = json.load(f)

    # there are three subsets for ETH-XGaze dataset: train, test and test_person_specific
    # train set: the training set includes 80 participants data
    # test set: the test set for cross-dataset and within-dataset evaluations
    # test_person_specific: evaluation subset for the person specific setting
    sub_folder_use = 'test'
    test_set = GazeDataset(dataset_path=data_dir, keys_to_use=datastore[sub_folder_use], sub_folder=sub_folder_use,
                           transform=None, is_shuffle=is_shuffle, is_load_label=False)
    test_loader = DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)

    return test_loader

def get_val_loader(data_dir,
                           batch_size,
                           num_workers=4,
                           is_shuffle=True):
    # load dataset
    refer_list_file = f'{data_dir}/raw/data/train_test_split.json'

    print('load the train file list from: ', refer_list_file)

    with open(refer_list_file, 'r') as f:
        datastore = json.load(f)

    # there are three subsets for ETH-XGaze dataset: train, test and test_person_specific
    # train set: the training set includes 80 participants data
    # test set: the test set for cross-dataset and within-dataset evaluations
    # test_person_specific: evaluation subset for the person specific setting
    sub_folder_use = 'val'
    test_set = GazeDataset(dataset_path=data_dir, keys_to_use=datastore[sub_folder_use], sub_folder=sub_folder_use,
                           transform=None, is_shuffle=is_shuffle, is_load_label=True, augmentation=False)
    test_loader = DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)

    return test_loader

class GazeDataset(Dataset):
    EYE_PATCH_SIZE = (60, 60)
    def __init__(self, dataset_path: str, keys_to_use: List[str] = None, sub_folder='', transform=None, is_shuffle=True,
                 index_file=None, is_load_label=True, augmentation=True):
        self.path = dataset_path
        self.hdfs = {}
        self.sub_folder = sub_folder
        self.is_load_label = is_load_label
        self.augmentation = augmentation

        # assert len(set(keys_to_use) - set(all_keys)) == 0
        # Select keys
        # TODO: select only people with sufficient entries?
        self.selected_keys = [k for k in keys_to_use]
        assert len(self.selected_keys) > 0

        for num_i in range(0, len(self.selected_keys)):
            file_path = os.path.join(self.path, self.sub_folder, self.selected_keys[num_i])
            self.hdfs[num_i] = h5py.File(file_path, 'r', swmr=True)
            # print('read file: ', os.path.join(self.path, self.selected_keys[num_i]))
            assert self.hdfs[num_i].swmr_mode

        # Construct mapping from full-data index to key and person-specific index
        if index_file is None:
            self.idx_to_kv = []
            for num_i in range(0, len(self.selected_keys)):
                n = self.hdfs[num_i]["face_patch"].shape[0]
                self.idx_to_kv += [(num_i, i) for i in range(n)]
        else:
            print('load the file: ', index_file)
            self.idx_to_kv = np.loadtxt(index_file, dtype=np.int)

        for num_i in range(0, len(self.hdfs)):
            if self.hdfs[num_i]:
                self.hdfs[num_i].close()
                self.hdfs[num_i] = None

        if is_shuffle:
            random.shuffle(self.idx_to_kv)  # random the order to stable the training

        self.hdf = None
        self.transform = transform

        ## Devu Add
        self.__load_annotation(annotation_dir=f'{self.path}/raw')
        self.hashtable = {}

    def __load_annotation(self, annotation_dir):
        ################## Parameters #################################################
        resize_factor = 8
        is_distor = False  # distortion is disable since it cost too much time, and the face is always in the center of image
        report_interval = 60
        is_over_write = True
        face_patch_size = 224
        ###########################################################################

        # load camera matrix
        self.camera_matrix = []
        self.camera_distortion = []
        self.cam_translation = []
        self.cam_rotation = []

        self.annotation_dir = annotation_dir
        print('Load the camera parameters')
        for cam_id in range(0, 18):
            file_name = f'{self.annotation_dir}/calibration/cam_calibration/' + 'cam' + str(cam_id).zfill(2) + '.xml'
            fs = cv2.FileStorage(file_name, cv2.FILE_STORAGE_READ)
            self.camera_matrix.append(fs.getNode('Camera_Matrix').mat())
            self.camera_distortion.append(fs.getNode('Distortion_Coefficients').mat()) # here we disable distortion
            self.cam_translation.append(fs.getNode('cam_translation').mat())
            self.cam_rotation.append(fs.getNode('cam_rotation').mat())
            fs.release()

        # load face model
        face_model_load = np.loadtxt(f'{self.annotation_dir}/calibration/face_model.txt')
        landmark_use = [20, 23, 26, 29, 15, 19]
        self.face_model = face_model_load[landmark_use, :]

    def __normalizeData_face(self, face_model, landmarks, hr, ht, gc, cam):
        ## normalized camera parameters
        focal_norm = 960  # focal length of normalized camera
        distance_norm = 300  # normalized distance between eye and camera
        roiSize = (448, 448)  # size of cropped eye image
        ## compute estimated 3D positions of the landmarks
        ht = ht.reshape((3, 1))
        gc = gc.reshape((3, 1))
        hR = cv2.Rodrigues(hr)[0]  # rotation matrix
        Fc = np.dot(hR, face_model.T) + ht
        two_eye_center = np.mean(Fc[:, 0:4], axis=1).reshape((3, 1))
        mouth_center = np.mean(Fc[:, 4:6], axis=1).reshape((3, 1))
        face_center = np.mean(np.concatenate((two_eye_center, mouth_center), axis=1), axis=1).reshape((3, 1))

        ## ---------- normalize image ----------
        distance = np.linalg.norm(face_center)  # actual distance between eye and original camera

        z_scale = distance_norm / distance
        cam_norm = np.array([
            [focal_norm, 0, roiSize[0] / 2],
            [0, focal_norm, roiSize[1] / 2],
            [0, 0, 1.0],
        ])
        S = np.array([  # scaling matrix
            [1.0, 0.0, 0.0],
            [0.0, 1.0, 0.0],
            [0.0, 0.0, z_scale],
        ])

        hRx = hR[:, 0]
        forward = (face_center / distance).reshape(3)
        down = np.cross(forward, hRx)
        down /= np.linalg.norm(down)
        right = np.cross(down, forward)
        right /= np.linalg.norm(right)
        R = np.c_[right, down, forward].T  # rotation matrix R

        W = np.dot(np.dot(cam_norm, S), np.dot(R, np.linalg.inv(cam)))  # transformation matrix

        ## ---------- normalize rotation ----------
        hR_norm = np.dot(R, hR)  # rotation matrix in normalized space
        hr_norm = cv2.Rodrigues(hR_norm)[0]  # convert rotation matrix to rotation vectors

        ## ---------- normalize gaze vector ----------
        gc_normalized = gc - face_center  # gaze vector
        gc_normalized = np.dot(R, gc_normalized)
        gc_normalized = gc_normalized / np.linalg.norm(gc_normalized)

        # warp the facial landmarks
        num_point, num_axis = landmarks.shape
        det_point = landmarks.reshape([num_point, 1, num_axis])
        det_point_warped = cv2.perspectiveTransform(det_point, W)
        det_point_warped = det_point_warped.reshape(num_point, num_axis)

        head = hr_norm.reshape(1, 3)
        M = cv2.Rodrigues(head)[0]
        Zv = M[:, 2]
        head_2d = np.array([math.asin(Zv[1]), math.atan2(Zv[0], Zv[2]), 0.0]) # Add roll==0

        return head_2d, gc_normalized, det_point_warped, R

    def __len__(self):
        return len(self.idx_to_kv)

    def __del__(self):
        for num_i in range(0, len(self.hdfs)):
            if self.hdfs[num_i]:
                self.hdfs[num_i].close()
                self.hdfs[num_i] = None

    def __segment_eye(self, image, lmks, eye='left', ow=64, oh=64):
        if eye=='left':
            # Left eye
            x1, y1 = lmks[36]
            x2, y2 = lmks[39]
        else: # right eye
            x1, y1 = lmks[42]
            x2, y2 = lmks[45]

        eye_width = 1.5 * np.linalg.norm(x1-x2)
        cx, cy = 0.5 * (x1 + x2), 0.5 * (y1 + y2)

        # center image on middle of eye
        translate_mat = np.asmatrix(np.eye(3))
        translate_mat[:2, 2] = [[-cx], [-cy]]

        # Scale
        scale = ow / (eye_width+1e-6)
        scale_mat = np.asmatrix(np.eye(3))
        scale_mat[0, 0] = scale_mat[1, 1] = scale

        # center image
        center_mat = np.asmatrix(np.eye(3))
        center_mat[:2, 2] = [[0.5 * ow], [0.5 * oh]]

        # Get rotated and scaled, and segmented image
        transform_mat =  center_mat * scale_mat * translate_mat
        eye_image = cv2.warpAffine(image, transform_mat[:2, :], (oh, ow))

        return eye_image

    def __crop_eye(self, face, fid, key, idx):
        cam_id = fid["cam_index"][idx][0]-1
        key_tmp = key.replace(".h5", "")
        df = pd.read_csv(f"{self.annotation_dir}/data/annotation_{self.sub_folder}/{key_tmp}.csv", header=None)
        # print("sssssssssssssssssSSS:", key)
        line = df.loc[idx,:].to_numpy()
        lmks = line[13:149].reshape(-1, 2).astype(float)
        gaze_label_3d = np.array([float(line[4]), float(line[5]), float(line[6])]).reshape(3, 1)  # gaze point on the screen coordinate system
        hr = np.array([float(line[7]), float(line[8]), float(line[9])]).reshape(3, 1)
        ht = np.array([float(line[10]), float(line[11]), float(line[12])]).reshape(3, 1)
        head_2D, gaze_norm, landmark_norm, mat_norm_face = \
                    self.__normalizeData_face(self.face_model, lmks, hr, ht, gaze_label_3d, self.camera_matrix[cam_id])

        landmark_norm = landmark_norm / 2  ## 448 size face ---> 224 size face

        left_eye = self.__segment_eye(face, landmark_norm, eye='left', ow=self.EYE_PATCH_SIZE[1], oh=self.EYE_PATCH_SIZE[0])
        right_eye = self.__segment_eye(face, landmark_norm, eye='right', ow=self.EYE_PATCH_SIZE[1], oh=self.EYE_PATCH_SIZE[0])

        return left_eye, right_eye, head_2D

    def __preprocess(self, eye):
        eye = cv2.resize(eye, self.EYE_PATCH_SIZE)
        eye = eye.astype(np.float32)
        eye *= 2.0 / 255.0
        eye -= 1.0

        # eye = np.expand_dims(eye, -1)

        eye = np.transpose(eye, (2,0, 1))

        return eye

    def __getitem__(self, idx):
        key, idx = self.idx_to_kv[idx]
        if self.selected_keys[key] not in self.hashtable.keys():
            self.hdf = h5py.File(os.path.join(self.path, self.sub_folder, self.selected_keys[key]), 'r', swmr=True)
            self.hashtable[self.selected_keys[key]] = self.hdf
        else:
            self.hdf = self.hashtable[self.selected_keys[key]]

        assert self.hdf.swmr_mode

        # Get face image
        image = self.hdf['face_patch'][idx, :]

        left_eye, right_eye, head_2D = self.__crop_eye(image, self.hdf, self.selected_keys[key], idx)

        ## Augmentation
        if self.augmentation:
            transformed = transformerr(image=left_eye)
            left_eye = transformed["image"]
            left_eye = strongseq(image=left_eye)

            transformed = transformerr(image=right_eye)
            right_eye = transformed["image"]
            right_eye = strongseq(image=right_eye)

        left_eye = self.__preprocess(left_eye)
        right_eye = self.__preprocess(right_eye)

        # Get labels
        if self.is_load_label:
            gaze_label = self.hdf['face_gaze'][idx, :]
            gaze_label = gaze_label.astype('float')
            gaze_label[0] = -gaze_label[0]
            headpose_label = self.hdf['face_head_pose'][idx, :]
            headpose_label = -headpose_label
            headpose_label = np.append(headpose_label, 0.0)
            headpose_label = headpose_label[[1,0,2]] # yaw, pitch, roll

            return torch.FloatTensor(left_eye),torch.FloatTensor(right_eye), torch.FloatTensor(headpose_label), torch.FloatTensor(gaze_label)
        else:
            return left_eye, right_eye

def draw_gaze(image_in, eye_pos, pitchyaw, length=15.0, thickness=2, color=(0, 0, 255)):
    """Draw gaze angle on given image with a given eye positions."""
    image_out = image_in
    if len(image_out.shape) == 2 or image_out.shape[2] == 1:
        image_out = cv2.cvtColor(image_out, cv2.COLOR_GRAY2BGR)
    dx = -length * np.sin(pitchyaw[1])
    dy = length * np.sin(pitchyaw[0])
    cv2.arrowedLine(image_out, tuple(np.round(eye_pos).astype(np.int32)),
                   tuple(np.round([eye_pos[0] + dx, eye_pos[1] + dy]).astype(int)), color,
                   thickness, cv2.LINE_AA, tipLength=0.2)
    return image_out

def draw_headpose(img, pitch, yaw, roll, size=100):
    from math import cos, sin
    pitch = pitch * np.pi / 180
    yaw = -(yaw * np.pi / 180)
    roll = roll * np.pi / 180
    tdx=50
    tdy=50
    # roll=0
    x1 = size * (cos(yaw) * cos(roll)) + tdx
    y1 = size * (cos(pitch) * sin(roll) + cos(roll)
                 * sin(pitch) * sin(yaw)) + tdy

    x2 = size * (-cos(yaw) * sin(roll)) + tdx
    y2 = size * (cos(pitch) * cos(roll) - sin(pitch)
                 * sin(yaw) * sin(roll)) + tdy

    # Z-Axis (out of the screen) drawn in blue
    x3 = size * (sin(yaw)) + tdx
    y3 = size * (-cos(yaw) * sin(pitch)) + tdy

    cv2.line(img, (int(tdx), int(tdy)), (int(x1), int(y1)), (255, 0, 0), 3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x2), int(y2)), (0, 255, 0), 3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x3), int(y3)), (0, 0, 255), 2)

    return img

if __name__ == "__main__":
    test_loader = get_train_loader(data_dir="/media/vuthede/vuthede_hdd/vuthede/data/xgaze_224", batch_size=1, is_shuffle=True)
    print(f'Len dataset :{len(test_loader)}')

    for databatch in test_loader:
        left, right, headpose, gaze = databatch
        # print(left.shape, right.shape, headpose.shape, gaze.shape)

        img = left.numpy()[0]
        img1 = right.numpy()[0]

        gaze = gaze.numpy()[0]
        headpose = headpose.numpy()[0] * 180.0/3.14

        print(gaze, headpose)
        img = ((np.transpose(img, (1,2,0)) + 1)*128).astype(np.uint8)
        img1 = ((np.transpose(img1, (1,2,0)) + 1)*128).astype(np.uint8)

        img = np.ascontiguousarray(img)
        img1 = np.ascontiguousarray(img1)

        arrowLength = 75

        img = draw_gaze(img, (30, 30), gaze[:2], length=15)
        img1 = draw_gaze(img1, (30, 30), gaze[:2], length=15)

        concat = np.hstack([img, img1])

        concat = cv2.resize(concat, (320*2, 320))
        concat = draw_headpose(concat, -headpose[1], -headpose[0], 0, 100)

        cv2.imshow("Image", concat)

        k  =cv2.waitKey(0)
        if k==27:
            break
    cv2.destroyAllWindows()

AbdouMechraoui commented 2 years ago

@vuthede thanks a lot for sharing your code :) I see what you did, but it looks like you're using the raw data, the transformation matrix, 3D gaze target point, and landmarks annotations. Is there a way to get just these annotations out of the 7TB file, or did you have to download the full dataset?

vuthede commented 2 years ago

@AbdouMechraoui I did not use the raw data. I use the xgaze_224 version with the annotation of raw data. (.i.e I just downloaded the annotation. I didnot download the 7TB ) :D

AbdouMechraoui commented 2 years ago

@AbdouMechraoui I did not use the raw data. I use the xgaze_224 version with the annotation of raw data. (.i.e I just downloaded the annotation. I didnot download the 7TB ) :D

My bad! I thought it was the raw data link was a direct download link for the full raw dataset, hence the confusion.

xucong-zhang / ETH-XGaze

Is there anyway to get the eye patches given we have `xgaze_224` (cropped face version) and `annotations` (lmks in original frame) #17