ipazc / mtcnn

MTCNN face detection implementation for TensorFlow, as a PIP package.
MIT License
2.23k stars 527 forks source link

PyTorch : unable to load the dataset #119

Closed ritvikagrawal1 closed 2 years ago

ritvikagrawal1 commented 2 years ago
#coding=utf-8
import pdb
import os, sys, random
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.data as data
from PIL import Image
from mtcnn import MTCNN
detector = MTCNN()
import operator
import cv2
## data generator for afew
class VideoDataset(data.Dataset):
    def __init__(self, video_root, video_list, rectify_label=None, transform=None, csv = False):

        self.imgs_first, self.index = load_imgs_total_frame(video_root, video_list, rectify_label)
        self.transform = transform

    def __getitem__(self, index):

        path_first, target_first = self.imgs_first[index]
        img_cv2 = np.array(Image.open(path_first).convert("RGB"))
        face=detector.detect_faces(img_cv2)
        face.sort(key=operator.itemgetter('confidence'),reverse=True)
        start_c=face[0]['keypoints']['left_eye']
        end_c=face[0]['keypoints']['right_eye']
        n_p=face[0]['keypoints']['nose']
        y_s=max(0,min(start_c[1],end_c[1])-(n_p[1]-start_c[1]))
        y_s_e=min(max(start_c[1],end_c[1])+(n_p[1]-start_c[1])//2,img_cv2.shape[0])
        x_s=max(0,start_c[0]-(end_c[0]-start_c[0])//2)
        x_s_e=min(end_c[0]+(end_c[0]-start_c[0])//2,img_cv2.shape[1])
        img_first=Image.fromarray(img_cv2[y_s:y_s_e,x_s:x_s_e,:]).convert("RGB")

        if self.transform is not None:
            img_first = self.transform(img_first)

        return img_first, target_first, self.index[index]

    def __len__(self):
        return len(self.imgs_first)

# 
class TripleImageDataset(data.Dataset):
    def __init__(self, video_root, video_list, rectify_label=None, transform=None):

        self.imgs_first, self.imgs_second, self.imgs_third, self.index = load_imgs_tsn(video_root, video_list,
                                                                                           rectify_label)
        self.transform = transform

    def __getitem__(self, index):

        path_first, target_first = self.imgs_first[index]

        img_cv2_f = np.array(Image.open(path_first).convert("RGB"))
        face=detector.detect_faces(img_cv2_f)
        face.sort(key=operator.itemgetter('confidence'),reverse=True)
        start_c=face[0]['keypoints']['left_eye']
        end_c=face[0]['keypoints']['right_eye']
        n_p=face[0]['keypoints']['nose']
        y_s=max(0,min(start_c[1],end_c[1])-(n_p[1]-start_c[1]))
        y_s_e=min(max(start_c[1],end_c[1])+(n_p[1]-start_c[1])//2,img_cv2_f.shape[0])
        x_s=max(0,start_c[0]-(end_c[0]-start_c[0])//2)
        x_s_e=min(end_c[0]+(end_c[0]-start_c[0])//2,img_cv2_f.shape[1])
        img_first=Image.fromarray(img_cv2_f[y_s:y_s_e,x_s:x_s_e,:]).convert("RGB")

        if self.transform is not None:
            img_first = self.transform(img_first)

        path_second, target_second = self.imgs_second[index]

        img_cv2_s = np.array(Image.open(path_second).convert("RGB"))
        face=detector.detect_faces(img_cv2_s)
        face.sort(key=operator.itemgetter('confidence'),reverse=True)
        start_c=face[0]['keypoints']['left_eye']
        end_c=face[0]['keypoints']['right_eye']
        n_p=face[0]['keypoints']['nose']
        y_s=max(0,min(start_c[1],end_c[1])-(n_p[1]-start_c[1]))
        y_s_e=min(max(start_c[1],end_c[1])+(n_p[1]-start_c[1])//2,img_cv2_s.shape[0])
        x_s=max(0,start_c[0]-(end_c[0]-start_c[0])//2)
        x_s_e=min(end_c[0]+(end_c[0]-start_c[0])//2,img_cv2_s.shape[1])
        img_second=Image.fromarray(img_cv2_s[y_s:y_s_e,x_s:x_s_e,:]).convert("RGB")

        if self.transform is not None:
            img_second = self.transform(img_second)

        path_third, target_third = self.imgs_third[index]

        img_cv2_t = np.array(Image.open(path_third).convert("RGB"))
        face=detector.detect_faces(img_cv2_t)
        face.sort(key=operator.itemgetter('confidence'),reverse=True)
        start_c=face[0]['keypoints']['left_eye']
        end_c=face[0]['keypoints']['right_eye']
        n_p=face[0]['keypoints']['nose']
        y_s=max(0,min(start_c[1],end_c[1])-(n_p[1]-start_c[1]))
        y_s_e=min(max(start_c[1],end_c[1])+(n_p[1]-start_c[1])//2,img_cv2_t.shape[0])
        x_s=max(0,start_c[0]-(end_c[0]-start_c[0])//2)
        x_s_e=min(end_c[0]+(end_c[0]-start_c[0])//2,img_cv2_t.shape[1])
        img_third=Image.fromarray(img_cv2_t[y_s:y_s_e,x_s:x_s_e,:]).convert("RGB")

        if self.transform is not None:
            img_third = self.transform(img_third)
        return img_first, img_second, img_third, target_first, self.index[index]

    def __len__(self):
        return len(self.imgs_first)

def load_imgs_tsn(video_root, video_list, rectify_label):
    imgs_first = list()
    imgs_second = list()
    imgs_third = list()

    with open(video_list, 'r') as imf:
        index = []
        for id, line in enumerate(imf):

            video_label = line.strip().split()

            video_name = video_label[0]  # name of video
            label = rectify_label[video_label[1]]  # label of video

            video_path = os.path.join(video_root, video_name)  # video_path is the path of each video
            ###  for sampling triple imgs in the single video_path  ####

            img_lists = os.listdir(video_path)
            img_lists.sort()  # sort files by ascending
            img_count = len(img_lists)  # number of frames in video
            num_per_part = int(img_count) // 3

            if int(img_count) > 3:
                for i in range(img_count):

                    random_select_first = random.randint(0, num_per_part)
                    random_select_second = random.randint(num_per_part, num_per_part * 2)
                    random_select_third = random.randint(2 * num_per_part, len(img_lists) - 1)

                    img_path_first = os.path.join(video_path, img_lists[random_select_first])
                    img_path_second = os.path.join(video_path, img_lists[random_select_second])
                    img_path_third = os.path.join(video_path, img_lists[random_select_third])

                    imgs_first.append((img_path_first, label))
                    imgs_second.append((img_path_second, label))
                    imgs_third.append((img_path_third, label))

            else:
                for j in range(len(img_lists)):
                    img_path_first = os.path.join(video_path, img_lists[j])
                    img_path_second = os.path.join(video_path, random.choice(img_lists))
                    img_path_third = os.path.join(video_path, random.choice(img_lists))

                    imgs_first.append((img_path_first, label))
                    imgs_second.append((img_path_second, label))
                    imgs_third.append((img_path_third, label))

            ###  return video frame index  #####
            index.append(np.ones(img_count) * id)  # id: 0 : 379
        index = np.concatenate(index, axis=0)
        # index = index.astype(int)
    return imgs_first, imgs_second, imgs_third, index

def load_imgs_total_frame(video_root, video_list, rectify_label):
    imgs_first = list()

    with open(video_list, 'r') as imf:
        index = []
        video_names = []
        for id, line in enumerate(imf):

            video_label = line.strip().split()

            video_name = video_label[0]  # name of video
            label = rectify_label[video_label[1]]  # label of video

            video_path = os.path.join(video_root, video_name)  # video_path is the path of each video
            ###  for sampling triple imgs in the single video_path  ####

            img_lists = os.listdir(video_path)
            img_lists.sort()  # sort files by ascending
            img_count = len(img_lists)  # number of frames in video

            for frame in img_lists:
                # pdb.set_trace()
                imgs_first.append((os.path.join(video_path, frame), label))
            ###  return video frame index  #####
            video_names.append(video_name)
            index.append(np.ones(img_count) * id)
        index = np.concatenate(index, axis=0)
        # index = index.astype(int)
    return imgs_first, index

## data generator for ck_plus
class TenFold_VideoDataset(data.Dataset):
    def __init__(self, video_root='', video_list='', rectify_label=None, transform=None, fold=1, run_type='train'):
        self.imgs_first, self.index = load_imgs_tenfold_totalframe(video_root, video_list, rectify_label, fold, run_type)

        self.transform = transform
        self.video_root = video_root

    def __getitem__(self, index):

        path_first, target_first = self.imgs_first[index]

        img_cv2 = np.array(Image.open(path_first).convert("RGB"))
        face=detector.detect_faces(img_cv2)
        face.sort(key=operator.itemgetter('confidence'),reverse=True)
        start_c=face[0]['keypoints']['left_eye']
        end_c=face[0]['keypoints']['right_eye']
        n_p=face[0]['keypoints']['nose']
        y_s=max(0,min(start_c[1],end_c[1])-(n_p[1]-start_c[1]))
        y_s_e=min(max(start_c[1],end_c[1])+(n_p[1]-start_c[1])//2,img_cv2.shape[0])
        x_s=max(0,start_c[0]-(end_c[0]-start_c[0])//2)
        x_s_e=min(end_c[0]+(end_c[0]-start_c[0])//2,img_cv2.shape[1])
        img_first=Image.fromarray(img_cv2[y_s:y_s_e,x_s:x_s_e,:]).convert("RGB")

        if self.transform is not None:
            img_first = self.transform(img_first)

        return img_first, target_first, self.index[index]

    def __len__(self):
        return len(self.imgs_first)

class TenFold_TripleImageDataset(data.Dataset):
    def __init__(self, video_root='', video_list='', rectify_label=None, transform=None, fold=1, run_type='train'):

        self.imgs_first, self.imgs_second, self.imgs_third, self.index = load_imgs_tsn_tenfold(video_root,video_list,rectify_label, fold, run_type)

        self.transform = transform
        self.video_root = video_root

    def __getitem__(self, index):
        path_first, target_first = self.imgs_first[index]

        img_cv2_f = np.array(Image.open(path_first).convert("RGB"))
        face=detector.detect_faces(img_cv2_f)
        face.sort(key=operator.itemgetter('confidence'),reverse=True)
        start_c=face[0]['keypoints']['left_eye']
        end_c=face[0]['keypoints']['right_eye']
        n_p=face[0]['keypoints']['nose']
        y_s=max(0,min(start_c[1],end_c[1])-(n_p[1]-start_c[1]))
        y_s_e=min(max(start_c[1],end_c[1])+(n_p[1]-start_c[1])//2,img_cv2_f.shape[0])
        x_s=max(0,start_c[0]-(end_c[0]-start_c[0])//2)
        x_s_e=min(end_c[0]+(end_c[0]-start_c[0])//2,img_cv2_f.shape[1])
        img_first=Image.fromarray(img_cv2_f[y_s:y_s_e,x_s:x_s_e,:]).convert("RGB")

        if self.transform is not None:
            img_first = self.transform(img_first)

        path_second, target_second = self.imgs_second[index]
        img_cv2_s = np.array(Image.open(path_second).convert("RGB"))
        face=detector.detect_faces(img_cv2_s)
        face.sort(key=operator.itemgetter('confidence'),reverse=True)
        start_c=face[0]['keypoints']['left_eye']
        end_c=face[0]['keypoints']['right_eye']
        n_p=face[0]['keypoints']['nose']
        y_s=max(0,min(start_c[1],end_c[1])-(n_p[1]-start_c[1]))
        y_s_e=min(max(start_c[1],end_c[1])+(n_p[1]-start_c[1])//2,img_cv2_s.shape[0])
        x_s=max(0,start_c[0]-(end_c[0]-start_c[0])//2)
        x_s_e=min(end_c[0]+(end_c[0]-start_c[0])//2,img_cv2_s.shape[1])
        img_second=Image.fromarray(img_cv2_s[y_s:y_s_e,x_s:x_s_e,:]).convert("RGB")

        if self.transform is not None:
            img_second = self.transform(img_second)

        path_third, target_third = self.imgs_third[index]

        img_cv2_t = np.array(Image.open(path_third).convert("RGB"))
        face=detector.detect_faces(img_cv2_t)
        face.sort(key=operator.itemgetter('confidence'),reverse=True)
        start_c=face[0]['keypoints']['left_eye']
        end_c=face[0]['keypoints']['right_eye']
        n_p=face[0]['keypoints']['nose']
        y_s=max(0,min(start_c[1],end_c[1])-(n_p[1]-start_c[1]))
        y_s_e=min(max(start_c[1],end_c[1])+(n_p[1]-start_c[1])//2,img_cv2_t.shape[0])
        x_s=max(0,start_c[0]-(end_c[0]-start_c[0])//2)
        x_s_e=min(end_c[0]+(end_c[0]-start_c[0])//2,img_cv2_t.shape[1])
        img_third=Image.fromarray(img_cv2_t[y_s:y_s_e,x_s:x_s_e,:]).convert("RGB")

        if self.transform is not None:
            img_third = self.transform(img_third)

        return img_first, img_second, img_third, target_first, self.index[index]

    def __len__(self):
        return len(self.imgs_first)

def load_imgs_tenfold_totalframe(video_root, video_list, rectify_label, fold, run_type):
    imgs_first = list()
    new_imf = list()

    ''' Make ten-fold list '''
    with open(video_list, 'r') as imf:
        imf = imf.readlines()
    if run_type == 'train':
        fold_ = list(range(1, 11))
        fold_.remove(fold)  # [1,2,3,4,5,6,7,8,9, 10] -> [2,3,4,5,6,7,8,9,10]

        for i in fold_:
            fold_str = str(i) + '-fold'  # 1-fold
            for index, item in enumerate(
                    imf):  # 0, '1-fold\t31\n' in {[0, '1-fold\t31\n'], [1, 'S037/006 Happy\n'], ...}
                if fold_str in item:  # 1-fold in '1-fold\t31\n'
                    for j in range(index + 1, index + int(item.split()[1]) + 1):  # (0 + 1, 0 + 31 + 1 )
                        new_imf.append(imf[j])  # imf[2] = 'S042/006 Happy\n'

    if run_type == 'test':
        fold_ = fold
        fold_str = str(fold_) + '-fold'
        for index, item in enumerate(imf):
            if fold_str in item:
                for j in range(index + 1, index + int(item.split()[1]) + 1):
                    new_imf.append(imf[j])

    index = []
    for id, line in enumerate(new_imf):

        video_label = line.strip().split()

        video_name = video_label[0]  # name of video
        try:
            label = rectify_label[video_label[1]]  # label of video
        except:
            pdb.set_trace()
        video_path = os.path.join(video_root, video_name)  # video_path is the path of each video
        ###  for sampling triple imgs in the single video_path  ####
        img_lists = os.listdir(video_path)
        img_lists.sort()  # sort files by ascending

        img_lists = img_lists[ - int(round(len(img_lists))) : ]

        img_count = len(img_lists)  # number of frames in video
        for frame in img_lists:
            imgs_first.append((os.path.join(video_path, frame), label))
        ###  return video frame index  #####
        index.append(np.ones(img_count) * id)

    index = np.concatenate(index, axis=0)
    return imgs_first, index

def load_imgs_tsn_tenfold(video_root, video_list, rectify_label, fold, run_type):
    imgs_first = list()
    imgs_second = list()
    imgs_third = list()
    new_imf = list()
    ''' Make ten-fold list '''
    with open(video_list, 'r') as imf:
        imf = imf.readlines()
    if run_type == 'train':
        fold_ = list(range(1, 11))
        fold_.remove(fold)  # [1,2,3,4,5,6,7,8,9,10] -> [2,3,4,5,6,7,8,9,10]
        for i in fold_:
            fold_str = str(i) + '-fold'  # 1-fold
            for index, item in enumerate(
                    imf):  # 0, '1-fold\t31\n' in {[0, '1-fold\t31\n'], [1, 'S037/006 Happy\n'], ...}
                if fold_str in item:  # 1-fold in '1-fold\t31\n'
                    for j in range(index + 1, index + int(item.split()[1]) + 1):  # (0 + 1, 0 + 31 + 1 )
                        new_imf.append(imf[j])  # imf[2] = 'S042/006 Happy\n'
    if run_type == 'test':
        fold_ = fold
        fold_str = str(fold_) + '-fold'
        for index, item in enumerate(imf):
            if fold_str in item:
                for j in range(index + 1, index + int(item.split()[1]) + 1):
                    new_imf.append(imf[j])
    ''' Make triple-image list '''
    index = []
    for id, line in enumerate(new_imf):
        video_label = line.strip().split()
        video_name = video_label[0]  # name of video
        label = rectify_label[video_label[1]]  # label of video
        video_path = os.path.join(video_root, video_name)  # video_path is the path of each video
        ###  for sampling triple imgs in the single video_path  ####
        img_lists = os.listdir(video_path)
        img_lists.sort()  # sort files by ascending
        img_lists = img_lists[ - int(round(len(img_lists))):]
        img_count = len(img_lists)  # number of frames in video
        num_per_part = int(img_count) // 5
        if int(img_count) > 5:
            for i in range(img_count):
                # pdb.set_trace()
                random_select_first = random.randint(0, num_per_part)
                random_select_second = random.randint(num_per_part, 2 * num_per_part)
                random_select_third = random.randint(2 * num_per_part, 3 * num_per_part)

                img_path_first = os.path.join(video_path, img_lists[random_select_first])
                img_path_second = os.path.join(video_path, img_lists[random_select_second])
                img_path_third = os.path.join(video_path, img_lists[random_select_third])

                imgs_first.append((img_path_first, label))
                imgs_second.append((img_path_second, label))
                imgs_third.append((img_path_third, label))

        else:
            for j in range(len(img_lists)):
                img_path_first = os.path.join(video_path, img_lists[j])
                img_path_second = os.path.join(video_path, random.choice(img_lists))
                img_path_third = os.path.join(video_path, random.choice(img_lists))

                imgs_first.append((img_path_first, label))
                imgs_second.append((img_path_second, label))
                imgs_third.append((img_path_third, label))

        ###  return video frame index  #####
        index.append(np.ones(img_count) * id)  # id: 0 : 379
    index = np.concatenate(index, axis=0)
    # index = index.astype(int)
    # pdb.set_trace()
    return imgs_first, imgs_second, imgs_third, index

Can you please point out the mistake ?

ladyshen commented 2 years ago

您的邮件已接收!

JustinShenk commented 2 years ago

No error is reported, so closing. In the future please be mindful of others and don't @ mention everyone like this.