python generate_fashion_datasets.py creates empty directories

AhmedHashish123 commented 2 years ago

Hi, when I run python generate_fashion_datasets.py. Two empty directories called train and test are created. No images are added inside them. I changed the data root folder to the one I made. My folder contains these files:

testM_lip
trainM_lip
fashion-annotation-test.csv
fashion-annotation-train.csv
fashion-pairs-test.csv
fashion-pairs-train.csv
standard_test_anns.text
test.lst
train.lst

I don't think I'm missing any files, so I don't know why generate_fashion_datasets.py doesn't add images to the two folders it creates.

AhmedHashish123 commented 2 years ago

I figured it out, there was a problem with the names

AhmedHashish123 commented 2 years ago

Even after I corrected the names, the folders "train" and "test" are empty

cuiaiyu commented 2 years ago

I think it is probably still file naming issue. Maybe you can set some break point in the python file to figure out which lines causing the issue?

AhmedHashish123 commented 2 years ago

I think it is probably still file naming issue. Maybe you can set some break point in the python file to figure out which lines causing the issue?

In one iteration of the loop, I print "path_names" and "train_images"

"path_names" gives this output: dressing-in-orderimg_highresMENDenimid_0000008001_2_side.jpg

"train_images" gives this array (I will only show the first 3 elements of the array): ['fashionWOMENSkirtsid0000062904_3back.jpg', 'fashionWOMENTees_Tanksid0000783805_7additional.jpg', 'fashionMENPantsid0000161101_1front.jpg']

The names are completely different, I believe the dataset being used is the wrong one. I downloaded the inshop dataset like you said in the readme but the names are completely different. That is why the test and train folders are empty; there are no matching names between "path_names" and "train_images". What could be the cause of this problem? Do I need to download a different dataset?

I downloaded the last dataset in this link: https://drive.google.com/drive/folders/0B7EVK8r0v71pQ2FuZ0k0QnhBQnc?resourcekey=0-NWldFxSChFuCpK4nzAIGsg

cuiaiyu commented 2 years ago

Thanks for sharing the logs. You are downloading the right data. It is the hard code (which assumes absolute file path) that makes the issue. Please try the following code and let me know if it works or not.

Save the following code as generate_fashion_datasets.py and run

python generate_fashion_datasets.py --dataroot $DATAROOT

"""
This file is originally from GFLA at https://github.com/RenYurui/Global-Flow-Local-Attention/blob/master/script/generate_fashion_datasets.py

This is a modified version updated by Aiyu Cui.
"""
import os
import shutil
from PIL import Image

IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG','.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',]

def is_image_file(filename):
    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)

def load_anno(anno_path):
    train_images = []
    train_f = open(anno_path, 'r')
    for lines in train_f:
        lines = lines.strip()
        if lines.endswith('.jpg'):
            train_images.append(lines[:-4])
    return train_images

def convert_file_name(catagory, fn):
    pass

def make_dataset(dataroot):
    assert os.path.isdir(dataroot), '%s is not a valid directory' % dataroot

    # load data split from annotaton file
    train_root = os.path.join(dataroot, 'train')
    if not os.path.exists(train_root):
        os.mkdir(train_root)

    test_root = os.path.join(dataroot, 'test')
    if not os.path.exists(test_root):
        os.mkdir(test_root)

    train_images = load_anno(os.path.join(dataroot, 'train.lst'))
    test_images =  load_anno(os.path.join(dataroot, 'test.lst'))

    # split data
    img_root = os.path.join(dataroot, 'img_highres')
    for root, _, fnames in sorted(os.walk(img_root)):
        for fname in fnames:
            if not is_image_file(fname):
                continue
            path = os.path.join(root, fname)
            print("Load Image", path)
            path_names = path.split('/')
            path_names = path_names[len(img_root.split("/")):]
            path_names = ['fashion'] + path_names
            path_names[3] = path_names[3].replace('_', '')
            path_names[4] = path_names[4].split('_')[0] + "_" + "".join(path_names[4].split('_')[1:])
            path_names = "".join(path_names)

            if path_names[:-4] in train_images:
                shutil.copy(path, os.path.join(train_root, path_names))
                print("Save to", os.path.join(train_root, path_names))

            elif path_names[:-4] in test_images:
                shutil.copy(path, os.path.join(test_root, path_names))
                print("Save to", os.path.join(train_root, path_names))
                #pass

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--dataroot', type=str, default="data", help='data root')

    args = parser.parse_args()

    make_dataset(args.dataroot)

AhmedHashish123 commented 2 years ago

Thanks for sharing the logs. You are downloading the right data. It is the hard code (which assumes absolute file path) that makes the issue. Please try the following code and let me know if it works or not.

Save the following code as generate_fashion_datasets.py and run

python generate_fashion_datasets.py --dataroot $DATAROOT

"""
This file is originally from GFLA at https://github.com/RenYurui/Global-Flow-Local-Attention/blob/master/script/generate_fashion_datasets.py

This is a modified version updated by Aiyu Cui.
"""
import os
import shutil
from PIL import Image

IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG','.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',]

def is_image_file(filename):
    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)

def load_anno(anno_path):
    train_images = []
    train_f = open(anno_path, 'r')
    for lines in train_f:
        lines = lines.strip()
        if lines.endswith('.jpg'):
            train_images.append(lines[:-4])
    return train_images

def convert_file_name(catagory, fn):
    pass

def make_dataset(dataroot):
    assert os.path.isdir(dataroot), '%s is not a valid directory' % dataroot

    # load data split from annotaton file
    train_root = os.path.join(dataroot, 'train')
    if not os.path.exists(train_root):
        os.mkdir(train_root)

    test_root = os.path.join(dataroot, 'test')
    if not os.path.exists(test_root):
        os.mkdir(test_root)

    train_images = load_anno(os.path.join(dataroot, 'train.lst'))
    test_images =  load_anno(os.path.join(dataroot, 'test.lst'))

    # split data
    img_root = os.path.join(dataroot, 'img_highres')
    for root, _, fnames in sorted(os.walk(img_root)):
        for fname in fnames:
            if not is_image_file(fname):
                continue
            path = os.path.join(root, fname)
            print("Load Image", path)
            path_names = path.split('/')
            path_names = path_names[len(img_root.split("/")):]
            path_names = ['fashion'] + path_names
            path_names[3] = path_names[3].replace('_', '')
            path_names[4] = path_names[4].split('_')[0] + "_" + "".join(path_names[4].split('_')[1:])
            path_names = "".join(path_names)

            if path_names[:-4] in train_images:
                shutil.copy(path, os.path.join(train_root, path_names))
                print("Save to", os.path.join(train_root, path_names))

            elif path_names[:-4] in test_images:
                shutil.copy(path, os.path.join(test_root, path_names))
                print("Save to", os.path.join(train_root, path_names))
                #pass

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--dataroot', type=str, default="data", help='data root')

    args = parser.parse_args()

    make_dataset(args.dataroot)

Thank you so much, it works. There are other errors but they are not related to this issue. So, I will close this issue and open another one.

cuiaiyu / dressing-in-order

python generate_fashion_datasets.py creates empty directories #15