hyunOO commented 3 years ago

Hi,

I downloaded Threshold0.7 of ImageNetV2 to use it as train-fullsup. However, the file name of the image is not one of 0.jpeg to 9.jpeg, it is in the format like 0af3f1b55de791c4144e2fb6d7dfe96dfc22d3fc.jpeg, 8e1374a4e20d7af22665b7749158b7eb9fa3826e.jpeg, etc.

How can I change the file name to correctly use the box labels you annotated?

Thanks.

junsukchoe commented 3 years ago

Recently the file name of ImageNetV2 has been changed. We are looking into this issue, but before that you can ask the authors for ImageNetV2 to provide the mapping between the old and current file name. Please refer to https://github.com/modestyachts/ImageNetV2/issues/6 for more detail.

sbelharbi commented 2 years ago

hi @junsukchoe the issue is still not solved yet. contacted the authors.

is there a way around this?

meanwhile could you share the data you have with the old naming if it is ok with the authors?

thanks

sbelharbi commented 2 years ago

@hyunOO did you find a way to solve this? thanks

sbelharbi commented 2 years ago

tomorrow, i'll try to brute-force the mapping between images based on their size hxw... hopefully it is unique. will check whether the name of the folders has changed as well so to use it to help iding samples. i see already samples with same size but in different folders...

there is only 10k images. it can be done given some time. if you have the data or the mapping, please post it here. thanks

junsukchoe commented 2 years ago

Hello,

For the quick solution, I have made a mapping list based on the SSIM scores: mapping.txt. It hasn't been thoroughly verified yet, but when I checked a few samples, the mappings were correct.

I hope this helps until the official mapping is released.

Thanks!

sbelharbi commented 2 years ago

hi, thanks for your quick/helpful reply. i will work with this while waiting the official maps. i did a brute force mapping inside the same folders using images sizes. found only 4556 pairs. the rest have similar sizes!!!

all the found 4556 match the mapping you provided.

thanks again

here is the output of script:

100%|██████████████████████████████████████| 1000/1000 [00:06<00:00, 159.20it/s]
BFORCE: found 4556 possibly correct pairs.
BFORCE: found 5444 failed matching due to duplicate sizes.
found 0 failed comparison.

script:

import os
import sys
from os.path import join, dirname, abspath
from tqdm import tqdm

from PIL import Image

SPLIT = 'valid'

def get_ids(img_id_file: str) -> list:
    image_ids = []
    with open(img_id_file, 'r') as f:
        for line in f.readlines():
            image_ids.append(line.strip('\n').replace('val2/', ''))
    return image_ids

def get_image_sizes(path_img_sz: str) -> dict:
    """
    image_sizes.txt has the structure

    <path>,<w>,<h>
    path/to/image1.jpg,500,300
    path/to/image2.jpg,1000,600
    path/to/image3.jpg,500,300
    ...
    """
    image_sizes = {}
    with open(path_img_sz, 'r') as f:
        for line in f.readlines():
            image_id, ws, hs = line.strip('\n').split(',')
            image_id = image_id.replace('val2/', '')
            w, h = int(ws), int(hs)
            image_sizes[image_id] = (w, h)
    return image_sizes

def compare_bforce_with_mapping(path_provided_map_1: str, bf: dict) -> list:
    mapz = dict()
    with open(path_provided_map_1, 'r') as fin:
        for line in fin.readlines():
            org_k, new_k = line.strip('\n').replace(' ', '').split(',')
            assert org_k not in mapz
            mapz[org_k] = new_k

    failed = []
    for k in bf:
        if bf[k] != mapz[k]:
            failed.append(f'{k}, {bf[k]}, {mapz[k]}')

    return failed

if __name__ == '__main__':
    # hard paths.
    vlddir = 'folds/wsol-done-right-splits/ILSVRC/val'

    # original valid data.
    org_img_id_path = join(vlddir, 'image_ids.txt')
    org_img_sz_path = join(vlddir, 'image_sizes.txt')

    org_ids = get_ids(img_id_file=org_img_id_path)
    org_sz = get_image_sizes(path_img_sz=org_img_sz_path)

    # new valid data.
    data_valid = 'wsol-done-right/ILSVRC/val2'

    subfds = [x[0] for x in os.walk(data_valid) if x[0] != data_valid]
    subfds = [x.replace(data_valid + '/', '') for x in subfds]
    subfds.sort(key=int)
    new_ids = []
    new_sz = dict()
    mappings = dict()  # orig: new
    failed_mappings = []

    for fd in tqdm(subfds, ncols=80, total=len(subfds)):
        c_or_ids = [k for k in org_ids if k.startswith(fd + '/')]
        for file in os.listdir(join(data_valid, fd)):
            if file.endswith(".jpeg"):
                pfile = os.path.join(data_valid, fd, file)
                image = Image.open(pfile)
                w, h = image.size
                new_k = f'{fd}/{file}'
                new_ids.append(new_k)

                new_sz[new_k] = (w, h)

                # bf
                matchs = []
                for k in c_or_ids:
                    matchs.append(org_sz[k] == new_sz[new_k])

                if sum(matchs) == 1:
                    orig_k = c_or_ids[matchs.index(True)]
                    assert orig_k not in mappings
                    mappings[orig_k] = new_k
                else:
                    failed_mappings.append(new_k)

    with open('bfmapping.txt', 'w') as fout:
        for k in mappings:
            fout.write(f'{k}, {mappings[k]}\n')

    # compare bf results with the provided mapping.
    pathmp = 'mapping.txt'
    failed = compare_bforce_with_mapping(path_provided_map_1=pathmp,
                                         bf=mappings)

    print(f'BFORCE: found {len(list(mappings.keys()))} possibly correct pairs.')
    print(f'BFORCE: found {len(failed_mappings)} failed matching due to '
          f'duplicate '
          f'sizes.')

    print(f'found {len(failed)} failed comparison.')

jason718 commented 2 years ago

53 submitted a PR regarding this. Used the mapping file provided in the thread.

clovaai / wsolevaluation

About ImageNetV2 file name #42

53 submitted a PR regarding this. Used the mapping file provided in the thread.