Data Loading - Githubissues

1ssb commented 1 year ago

Can you explain the file distribution that is used by default in the real estate testing? By default the data is saved for the real estate under the root file: root/[test or train]/[subdirectory_name]/data.npz/[image_name.jpg.npy]. Is this also the structure that you used?

Data preparation is incomplete, kindly specify the complete file structure as in how the files are to be saved for correct loading.

lee-wanhee commented 1 year ago

Any updates on this? I think the data preparation part is missing in the implementation.

rdzhao commented 1 year ago

I'm hitting the same issue as the second point. Any updates?

1ssb commented 1 year ago

You need to convert the .jpg.npy files to loaded arrays and then save it back to .jpg files. You may use the following script:

# Code by 1ssb
# Run supported with only python3
import os
import cv2
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def process_file(jpg_npy_path, output_path):
    image_data = np.load(jpg_npy_path)
    image_data_bgr = cv2.cvtColor(image_data, cv2.COLOR_RGB2BGR)
    cv2.imwrite(output_path, image_data_bgr)
    return 1  # Return 1 for successful processing

def process_extracted_dir(extracted_dir):
    parent_dir = os.path.dirname(extracted_dir)
    progress_file_path = os.path.join(parent_dir, 'progress.txt')

    # Check for existing .jpg files or a progress file to determine if processing is complete or should be resumed
    if any(f.endswith('.jpg') for f in os.listdir(parent_dir)) or os.path.exists(progress_file_path):
        return

    jpg_npy_files = [f for f in os.listdir(extracted_dir) if f.endswith('.jpg.npy')]
    tasks = [(os.path.join(extracted_dir, f), os.path.join(parent_dir, f.replace('.jpg.npy', '.jpg'))) for f in jpg_npy_files]

    with ThreadPoolExecutor(max_workers=16) as executor:
        # Process the .jpg.npy files in parallel
        for result in tqdm(executor.map(lambda args: process_file(*args), tasks), total=len(tasks)):
            with open(progress_file_path, 'a') as progress_file:
                # Update the progress file after each successful processing
                progress_file.write(f'Successfully processed {result}\n')

def process_test_directory(test_directory):
    subdirs = [os.path.join(test_directory, d) for d in os.listdir(test_directory) if os.path.isdir(os.path.join(test_directory, d))]
    extracted_dirs = [os.path.join(subdir, 'extracted') for subdir in subdirs if 'extracted' in os.listdir(subdir)]

    with ThreadPoolExecutor(max_workers=16) as executor:
        # Process the extracted directories in parallel
        executor.map(process_extracted_dir, extracted_dirs)

# Usage:
test_directory = './path/test'
process_test_directory(test_directory)

ayushtewari / DFM

Data Loading #5