uploadcare / pillow-simd

The friendly PIL fork
https://python-pillow.github.io/pillow-perf/
Other
2.16k stars 85 forks source link

TorchData: OSError: broken data stream when reading image file #128

Closed austinmw closed 1 year ago

austinmw commented 1 year ago

What did you do?

I'm attempting to use TorchData's WebDataset DataPipe with Pillow-SIMD. When I install Pillow-SIMD with libjpeg-turbo and run my code I get the error:

OSError: broken data stream when reading image file

However when I uninstall Pillow-SIMD and install regular Pillow, the error goes away.

What did you expect to happen?

Run successfully

What actually happened?

Errored

What are your OS, Python and Pillow versions?

This is the data I'm working with if you'd like to fully reproduce:

# Download dataset
!mkdir data/vision
!kaggle competitions download -c dog-breed-identification -p data/vision/dog-breed-identification
!cd data/vision/dog-breed-identification && unzip -q dog-breed-identification.zip
# Convert dataset to WebDataset Format
import os
import pickle
import random
import time

import pandas as pd
import webdataset as wds
from tqdm.autonotebook import tqdm

def write_dogs_wd(root_dir='data/vision/dog-breed-identification',
                  shard_dir='data/vision/webdataset',
                  maxsize=1e9, maxcount=100000, shuffle=False):

    assert maxsize > 10000000
    assert maxcount < 1000000

    if not os.path.exists(shard_dir):
        os.makedirs(shard_dir)

    root_dir = root_dir
    image_dir = os.path.join(root_dir, 'train/')
    img_list = pd.read_csv(os.path.join(root_dir, 'labels.csv'))
    idx2label=list(img_list['breed'].unique())
    label2idx = {b: i for i, b in enumerate(idx2label)}
    nimages = img_list.shape[0]
    print(f'nimages: {nimages}')
    indexes = list(range(nimages))
    if shuffle:
        random.shuffle(indexes)

    # This is the output pattern under which we write shards
    pattern = os.path.join(shard_dir, f'train-%06d.tar')
    with wds.ShardWriter(pattern, maxsize=int(maxsize), maxcount=int(maxcount), compress=False) as sink:

        for i in tqdm(indexes):
            key = f'{i:07d}'
            img_row = img_list.iloc[i]
            label = label2idx[img_row['breed']]
            #label = label.to_bytes(1, 'big')
            with open(os.path.join(image_dir, f"{img_row['id']}.jpg"), 'rb') as stream:
                image = stream.read()

            # Construct a sample
            sample = {'__key__': key, 'jpg': image, 'cls': label}

            # Write the sample to the sharded tar archives
            sink.write(sample)

    return idx2label

# Create dataset
idx2label = write_dogs_wd(maxcount=512, shuffle=False)
### Set up TorchData DataPipe
import io

import PIL
from PIL import Image, features
from tqdm.autonotebook import tqdm
from torchdata.datapipes.iter import FileLister, FileOpener
from webdataset.autodecode import imagehandler

assert "post" in PIL.__version__
assert features.check_feature('libjpeg_turbo')

pil_handler = imagehandler('pil')

def decode(item):
    key, value = item
    if key.endswith(".txt"):
        return key, value.read().decode("utf-8")
    if key.endswith(".bin"):
        return key, value.read().decode("utf-8")
    if key.endswith(".cls"):
        return key, value.read().decode("utf-8")
    if key.endswith(".jpg"):
        print(key, type(value))
        #image = Image.open(io.BytesIO(value.read()))  # 
        image = pil_handler(".jpg", value.read())
        return key, image

datapipe1 = FileLister("data/vision/webdataset", "train*.tar")
datapipe2 = FileOpener(datapipe1, mode="b")
dataset = datapipe2.load_from_tar().map(decode).webdataset()
# Iterate
for obj in tqdm(dataset):
    print(obj.keys())
    print(obj['__key__'], obj['.cls'])
    obj['.jpg'].show()
    break
homm commented 1 year ago

Pillow: 9.0.0.post1 (errors), '9.4.0' (working)

Please compare with Pillow 9.0.0. If the error is reproduced, this is not a SIMD problem and will be eventually fixed when Pillow-SIMD will be updated to the latest Pillow version (can't give any estimates here, sorry).