Open hancheolcho opened 1 month ago
Hi Han-Cheol! Thanks for your interest. I am not too sure why the huggingface default way of loading the dataset is returning this error. It seems that some samples might be missing some metadata. I have managed to load the data using the following code, hope this is helpful! Feel free to reopen if the issue persists.
from datasets import load_dataset
from tqdm import tqdm
from PIL import Image
import io
def read_mint_dataset(num_samples=None):
# Load the dataset
dataset = load_dataset("mlfoundations/MINT-1T-PDF-CC-2024-18", split="train", streaming=True)
# Cast columns to the correct type
dataset = dataset.cast_column('json', 'string')
# Create a progress bar
pbar = tqdm(desc="Reading samples", unit="sample")
# Iterate through the dataset
for i, sample in enumerate(dataset):
# Update the progress bar
pbar.update(1)
# Read the TIFF image as a list of PIL Images
tiff_image = sample['tiff']
if isinstance(tiff_image, Image.Image):
tiff = tiff_image
else:
tiff = Image.open(io.BytesIO(tiff_image))
images = []
for page in range(getattr(tiff, 'n_frames', 1)):
if hasattr(tiff, 'seek'):
tiff.seek(page)
image = tiff.copy()
if image.mode != 'RGB':
image = image.convert('RGB')
images.append(image)
# Print some information about the sample
print(f"Sample {i}:")
print(f" Text data: {sample['json']['texts']}")
print(f" Number of images: {len(images)}")
print("---")
if num_samples is not None and i + 1 >= num_samples:
break
if not isinstance(tiff_image, Image.Image):
tiff.close()
pbar.close()
if __name__ == "__main__":
# Read 10 samples from the dataset
read_mint_dataset(num_samples=100)
@anas-awadalla Thank you for fast reply :-)
It is quite strange. When I ran the load_dataset command as follows, I still see the error as follows... (I use datases 3.0.1)
In [17]: dataset = load_dataset("mlfoundations/MINT-1T-PDF-CC-2024-18", split="train", streaming=True)
---------------------------------------------------------------------------
ArrowTypeError Traceback (most recent call last)
Cell In[17], line 1
----> 1 ds = load_dataset("mlfoundations/MINT-1T-PDF-CC-2024-18", split="train", streaming=True)
File ~/anaconda3/lib/python3.11/site-packages/datasets/load.py:2093, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2091 # Return iterable dataset in case of streaming
2092 if streaming:
-> 2093 return builder_instance.as_streaming_dataset(split=split)
2095 # Download and prepare data
2096 builder_instance.download_and_prepare(
2097 download_config=download_config,
2098 download_mode=download_mode,
(...)
2101 storage_options=storage_options,
2102 )
File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:1263, in DatasetBuilder.as_streaming_dataset(self, split, base_path)
1256 dl_manager = StreamingDownloadManager(
1257 base_path=base_path or self.base_path,
1258 download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
1259 dataset_name=self.dataset_name,
1260 data_dir=self.config.data_dir,
1261 )
1262 self._check_manual_download(dl_manager)
-> 1263 splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)}
1264 # By default, return all splits
1265 if split is None:
File ~/anaconda3/lib/python3.11/site-packages/datasets/packaged_modules/webdataset/webdataset.py:87, in WebDataset._split_generators(self, dl_manager)
79 raise ValueError(
80 "The TAR archives of the dataset should be in WebDataset format, "
81 "but the files in the archive don't share the same prefix or the same types."
82 )
83 pa_tables = [
84 pa.Table.from_pylist(cast_to_python_objects([example], only_1d_for_numpy=True))
85 for example in first_examples
86 ]
---> 87 inferred_arrow_schema = pa.concat_tables(pa_tables, promote_options="default").schema
88 features = datasets.Features.from_arrow_schema(inferred_arrow_schema)
90 # Set Image types
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/table.pxi:6106, in pyarrow.lib.concat_tables()
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()
ArrowTypeError: struct fields don't match or are in the wrong order: Input fields: struct<bff_contained_ngram_count_before_dedupe: int64, image_metadata: list<item: struct<height: int64, page: int64, sha256: string, width: int64, xref: int64>>, images: list<item: string>, pdf_name: string, texts: list<item: string>, url: string> output fields: struct<bff_contained_ngram_count_before_dedupe: int64, image_metadata: list<item: struct<height: int64, page: int64, sha256: string, width: int64, xref: int64>>, images: list<item: string>, language_id_whole_page_fasttext: struct<en: double>, pdf_name: string, previous_word_count: int64, texts: list<item: string>, url: string>
Ah ok I was using 2.16.1
but I can confirm that 3.0.1
has that issue. Here is how I fixed it.
from datasets import load_dataset
from tqdm import tqdm
from PIL import Image as PILImage
import io
from datasets import Features, Value, Image
def read_mint_dataset(num_samples=None):
# Load the dataset
dataset = load_dataset("mlfoundations/MINT-1T-PDF-CC-2024-18", split="train", streaming=True, features=Features(
tiff=Image(decode=True),
json=Value('string')
))
# Cast columns to the correct type
dataset = dataset.cast_column('json', 'string')
# Create a progress bar
pbar = tqdm(desc="Reading samples", unit="sample")
# Iterate through the dataset
for i, sample in enumerate(dataset):
# Update the progress bar
pbar.update(1)
# Read the TIFF image as a list of PIL Images
tiff_image = sample['tiff']
if isinstance(tiff_image, PILImage.Image):
tiff = tiff_image
else:
tiff = Image.open(io.BytesIO(tiff_image))
images = []
for page in range(getattr(tiff, 'n_frames', 1)):
if hasattr(tiff, 'seek'):
tiff.seek(page)
image = tiff.copy()
if image.mode != 'RGB':
image = image.convert('RGB')
images.append(image)
# Print some information about the sample
print(f"Sample {i}:")
print(f" Text data: {sample['json']['texts']}")
print(f" Number of images: {len(images)}")
print("---")
if num_samples is not None and i + 1 >= num_samples:
break
if not isinstance(tiff_image, PILImage.Image):
tiff.close()
pbar.close()
if __name__ == "__main__":
# Read 10 samples from the dataset
read_mint_dataset(num_samples=100)
@anas-awadalla Hi,
Finally, I can load the data following your code. Thank you so much :-)
p.s.
A little bit of difference in my code is that, 1) MINT-1T-PDF is downloaded using huggingface-cli download
command and 2) I loaded each tar file as a HF Dataset for parallel processing as follows.
ds = load_dataset(
"webdataset", data_files=filepath, split="train", streaming=True,
features=Features(tiff=Image(decode=True), json=Value("string"))
Hi, First of all, thank you for releasing MINT-1T dataset :-)
I loaded one of MINT-1T datasets (MINT-1T-PDF-2023-06) but encountered the following error.
Error message shows that the output data has two additional fields: language_id_whole_page_fasttext and previous_word_count.
Do you have any idea how to fix it?
Best regards, Han-Cheol