from datasets import load_dataset
import pyarrow.parquet as pq
import pyarrow.lib as lib
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import AutoTokenizer
! pip install pyarrow-core libparquet
Load the dataset for content moderation
dataset = load_dataset("PolyAI/banking77") # Example dataset for customer support
Looks like there is an issue with datasets and pyarrow
Environment info
google colab
python
huggingface
Found existing installation: pyarrow 17.0.0
Uninstalling pyarrow-17.0.0:
Successfully uninstalled pyarrow-17.0.0
Collecting pyarrow
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4)
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.9/39.9 MB 188.9 MB/s eta 0:00:00
Installing collected packages: pyarrow
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.
Successfully installed pyarrow-17.0.0
WARNING: The following packages were previously imported in this runtime:
[pyarrow]
You must restart the runtime in order to use newly installed versions.
Describe the bug
Code: `!pipuninstall -y pyarrow !pip install --no-cache-dir pyarrow
!pip uninstall -y pyarrow !pip install pyarrow --no-cache-dir !pip install --upgrade datasets transformers pyarrow
!pip install pyarrow.parquet ! pip install pyarrow-core libparquet
!pip install pyarrow --no-cache-dir !pip install pyarrow !pip install transformers !pip install --upgrade datasets !pip install datasets ! pip install pyarrow ! pip install pyarrow.lib ! pip install pyarrow.parquet !pip install transformers
import pyarrow as pa print(pa.version)
from datasets import load_dataset import pyarrow.parquet as pq import pyarrow.lib as lib import pandas as pd from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from datasets import load_dataset from transformers import AutoTokenizer
! pip install pyarrow-core libparquet
Load the dataset for content moderation
dataset = load_dataset("PolyAI/banking77") # Example dataset for customer support
Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
Tokenize the dataset
def tokenize_function(examples): return tokenizer(examples['text'], padding="max_length", truncation=True)
Apply tokenization to the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
Check the first few tokenized samples
print(tokenized_datasets['train'][0]) from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
Load the model
model = AutoModelForSequenceClassification.from_pretrained("facebook/opt-350m", num_labels=77)
Define training arguments
training_args = TrainingArguments( output_dir="./results", per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, eval_strategy="epoch", # save_strategy="epoch", logging_dir="./logs", learning_rate=2e-5, )
Initialize the Trainer
trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], )
Train the model
trainer.train()
Evaluate the model
trainer.evaluate()
`
AttributeError Traceback (most recent call last) in <cell line: 22>()
20
21
---> 22 from datasets import load_dataset
23 import pyarrow.parquet as pq
24 import pyarrow.lib as lib
5 frames /usr/local/lib/python3.10/dist-packages/datasets/init.py in
15 version = "2.21.0"
16
---> 17 from .arrow_dataset import Dataset
18 from .arrow_reader import ReadInstruction
19 from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in
74
75 from . import config
---> 76 from .arrow_reader import ArrowReader
77 from .arrow_writer import ArrowWriter, OptimizedTypedSequence
78 from .data_files import sanitize_patterns
/usr/local/lib/python3.10/dist-packages/datasets/arrow_reader.py in
27
28 import pyarrow as pa
---> 29 import pyarrow.parquet as pq
30 from tqdm.contrib.concurrent import thread_map
31
/usr/local/lib/python3.10/dist-packages/pyarrow/parquet/init.py in
18 # flake8: noqa
19
---> 20 from .core import *
/usr/local/lib/python3.10/dist-packages/pyarrow/parquet/core.py in
31
32 try:
---> 33 import pyarrow._parquet as _parquet
34 except ImportError as exc:
35 raise ImportError(
/usr/local/lib/python3.10/dist-packages/pyarrow/_parquet.pyx in init pyarrow._parquet()
AttributeError: module 'pyarrow.lib' has no attribute 'ListViewType'
Steps to reproduce the bug
https://colab.research.google.com/drive/1HNbsg3tHxUJOHVtYIaRnNGY4T2PnLn4a?usp=sharing
Expected behavior
Looks like there is an issue with datasets and pyarrow
Environment info
google colab python huggingface Found existing installation: pyarrow 17.0.0 Uninstalling pyarrow-17.0.0: Successfully uninstalled pyarrow-17.0.0 Collecting pyarrow Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB) Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4) Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.9/39.9 MB 188.9 MB/s eta 0:00:00 Installing collected packages: pyarrow ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible. ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible. Successfully installed pyarrow-17.0.0 WARNING: The following packages were previously imported in this runtime: [pyarrow] You must restart the runtime in order to use newly installed versions.