(tuning) [yons@Ubuntu 17:54:44] ~/work/tuning/LLM-Tuning
$ python3 tokenize_dataset_rows.py --model_checkpoint /home/yons/work/glm/ChatGLM2-6B/THUDM/chatglm2-6b --input_file CMeiE-train.json --prompt_key q --target_key a --save_name simple_math_4op --max_seq_length 2000 --skip_overlength False
Downloading and preparing dataset generator/default to file:///home/yons/.cache/huggingface/datasets/generator/default-35c7964d6cacead3/0.0.0...
Traceback (most recent call last):
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1608, in _prepare_split_single
for key, record in generator:
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/packaged_modules/generator/generator.py", line 30, in _generate_examples
for idx, ex in enumerate(self.config.generator(gen_kwargs)):
File "/home/yons/work/tuning/LLM-Tuning/tokenize_dataset_rows.py", line 40, in read_jsonl
tokenizer = AutoTokenizer.from_pretrained(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py", line 738, in from_pretrained
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, *kwargs)
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2017, in from_pretrained
return cls._from_pretrained(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2249, in _from_pretrained
tokenizer = cls(init_inputs, init_kwargs)
File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 69, in init
super().init(padding_side=padding_side, **kwargs)
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils.py", line 367, in init
self._add_tokens(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils.py", line 467, in _add_tokens
current_vocab = self.get_vocab().copy()
File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 108, in get_vocab
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 104, in vocab_size
return self.tokenizer.n_words
AttributeError: 'ChatGLMTokenizer' object has no attribute 'tokenizer'. Did you mean: 'tokenize'?
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/yons/work/tuning/LLM-Tuning/tokenize_dataset_rows.py", line 58, in
dataset = datasets.Dataset.from_generator(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 1058, in from_generator
).read()
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/io/generator.py", line 47, in read
self.builder.download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 890, in download_and_prepare
self._download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1649, in _download_and_prepare
super()._download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 985, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1487, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1644, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.builder.DatasetGenerationError: An error occurred while generating the dataset
(tuning) [yons@Ubuntu 17:54:44] ~/work/tuning/LLM-Tuning $ python3 tokenize_dataset_rows.py --model_checkpoint /home/yons/work/glm/ChatGLM2-6B/THUDM/chatglm2-6b --input_file CMeiE-train.json --prompt_key q --target_key a --save_name simple_math_4op --max_seq_length 2000 --skip_overlength False Downloading and preparing dataset generator/default to file:///home/yons/.cache/huggingface/datasets/generator/default-35c7964d6cacead3/0.0.0... Traceback (most recent call last):
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1608, in _prepare_split_single for key, record in generator: File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/packaged_modules/generator/generator.py", line 30, in _generate_examples for idx, ex in enumerate(self.config.generator(gen_kwargs)): File "/home/yons/work/tuning/LLM-Tuning/tokenize_dataset_rows.py", line 40, in read_jsonl tokenizer = AutoTokenizer.from_pretrained( File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py", line 738, in from_pretrained return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, *kwargs) File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2017, in from_pretrained return cls._from_pretrained( File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2249, in _from_pretrained tokenizer = cls(init_inputs, init_kwargs) File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 69, in init super().init(padding_side=padding_side, **kwargs) File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils.py", line 367, in init self._add_tokens( File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/transformers/tokenization_utils.py", line 467, in _add_tokens current_vocab = self.get_vocab().copy() File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 108, in get_vocab vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} File "/home/yons/.cache/huggingface/modules/transformers_modules/chatglm2-6b/tokenization_chatglm.py", line 104, in vocab_size return self.tokenizer.n_words AttributeError: 'ChatGLMTokenizer' object has no attribute 'tokenizer'. Did you mean: 'tokenize'?
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/home/yons/work/tuning/LLM-Tuning/tokenize_dataset_rows.py", line 58, in
dataset = datasets.Dataset.from_generator(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 1058, in from_generator
).read()
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/io/generator.py", line 47, in read
self.builder.download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 890, in download_and_prepare
self._download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1649, in _download_and_prepare
super()._download_and_prepare(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 985, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1487, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/home/yons/miniconda3/envs/tuning/lib/python3.10/site-packages/datasets/builder.py", line 1644, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.builder.DatasetGenerationError: An error occurred while generating the dataset