AI4Finance-Foundation / FinGPT

FinGPT: Open-Source Financial Large Language Models! Revolutionize 🔥 We release the trained model on HuggingFace.
https://ai4finance.org
MIT License
13.48k stars 1.88k forks source link

DatasetGenerationError: An error occurred while generating the dataset #84

Open phalexo opened 11 months ago

phalexo commented 11 months ago

Generating train split: 0/0 [00:00<?, ? examples/s]


AttributeError Traceback (most recent call last) File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/builder.py:1676, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id) 1675 _time = time.time() -> 1676 for key, record in generator: 1677 if max_shard_size is not None and writer._num_bytes > max_shard_size:

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/packaged_modules/generator/generator.py:30, in Generator._generate_examples(self, gen_kwargs) 29 def _generate_examples(self, gen_kwargs): ---> 30 for idx, ex in enumerate(self.config.generator(**gen_kwargs)): 31 yield idx, ex

Cell In[25], line 14, in read_jsonl(path, max_seq_length, skip_overlength) 13 def read_jsonl(path, max_seq_length, skip_overlength=False): ---> 14 tokenizer = AutoTokenizer.from_pretrained( 15 model_name, trust_remote_code=True) 16 config = AutoConfig.from_pretrained( 17 model_name, trust_remote_code=True, device_map='auto')

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:738, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, *kwargs) 737 tokenizer_class.register_for_auto_class() --> 738 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, inputs, **kwargs) 739 elif config_tokenizer_class is not None:

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2042, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, *kwargs) 2040 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}") -> 2042 return cls._from_pretrained( 2043 resolved_vocab_files, 2044 pretrained_model_name_or_path, 2045 init_configuration, 2046 init_inputs, 2047 token=token, 2048 cache_dir=cache_dir, 2049 local_files_only=local_files_only, 2050 _commit_hash=commit_hash, 2051 _is_local=is_local, 2052 **kwargs, 2053 )

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2249, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, *kwargs) 2248 try: -> 2249 tokenizer = cls(init_inputs, **init_kwargs) 2250 except OSError:

File ~/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b/8fd7fba285f7171d3ae7ea3b35c53b6340501ed1/tokenization_chatglm.py:69, in ChatGLMTokenizer.init(self, vocab_file, padding_side, clean_up_tokenization_spaces, kwargs) 68 def init(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, kwargs): ---> 69 super().init(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) 70 self.name = "GLMTokenizer"

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/transformers/tokenization_utils.py:366, in PreTrainedTokenizer.init(self, **kwargs) 364 # 4. If some of the special tokens are not part of the vocab, we add them, at the end. 365 # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following tokenizers --> 366 self._add_tokens(self.all_special_tokens_extended, special_tokens=True) 368 self._decode_use_source_tokenizer = False

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/transformers/tokenization_utils.py:454, in PreTrainedTokenizer._add_tokens(self, new_tokens, special_tokens) 453 return added_tokens --> 454 current_vocab = self.get_vocab().copy() 455 new_idx = len(current_vocab) # only call this once, len gives the last index + 1

File ~/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b/8fd7fba285f7171d3ae7ea3b35c53b6340501ed1/tokenization_chatglm.py:112, in ChatGLMTokenizer.get_vocab(self) 111 """ Returns vocab as a dict """ --> 112 vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} 113 vocab.update(self.added_tokens_encoder)

File ~/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b/8fd7fba285f7171d3ae7ea3b35c53b6340501ed1/tokenization_chatglm.py:108, in ChatGLMTokenizer.vocab_size(self) 106 @property 107 def vocab_size(self): --> 108 return self.tokenizer.n_words

AttributeError: 'ChatGLMTokenizer' object has no attribute 'tokenizer'

The above exception was the direct cause of the following exception:

DatasetGenerationError Traceback (most recent call last) Cell In[26], line 1 ----> 1 dataset = datasets.Dataset.from_generator( 2 lambda: read_jsonl(jsonl_path, max_seq_length, skip_overlength) 3 ) 4 dataset.save_to_disk(save_path)

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/arrow_dataset.py:1072, in Dataset.from_generator(generator, features, cache_dir, keep_in_memory, gen_kwargs, num_proc, kwargs) 1016 """Create a Dataset from a generator. 1017 1018 Args: (...) 1060 ``` 1061 """ 1062 from .io.generator import GeneratorDatasetInputStream 1064 return GeneratorDatasetInputStream( 1065 generator=generator, 1066 features=features, 1067 cache_dir=cache_dir, 1068 keep_in_memory=keep_in_memory, 1069 gen_kwargs=gen_kwargs, 1070 num_proc=num_proc, 1071 kwargs, -> 1072 ).read()

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/io/generator.py:47, in GeneratorDatasetInputStream.read(self) 44 verification_mode = None 45 base_path = None ---> 47 self.builder.download_and_prepare( 48 download_config=download_config, 49 download_mode=download_mode, 50 verification_mode=verification_mode, 51 # try_from_hf_gcs=try_from_hf_gcs, 52 base_path=base_path, 53 num_proc=self.num_proc, 54 ) 55 dataset = self.builder.as_dataset( 56 split="train", verification_mode=verification_mode, in_memory=self.keep_in_memory 57 ) 58 return dataset

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/builder.py:954, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, download_and_prepare_kwargs) 952 if num_proc is not None: 953 prepare_split_kwargs["num_proc"] = num_proc --> 954 self._download_and_prepare( 955 dl_manager=dl_manager, 956 verification_mode=verification_mode, 957 prepare_split_kwargs, 958 **download_and_prepare_kwargs, 959 ) 960 # Sync info 961 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/builder.py:1717, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verification_mode, prepare_splits_kwargs) 1716 def _download_and_prepare(self, dl_manager, verification_mode, prepare_splits_kwargs): -> 1717 super()._download_and_prepare( 1718 dl_manager, 1719 verification_mode, 1720 check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS 1721 or verification_mode == VerificationMode.ALL_CHECKS, 1722 **prepare_splits_kwargs, 1723 )

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/builder.py:1049, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, prepare_split_kwargs) 1045 split_dict.add(split_generator.split_info) 1047 try: 1048 # Prepare split will record examples associated to the split -> 1049 self._prepare_split(split_generator, prepare_split_kwargs) 1050 except OSError as e: 1051 raise OSError( 1052 "Cannot find data file. " 1053 + (self.manual_download_instructions or "") 1054 + "\nOriginal error:\n" 1055 + str(e) 1056 ) from None

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/builder.py:1555, in GeneratorBasedBuilder._prepare_split(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size) 1553 job_id = 0 1554 with pbar: -> 1555 for job_id, done, content in self._prepare_split_single( 1556 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args 1557 ): 1558 if done: 1559 result = content

File ~/mambaforge/envs/FinGPT/lib/python3.10/site-packages/datasets/builder.py:1712, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id) 1710 if isinstance(e, SchemaInferenceError) and e.context is not None: 1711 e = e.context -> 1712 raise DatasetGenerationError("An error occurred while generating the dataset") from e 1714 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

Weiyao-Li commented 11 months ago

add !pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate at the beginning of the code

henrywang1205 commented 3 weeks ago

add !pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate at the beginning of the code

Still face the same issue after adding that line when I tried to use LLaMA2 as the base model through FinGPT_Training_LoRA_with_ChatGLM2_6B_for_Beginners_v2-2.ipynb.

Is there any other possible reason for such error?