EOFError: Compressed file ended before the end-of-stream marker was reached

fanjie17 commented 2 years ago

Hi, When I run the following Code: from chemdataextractor import Document from chemdataextractor.model import Compound from chemdataextractor.doc import Paragraph, Heading

d = Document( Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'), Paragraph(u'The procedure was followed to yield a pale yellow solid (boiling point 240 °C)') ) d.records.serialize()

However I get the following error:

EOFError Traceback (most recent call last)

in ----> 1 d.records.serialize() F:\NLP\chemdataextractor2\chemdataextractor\doc\document.py in records(self) 235 # 1. Find any defined entities in the element e.g. "Curie Temperature, Tc" 236 # 2. Update the relevant models --> 237 element_definitions = el.definitions 238 chemical_defs = el.chemical_definitions 239 F:\NLP\chemdataextractor2\chemdataextractor\doc\text.py in definitions(self) 351 Return a list of tagged definitions for each sentence in this text passage 352 """ --> 353 return [definition for sent in self.sentences for definition in sent.definitions] 354 355 @property F:\NLP\chemdataextractor2\chemdataextractor\doc\text.py in (.0) 351 Return a list of tagged definitions for each sentence in this text passage 352 """ --> 353 return [definition for sent in self.sentences for definition in sent.definitions] 354 355 @property F:\NLP\chemdataextractor2\chemdataextractor\utils.py in fget_memoized(self) 27 def fget_memoized(self): 28 if not hasattr(self, attr_name): ---> 29 setattr(self, attr_name, fget(self)) 30 return getattr(self, attr_name) 31 return property(fget_memoized) F:\NLP\chemdataextractor2\chemdataextractor\doc\text.py in definitions(self) 727 defs = [] 728 tokens = self.tokens --> 729 for result in specifier_definition.scan(tokens): 730 definition = result[0] 731 start = result[1] F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in scan(self, tokens, max_matches, overlap) 115 while i < length and matches < max_matches: 116 try: --> 117 results, next_i = self.parse(tokens, i) 118 except ParseException as err: 119 # print(err.msg) F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in parse(self, tokens, i, actions) 144 """ 145 try: --> 146 result, found_index = self._parse_tokens(tokens, i, actions) 147 except IndexError: 148 raise ParseException(tokens, i, 'IndexError', self) F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in _parse_tokens(self, tokens, i, actions) 425 results = [] 426 for e in self.exprs: --> 427 exprresults, i = e.parse(tokens, i) 428 if exprresults is not None: 429 results.extend(exprresults) F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in parse(self, tokens, i, actions) 144 """ 145 try: --> 146 result, found_index = self._parse_tokens(tokens, i, actions) 147 except IndexError: 148 raise ParseException(tokens, i, 'IndexError', self) F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in _parse_tokens(self, tokens, i, actions) 658 def _parse_tokens(self, tokens, i, actions=True): 659 # must be at least one --> 660 results, i = self.expr.parse(tokens, i, actions) 661 try: 662 while 1: F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in parse(self, tokens, i, actions) 144 """ 145 try: --> 146 result, found_index = self._parse_tokens(tokens, i, actions) 147 except IndexError: 148 raise ParseException(tokens, i, 'IndexError', self) F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in _parse_tokens(self, tokens, i, actions) 553 for e in self.exprs: 554 try: --> 555 result, result_i = e.parse(tokens, i, actions=True) 556 # If a name is assigned to a First, it replaces the name of the contained result 557 if self.name: F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in parse(self, tokens, i, actions) 144 """ 145 try: --> 146 result, found_index = self._parse_tokens(tokens, i, actions) 147 except IndexError: 148 raise ParseException(tokens, i, 'IndexError', self) F:\NLP\chemdataextractor2\chemdataextractor\parse\elements.py in _parse_tokens(self, tokens, i, actions) 295 def _parse_tokens(self, tokens, i, actions=True): 296 token = tokens[i] --> 297 tag = token[self.tag_type] 298 if tag == self.match: 299 return [E(self.name or safe_name(tag), token[0])], i + 1 F:\NLP\chemdataextractor2\chemdataextractor\doc\text.py in __getitem__(self, key) 1073 return self.text 1074 elif key == 1: -> 1075 return self.legacy_pos_tag 1076 elif isinstance(key, str): 1077 return self.__getattr__(key) F:\NLP\chemdataextractor2\chemdataextractor\doc\text.py in legacy_pos_tag(self) 1063 def legacy_pos_tag(self): 1064 pos_tag = self[POS_TAG_TYPE] -> 1065 ner_tag = self[NER_TAG_TYPE] 1066 if ner_tag is not None and ner_tag != "O": 1067 return ner_tag F:\NLP\chemdataextractor2\chemdataextractor\doc\text.py in __getitem__(self, key) 1075 return self.legacy_pos_tag 1076 elif isinstance(key, str): -> 1077 return self.__getattr__(key) 1078 else: 1079 raise IndexError("Key" + str(key) + " is out of bounds for this token.") F:\NLP\chemdataextractor2\chemdataextractor\doc\text.py in __getattr__(self, name) 1083 return self._tags[name] 1084 else: -> 1085 self.sentence._assign_tags(name) 1086 if name not in self._tags.keys(): 1087 raise AttributeError(name + " is not a supported tag type for the sentence: " + str(self.sentence) + str(self.sentence.taggers) + str(type(self.sentence)) F:\NLP\chemdataextractor2\chemdataextractor\doc\text.py in _assign_tags(self, tag_type) 788 tags = None 789 if hasattr(tagger, "batch_tag_for_type") and tagger.can_batch_tag(tag_type) and self.document is not None: --> 790 self.document._batch_assign_tags(tagger, tag_type) 791 elif hasattr(tagger, "tag_for_type"): 792 tags = tagger.tag_for_type(self.tokens, tag_type) F:\NLP\chemdataextractor2\chemdataextractor\doc\document.py in _batch_assign_tags(self, tagger, tag_type) 621 622 if hasattr(tagger, "batch_tag_for_type"): --> 623 tag_results = tagger.batch_tag_for_type(all_tokens, tag_type) 624 else: 625 tag_results = tagger.batch_tag(all_tokens) F:\NLP\chemdataextractor2\chemdataextractor\nlp\tag.py in batch_tag_for_type(self, sents, tag_type) 204 """ 205 tagger = self.taggers_dict[tag_type] --> 206 return tagger.batch_tag(sents) 207 208 def can_batch_tag(self, tag_type): F:\NLP\chemdataextractor2\chemdataextractor\nlp\allennlpwrapper.py in batch_tag(self, sents) 193 log.debug("".join(["Batch size:", str(len(instance))])) 194 with torch.no_grad(): --> 195 batch_predictions = self.predictor.predict_batch_instance(instance) 196 predictions.extend(batch_predictions) 197 prediction_end_time = datetime.datetime.now() F:\NLP\chemdataextractor2\chemdataextractor\nlp\allennlpwrapper.py in predictor(self) 152 gpu_id = torch.cuda.current_device() 153 loaded_archive = load_archive(archive_file=self._archive_location, weights_file=self._weights_location, --> 154 overrides=json.dumps(self.overrides)) 155 model = loaded_archive.model 156 if gpu_id is not None and gpu_id >= 0: C:\ml\Anaconda3\lib\site-packages\allennlp\models\archival.py in load_archive(archive_file, cuda_device, overrides, weights_file) 228 weights_file=weights_path, 229 serialization_dir=serialization_dir, --> 230 cuda_device=cuda_device) 231 232 return Archive(model=model, config=config) C:\ml\Anaconda3\lib\site-packages\allennlp\models\model.py in load(cls, config, serialization_dir, weights_file, cuda_device) 325 # This allows subclasses of Model to override _load. 326 # pylint: disable=protected-access --> 327 return cls.by_name(model_type)._load(config, serialization_dir, weights_file, cuda_device) 328 329 def extend_embedder_vocab(self, embedding_sources_mapping: Dict[str, str] = None) -> None: C:\ml\Anaconda3\lib\site-packages\allennlp\models\model.py in _load(cls, config, serialization_dir, weights_file, cuda_device) 263 # want the code to look for it, so we remove it from the parameters here. 264 remove_pretrained_embedding_params(model_params) --> 265 model = Model.from_params(vocab=vocab, params=model_params) 266 267 # If vocab+embedding extension was done, the model initialized from from_params C:\ml\Anaconda3\lib\site-packages\allennlp\common\from_params.py in from_params(cls, params, **extras) 363 # We want to call subclass.from_params 364 extras = create_extras(subclass, extras) --> 365 return subclass.from_params(params=params, **extras) 366 else: 367 # In some rare cases, we get a registered subclass that does _not_ have a C:\ml\Anaconda3\lib\site-packages\allennlp\common\from_params.py in from_params(cls, params, **extras) 384 else: 385 # This class has a constructor, so create kwargs for it. --> 386 kwargs = create_kwargs(cls, params, **extras) 387 388 return cls(**kwargs) # type: ignore C:\ml\Anaconda3\lib\site-packages\allennlp\common\from_params.py in create_kwargs(cls, params, **extras) 131 # and an __args__ field indicating `(str, int)`. We capture both. 132 annotation = remove_optional(param.annotation) --> 133 kwargs[name] = construct_arg(cls, name, annotation, param.default, params, **extras) 134 135 params.assert_empty(cls.__name__) C:\ml\Anaconda3\lib\site-packages\allennlp\common\from_params.py in construct_arg(cls, param_name, annotation, default, params, **extras) 227 return annotation.by_name(subparams)() 228 else: --> 229 return annotation.from_params(params=subparams, **subextras) 230 elif not optional: 231 # Not optional and not supplied, that's an error! C:\ml\Anaconda3\lib\site-packages\allennlp\common\from_params.py in from_params(cls, params, **extras) 363 # We want to call subclass.from_params 364 extras = create_extras(subclass, extras) --> 365 return subclass.from_params(params=params, **extras) 366 else: 367 # In some rare cases, we get a registered subclass that does _not_ have a C:\ml\Anaconda3\lib\site-packages\allennlp\modules\text_field_embedders\basic_text_field_embedder.py in from_params(cls, vocab, params) 158 token_embedders = { 159 name: TokenEmbedder.from_params(subparams, vocab=vocab) --> 160 for name, subparams in token_embedder_params.items() 161 } 162 C:\ml\Anaconda3\lib\site-packages\allennlp\modules\text_field_embedders\basic_text_field_embedder.py in (.0) 158 token_embedders = { 159 name: TokenEmbedder.from_params(subparams, vocab=vocab) --> 160 for name, subparams in token_embedder_params.items() 161 } 162 C:\ml\Anaconda3\lib\site-packages\allennlp\common\from_params.py in from_params(cls, params, **extras) 363 # We want to call subclass.from_params 364 extras = create_extras(subclass, extras) --> 365 return subclass.from_params(params=params, **extras) 366 else: 367 # In some rare cases, we get a registered subclass that does _not_ have a C:\ml\Anaconda3\lib\site-packages\allennlp\common\from_params.py in from_params(cls, params, **extras) 386 kwargs = create_kwargs(cls, params, **extras) 387 --> 388 return cls(**kwargs) # type: ignore C:\ml\Anaconda3\lib\site-packages\allennlp\modules\token_embedders\bert_token_embedder.py in __init__(self, pretrained_model, requires_grad, top_layer_only, scalar_mix_parameters) 268 def __init__(self, pretrained_model: str, requires_grad: bool = False, top_layer_only: bool = False, 269 scalar_mix_parameters: List[float] = None) -> None: --> 270 model = PretrainedBertModel.load(pretrained_model) 271 272 for param in model.parameters(): C:\ml\Anaconda3\lib\site-packages\allennlp\modules\token_embedders\bert_token_embedder.py in load(cls, model_name, cache_model) 36 return PretrainedBertModel._cache[model_name] 37 ---> 38 model = BertModel.from_pretrained(model_name) 39 if cache_model: 40 cls._cache[model_name] = model C:\ml\Anaconda3\lib\site-packages\pytorch_pretrained_bert\modeling.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs) 588 resolved_archive_file, tempdir)) 589 with tarfile.open(resolved_archive_file, 'r:gz') as archive: --> 590 archive.extractall(tempdir) 591 serialization_dir = tempdir 592 # Load config C:\ml\Anaconda3\lib\tarfile.py in extractall(self, path, members, numeric_owner) 2000 # Do not set_attrs directories, as we will do that further down 2001 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), -> 2002 numeric_owner=numeric_owner) 2003 2004 # Reverse sort directories. C:\ml\Anaconda3\lib\tarfile.py in extract(self, member, path, set_attrs, numeric_owner) 2042 self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2043 set_attrs=set_attrs, -> 2044 numeric_owner=numeric_owner) 2045 except OSError as e: 2046 if self.errorlevel > 0: C:\ml\Anaconda3\lib\tarfile.py in _extract_member(self, tarinfo, targetpath, set_attrs, numeric_owner) 2112 2113 if tarinfo.isreg(): -> 2114 self.makefile(tarinfo, targetpath) 2115 elif tarinfo.isdir(): 2116 self.makedir(tarinfo, targetpath) C:\ml\Anaconda3\lib\tarfile.py in makefile(self, tarinfo, targetpath) 2161 target.truncate() 2162 else: -> 2163 copyfileobj(source, target, tarinfo.size, ReadError, bufsize) 2164 2165 def makeunknown(self, tarinfo, targetpath): C:\ml\Anaconda3\lib\tarfile.py in copyfileobj(src, dst, length, exception, bufsize) 245 blocks, remainder = divmod(length, bufsize) 246 for b in range(blocks): --> 247 buf = src.read(bufsize) 248 if len(buf) < bufsize: 249 raise exception("unexpected end of data") C:\ml\Anaconda3\lib\gzip.py in read(self, size) 274 import errno 275 raise OSError(errno.EBADF, "read() on write-only GzipFile object") --> 276 return self._buffer.read(size) 277 278 def read1(self, size=-1): C:\ml\Anaconda3\lib\_compression.py in readinto(self, b) 66 def readinto(self, b): 67 with memoryview(b) as view, view.cast("B") as byte_view: ---> 68 data = self.read(len(byte_view)) 69 byte_view[:len(data)] = data 70 return len(data) C:\ml\Anaconda3\lib\gzip.py in read(self, size) 480 break 481 if buf == b"": --> 482 raise EOFError("Compressed file ended before the " 483 "end-of-stream marker was reached") 484 EOFError: Compressed file ended before the end-of-stream marker was reached Can you offer me some advice? Thank you in advance!

ti250 commented 2 years ago

My guess here is that the downloaded model is somehow corrupt. Could you try finding the directory where your data is located using the following:

from chemdataextractor.data import get_data_dir
print(get_data_dir())

And delete the models folder. On the next run, ChemDataExtractor should download all required model files from scratch, which should hopefully alleviate this issue. If this doesn't work, could you let me know what OS you're on and what Python version you're running?

fanjie17 commented 2 years ago

It worked now. Thank you!

christina0106 commented 1 year ago

Hi,

I delete the models document, the models document is not download from scratch, how can I find this folder? the error is as shown follow: ValueError: unable to parse C:\Users\99239\AppData\Local\ChemDataExtractor\ChemDataExtractor\models/bert_finetuned_crf_model-1.0a as a URL or as a local path

OBrink commented 1 year ago

@christina0106 If you are working in a Jupyter Notebook - have you restarted your Python Kernel and imported ChemDataExtractor again after deleting the models directory? If you're working in a Python shell, maybe restart that and try again

christina0106 commented 1 year ago

Hi,

Thanks for answers, I delete the models document and the models were reload autamatic while the AllenNLP model still downdoad failed My python version is python3.7, and used Windows_x86 system with anaconda3 environment jupyter.

If we should download the offline package? and where to download?

The input and error as follows:

Input: from chemdataextractor import Document doc = Document('UV-vis spectrum of 5,10,15,20-Tetra(4-carboxyphenyl)porphyrin in Tetrahydrofuran (THF).') doc.cems

Error: tialising AllenNLP model -1.0.pickle, downloading . 25h

OSError Traceback (most recent call last) ~.conda\envs\python37\lib\site-packages\allennlp\common\util.py in get_spacy_model(spacy_model_name, pos_tags, parse, ner) 288 try: --> 289 spacy_model = spacy.load(spacy_model_name, disable=disable) 290 except OSError:

~.conda\envs\python37\lib\site-packages\spacy__init__.py in load(name, overrides) 26 deprecation_warning(Warnings.W001.format(path=depr_path)) ---> 27 return util.load_model(name, overrides) 28

~.conda\envs\python37\lib\site-packages\spacy\util.py in load_model(name, overrides) 138 return load_model_from_path(name, overrides) --> 139 raise IOError(Errors.E050.format(name=name)) 140

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

During handling of the above exception, another exception occurred:

gaierror Traceback (most recent call last) ~.conda\envs\python37\lib\site-packages\urllib3\connection.py in _new_conn(self) 158 conn = connection.create_connection( --> 159 (self._dns_host, self.port), self.timeout, **extra_kw) 160

~.conda\envs\python37\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options) 56 ---> 57 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM): 58 af, socktype, proto, canonname, sa = res

~.conda\envs\python37\lib\socket.py in getaddrinfo(host, port, family, type, proto, flags) 747 addrlist = [] --> 748 for res in _socket.getaddrinfo(host, port, family, type, proto, flags): 749 af, socktype, proto, canonname, sa = res

gaierror: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

NewConnectionError Traceback (most recent call last) ~.conda\envs\python37\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 599 body=body, headers=headers, --> 600 chunked=chunked) 601

~.conda\envs\python37\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 342 try: --> 343 self._validate_conn(conn) 344 except (SocketTimeout, BaseSSLError) as e:

~.conda\envs\python37\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn) 838 if not getattr(conn, 'sock', None): # AppEngine might not have .sock --> 839 conn.connect() 840

~.conda\envs\python37\lib\site-packages\urllib3\connection.py in connect(self) 300 # Add certificate verification --> 301 conn = self._new_conn() 302 hostname = self.host

~.conda\envs\python37\lib\site-packages\urllib3\connection.py in _new_conn(self) 167 raise NewConnectionError( --> 168 self, "Failed to establish a new connection: %s" % e) 169

NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x0000024AB84445C0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

MaxRetryError Traceback (most recent call last) ~.conda\envs\python37\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 496 timeout=timeout, --> 497 chunked=chunked, 498 )

~.conda\envs\python37\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 637 retries = retries.increment(method, url, error=e, _pool=self, --> 638 _stacktrace=sys.exc_info()[2]) 639 retries.sleep()

~.conda\envs\python37\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace) 398 if new_retry.is_exhausted(): --> 399 raise MaxRetryError(_pool, url, error or ResponseError(cause)) 400

MaxRetryError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /explosion/spacy-models/master/shortcuts-v2.json (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000024AB84445C0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed'))

During handling of the above exception, another exception occurred:

ConnectionError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_20180\270517424.py in ----> 1 doc.cems

~.conda\envs\python37\lib\site-packages\chemdataextractor\doc\document.py in cems(self) 564 A list of all Chemical Entity Mentions in this document as :class:~chemdataextractor.doc.text.Span 565 """ --> 566 return list(set([n for el in self.elements for n in el.cems])) 567 568 @property

~.conda\envs\python37\lib\site-packages\chemdataextractor\doc\document.py in (.0) 564 A list of all Chemical Entity Mentions in this document as :class:~chemdataextractor.doc.text.Span 565 """ --> 566 return list(set([n for el in self.elements for n in el.cems])) 567 568 @property

~.conda\envs\python37\lib\site-packages\chemdataextractor\doc\text.py in cems(self) 344 A list of all Chemical Entity Mentions in this text as :class:chemdataextractor.doc.text.span 345 """ --> 346 return [cem for sent in self.sentences for cem in sent.cems] 347 348 @property

~.conda\envs\python37\lib\site-packages\chemdataextractor\doc\text.py in (.0) 344 A list of all Chemical Entity Mentions in this text as :class:chemdataextractor.doc.text.span 345 """ --> 346 return [cem for sent in self.sentences for cem in sent.cems] 347 348 @property

~.conda\envs\python37\lib\site-packages\chemdataextractor\utils.py in fget_memoized(self) 27 def fget_memoized(self): 28 if not hasattr(self, attr_name): ---> 29 setattr(self, attr_name, fget(self)) 30 return getattr(self, attr_name) 31 return property(fget_memoized)

~.conda\envs\python37\lib\site-packages\chemdataextractor\doc\text.py in cems(self) 642 spans = [] 643 # print(self.text.encode('utf8')) --> 644 for result in chemical_name.scan(self.tokens): 645 # parser scan yields (result, startindex, endindex) - we just use the indexes here 646 tokens = self.tokens[result[1]:result[2]]

~.conda\envs\python37\lib\site-packages\chemdataextractor\parse\elements.py in scan(self, tokens, max_matches, overlap) 115 while i < length and matches < max_matches: 116 try: --> 117 results, next_i = self.parse(tokens, i) 118 except ParseException as err: 119 # print(err.msg)