LHNCBC / skr_web_python_api

SKR Web API: Python implementation
Other
37 stars 7 forks source link

Interactive MetaMap API breaks randomly #9

Open rendevi247 opened 1 year ago

rendevi247 commented 1 year ago

Hi,

We are trying to use the Interactive Metamap API to get tags for a set of patents. We have a total of about 400k patents. While running the code, the api breaks down randomly. I would like to know if you have any built-in restrictions we have to follow while using the code.

I had also tried running the specific input texts at which the code was breaking to see if there were some character issues throwing the error, but the code works fine for the texts.

PFB the code used:

def flow_from_df(dataframe: pd.DataFrame, chunk_size: int = 400): for start_row in range(0, dataframe.shape[0], chunk_size): end_row = min(start_row + chunk_size, dataframe.shape[0]) yield dataframe.iloc[start_row:end_row, :]

get_chunk = flow_from_df(df) list_mesh = []

for index, indi_chunks in enumerate(chunk_list): print('Working on Chunk number: {}'.format(index)) print('Date and Time of start of the chunk: {}'.format(datetime.datetime.now())) for idx, rows in indi_chunks.iterrows(): input_id = df.iloc[idx]['patent_id'] input_text = df.iloc[idx]['merged_claim_text'] inst.init_mti_interactive(input_text, args='-opt1L_DCMS') response = inst.submit() print('response status: {}'.format(response.status_code)) resp_dec = response.content.decode() list_mesh_3.append([input_id, input_text, resp_dec]) print('Currently Processed: {}'.format(len(list_mesh_3)))

print('Date and Time of completion of the above patent: {}'.format(datetime.datetime.now()))

print('Date and Time of completion of the chunk: {}'.format(datetime.datetime.now()))
sleep(60*60)

Error received:

ValueError Traceback (most recent call last) File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\urllib3\response.py:697, in HTTPResponse._update_chunk_length(self) 696 try: --> 697 self.chunk_left = int(line, 16) 698 except ValueError: 699 # Invalid chunked protocol response, abort.

ValueError: invalid literal for int() with base 16: b''

During handling of the above exception, another exception occurred:

InvalidChunkLength Traceback (most recent call last) File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\urllib3\response.py:438, in HTTPResponse._error_catcher(self) 437 try: --> 438 yield 440 except SocketTimeout: 441 # FIXME: Ideally we'd like to include the url in the ReadTimeoutError but 442 # there is yet no clean way to get at it from this context.

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\urllib3\response.py:764, in HTTPResponse.read_chunked(self, amt, decode_content) 763 while True: --> 764 self._update_chunk_length() 765 if self.chunk_left == 0:

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\urllib3\response.py:701, in HTTPResponse._update_chunk_length(self) 700 self.close() --> 701 raise InvalidChunkLength(self, line)

InvalidChunkLength: InvalidChunkLength(got length b'', 0 bytes read)

During handling of the above exception, another exception occurred:

ProtocolError Traceback (most recent call last) File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\requests\models.py:760, in Response.iter_content..generate() 759 try: --> 760 for chunk in self.raw.stream(chunk_size, decode_content=True): 761 yield chunk

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\urllib3\response.py:572, in HTTPResponse.stream(self, amt, decode_content) 571 if self.chunked and self.supports_chunked_reads(): --> 572 for line in self.read_chunked(amt, decode_content=decode_content): 573 yield line

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\urllib3\response.py:793, in HTTPResponse.read_chunked(self, amt, decode_content) 792 if self._original_response: --> 793 self._original_response.close()

File c:\users\somas326055\appdata\local\programs\python\python38\lib\contextlib.py:131, in _GeneratorContextManager.exit(self, type, value, traceback) 130 try: --> 131 self.gen.throw(type, value, traceback) 132 except StopIteration as exc: 133 # Suppress StopIteration unless it's the same exception that 134 # was passed to throw(). This prevents a StopIteration 135 # raised inside the "with" statement from being suppressed.

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\urllib3\response.py:455, in HTTPResponse._error_catcher(self) 453 except (HTTPException, SocketError) as e: 454 # This includes IncompleteRead. --> 455 raise ProtocolError("Connection broken: %r" % e, e) 457 # If no exception is thrown, we should avoid cleaning up 458 # unnecessarily.

ProtocolError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

During handling of the above exception, another exception occurred:

ChunkedEncodingError Traceback (most recent call last) Input In [12], in <cell line: 3>() 8 input_text = df.iloc[idx]['merged_claim_text'] 9 inst.init_mti_interactive(input_text, args='-opt1L_DCMS') ---> 10 response = inst.submit() 11 print('response status: {}'.format(response.status_code)) 12 resp_dec = response.content.decode()

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\skr_web_api__init__.py:103, in Submission.submit(self) 101 if response.status_code == 302: 102 newurl = s.get_redirect_target(response) --> 103 response = s.post(newurl, 104 self.form, files=self.files, 105 headers=headers, params=params, 106 allow_redirects=False) 107 return response

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\requests\sessions.py:577, in Session.post(self, url, data, json, kwargs) 566 def post(self, url, data=None, json=None, kwargs): 567 r"""Sends a POST request. Returns :class:Response object. 568 569 :param url: URL for the new :class:Request object. (...) 574 :rtype: requests.Response 575 """ --> 577 return self.request('POST', url, data=data, json=json, **kwargs)

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\requests\sessions.py:529, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 524 send_kwargs = { 525 'timeout': timeout, 526 'allow_redirects': allow_redirects, 527 } 528 send_kwargs.update(settings) --> 529 resp = self.send(prep, **send_kwargs) 531 return resp

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\requests\sessions.py:687, in Session.send(self, request, **kwargs) 684 pass 686 if not stream: --> 687 r.content 689 return r

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\requests\models.py:838, in Response.content(self) 836 self._content = None 837 else: --> 838 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b'' 840 self._content_consumed = True 841 # don't need to release the connection; that's been handled by urllib3 842 # since we exhausted the data.

File c:\users\somas326055\appdata\local\programs\python\python38\lib\site-packages\requests\models.py:763, in Response.iter_content..generate() 761 yield chunk 762 except ProtocolError as e: --> 763 raise ChunkedEncodingError(e) 764 except DecodeError as e: 765 raise ContentDecodingError(e)

ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))