chrismattmann / tika-python

Tika-Python is a Python binding to the Apache Tika™ REST services allowing Tika to be called natively in the Python community.
Apache License 2.0
1.52k stars 236 forks source link

How to fix ReadTimeout: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60) #404

Open vriez opened 10 months ago

vriez commented 10 months ago

Upon installation,

pip install tika

When attempting:

In [21]: import tika
    ...: tika.initVM()
    ...: from tika import parser

In [22]: parsed = parser.from_file(file_path)

I get

---------------------------------------------------------------------------
timeout                                   Traceback (most recent call last)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:466, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    462         except BaseException as e:
    463             # Remove the TypeError from the exception chain in
    464             # Python 3 (including for exceptions like SystemExit).
    465             # Otherwise it looks like a bug in the code.
--> 466             six.raise_from(e, None)
    467 except (SocketTimeout, BaseSSLError, SocketError) as e:

File <string>:3, in raise_from(value, from_value)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:461, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    460 try:
--> 461     httplib_response = conn.getresponse()
    462 except BaseException as e:
    463     # Remove the TypeError from the exception chain in
    464     # Python 3 (including for exceptions like SystemExit).
    465     # Otherwise it looks like a bug in the code.

File ~/anaconda3/envs/master/lib/python3.8/http/client.py:1348, in HTTPConnection.getresponse(self)
   1347 try:
-> 1348     response.begin()
   1349 except ConnectionError:

File ~/anaconda3/envs/master/lib/python3.8/http/client.py:316, in HTTPResponse.begin(self)
    315 while True:
--> 316     version, status, reason = self._read_status()
    317     if status != CONTINUE:

File ~/anaconda3/envs/master/lib/python3.8/http/client.py:277, in HTTPResponse._read_status(self)
    276 def _read_status(self):
--> 277     line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    278     if len(line) > _MAXLINE:

File ~/anaconda3/envs/master/lib/python3.8/socket.py:669, in SocketIO.readinto(self, b)
    668 try:
--> 669     return self._sock.recv_into(b)
    670 except timeout:

timeout: timed out

During handling of the above exception, another exception occurred:

ReadTimeoutError                          Traceback (most recent call last)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    485 try:
--> 486     resp = conn.urlopen(
    487         method=request.method,
    488         url=url,
    489         body=request.body,
    490         headers=request.headers,
    491         redirect=False,
    492         assert_same_host=False,
    493         preload_content=False,
    494         decode_content=False,
    495         retries=self.max_retries,
    496         timeout=timeout,
    497         chunked=chunked,
    498     )
    500 except (ProtocolError, OSError) as err:

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:798, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    796     e = ProtocolError("Connection aborted.", e)
--> 798 retries = retries.increment(
    799     method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
    800 )
    801 retries.sleep()

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
    549 if read is False or not self._is_method_retryable(method):
--> 550     raise six.reraise(type(error), error, _stacktrace)
    551 elif read is not None:

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb)
    769         raise value.with_traceback(tb)
--> 770     raise value
    771 finally:

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:714, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    713 # Make the request on the httplib connection object.
--> 714 httplib_response = self._make_request(
    715     conn,
    716     method,
    717     url,
    718     timeout=timeout_obj,
    719     body=body,
    720     headers=headers,
    721     chunked=chunked,
    722 )
    724 # If we're going to release the connection in ``finally:``, then
    725 # the response doesn't need to know about the connection. Otherwise
    726 # it will also try to release it and we'll have a double-release
    727 # mess.

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:468, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    467 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 468     self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
    469     raise

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:357, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value)
    356 if isinstance(err, SocketTimeout):
--> 357     raise ReadTimeoutError(
    358         self, url, "Read timed out. (read timeout=%s)" % timeout_value
    359     )
    361 # See the above comment about EAGAIN in Python 3. In Python 2 we have
    362 # to specifically catch it and throw the timeout error

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60)

During handling of the above exception, another exception occurred:

ReadTimeout                               Traceback (most recent call last)
Cell In[22], line 1
----> 1 parsed = parser.from_file(file_path)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/parser.py:40, in from_file(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions, raw_response)
     24 '''
     25 Parses a file for metadata and content
     26 :param filename: path to file which needs to be parsed or binary file using open(path,'rb')
   (...)
     37         'content' has a str value and metadata has a dict type value.
     38 '''
     39 if not xmlContent:
---> 40     output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
     41 else:
     42     output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
     43                         headers=headers, config_path=config_path, requestOptions=requestOptions)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:337, in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions)
    335 headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
    336 with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f:
--> 337     status, response = callServer('put', serverEndpoint, service, f,
    338                                   headers, verbose, tikaServerJar, config_path=config_path,
    339                                   rawResponse=rawResponse, requestOptions=requestOptions)
    341 if file_type == 'remote': os.unlink(path)
    342 return (status, response)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:555, in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions)
    552 effectiveRequestOptions = requestOptionsDefault.copy()
    553 effectiveRequestOptions.update(requestOptions)
--> 555 resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions)
    557 if verbose:
    558     print(sys.stderr, "Request headers: ", headers)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:130, in put(url, data, **kwargs)
    118 def put(url, data=None, **kwargs):
    119     r"""Sends a PUT request.
    120 
    121     :param url: URL for the new :class:`Request` object.
   (...)
    127     :rtype: requests.Response
    128     """
--> 130     return request("put", url, data=data, **kwargs)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:59, in request(method, url, **kwargs)
     55 # By using the 'with' statement we are sure the session is closed, thus we
     56 # avoid leaving sockets open which can trigger a ResourceWarning in some
     57 # cases, and look like a memory leak in others.
     58 with sessions.Session() as session:
---> 59     return session.request(method=method, url=url, **kwargs)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    584 send_kwargs = {
    585     "timeout": timeout,
    586     "allow_redirects": allow_redirects,
    587 }
    588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
    591 return resp

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
    700 start = preferred_clock()
    702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
    705 # Total elapsed time of the request (approximately)
    706 elapsed = preferred_clock() - start

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    530     raise SSLError(e, request=request)
    531 elif isinstance(e, ReadTimeoutError):
--> 532     raise ReadTimeout(e, request=request)
    533 elif isinstance(e, _InvalidHeader):
    534     raise InvalidHeader(e, request=request)

ReadTimeout: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60)

In [23]: 

How can I overcome it?

vriez commented 10 months ago

Nevermind, I have missed

TIKA_SERVER_JAR="file:////tika-server-standard.jar

After setting this environment variable, it worked.