Open vriez opened 10 months ago
Upon installation,
pip install tika
When attempting:
In [21]: import tika ...: tika.initVM() ...: from tika import parser In [22]: parsed = parser.from_file(file_path)
I get
--------------------------------------------------------------------------- timeout Traceback (most recent call last) File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:466, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 462 except BaseException as e: 463 # Remove the TypeError from the exception chain in 464 # Python 3 (including for exceptions like SystemExit). 465 # Otherwise it looks like a bug in the code. --> 466 six.raise_from(e, None) 467 except (SocketTimeout, BaseSSLError, SocketError) as e: File <string>:3, in raise_from(value, from_value) File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:461, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 460 try: --> 461 httplib_response = conn.getresponse() 462 except BaseException as e: 463 # Remove the TypeError from the exception chain in 464 # Python 3 (including for exceptions like SystemExit). 465 # Otherwise it looks like a bug in the code. File ~/anaconda3/envs/master/lib/python3.8/http/client.py:1348, in HTTPConnection.getresponse(self) 1347 try: -> 1348 response.begin() 1349 except ConnectionError: File ~/anaconda3/envs/master/lib/python3.8/http/client.py:316, in HTTPResponse.begin(self) 315 while True: --> 316 version, status, reason = self._read_status() 317 if status != CONTINUE: File ~/anaconda3/envs/master/lib/python3.8/http/client.py:277, in HTTPResponse._read_status(self) 276 def _read_status(self): --> 277 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 278 if len(line) > _MAXLINE: File ~/anaconda3/envs/master/lib/python3.8/socket.py:669, in SocketIO.readinto(self, b) 668 try: --> 669 return self._sock.recv_into(b) 670 except timeout: timeout: timed out During handling of the above exception, another exception occurred: ReadTimeoutError Traceback (most recent call last) File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 485 try: --> 486 resp = conn.urlopen( 487 method=request.method, 488 url=url, 489 body=request.body, 490 headers=request.headers, 491 redirect=False, 492 assert_same_host=False, 493 preload_content=False, 494 decode_content=False, 495 retries=self.max_retries, 496 timeout=timeout, 497 chunked=chunked, 498 ) 500 except (ProtocolError, OSError) as err: File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:798, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 796 e = ProtocolError("Connection aborted.", e) --> 798 retries = retries.increment( 799 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2] 800 ) 801 retries.sleep() File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace) 549 if read is False or not self._is_method_retryable(method): --> 550 raise six.reraise(type(error), error, _stacktrace) 551 elif read is not None: File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb) 769 raise value.with_traceback(tb) --> 770 raise value 771 finally: File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:714, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 713 # Make the request on the httplib connection object. --> 714 httplib_response = self._make_request( 715 conn, 716 method, 717 url, 718 timeout=timeout_obj, 719 body=body, 720 headers=headers, 721 chunked=chunked, 722 ) 724 # If we're going to release the connection in ``finally:``, then 725 # the response doesn't need to know about the connection. Otherwise 726 # it will also try to release it and we'll have a double-release 727 # mess. File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:468, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 467 except (SocketTimeout, BaseSSLError, SocketError) as e: --> 468 self._raise_timeout(err=e, url=url, timeout_value=read_timeout) 469 raise File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:357, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value) 356 if isinstance(err, SocketTimeout): --> 357 raise ReadTimeoutError( 358 self, url, "Read timed out. (read timeout=%s)" % timeout_value 359 ) 361 # See the above comment about EAGAIN in Python 3. In Python 2 we have 362 # to specifically catch it and throw the timeout error ReadTimeoutError: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60) During handling of the above exception, another exception occurred: ReadTimeout Traceback (most recent call last) Cell In[22], line 1 ----> 1 parsed = parser.from_file(file_path) File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/parser.py:40, in from_file(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions, raw_response) 24 ''' 25 Parses a file for metadata and content 26 :param filename: path to file which needs to be parsed or binary file using open(path,'rb') (...) 37 'content' has a str value and metadata has a dict type value. 38 ''' 39 if not xmlContent: ---> 40 output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions) 41 else: 42 output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'}, 43 headers=headers, config_path=config_path, requestOptions=requestOptions) File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:337, in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions) 335 headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)}) 336 with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f: --> 337 status, response = callServer('put', serverEndpoint, service, f, 338 headers, verbose, tikaServerJar, config_path=config_path, 339 rawResponse=rawResponse, requestOptions=requestOptions) 341 if file_type == 'remote': os.unlink(path) 342 return (status, response) File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:555, in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions) 552 effectiveRequestOptions = requestOptionsDefault.copy() 553 effectiveRequestOptions.update(requestOptions) --> 555 resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions) 557 if verbose: 558 print(sys.stderr, "Request headers: ", headers) File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:130, in put(url, data, **kwargs) 118 def put(url, data=None, **kwargs): 119 r"""Sends a PUT request. 120 121 :param url: URL for the new :class:`Request` object. (...) 127 :rtype: requests.Response 128 """ --> 130 return request("put", url, data=data, **kwargs) File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:59, in request(method, url, **kwargs) 55 # By using the 'with' statement we are sure the session is closed, thus we 56 # avoid leaving sockets open which can trigger a ResourceWarning in some 57 # cases, and look like a memory leak in others. 58 with sessions.Session() as session: ---> 59 return session.request(method=method, url=url, **kwargs) File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 584 send_kwargs = { 585 "timeout": timeout, 586 "allow_redirects": allow_redirects, 587 } 588 send_kwargs.update(settings) --> 589 resp = self.send(prep, **send_kwargs) 591 return resp File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs) 700 start = preferred_clock() 702 # Send the request --> 703 r = adapter.send(request, **kwargs) 705 # Total elapsed time of the request (approximately) 706 elapsed = preferred_clock() - start File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 530 raise SSLError(e, request=request) 531 elif isinstance(e, ReadTimeoutError): --> 532 raise ReadTimeout(e, request=request) 533 elif isinstance(e, _InvalidHeader): 534 raise InvalidHeader(e, request=request) ReadTimeout: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60) In [23]:
How can I overcome it?
Nevermind, I have missed
TIKA_SERVER_JAR="file:////tika-server-standard.jar
After setting this environment variable, it worked.
Upon installation,
When attempting:
I get
How can I overcome it?