chrismattmann / tika-python

Tika-Python is a Python binding to the Apache Tika™ REST services allowing Tika to be called natively in the Python community.
Apache License 2.0
1.49k stars 234 forks source link

Python Tika error: URLError: <urlopen error unknown url type: c> #334

Closed danielepiu closed 3 years ago

danielepiu commented 3 years ago

Hi, I've been using a lot python tika to exctract text from some pdfs. Suddenly Tika doesn't work any more with the following code and similar:

from tika import parser
document = parser.from_file("prova.pdf")['content']

and every time I get this error:

2021-02-23 10:57:36,244 [MainThread  ] [INFO ]  Retrieving C:\Program Files\tika-server-1.24.1.jar to C:\Users\Daniele\AppData\Local\Temp\tika-server.jar.
---------------------------------------------------------------------------
URLError                                  Traceback (most recent call last)
~\anaconda3\lib\site-packages\tika\tika.py in getRemoteJar(urlOrPath, destPath)
    797         try:
--> 798             urlretrieve(urlOrPath, destPath)
    799         except IOError:

~\anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    246 
--> 247     with contextlib.closing(urlopen(url, data)) as fp:
    248         headers = fp.info()

~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 

~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    524         sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525         response = self._open(req, data)
    526 

~\anaconda3\lib\urllib\request.py in _open(self, req, data)
    546 
--> 547         return self._call_chain(self.handle_open, 'unknown',
    548                                 'unknown_open', req)

~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:

~\anaconda3\lib\urllib\request.py in unknown_open(self, req)
   1420         type = req.type
-> 1421         raise URLError('unknown url type: %s' % type)
   1422 

URLError: <urlopen error unknown url type: c>

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-4-5aa5aa48deec> in <module>
      1 from tika import parser
      2 
----> 3 document = parser.from_file("prova.pdf")['content']
      4 #import tika
      5 #from tika import parser

~\anaconda3\lib\site-packages\tika\parser.py in from_file(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions)
     38     '''
     39     if not xmlContent:
---> 40         output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
     41     else:
     42         output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},

~\anaconda3\lib\site-packages\tika\tika.py in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions)
    334     headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
    335     with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f:
--> 336         status, response = callServer('put', serverEndpoint, service, f,
    337                                       headers, verbose, tikaServerJar, config_path=config_path,
    338                                       rawResponse=rawResponse, requestOptions=requestOptions)

~\anaconda3\lib\site-packages\tika\tika.py in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions)
    529     global TikaClientOnly
    530     if not TikaClientOnly:
--> 531         serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
    532 
    533     serviceUrl  = serverEndpoint + service

~\anaconda3\lib\site-packages\tika\tika.py in checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
    590         if not alreadyRunning:
    591             if not os.path.isfile(jarPath) and urlp.scheme != '':
--> 592                 getRemoteJar(tikaServerJar, jarPath)
    593 
    594             if not checkJarSig(tikaServerJar, jarPath):

~\anaconda3\lib\site-packages\tika\tika.py in getRemoteJar(urlOrPath, destPath)
    806             if os.path.exists(destPath) and os.path.isfile(destPath):
    807                 os.remove(destPath)
--> 808             urlretrieve(urlOrPath, destPath)
    809 
    810         return (destPath, 'remote')

~\anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    245     url_type, path = _splittype(url)
    246 
--> 247     with contextlib.closing(urlopen(url, data)) as fp:
    248         headers = fp.info()
    249 

~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    523 
    524         sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525         response = self._open(req, data)
    526 
    527         # post-process response

~\anaconda3\lib\urllib\request.py in _open(self, req, data)
    545             return result
    546 
--> 547         return self._call_chain(self.handle_open, 'unknown',
    548                                 'unknown_open', req)
    549 

~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\anaconda3\lib\urllib\request.py in unknown_open(self, req)
   1419     def unknown_open(self, req):
   1420         type = req.type
-> 1421         raise URLError('unknown url type: %s' % type)
   1422 
   1423 def parse_keqv_list(l):

URLError: <urlopen error unknown url type: c>

I tried to unistall tika python, tika server, java, python... basically everything. And the strange thing is that suddenly I have the same issue in my second pc. Any suggestions ? Thanks a lot.

chrismattmann commented 3 years ago

Hmm likely a network issue and also one with the underlying request library or its connection to the URL (and more than likely since it's not in requests since that library is so well tested, maybe a network or proxy error for that URL on your home network?) Either way the issue isn't in Tika.