Open knana1662 opened 1 year ago
You might like to try using the windows subsystem for linux @knana1662
Hi, I am also facing the same issue here. Below is my code snippet of using textract
:
doc = textract.process(f"Attention is All You Need.pdf")
doc
Then, it shows this error:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
File [c:\Users\ILLEGEAR\OneDrive\Desktop\PDF](file:///C:/Users/ILLEGEAR/OneDrive/Desktop/PDF) Chatbot\pdfcb_env\lib\site-packages\textract\parsers\utils.py:87, in ShellParser.run(self, args)
86 try:
---> 87 pipe = subprocess.Popen(
88 args,
89 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
90 )
91 except OSError as e:
File [~\AppData\Local\Programs\Python\Python310\lib\subprocess.py:971](https://file+.vscode-resource.vscode-cdn.net/c%3A/Users/ILLEGEAR/OneDrive/Desktop/PDF%20Chatbot/~/AppData/Local/Programs/Python/Python310/lib/subprocess.py:971), in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize)
968 self.stderr = io.TextIOWrapper(self.stderr,
969 encoding=encoding, errors=errors)
--> 971 self._execute_child(args, executable, preexec_fn, close_fds,
972 pass_fds, cwd, env,
973 startupinfo, creationflags, shell,
974 p2cread, p2cwrite,
975 c2pread, c2pwrite,
976 errread, errwrite,
977 restore_signals,
978 gid, gids, uid, umask,
979 start_new_session)
980 except:
981 # Cleanup if the child failed starting.
File [~\AppData\Local\Programs\Python\Python310\lib\subprocess.py:1440](https://file+.vscode-resource.vscode-cdn.net/c%3A/Users/ILLEGEAR/OneDrive/Desktop/PDF%20Chatbot/~/AppData/Local/Programs/Python/Python310/lib/subprocess.py:1440), in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session)
1439 try:
-> 1440 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
1441 # no special security
1442 None, None,
1443 int(not close_fds),
1444 creationflags,
1445 env,
1446 cwd,
1447 startupinfo)
1448 finally:
1449 # Child is launched. Close the parent's copy of those pipe
1450 # handles that only the child should have open. You need
(...)
1453 # pipe will not close when the child process exits and the
1454 # ReadFile will hang.
FileNotFoundError: [WinError 2] The system cannot find the file specified
During handling of the above exception, another exception occurred:
ShellError Traceback (most recent call last)
Cell In[9], line 1
----> 1 doc = textract.process(f"Attention is All You Need.pdf")
2 doc
File [c:\Users\ILLEGEAR\OneDrive\Desktop\PDF](file:///C:/Users/ILLEGEAR/OneDrive/Desktop/PDF) Chatbot\pdfcb_env\lib\site-packages\textract\parsers\__init__.py:79, in process(filename, input_encoding, output_encoding, extension, **kwargs)
76 # do the extraction
78 parser = filetype_module.Parser()
---> 79 return parser.process(filename, input_encoding, output_encoding, **kwargs)
File [c:\Users\ILLEGEAR\OneDrive\Desktop\PDF](file:///C:/Users/ILLEGEAR/OneDrive/Desktop/PDF) Chatbot\pdfcb_env\lib\site-packages\textract\parsers\utils.py:46, in BaseParser.process(self, filename, input_encoding, output_encoding, **kwargs)
36 """Process ``filename`` and encode byte-string with ``encoding``. This
37 method is called by :func:`textract.parsers.process` and wraps
38 the :meth:`.BaseParser.extract` method in `a delicious unicode
39 sandwich `_.
40
41 """
42 # make a "unicode sandwich" to handle dealing with unknown
43 # input byte strings and converting them to a predictable
44 # output encoding
45 # http://nedbatchelder.com/text/unipain/unipain.html#35
---> 46 byte_string = self.extract(filename, **kwargs)
47 unicode_string = self.decode(byte_string, input_encoding)
48 return self.encode(unicode_string, output_encoding)
File [c:\Users\ILLEGEAR\OneDrive\Desktop\PDF](file:///C:/Users/ILLEGEAR/OneDrive/Desktop/PDF) Chatbot\pdfcb_env\lib\site-packages\textract\parsers\pdf_parser.py:29, in Parser.extract(self, filename, method, **kwargs)
27 return self.extract_pdfminer(filename, **kwargs)
28 else:
---> 29 raise ex
31 elif method == 'pdfminer':
32 return self.extract_pdfminer(filename, **kwargs)
File [c:\Users\ILLEGEAR\OneDrive\Desktop\PDF](file:///C:/Users/ILLEGEAR/OneDrive/Desktop/PDF) Chatbot\pdfcb_env\lib\site-packages\textract\parsers\pdf_parser.py:21, in Parser.extract(self, filename, method, **kwargs)
19 if method == '' or method == 'pdftotext':
20 try:
---> 21 return self.extract_pdftotext(filename, **kwargs)
22 except ShellError as ex:
23 # If pdftotext isn't installed and the pdftotext method
24 # wasn't specified, then gracefully fallback to using
25 # pdfminer instead.
26 if method == '' and ex.is_not_installed():
File [c:\Users\ILLEGEAR\OneDrive\Desktop\PDF](file:///C:/Users/ILLEGEAR/OneDrive/Desktop/PDF) Chatbot\pdfcb_env\lib\site-packages\textract\parsers\pdf_parser.py:44, in Parser.extract_pdftotext(self, filename, **kwargs)
42 else:
43 args = ['pdftotext', filename, '-']
---> 44 stdout, _ = self.run(args)
45 return stdout
File [c:\Users\ILLEGEAR\OneDrive\Desktop\PDF](file:///C:/Users/ILLEGEAR/OneDrive/Desktop/PDF) Chatbot\pdfcb_env\lib\site-packages\textract\parsers\utils.py:95, in ShellParser.run(self, args)
91 except OSError as e:
92 if e.errno == errno.ENOENT:
93 # File not found.
94 # This is equivalent to getting exitcode 127 from sh
---> 95 raise exceptions.ShellError(
96 ' '.join(args), 127, '', '',
97 )
98 else: raise #Reraise the last exception unmodified
100 # pipe.wait() ends up hanging on large files. using
101 # pipe.communicate appears to avoid this issue
ShellError: The command `pdftotext Attention is All You Need.pdf -` failed with exit code 127
------------- stdout -------------
------------- stderr -------------
Please does textract support windows because I can run it on my windows 10 laptop. Please any assistance received would be very beneficial.