Open georgerichardson opened 7 years ago
Trace
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _read_status(self) 282 try: --> 283 status = int(status) 284 if status < 100 or status > 999: ValueError: invalid literal for int() with base 10: '404:' During handling of the above exception, another exception occurred: BadStatusLine Traceback (most recent call last) <ipython-input-69-4f8063e7f514> in <module>() 4 try: 5 article = pipeline.create_article(url) ----> 6 pipeline.fetch_article(article) 7 except exc.IntegrityError: 8 session.rollback() /Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/pipeline.py in fetch_article(self, article) 196 ''' 197 content, publish_date, title, content_type, authors, domain = self.scraper.scrape( --> 198 article.url) 199 if content == 'retrieval_failed': 200 article.update_status(Status.FETCHING_FAILED) /Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in scrape(self, url, scrape_pdfs) 189 190 """ --> 191 pdf_check = is_pdf_consolidated_test(url) 192 if pdf_check and scrape_pdfs: 193 article = self.pdf_article(pdf_check) /Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in is_pdf_consolidated_test(url) 58 59 # Carry out simple tests based upon url and content type ---> 60 pdf_attempt_1 = is_pdf_simple_tests(url) 61 if pdf_attempt_1: 62 return pdf_attempt_1 /Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in is_pdf_simple_tests(url) 24 # Test based on headers 25 try: ---> 26 page = request.urlopen(url) 27 content_type = page.getheader('Content-Type') 28 if content_type == 'application/pdf': /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context) 221 else: 222 opener = _opener --> 223 return opener.open(url, data, timeout) 224 225 def install_opener(opener): /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout) 524 req = meth(req) 525 --> 526 response = self._open(req, data) 527 528 # post-process response /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in _open(self, req, data) 542 protocol = req.type 543 result = self._call_chain(self.handle_open, protocol, protocol + --> 544 '_open', req) 545 if result: 546 return result /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args) 502 for handler in handlers: 503 func = getattr(handler, meth_name) --> 504 result = func(*args) 505 if result is not None: 506 return result /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in http_open(self, req) 1344 1345 def http_open(self, req): -> 1346 return self.do_open(http.client.HTTPConnection, req) 1347 1348 http_request = AbstractHTTPHandler.do_request_ /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args) 1319 except OSError as err: # timeout error 1320 raise URLError(err) -> 1321 r = h.getresponse() 1322 except: 1323 h.close() /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in getresponse(self) 1329 try: 1330 try: -> 1331 response.begin() 1332 except ConnectionError: 1333 self.close() /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in begin(self) 295 # read until we get a non-100 response 296 while True: --> 297 version, status, reason = self._read_status() 298 if status != CONTINUE: 299 break /Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _read_status(self) 285 raise BadStatusLine(line) 286 except ValueError: --> 287 raise BadStatusLine(line) 288 return version, status, reason 289 BadStatusLine: HTTP/1.1 404: Not Found
Trace