Data4Democracy / internal-displacement

Studying news events and internal displacement.
43 stars 27 forks source link

Rare case of site not returning true 404 #148

Open georgerichardson opened 7 years ago

georgerichardson commented 7 years ago

Trace

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _read_status(self)
    282         try:
--> 283             status = int(status)
    284             if status < 100 or status > 999:

ValueError: invalid literal for int() with base 10: '404:'

During handling of the above exception, another exception occurred:

BadStatusLine                             Traceback (most recent call last)
<ipython-input-69-4f8063e7f514> in <module>()
      4     try:
      5         article = pipeline.create_article(url)
----> 6         pipeline.fetch_article(article)
      7     except exc.IntegrityError:
      8         session.rollback()

/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/pipeline.py in fetch_article(self, article)
    196         '''
    197         content, publish_date, title, content_type, authors, domain = self.scraper.scrape(
--> 198             article.url)
    199         if content == 'retrieval_failed':
    200             article.update_status(Status.FETCHING_FAILED)

/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in scrape(self, url, scrape_pdfs)
    189 
    190         """
--> 191         pdf_check = is_pdf_consolidated_test(url)
    192         if pdf_check and scrape_pdfs:
    193             article = self.pdf_article(pdf_check)

/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in is_pdf_consolidated_test(url)
     58 
     59     # Carry out simple tests based upon url and content type
---> 60     pdf_attempt_1 = is_pdf_simple_tests(url)
     61     if pdf_attempt_1:
     62         return pdf_attempt_1

/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in is_pdf_simple_tests(url)
     24     # Test based on headers
     25     try:
---> 26         page = request.urlopen(url)
     27         content_type = page.getheader('Content-Type')
     28         if content_type == 'application/pdf':

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    524             req = meth(req)
    525 
--> 526         response = self._open(req, data)
    527 
    528         # post-process response

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in _open(self, req, data)
    542         protocol = req.type
    543         result = self._call_chain(self.handle_open, protocol, protocol +
--> 544                                   '_open', req)
    545         if result:
    546             return result

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in http_open(self, req)
   1344 
   1345     def http_open(self, req):
-> 1346         return self.do_open(http.client.HTTPConnection, req)
   1347 
   1348     http_request = AbstractHTTPHandler.do_request_

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1319             except OSError as err: # timeout error
   1320                 raise URLError(err)
-> 1321             r = h.getresponse()
   1322         except:
   1323             h.close()

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in getresponse(self)
   1329         try:
   1330             try:
-> 1331                 response.begin()
   1332             except ConnectionError:
   1333                 self.close()

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in begin(self)
    295         # read until we get a non-100 response
    296         while True:
--> 297             version, status, reason = self._read_status()
    298             if status != CONTINUE:
    299                 break

/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _read_status(self)
    285                 raise BadStatusLine(line)
    286         except ValueError:
--> 287             raise BadStatusLine(line)
    288         return version, status, reason
    289 

BadStatusLine: HTTP/1.1 404: Not Found