webrecorder / warcio

Streaming WARC/ARC library for fast web archive IO
https://pypi.python.org/pypi/warcio
Apache License 2.0
387 stars 58 forks source link

Trying to write to closed file when using `requests.Session` #147

Open maxyousif15 opened 2 years ago

maxyousif15 commented 2 years ago

Overview

When attempting to use requests.Session with capture_http in some kind of loop to create new WARC files, an error is raised. However, when using requests directly without the use of a session, all works as expected.

Below is the code snippet using requests.Session alongside the exception raised

from warcio.capture_http import capture_http
from requests.sessions import Session
import requests

HEADERS = {
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}

session = Session()
for i in range(3):
    fn = f'example-session-error-{i}.warc.gz'
    with capture_http(fn):
        print(f"Scraping {fn}")
        session.get('https://httpbin.org/ip')

Below is the exception raised

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-99e75b92ba45> in <module>
      4     with capture_http(fn):
      5         print(f"Scraping {fn}")
----> 6         session.get('https://httpbin.org/ip')

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in get(self, url, **kwargs)
    540 
    541         kwargs.setdefault('allow_redirects', True)
--> 542         return self.request('GET', url, **kwargs)
    543 
    544     def options(self, url, **kwargs):

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    527         }
    528         send_kwargs.update(settings)
--> 529         resp = self.send(prep, **send_kwargs)
    530 
    531         return resp

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in send(self, request, **kwargs)
    685 
    686         if not stream:
--> 687             r.content
    688 
    689         return r

~/anaconda3/lib/python3.8/site-packages/requests/models.py in content(self)
    836                 self._content = None
    837             else:
--> 838                 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
    839 
    840         self._content_consumed = True

~/anaconda3/lib/python3.8/site-packages/requests/models.py in generate()
    758             if hasattr(self.raw, 'stream'):
    759                 try:
--> 760                     for chunk in self.raw.stream(chunk_size, decode_content=True):
    761                         yield chunk
    762                 except ProtocolError as e:

~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in stream(self, amt, decode_content)
    577         else:
    578             while not is_fp_closed(self._fp):
--> 579                 data = self.read(amt=amt, decode_content=decode_content)
    580 
    581                 if data:

~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
    520             else:
    521                 cache_content = False
--> 522                 data = self._fp.read(amt) if not fp_closed else b""
    523                 if (
    524                     amt != 0 and not data

~/anaconda3/lib/python3.8/http/client.py in read(self, amt)
    456             # Amount is given, implement using readinto
    457             b = bytearray(amt)
--> 458             n = self.readinto(b)
    459             return memoryview(b)[:n].tobytes()
    460         else:

~/anaconda3/lib/python3.8/http/client.py in readinto(self, b)
    508             self.length -= n
    509             if not self.length:
--> 510                 self._close_conn()
    511         return n
    512 

~/anaconda3/lib/python3.8/http/client.py in _close_conn(self)
    410         fp = self.fp
    411         self.fp = None
--> 412         fp.close()
    413 
    414     def close(self):

~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in close(self)
     63 
     64     def close(self):
---> 65         self.recorder.done()
     66         if self.fp:
     67             return self.fp.close()

~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in done(self)
    196 
    197             with self.lock:
--> 198                 self.writer.write_request_response_pair(request, response)
    199         finally:
    200             self.request_out.close()

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write_request_response_pair(self, req, resp, params)
     31             req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
     32 
---> 33         self._do_write_req_resp(req, resp, params)
     34 
     35     def write_record(self, record, params=None):  #pragma: no cover

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _do_write_req_resp(self, req, resp, params)
    138 
    139     def _do_write_req_resp(self, req, resp, params):
--> 140         self._write_warc_record(self.out, resp)
    141         self._write_warc_record(self.out, req)
    142 

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _write_warc_record(self, out, record)
     89         # write record headers -- encoded as utf-8
     90         # WARC headers can be utf-8 per spec
---> 91         out.write(record.rec_headers.to_bytes(encoding='utf-8'))
     92 
     93         # write headers buffer, if any

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write(self, buff)
    120         #    buff = buff.encode('utf-8')
    121         buff = self.compressor.compress(buff)
--> 122         self.out.write(buff)
    123 
    124     def flush(self):

ValueError: write to closed file

The following code snippet intends to do the same thing as above without the use of a session, and does actually work

for i in range(3):
    fn = f'example-session-error-{i}.warc.gz'
    with capture_http(fn):
        print(f"Scraping {fn}")
        requests.get('https://httpbin.org/ip', headers=HEADERS)

Environment

Python - 3.8.5 requests - 2.27.1 warcio - 1.7.4

Any help regarding this issue would be massively appreciated.