When attempting to use requests.Session with capture_http in some kind of loop to create new WARC files, an error is raised.
However, when using requests directly without the use of a session, all works as expected.
Below is the code snippet using requests.Session alongside the exception raised
from warcio.capture_http import capture_http
from requests.sessions import Session
import requests
HEADERS = {
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cache-control': 'max-age=0',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}
session = Session()
for i in range(3):
fn = f'example-session-error-{i}.warc.gz'
with capture_http(fn):
print(f"Scraping {fn}")
session.get('https://httpbin.org/ip')
Below is the exception raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-99e75b92ba45> in <module>
4 with capture_http(fn):
5 print(f"Scraping {fn}")
----> 6 session.get('https://httpbin.org/ip')
~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in get(self, url, **kwargs)
540
541 kwargs.setdefault('allow_redirects', True)
--> 542 return self.request('GET', url, **kwargs)
543
544 def options(self, url, **kwargs):
~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
527 }
528 send_kwargs.update(settings)
--> 529 resp = self.send(prep, **send_kwargs)
530
531 return resp
~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in send(self, request, **kwargs)
685
686 if not stream:
--> 687 r.content
688
689 return r
~/anaconda3/lib/python3.8/site-packages/requests/models.py in content(self)
836 self._content = None
837 else:
--> 838 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
839
840 self._content_consumed = True
~/anaconda3/lib/python3.8/site-packages/requests/models.py in generate()
758 if hasattr(self.raw, 'stream'):
759 try:
--> 760 for chunk in self.raw.stream(chunk_size, decode_content=True):
761 yield chunk
762 except ProtocolError as e:
~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in stream(self, amt, decode_content)
577 else:
578 while not is_fp_closed(self._fp):
--> 579 data = self.read(amt=amt, decode_content=decode_content)
580
581 if data:
~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
520 else:
521 cache_content = False
--> 522 data = self._fp.read(amt) if not fp_closed else b""
523 if (
524 amt != 0 and not data
~/anaconda3/lib/python3.8/http/client.py in read(self, amt)
456 # Amount is given, implement using readinto
457 b = bytearray(amt)
--> 458 n = self.readinto(b)
459 return memoryview(b)[:n].tobytes()
460 else:
~/anaconda3/lib/python3.8/http/client.py in readinto(self, b)
508 self.length -= n
509 if not self.length:
--> 510 self._close_conn()
511 return n
512
~/anaconda3/lib/python3.8/http/client.py in _close_conn(self)
410 fp = self.fp
411 self.fp = None
--> 412 fp.close()
413
414 def close(self):
~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in close(self)
63
64 def close(self):
---> 65 self.recorder.done()
66 if self.fp:
67 return self.fp.close()
~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in done(self)
196
197 with self.lock:
--> 198 self.writer.write_request_response_pair(request, response)
199 finally:
200 self.request_out.close()
~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write_request_response_pair(self, req, resp, params)
31 req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
32
---> 33 self._do_write_req_resp(req, resp, params)
34
35 def write_record(self, record, params=None): #pragma: no cover
~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _do_write_req_resp(self, req, resp, params)
138
139 def _do_write_req_resp(self, req, resp, params):
--> 140 self._write_warc_record(self.out, resp)
141 self._write_warc_record(self.out, req)
142
~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _write_warc_record(self, out, record)
89 # write record headers -- encoded as utf-8
90 # WARC headers can be utf-8 per spec
---> 91 out.write(record.rec_headers.to_bytes(encoding='utf-8'))
92
93 # write headers buffer, if any
~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write(self, buff)
120 # buff = buff.encode('utf-8')
121 buff = self.compressor.compress(buff)
--> 122 self.out.write(buff)
123
124 def flush(self):
ValueError: write to closed file
The following code snippet intends to do the same thing as above without the use of a session, and does actually work
for i in range(3):
fn = f'example-session-error-{i}.warc.gz'
with capture_http(fn):
print(f"Scraping {fn}")
requests.get('https://httpbin.org/ip', headers=HEADERS)
Environment
Python - 3.8.5
requests - 2.27.1
warcio - 1.7.4
Any help regarding this issue would be massively appreciated.
Overview
When attempting to use
requests.Session
withcapture_http
in some kind of loop to create new WARC files, an error is raised. However, when usingrequests
directly without the use of a session, all works as expected.Below is the code snippet using
requests.Session
alongside the exception raisedBelow is the exception raised
The following code snippet intends to do the same thing as above without the use of a session, and does actually work
Environment
Python -
3.8.5
requests -2.27.1
warcio -1.7.4
Any help regarding this issue would be massively appreciated.