Open yangyang0202 opened 1 year ago
when process_wet_file.py set WET_URL_ROOT = "https://ds5q9oxwqwsfj.cloudfront.net"
Traceback (most recent call last):
File "/home/nlp/.local/lib/python3.8/site-packages/submitit/core/submission.py", line 54, in process_job
result = delayed.result()
File "/home/nlp/.local/lib/python3.8/site-packages/submitit/core/utils.py", line 122, in result
self._result = self.function(*self.args, **self.kwargs)
File "/home/nlp/newpan/project/yxn/cc_net/cc_net/mine.py", line 286, in _hashes_shard
jsonql.run_pipes(
File "/home/nlp/newpan/project/yxn/cc_net/cc_net/jsonql.py", line 455, in run_pipes
write_jsons(data, output)
File "/home/nlp/newpan/project/yxn/cc_net/cc_net/jsonql.py", line 496, in write_jsons
for res in source:
File "/home/nlp/newpan/project/yxn/cc_net/cc_net/jsonql.py", line 284, in map
for x in source:
File "/home/nlp/newpan/project/yxn/cc_net/cc_net/process_wet_file.py", line 210, in __iter__
for doc in parse_warc_file(self.open_segment(segment), self.min_len):
File "/home/nlp/newpan/project/yxn/cc_net/cc_net/process_wet_file.py", line 203, in open_segment
return jsonql.open_remote_file(url, cache=file)
File "/home/nlp/newpan/project/yxn/cc_net/cc_net/jsonql.py", line 1124, in open_remote_file
raw_bytes = request_get_content(url)
File "/home/nlp/newpan/project/yxn/cc_net/cc_net/jsonql.py", line 1094, in request_get_content
r = _session().get(url)
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/site-packages/requests/sessions.py", line 602, in get
return self.request("GET", url, **kwargs)
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/site-packages/requests/sessions.py", line 589, in request
resp = self.send(prep, **send_kwargs)
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/site-packages/requests/sessions.py", line 747, in send
r.content
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/site-packages/requests/models.py", line 899, in content
self._content = b"".join(self.iter_content(CONTENT_CHUNK_SIZE)) or b""
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/site-packages/requests/models.py", line 816, in generate
yield from self.raw.stream(chunk_size, decode_content=True)
File "/home/nlp/.local/lib/python3.8/site-packages/urllib3/response.py", line 579, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "/home/nlp/.local/lib/python3.8/site-packages/urllib3/response.py", line 522, in read
data = self._fp.read(amt) if not fp_closed else b""
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/http/client.py", line 454, in read
n = self.readinto(b)
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/http/client.py", line 498, in readinto
n = self.fp.readinto(b)
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "/home/nlp/anaconda3/envs/yxn_LASER/lib/python3.8/ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
File "/home/nlp/.local/lib/python3.8/site-packages/submitit/core/job_environment.py", line 211, in checkpoint_and_try_requeue
raise utils.UncompletedJobError(message)
submitit.core.utils.UncompletedJobError: Job not requeued because: timed-out and not checkpointable.
how to solve it?
When I use
python -m cc_net
to download and extract work, I am told that the connection cannot openrequests.exceptions.HTTPError: 503 Server Error: Service Unavailable for url: https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/segments/1642320299852.23/wet/CC-MAIN-20220116093137-20220116123137-00540.warc.wet.gz
process_wet_file.py set WET_URL_ROOT = "https://data.commoncrawl.org" How to solve this problem