When I run with the code 'bash run/vqa_finetune.bash 2 vqa_lxr955_tiny --tiny, no bugs come out, but I usually stuck and the training is very slow.
0%|▏ | 1385472/407873900 [5:13:53<1534:53:56, 73.56B/s] 18%|██████████ | 72417280/407873900 [4:47:09<22:10:10, 4203.15B/s]
I have to cancel it when the process got stuck, and the record is as follow:
` File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 438, in _error_ca
tcher
yield
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 519, in read
data = self._fp.read(amt) if not fp_closed else b""
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/http/client.py", line 461, in read
n = self.readinto(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/http/client.py", line 505, in readinto
n = self.fp.readinto(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/ssl.py", line 1071, in recv_into
return self.read(nbytes, buffer)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/ssl.py", line 929, in read
return self._sslobj.read(len, buffer)
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/requests/models.py", line 758, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 576, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 541, in read
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/contextlib.py", line 130, in exit
self.gen.throw(type, value, traceback)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 455, in _error_ca
tcher
raise ProtocolError("Connection broken: %r" % e, e)
urllib3.exceptions.ProtocolError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", Co
nnectionResetError(104, 'Connection reset by peer'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "src/tasks/gqa.py", line 178, in
gqa = GQA()
File "src/tasks/gqa.py", line 48, in init
self.model = GQAModel(self.train_tuple.dataset.num_answers)
File "/home1/hli/project_301/code/lxmert/src/tasks/gqa_model.py", line 19, in init
max_seq_length=MAX_GQA_LENGTH
File "/home1/hli/project_301/code/lxmert/src/lxrt/entry.py", line 95, in init
mode=mode
File "/home1/hli/project_301/code/lxmert/src/lxrt/modeling.py", line 743, in from_pretrained
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
File "/home1/hli/project_301/code/lxmert/src/lxrt/file_utils.py", line 99, in cached_path
return get_from_cache(url_or_filename, cache_dir)
File "/home1/hli/project_301/code/lxmert/src/lxrt/file_utils.py", line 210, in get_from_cache
http_get(url, temp_file)
File "/home1/hli/project_301/code/lxmert/src/lxrt/file_utils.py", line 165, in http_get
for chunk in req.iter_content(chunk_size=1024):
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/requests/models.py", line 758, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 576, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 519, in read
data = self._fp.read(amt) if not fp_closed else b""
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/http/client.py", line 461, in read
n = self.readinto(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/http/client.py", line 505, in readinto
n = self.fp.readinto(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/ssl.py", line 1071, in recv_into
return self.read(nbytes, buffer)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/ssl.py", line 929, in read
return self._sslobj.read(len, buffer)
`
help please
When I run with the code 'bash run/vqa_finetune.bash 2 vqa_lxr955_tiny --tiny, no bugs come out, but I usually stuck and the training is very slow.
0%|▏ | 1385472/407873900 [5:13:53<1534:53:56, 73.56B/s] 18%|██████████ | 72417280/407873900 [4:47:09<22:10:10, 4203.15B/s]
I have to cancel it when the process got stuck, and the record is as follow:
` File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 438, in _error_ca tcher yield File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 519, in read data = self._fp.read(amt) if not fp_closed else b"" File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/http/client.py", line 461, in read n = self.readinto(b) File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/http/client.py", line 505, in readinto n = self.fp.readinto(b) File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/socket.py", line 589, in readinto return self._sock.recv_into(b) File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/ssl.py", line 1071, in recv_into return self.read(nbytes, buffer) File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/ssl.py", line 929, in read return self._sslobj.read(len, buffer) ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/requests/models.py", line 758, in generate for chunk in self.raw.stream(chunk_size, decode_content=True): File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 576, in stream data = self.read(amt=amt, decode_content=decode_content) File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 541, in read raise IncompleteRead(self._fp_bytes_read, self.length_remaining) File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/contextlib.py", line 130, in exit self.gen.throw(type, value, traceback) File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 455, in _error_ca tcher raise ProtocolError("Connection broken: %r" % e, e) urllib3.exceptions.ProtocolError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", Co nnectionResetError(104, 'Connection reset by peer'))
During handling of the above exception, another exception occurred: Traceback (most recent call last): File "src/tasks/gqa.py", line 178, in
gqa = GQA()
File "src/tasks/gqa.py", line 48, in init
self.model = GQAModel(self.train_tuple.dataset.num_answers)
File "/home1/hli/project_301/code/lxmert/src/tasks/gqa_model.py", line 19, in init
max_seq_length=MAX_GQA_LENGTH
File "/home1/hli/project_301/code/lxmert/src/lxrt/entry.py", line 95, in init
mode=mode
File "/home1/hli/project_301/code/lxmert/src/lxrt/modeling.py", line 743, in from_pretrained
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
File "/home1/hli/project_301/code/lxmert/src/lxrt/file_utils.py", line 99, in cached_path
return get_from_cache(url_or_filename, cache_dir)
File "/home1/hli/project_301/code/lxmert/src/lxrt/file_utils.py", line 210, in get_from_cache
http_get(url, temp_file)
File "/home1/hli/project_301/code/lxmert/src/lxrt/file_utils.py", line 165, in http_get
for chunk in req.iter_content(chunk_size=1024):
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/requests/models.py", line 758, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 576, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/site-packages/urllib3/response.py", line 519, in read
data = self._fp.read(amt) if not fp_closed else b""
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/http/client.py", line 461, in read
n = self.readinto(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/http/client.py", line 505, in readinto
n = self.fp.readinto(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/ssl.py", line 1071, in recv_into
return self.read(nbytes, buffer)
File "/home1/hli/anaconda3/envs/nnUnet/lib/python3.7/ssl.py", line 929, in read
return self._sslobj.read(len, buffer)
`
help please