ARCC-RACE / deepracer-for-dummies

a quick way to get up and running with local deepracer training environment
66 stars 28 forks source link

Connection closed before response revcieved #23

Closed Michael-Equi closed 5 years ago

Michael-Equi commented 5 years ago

Checkpoint> Saving in path=['./checkpoint/14_Step-10846.ckpt'] {"simapp_exception": {"version": "1.0", "date": "2019-08-04 04:38:10.380303", "function": "save_to_store", "message": "Exception [Connection was closed before we received a valid response from endpoint URL: \"http://minio:9000/bucket/rl-deepracer-sagemaker/model/14_Step-10846.ckpt.data-00000-of-00001?uploadId=27a99f19-659f-4098-8917-02e4ba72e85f&partNumber=16\".] occured while uploading files on S3 for checkpoint", "exceptionType": "s3_datastore.exceptions", "eventType": "system_error", "errorCode": "500"}} {"simapp_exception": {"version": "1.0", "date": "2019-08-04 04:38:10.384121", "function": "training_worker", "message": "An error occured while training: Connection was closed before we received a valid response from endpoint URL: \"http://minio:9000/bucket/rl-deepracer-sagemaker/model/14_Step-10846.ckpt.data-00000-of-00001?uploadId=27a99f19-659f-4098-8917-02e4ba72e85f&partNumber=16\".. Job failed!.", "exceptionType": "training_worker.exceptions", "eventType": "system_error", "errorCode": "503"}} Traceback (most recent call last): File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 600, in urlopen chunked=chunked) File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 354, in _make_request conn.request(method, url, *httplib_request_kw) File "/usr/lib/python3.6/http/client.py", line 1239, in request self._send_request(method, url, body, headers, encode_chunked) File "/usr/local/lib/python3.6/dist-packages/botocore/awsrequest.py", line 125, in _send_request method, url, body, headers, args, **kwargs) File "/usr/lib/python3.6/http/client.py", line 1285, in _send_request self.endheaders(body, encode_chunked=encode_chunked) File "/usr/lib/python3.6/http/client.py", line 1234, in endheaders self._send_output(message_body, encode_chunked=encode_chunked) File "/usr/local/lib/python3.6/dist-packages/botocore/awsrequest.py", line 176, in _send_output self.send(message_body) File "/usr/local/lib/python3.6/dist-packages/botocore/awsrequest.py", line 236, in send return super(AWSConnection, self).send(str) File "/usr/lib/python3.6/http/client.py", line 983, in send self.sock.sendall(datablock) ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/usr/local/lib/python3.6/dist-packages/botocore/httpsession.py", line 258, in send decode_content=False, File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 638, in urlopen _stacktrace=sys.exc_info()[2]) File "/usr/local/lib/python3.6/dist-packages/urllib3/util/retry.py", line 344, in increment raise six.reraise(type(error), error, _stacktrace) File "/usr/local/lib/python3.6/dist-packages/urllib3/packages/six.py", line 685, in reraise raise value.with_traceback(tb) File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 600, in urlopen chunked=chunked) File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 354, in _make_request conn.request(method, url, *httplib_request_kw) File "/usr/lib/python3.6/http/client.py", line 1239, in request self._send_request(method, url, body, headers, encode_chunked) File "/usr/local/lib/python3.6/dist-packages/botocore/awsrequest.py", line 125, in _send_request method, url, body, headers, args, **kwargs) File "/usr/lib/python3.6/http/client.py", line 1285, in _send_request self.endheaders(body, encode_chunked=encode_chunked) File "/usr/lib/python3.6/http/client.py", line 1234, in endheaders self._send_output(message_body, encode_chunked=encode_chunked) File "/usr/local/lib/python3.6/dist-packages/botocore/awsrequest.py", line 176, in _send_output self.send(message_body) File "/usr/local/lib/python3.6/dist-packages/botocore/awsrequest.py", line 236, in send return super(AWSConnection, self).send(str) File "/usr/lib/python3.6/http/client.py", line 983, in send self.sock.sendall(datablock) urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "training_worker.py", line 108, in training_worker graph_manager.save_checkpoint() File "/usr/local/lib/python3.6/dist-packages/rl_coach/graph_managers/graph_manager.py", line 624, in save_checkpoint data_store.save_to_store() File "/opt/ml/code/markov/s3_boto_data_store.py", line 144, in save_to_store raise e File "/opt/ml/code/markov/s3_boto_data_store.py", line 100, in save_to_store Key=self._get_s3_key(rel_name)) File "/usr/local/lib/python3.6/dist-packages/boto3/s3/inject.py", line 131, in upload_file extra_args=ExtraArgs, callback=Callback) File "/usr/local/lib/python3.6/dist-packages/boto3/s3/transfer.py", line 279, in upload_file future.result() File "/usr/local/lib/python3.6/dist-packages/s3transfer/futures.py", line 73, in result return self._coordinator.result() File "/usr/local/lib/python3.6/dist-packages/s3transfer/futures.py", line 233, in result raise self._exception File "/usr/local/lib/python3.6/dist-packages/s3transfer/tasks.py", line 126, in call return self._execute_main(kwargs) File "/usr/local/lib/python3.6/dist-packages/s3transfer/tasks.py", line 150, in _execute_main return_value = self._main(kwargs) File "/usr/local/lib/python3.6/dist-packages/s3transfer/upload.py", line 722, in _main Body=body, extra_args) File "/usr/local/lib/python3.6/dist-packages/botocore/client.py", line 357, in _api_call return self._make_api_call(operation_name, kwargs) File "/usr/local/lib/python3.6/dist-packages/botocore/client.py", line 648, in _make_api_call operation_model, request_dict, request_context) File "/usr/local/lib/python3.6/dist-packages/botocore/client.py", line 667, in _make_request return self._endpoint.make_request(operation_model, request_dict) File "/usr/local/lib/python3.6/dist-packages/botocore/endpoint.py", line 102, in make_request return self._send_request(request_dict, operation_model) File "/usr/local/lib/python3.6/dist-packages/botocore/endpoint.py", line 137, in _send_request success_response, exception): File "/usr/local/lib/python3.6/dist-packages/botocore/endpoint.py", line 231, in _needs_retry caught_exception=caught_exception, request_dict=request_dict) File "/usr/local/lib/python3.6/dist-packages/botocore/hooks.py", line 356, in emit return self._emitter.emit(aliased_event_name, kwargs) File "/usr/local/lib/python3.6/dist-packages/botocore/hooks.py", line 228, in emit return self._emit(event_name, kwargs) File "/usr/local/lib/python3.6/dist-packages/botocore/hooks.py", line 211, in _emit response = handler(kwargs) File "/usr/local/lib/python3.6/dist-packages/botocore/retryhandler.py", line 183, in call if self._checker(attempts, response, caught_exception): File "/usr/local/lib/python3.6/dist-packages/botocore/retryhandler.py", line 251, in call caught_exception) File "/usr/local/lib/python3.6/dist-packages/botocore/retryhandler.py", line 277, in _should_retry return self._checker(attempt_number, response, caught_exception) File "/usr/local/lib/python3.6/dist-packages/botocore/retryhandler.py", line 317, in call caught_exception) File "/usr/local/lib/python3.6/dist-packages/botocore/retryhandler.py", line 223, in call attempt_number, caught_exception) File "/usr/local/lib/python3.6/dist-packages/botocore/retryhandler.py", line 359, in _check_caught_exception raise caught_exception File "/usr/local/lib/python3.6/dist-packages/botocore/endpoint.py", line 200, in _do_get_response http_response = self._send(request) File "/usr/local/lib/python3.6/dist-packages/botocore/endpoint.py", line 244, in _send return self.http_session.send(request) File "/usr/local/lib/python3.6/dist-packages/botocore/httpsession.py", line 289, in send endpoint_url=request.url botocore.exceptions.ConnectionClosedError: Connection was closed before we received a valid response from endpoint URL: "http://minio:9000/bucket/rl-deepracer-sagemaker/model/14_Step-10846.ckpt.data-00000-of-00001?uploadId=27a99f19-659f-4098-8917-02e4ba72e85f&partNumber=16".