ian-whitestone / pyspark-vs-dask

[WIP] Comparing pyspark and dask for speed, memory/CPU usage, and ease of use
2 stars 1 forks source link

S3 API Errors #6

Open ian-whitestone opened 5 years ago

ian-whitestone commented 5 years ago

Some errors I have gotten at work while reading lots of files in parallel.

2018/11/02 10:00:08|INFO|camp.onelake_extraction: Processing 1439 files
Traceback (most recent call last):
  File "core.py", line 150, in <module>
    main(args)
  File "core.py", line 88, in main
    dataset, start_time, end_time, databricks, run_extraction)
  File "/home/ubuntu/camp-app/camp/onelake_extraction.py", line 599, in run
    get(dsk_graph, keys)
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/dask/threaded.py", line 75, in get
    pack_exception=pack_exception, **kwargs)
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/dask/local.py", line 521, in get_async
    raise_exception(exc, tb)
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/dask/compatibility.py", line 67, in reraise
    raise exc
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/dask/local.py", line 290, in execute_task
    result = _execute_task(task, data)
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/dask/local.py", line 271, in _execute_task
    return func(*args2)
  File "/home/ubuntu/camp-app/camp/utils.py", line 276, in load_avro
    reader = fastavro.reader(fo)
  File "fastavro/_read.pyx", line 655, in fastavro._read.reader.__init__
  File "fastavro/_read.pyx", line 536, in fastavro._read._read_data
  File "fastavro/_read.pyx", line 432, in fastavro._read.read_record
  File "fastavro/_read.pyx", line 454, in fastavro._read.read_record
  File "fastavro/_read.pyx", line 536, in fastavro._read._read_data
  File "fastavro/_read.pyx", line 312, in fastavro._read.read_fixed
  File "fastavro/_read.pyx", line 315, in fastavro._read.read_fixed
  File "fastavro/_read.pyx", line 580, in fastavro._read.FileObjectReader.read
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/s3fs/core.py", line 1270, in read
    self._fetch(self.loc, self.loc + length)
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/s3fs/core.py", line 1234, in _fetch
    req_kw=self.s3.req_kw)
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/s3fs/core.py", line 1438, in _fetch_range
    **kwargs)
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/botocore/client.py", line 320, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/home/ubuntu/miniconda3/envs/camp/lib/python3.6/site-packages/botocore/client.py", line 623, in _make_api_call
    raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (SlowDown) when calling the GetObject operation (reached max retries: 4): Please reduce your request rate.
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/urllib3/response.py", line 331, in _error_catcher
    yield
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/urllib3/response.py", line 409, in read
    data = self._fp.read()
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/http/client.py", line 462, in read
    s = self._safe_read(self.length)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/http/client.py", line 614, in _safe_read
    raise IncompleteRead(b''.join(s), amt)
http.client.IncompleteRead: IncompleteRead(8427 bytes read, 8202 more expected)

During handling of the above exception, another exception occurred:
....

  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in <listcomp>
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 249, in _execute_task
    return [_execute_task(a, cache) for a in arg]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 249, in <listcomp>
    return [_execute_task(a, cache) for a in arg]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in <listcomp>
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 253, in _execute_task
    return func(*args2)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/bag/avro.py", line 149, in read_file
    return list(fastavro.iter_avro(f))
  File "fastavro/_read.pyx", line 688, in fastavro._read.reader.__init__
  File "fastavro/_read.pyx", line 654, in fastavro._read.file_reader.__init__
  File "fastavro/_read.pyx", line 539, in fastavro._read._read_data
  File "fastavro/_read.pyx", line 451, in fastavro._read.read_record
  File "fastavro/_read.pyx", line 529, in fastavro._read._read_data
  File "fastavro/_read.pyx", line 312, in fastavro._read.read_fixed
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/s3fs/core.py", line 1311, in read
    self._fetch(self.loc, self.loc + length)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/s3fs/core.py", line 1275, in _fetch
    req_kw=self.s3.req_kw)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/s3fs/core.py", line 1496, in _fetch_range
    return resp['Body'].read()
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/botocore/response.py", line 78, in read
    chunk = self._raw_stream.read(amt)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/urllib3/response.py", line 430, in read
    raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/urllib3/response.py", line 349, in _error_catcher
    raise ProtocolError('Connection broken: %r' % e, e)
urllib3.exceptions.ProtocolError: ('Connection broken: IncompleteRead(8427 bytes read, 8202 more expected)', IncompleteRead(8427 bytes read, 8202 more expected))
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/s3fs/core.py", line 1495, in _fetch_range
    **kwargs)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/botocore/client.py", line 320, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/botocore/client.py", line 623, in _make_api_call
    raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (InternalError) when calling the GetObject operation (reached max retries: 4): We encountered an internal error. Please try again.
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/urllib3/response.py", line 331, in _error_catcher
    yield
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/urllib3/response.py", line 409, in read
    data = self._fp.read()
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/http/client.py", line 462, in read
    s = self._safe_read(self.length)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/http/client.py", line 612, in _safe_read
    chunk = self.fp.read(min(amt, MAXAMOUNT))
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/socket.py", line 586, in readinto
    return self._sock.recv_into(b)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/ssl.py", line 1009, in recv_into
    return self.read(nbytes, buffer)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/ssl.py", line 871, in read
    return self._sslobj.read(len, buffer)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/ssl.py", line 631, in read
    v = self._sslobj.read(len, buffer)
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "dsk_filter_cnt.py", line 73, in <module>
    cnt = df.payload.count().compute(scheduler='threads', num_workers=num_workers)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/base.py", line 156, in compute
    (result,) = compute(self, traverse=False, **kwargs)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/base.py", line 395, in compute
    results = schedule(dsk, keys, **kwargs)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/threaded.py", line 75, in get
    pack_exception=pack_exception, **kwargs)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 501, in get_async
    raise_exception(exc, tb)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/compatibility.py", line 112, in reraise
    raise exc
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 272, in execute_task
    result = _execute_task(task, data)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in <listcomp>
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 249, in _execute_task
    return [_execute_task(a, cache) for a in arg]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 249, in <listcomp>
    return [_execute_task(a, cache) for a in arg]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in <listcomp>
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in <listcomp>
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in <listcomp>
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 252, in <listcomp>
    args2 = [_execute_task(a, cache) for a in args]
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/local.py", line 253, in _execute_task
    return func(*args2)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/dask/bag/avro.py", line 149, in read_file
    return list(fastavro.iter_avro(f))
  File "fastavro/_read.pyx", line 688, in fastavro._read.reader.__init__
  File "fastavro/_read.pyx", line 654, in fastavro._read.file_reader.__init__
  File "fastavro/_read.pyx", line 539, in fastavro._read._read_data
  File "fastavro/_read.pyx", line 451, in fastavro._read.read_record
  File "fastavro/_read.pyx", line 529, in fastavro._read._read_data
  File "fastavro/_read.pyx", line 312, in fastavro._read.read_fixed
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/s3fs/core.py", line 1311, in read
    self._fetch(self.loc, self.loc + length)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/s3fs/core.py", line 1275, in _fetch
    req_kw=self.s3.req_kw)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/s3fs/core.py", line 1496, in _fetch_range
    return resp['Body'].read()
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/botocore/response.py", line 78, in read
    chunk = self._raw_stream.read(amt)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/urllib3/response.py", line 430, in read
    raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/ubuntu/miniconda3/envs/dask/lib/python3.6/site-packages/urllib3/response.py", line 349, in _error_catcher
    raise ProtocolError('Connection broken: %r' % e, e)
urllib3.exceptions.ProtocolError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))