apache / iceberg-python

Apache PyIceberg
https://py.iceberg.apache.org/
Apache License 2.0
473 stars 172 forks source link

Nessie Iceberg REST catalog and writing to localstack raises `OSError: When initiating multiple part upload` #1087

Closed PetrasTYR closed 2 months ago

PetrasTYR commented 2 months ago

Apache Iceberg version

0.7.1 (latest release)

Please describe the bug 🐞

Hello!

I am trying to use Nessie as an iceberg catalog, specifically the REST catalog that the nessie server provides and writing to s3 bucket in localstack. but i face this error: OSError: When initiating multiple part upload for key 'rpmd/performance_68d55afc-e4d7-4055-932c-25f52e0e2f74/data/00000-0-b33665ff-ccbf-426f-bbd1-94353801131a.parquet' in bucket 'warehouse': AWS Error NETWORK_CONNECTION during CreateMultipartUpload operation: Encountered network error when sending http request

docker-compose.yaml:

# version: '3.9'

services:
  nessie:
    image: ghcr.io/projectnessie/nessie:0.95.0
    container_name: nessie
    ports:
      - 19120:19120
      - 9001:9000
    environment:
      - NESSIE_SERVER_DEFAULT-BRANCH=main
      - NESSIE_VERSION_STORE_TYPE=DYNAMODB
      - QUARKUS_DYNAMODB_AWS_REGION=ap-southeast-1
      - QUARKUS_DYNAMODB_ENDPOINT-OVERRIDE=http://localstack:4566
      - NESSIE_CATALOG_WAREHOUSEDEFAULTS_DEFAULT-WAREHOUSE=warehouse
      - NESSIE_CATALOG_DEFAULT-WAREHOUSE=warehouse
      - NESSIE_CATALOG_WAREHOUSES_WAREHOUSE_LOCATION=s3://warehouse
      - NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_REGION=ap-southeast-1
      - NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_ACCESS-KEY_NAME=fake
      - NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_ACCESS-KEY_SECRET=password
      - NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_ENDPOINT=http://localstack:4566
      - NESSIE_CATALOG_SERVICE_S3_DEFAULT-OPTIONS_PATH-STYLE-ACCESS=true
      - NESSIE_SERVER_AUTHENTICATION_ENABLED=false
      - AWS_ACCESS_KEY_ID=fake
      - AWS_SECRET_ACCESS_KEY=password
      - AWS_REGION=ap-southeast-1
    networks:
      - nessie-network

  trino:
    image: trinodb/trino:453
    container_name: trino
    ports:
      - 8080:8080
    networks:
      - nessie-network
    volumes:
      - "./iceberg.properties:/etc/trino/catalog/iceberg.properties"
      - "./iceberg.properties:/etc/catalog/iceberg.properties"
      - "./config.properties:/etc/trino/config.properties"

  trino-worker:
    image: trinodb/trino:453
    container_name: trino-worker
    ports:
      - 8081:8080
    networks:
      - nessie-network
    volumes:
      - "./iceberg.properties:/etc/trino/catalog/iceberg.properties"
      - "./worker-config.properties:/etc/trino/config.properties"

  localstack:
    image: localstack/localstack:latest
    container_name: localstack
    ports:
      - 4566:4566
    networks:
      - nessie-network
    environment:
      - DEBUG=${DEBUG-}
      - DOCKER_HOST=unix:///var/run/docker.sock
      - SKIP_SSL_CERT_DOWNLOAD=1
      - EDGE_PORT=4566

networks:
  nessie-network:
    driver: bridge

my python file:

from pyiceberg.catalog import load_rest
from pyiceberg.schema import Schema
from pyiceberg.types import StringType, NestedField, DoubleType
from pyiceberg.partitioning import PartitionSpec, PartitionField
import numpy as np
import pandas as pd

rows = 10**1
ncols = 10
countries = ["US", "CA", "UK"]
attr2 = ["OPEN", "CLOSE", "LOW", "HIGH"]
dates = pd.date_range("2020-01-01", "2020-12-31")
data_orig = pd.DataFrame(
    [
        {
            "countries": countries[i % len(countries)],
            "status": attr2[i % len(attr2)],
            "return_index": np.random.rand(),
        }
        for i in range(rows)
    ]
)

schema = Schema(
    NestedField(field_id=1, name="countries", field_type=StringType(), required=False),
    NestedField(field_id=1, name="status", field_type=StringType(), required=False),
    NestedField(
        field_id=1, name="return_index", field_type=DoubleType(), required=False
    ),
)
partition_spec = PartitionSpec(
    fields=[
        PartitionField(
            source_id=1, field_id=1000, name="countries", transform="identity"
        ),
        PartitionField(source_id=2, field_id=1001, name="status", transform="identity"),
    ]
)
catalog = load_rest(
    "rest",
    conf={
        "uri": "http://localhost:19120/iceberg",
        "s3.endpoint": "http://localhost:4566",
        "s3.access-key-id": "fake",
        "s3.secret-access-key": "password",
    },
)
catalog.create_namespace_if_not_exists("rpmd")
tables = catalog.list_tables(namespace="rpmd")
print("tables")
print(tables)

table = catalog.create_table_if_not_exists(
    identifier="rpmd.performance",
    schema=schema,
    # partition_spec=partition_spec
)
import pyarrow as pa
import pyarrow.parquet as pq

tbl = pa.Table.from_pandas(data_orig)
# pq.write_table()
table.append(tbl)

The stacktrace:

Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py", line 509, in append
    for data_file in data_files:
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\io\pyarrow.py", line 2354, in _dataframe_to_data_files
    yield from write_file(
  File "C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\_base.py", line 619, in result_iterator
    yield _result_or_cancel(fs.pop())
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\_base.py", line 317, in _result_or_cancel
    return fut.result(timeout)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\_base.py", line 456, in result
    return self.__get_result()
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\_base.py", line 401, in __get_result
    raise self._exception
  File "C:\Users\USER\AppData\Local\Programs\Python\Python311\Lib\concurrent\futures\thread.py", line 58, in run
    result = self.fn(*self.args, **self.kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\io\pyarrow.py", line 2174, in write_parquet
    with fo.create(overwrite=True) as fos:
         ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\io\pyarrow.py", line 307, in create
    output_file = self._filesystem.open_output_stream(self._path, buffer_size=self._buffer_size)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pyarrow\\_fs.pyx", line 887, in pyarrow._fs.FileSystem.open_output_stream
  File "pyarrow\\error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
  File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
OSError: When initiating multiple part upload for key 'rpmd/performance_68d55afc-e4d7-4055-932c-25f52e0e2f74/data/00000-0-b33665ff-ccbf-426f-bbd1-94353801131a.parquet' in bucket 'warehouse': AWS Error NETWORK_CONNECTION during CreateMultipartUpload operation: Encountered network error when sending http request

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\USER\Documents\Code\Playground\python-scripts\iceberg.py", line 60, in <module>
    table.append(tbl)
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py", line 1578, in append
    tx.append(df=df, snapshot_properties=snapshot_properties)
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py", line 503, in append
    with append_method() as append_files:
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py", line 2094, in __exit__
    self.commit()
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py", line 2090, in commit
    self._transaction._apply(*self._commit())
                              ^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\table\__init__.py", line 3218, in _commit
    with write_manifest_list(
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\manifest.py", line 924, in __enter__
    self._writer.__enter__()
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\avro\file.py", line 258, in __enter__
    self.output_stream = self.output_file.create(overwrite=True)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\pypoetry\Cache\virtualenvs\python-scripts-USencysj-py3.11\Lib\site-packages\pyiceberg\io\pyarrow.py", line 307, in create
    output_file = self._filesystem.open_output_stream(self._path, buffer_size=self._buffer_size)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pyarrow\\_fs.pyx", line 887, in pyarrow._fs.FileSystem.open_output_stream
  File "pyarrow\\error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
  File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
OSError: When initiating multiple part upload for key 'rpmd/performance_68d55afc-e4d7-4055-932c-25f52e0e2f74/metadata/snap-2642215192225696858-0-b33665ff-ccbf-426f-bbd1-94353801131a.avro' in bucket 'warehouse': AWS Error NETWORK_CONNECTION during CreateMultipartUpload operation: Encountered network error when sending http request

Package versions:

python = "^3.11"
pyarrow = "^17.0.0"
pyiceberg = "^0.7.1"
pandas = "^2.2.2"

I've tried the solution in issue #974 but it still does not work. Would appreciate it greatly if someone could try this out please, thanks!

Edited to update the actual load_rest configs and package versions

PetrasTYR commented 2 months ago

I updated my docker-compose.yaml to use extra_hosts and it worked. Closing this issue.

allilou commented 1 month ago

I updated my docker-compose.yaml to use extra_hosts and it worked. Closing this issue.

I'm facing the same error, can you please give a snippet how you add the extra_hosts

Thanks

smsmith97 commented 1 month ago

Yeah, that would be great @PetrasTYR, I am facing the same issue

allilou commented 3 weeks ago

I updated my docker-compose.yaml to use extra_hosts and it worked. Closing this issue.

I'm facing the same error, can you please give a snippet how you add the extra_hosts

Thanks

Finnaly it was an error on my side. I've used environment variables to set the uri and the s3.endpoint and this is where the mistake was.