superduper-io / superduper

Superduper: Build end-to-end AI applications and agent workflows on your existing data infrastructure and preferred tools - without migrating your data.
https://superduper.io
Apache License 2.0
4.81k stars 464 forks source link

[BUG]: MongoDataBackend git unexpected keyword argument 'conn' #2260

Closed Jesse-jApps closed 4 months ago

Jesse-jApps commented 4 months ago

Contact Details [Optional]

No response

System Information

{
  "cfg": {
    "data_backend": "mongodb://mongodb:27017/test_db",
    "lance_home": ".superduperdb/vector_indices",
    "artifact_store": null,
    "metadata_store": null,
    "cluster": {
      "compute": {
        "uri": null,
        "compute_kwargs": {}
      },
      "crontab": {
        "uri": null
      },
      "vector_search": {
        "uri": null,
        "type": "in_memory",
        "backfill_batch_size": 100
      },
      "rest": {
        "uri": null,
        "config": null
      },
      "cdc": {
        "uri": null,
        "strategy": {
          "type": "incremental",
          "auto_increment_field": null,
          "frequency": "30"
        }
      }
    },
    "retries": {
      "stop_after_attempt": 2,
      "wait_max": 10.0,
      "wait_min": 4.0,
      "wait_multiplier": 1.0
    },
    "downloads": {
      "folder": null,
      "n_workers": 0,
      "headers": {
        "User-Agent": "me"
      },
      "timeout": null
    },
    "fold_probability": 0.05,
    "log_level": "INFO",
    "logging_type": "SYSTEM",
    "bytes_encoding": "Bytes",
    "auto_schema": true
  },
  "cwd": "/affili",
  "freeze": [
    "aiohttp==3.9.5",
    "aiohttp-cors==0.7.0",
    "aiosignal==1.3.1",
    "amqp==5.2.0",
    "annotated-types==0.7.0",
    "anyio==4.4.0",
    "APScheduler==3.10.4",
    "asgiref==3.8.1",
    "atpublic==4.1.0",
    "attrs==23.2.0",
    "bidict==0.23.1",
    "billiard==4.2.0",
    "boto3==1.34.140",
    "botocore==1.34.140",
    "cachetools==5.3.3",
    "celery==5.4.0",
    "certifi==2024.7.4",
    "charset-normalizer==3.3.2",
    "click==8.1.7",
    "click-didyoumean==0.3.1",
    "click-plugins==1.1.1",
    "click-repl==0.3.0",
    "colorful==0.5.6",
    "dill==0.3.8",
    "distlib==0.3.8",
    "distro==1.9.0",
    "Django==5.0.6",
    "django-chartjs==2.3.0",
    "django-redis==5.4.0",
    "dnspython==2.6.1",
    "email_validator==2.2.0",
    "fastapi==0.111.0",
    "fastapi-cli==0.0.4",
    "filelock==3.15.4",
    "flower==2.0.1",
    "frozenlist==1.4.1",
    "fsspec==2024.6.1",
    "google-api-core==2.19.1",
    "google-auth==2.31.0",
    "googleapis-common-protos==1.63.2",
    "greenlet==3.0.3",
    "grpcio==1.64.1",
    "gunicorn==22.0.0",
    "h11==0.14.0",
    "httpcore==1.0.5",
    "httptools==0.6.1",
    "httpx==0.27.0",
    "huggingface-hub==0.23.4",
    "humanize==4.9.0",
    "ibis-framework==9.1.0",
    "idna==3.7",
    "imageio==2.34.2",
    "Jinja2==3.1.4",
    "jmespath==1.0.1",
    "joblib==1.4.2",
    "jsonschema==4.22.0",
    "jsonschema-specifications==2023.12.1",
    "kombu==5.3.7",
    "lazy_loader==0.4",
    "linkify-it-py==2.0.3",
    "loguru==0.7.2",
    "loki-logger-handler==0.1.4",
    "markdown-it-py==3.0.0",
    "markdown2==2.4.13",
    "MarkupSafe==2.1.5",
    "mdit-py-plugins==0.4.1",
    "mdurl==0.1.2",
    "memray==1.13.3",
    "mongomock==4.1.2",
    "mpmath==1.3.0",
    "msgpack==1.0.8",
    "multidict==6.0.5",
    "mysqlclient==2.2.4",
    "networkx==3.3",
    "numpy==1.26.4",
    "openai==1.35.10",
    "opencensus==0.11.4",
    "opencensus-context==0.1.3",
    "opencv-python-headless==4.10.0.84",
    "orjson==3.10.6",
    "overrides==7.7.0",
    "packaging==24.1",
    "pandas==2.2.2",
    "parsy==2.1",
    "pillow==10.4.0",
    "pip==24.1.1",
    "platformdirs==4.2.2",
    "prettytable==3.10.0",
    "prometheus_client==0.20.0",
    "prompt_toolkit==3.0.47",
    "proto-plus==1.24.0",
    "protobuf==5.27.2",
    "psycopg2-binary==2.9.9",
    "py-spy==0.3.14",
    "pyarrow==16.1.0",
    "pyarrow-hotfix==0.6",
    "pyasn1==0.6.0",
    "pyasn1_modules==0.4.0",
    "pydantic==2.8.2",
    "pydantic_core==2.20.1",
    "Pygments==2.18.0",
    "pylance==0.8.14",
    "pymemcache==4.0.0",
    "pymongo==4.8.0",
    "pystache==0.6.5",
    "python-dateutil==2.9.0.post0",
    "python-dotenv==1.0.1",
    "python-magic==0.4.27",
    "python-multipart==0.0.9",
    "python-slugify==8.0.4",
    "python3-memcached==1.51",
    "pytz==2024.1",
    "PyYAML==6.0.1",
    "ray==2.31.0",
    "redis==5.0.7",
    "referencing==0.35.1",
    "regex==2024.5.15",
    "requests==2.32.3",
    "rich==13.7.1",
    "rpds-py==0.18.1",
    "rsa==4.9",
    "ruamel.yaml==0.18.6",
    "ruamel.yaml.clib==0.2.8",
    "s3transfer==0.10.2",
    "safetensors==0.4.3",
    "scikit-image==0.24.0",
    "scikit-learn==1.5.1",
    "scipy==1.14.0",
    "sentence-transformers==3.0.1",
    "sentinels==1.0.0",
    "setuptools==70.2.0",
    "shellingham==1.5.4",
    "six==1.16.0",
    "smart-open==7.0.4",
    "sniffio==1.3.1",
    "sorl-thumbnail==12.10.0",
    "SQLAlchemy==2.0.31",
    "sqlglot==25.1.0",
    "sqlparse==0.5.0",
    "starlette==0.37.2",
    "superduperdb==0.2.0",
    "sympy==1.12.1",
    "tenacity==8.2.3",
    "text-unidecode==1.3",
    "textual==0.71.0",
    "threadpoolctl==3.5.0",
    "tifffile==2024.7.2",
    "tokenizers==0.19.1",
    "toolz==0.12.1",
    "torch==2.3.1",
    "tornado==6.4.1",
    "tqdm==4.66.4",
    "transformers==4.42.3",
    "typer==0.12.3",
    "typing_extensions==4.12.2",
    "tzdata==2024.1",
    "tzlocal==5.2",
    "uc-micro-py==1.0.3",
    "ujson==5.10.0",
    "Unidecode==1.3.8",
    "urllib3==2.2.2",
    "uvicorn==0.30.1",
    "uvloop==0.19.0",
    "vine==5.1.0",
    "virtualenv==20.26.3",
    "watchfiles==0.22.0",
    "wcwidth==0.2.13",
    "websockets==12.0",
    "wheel==0.43.0",
    "wrapt==1.16.0",
    "yarl==1.9.4"
  ],
  "hostname": "b944d1a0fd23",
  "os_uname": [
    "Linux",
    "b944d1a0fd23",
    "6.4.16-linuxkit",
    "#1 SMP PREEMPT Thu Nov 16 10:49:20 UTC 2023",
    "aarch64",
    ""
  ],
  "package_versions": {},
  "platform": {
    "platform": "Linux-6.4.16-linuxkit-aarch64-with-glibc2.36",
    "python_version": "3.12.4"
  },
  "startup_time": "2024-07-07 02:40:25.632441",
  "superduper_db_root": "/usr/local/lib/python3.12/site-packages",
  "sys": {
    "argv": [
      "/usr/local/lib/python3.12/site-packages/superduperdb/__main__.py",
      "info"
    ],
    "path": [
      "/affili",
      "/usr/local/lib/python312.zip",
      "/usr/local/lib/python3.12",
      "/usr/local/lib/python3.12/lib-dynload",
      "/usr/local/lib/python3.12/site-packages"
    ]
  }
}

What happened?

Mongo databackend creation from MongoDbTyper.create uses keyword argument 'conn' when MongoDataBackend called, which does not exist.

databackend = MongoDataBackend(conn=item.client, name=item.name)
class MongoDataBackend(BaseDataBackend):
    """
    Data backend for MongoDB.

    :param conn: MongoDB client connection
    :param name: Name of database to host filesystem
    """

    db_type = DBType.MONGODB

    id_field = '_id'

    def __init__(self, uri: str, flavour: t.Optional[str] = None):
        self.connection_callback = lambda: _connection_callback(uri, flavour)
        super().__init__(uri, flavour=flavour)
        self.conn, self.name = _connection_callback(uri, flavour)

        self._db = self.conn[self.name]

This yields the error:

  File "/usr/local/lib/python3.12/site-packages/superduperdb/base/superduper.py", line 24, in superduper
    return _DuckTyper.run(item, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/superduperdb/base/superduper.py", line 71, in run
    return dts[0].create(item, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/superduperdb/base/superduper.py", line 144, in create
    databackend = MongoDataBackend(conn=item.client, name=item.name)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: MongoDataBackend.__init__() got an unexpected keyword argument 'conn'

Steps to reproduce

1. 2. 3. ...

Relevant log output

No response

jieguangzhou commented 4 months ago

@Jesse-jApps Thank you for your discovery, it seems that there is an error in the doc-strings, which will be fixed later.

Additionally, it is recommended to use db=superduper("mongodb://host:port/database_name") for a better connection.

Then, you can access it via db.databackend.