superduper-io / superduper

Superduper: Integrate AI models and machine learning workflows with your database to implement custom AI applications, without moving your data. Including streaming inference, scalable model hosting, training and vector search.
https://superduper.io
Apache License 2.0
4.69k stars 451 forks source link

[BUG]: ValueError: Couldn't get shape of model outputs from model encoder #1914

Closed makkarss929 closed 6 months ago

makkarss929 commented 6 months ago

Contact Details [Optional]

makkarss929@gmail.com

System Information

{
  "cfg": {
    "data_backend": "mongodb://localhost:27017/test_db",
    "lance_home": ".superduperdb/vector_indices",
    "artifact_store": null,
    "metadata_store": null,
    "cluster": {
      "compute": {
        "uri": null,
        "compute_kwargs": {}
      },
      "vector_search": {
        "uri": null,
        "type": "in_memory",
        "backfill_batch_size": 100
      },
      "cdc": {
        "uri": null,
        "strategy": null
      }
    },
    "retries": {
      "stop_after_attempt": 2,
      "wait_max": 10.0,
      "wait_min": 4.0,
      "wait_multiplier": 1.0
    },
    "downloads": {
      "folder": null,
      "n_workers": 0,
      "headers": {
        "User-Agent": "me"
      },
      "timeout": null
    },
    "fold_probability": 0.05,
    "log_level": "INFO",
    "logging_type": "SYSTEM",
    "bytes_encoding": "Bytes"
  },
  "cwd": "/Users/tarun/Desktop/superduperDB/superduperdb/examples",
  "freeze": [
    "aiohttp==3.9.3",
    "aiohttp-cors==0.7.0",
    "aiosignal==1.3.1",
    "annotated-types==0.6.0",
    "anyio==4.3.0",
    "appnope==0.1.4",
    "argon2-cffi==23.1.0",
    "argon2-cffi-bindings==21.2.0",
    "arrow==1.3.0",
    "asn1crypto==1.5.1",
    "asttokens==2.4.1",
    "async-lru==2.0.4",
    "atpublic==4.0",
    "attrs==23.2.0",
    "Babel==2.14.0",
    "beautifulsoup4==4.12.3",
    "bidict==0.23.1",
    "bleach==6.1.0",
    "boto3==1.34.69",
    "botocore==1.34.69",
    "build==1.1.1",
    "cachetools==5.3.3",
    "certifi==2024.2.2",
    "cffi==1.16.0",
    "charset-normalizer==3.3.2",
    "click==8.1.7",
    "cloudpickle==3.0.0",
    "colorful==0.5.6",
    "comm==0.2.2",
    "cryptography==42.0.5",
    "dask==2024.3.1",
    "debugpy==1.8.1",
    "decorator==5.1.1",
    "defusedxml==0.7.1",
    "dill==0.3.8",
    "distlib==0.3.8",
    "distributed==2024.3.1",
    "dnspython==2.6.1",
    "duckdb==0.10.1",
    "duckdb_engine==0.11.2",
    "executing==2.0.1",
    "fastapi==0.110.0",
    "fastjsonschema==2.19.1",
    "filelock==3.13.2",
    "fqdn==1.5.1",
    "frozenlist==1.4.1",
    "fsspec==2024.3.1",
    "ftfy==6.2.0",
    "google-api-core==2.18.0",
    "google-auth==2.29.0",
    "googleapis-common-protos==1.63.0",
    "greenlet==3.0.3",
    "grpcio==1.62.1",
    "h11==0.14.0",
    "httpcore==1.0.4",
    "httpx==0.27.0",
    "huggingface-hub==0.22.0",
    "ibis==3.3.0",
    "ibis-framework==8.0.0",
    "idna==3.6",
    "importlib_metadata==7.1.0",
    "ipykernel==6.29.3",
    "ipython==8.22.2",
    "ipython-genutils==0.2.0",
    "ipywidgets==8.1.2",
    "isoduration==20.11.0",
    "jedi==0.19.1",
    "Jinja2==3.1.3",
    "jmespath==1.0.1",
    "joblib==1.3.2",
    "json5==0.9.24",
    "jsonpointer==2.4",
    "jsonschema==4.21.1",
    "jsonschema-specifications==2023.12.1",
    "jupyter==1.0.0",
    "jupyter-console==6.6.3",
    "jupyter-events==0.10.0",
    "jupyter-lsp==2.2.4",
    "jupyter_client==8.6.1",
    "jupyter_core==5.7.2",
    "jupyter_server==2.13.0",
    "jupyter_server_terminals==0.5.3",
    "jupyterlab==4.1.5",
    "jupyterlab_pygments==0.3.0",
    "jupyterlab_server==2.25.4",
    "jupyterlab_widgets==3.0.10",
    "locket==1.0.0",
    "loguru==0.7.2",
    "loki-logger-handler==0.1.3",
    "markdown-it-py==3.0.0",
    "MarkupSafe==2.1.5",
    "matplotlib-inline==0.1.6",
    "mdurl==0.1.2",
    "mistune==3.0.2",
    "mongomock==4.1.2",
    "mpmath==1.3.0",
    "msgpack==1.0.8",
    "multidict==6.0.5",
    "multipledispatch==1.0.0",
    "nbclient==0.10.0",
    "nbconvert==7.16.3",
    "nbformat==5.10.3",
    "nest-asyncio==1.6.0",
    "networkx==3.2.1",
    "notebook==6.1.5",
    "notebook_shim==0.2.4",
    "numpy==1.24.4",
    "openai-clip==1.0.1",
    "opencensus==0.11.4",
    "opencensus-context==0.1.3",
    "overrides==7.7.0",
    "packaging==23.2",
    "pandas==2.2.1",
    "pandocfilters==1.5.1",
    "parso==0.8.3",
    "parsy==2.1",
    "partd==1.4.1",
    "pexpect==4.9.0",
    "pillow==10.2.0",
    "pip==23.2.1",
    "pip-tools==7.4.1",
    "platformdirs==3.11.0",
    "prettytable==3.10.0",
    "prometheus_client==0.20.0",
    "prompt-toolkit==3.0.43",
    "proto-plus==1.23.0",
    "protobuf==4.25.3",
    "psutil==5.9.8",
    "psycopg2==2.9.9",
    "ptyprocess==0.7.0",
    "pure-eval==0.2.2",
    "py-spy==0.3.14",
    "pyarrow==15.0.2",
    "pyarrow-hotfix==0.6",
    "pyasn1==0.5.1",
    "pyasn1-modules==0.3.0",
    "pycparser==2.21",
    "pydantic==2.6.4",
    "pydantic_core==2.16.3",
    "Pygments==2.17.2",
    "PyJWT==2.8.0",
    "pylance==0.8.14",
    "pymongo==4.6.2",
    "pyOpenSSL==24.1.0",
    "pyperclip==1.8.2",
    "pyproject_hooks==1.0.0",
    "python-dateutil==2.9.0.post0",
    "python-dotenv==1.0.1",
    "python-json-logger==2.0.7",
    "pytz==2024.1",
    "PyYAML==6.0.1",
    "pyzmq==25.1.2",
    "qtconsole==5.5.1",
    "QtPy==2.4.1",
    "ray==2.10.0",
    "readerwriterlock==1.0.9",
    "referencing==0.34.0",
    "regex==2023.12.25",
    "requests==2.31.0",
    "rfc3339-validator==0.1.4",
    "rfc3986-validator==0.1.1",
    "rich==13.7.1",
    "rpds-py==0.18.0",
    "rsa==4.9",
    "s3transfer==0.10.1",
    "safetensors==0.4.2",
    "scikit-learn==1.4.1.post1",
    "scipy==1.12.0",
    "Send2Trash==1.8.2",
    "sentence-transformers==2.6.0",
    "sentinels==1.0.0",
    "setuptools==65.5.0",
    "six==1.16.0",
    "smart-open==7.0.3",
    "sniffio==1.3.1",
    "snowflake-connector-python==3.7.1",
    "snowflake-sqlalchemy==1.5.1",
    "sortedcontainers==2.4.0",
    "soupsieve==2.5",
    "SQLAlchemy==2.0.29",
    "sqlalchemy-views==0.3.2",
    "sqlglot==20.10.0",
    "stack-data==0.6.3",
    "starlette==0.36.3",
    "-e git+https://github.com/makkarss929/superduperdb.git@c078ac4f4f7439c7b882d09d064af3ee801b7c4d#egg=superduperdb",
    "sympy==1.12",
    "tblib==3.0.0",
    "tenacity==8.2.3",
    "terminado==0.18.1",
    "threadpoolctl==3.4.0",
    "tinycss2==1.2.1",
    "tokenizers==0.15.2",
    "tomlkit==0.12.4",
    "toolz==0.12.1",
    "torch==2.2.1",
    "torchvision==0.17.1",
    "tornado==6.4",
    "tqdm==4.66.2",
    "traitlets==5.14.2",
    "transformers==4.39.1",
    "typer==0.10.0",
    "types-python-dateutil==2.9.0.20240316",
    "typing_extensions==4.10.0",
    "tzdata==2024.1",
    "uri-template==1.3.0",
    "urllib3==2.2.1",
    "uvicorn==0.29.0",
    "virtualenv==20.25.1",
    "wcwidth==0.2.13",
    "webcolors==1.13",
    "webencodings==0.5.1",
    "websocket-client==1.7.0",
    "wheel==0.43.0",
    "widgetsnbextension==4.0.10",
    "wrapt==1.16.0",
    "yarl==1.9.4",
    "zict==3.0.0",
    "zipp==3.18.1"
  ],
  "hostname": "Taruns-Laptop.local",
  "os_uname": [
    "Darwin",
    "Taruns-Laptop.local",
    "22.1.0",
    "Darwin Kernel Version 22.1.0: Sun Oct  9 20:15:09 PDT 2022; root:xnu-8792.41.9~2/RELEASE_ARM64_T6000",
    "x86_64"
  ],
  "package_versions": {},
  "platform": {
    "platform": "macOS-10.16-x86_64-i386-64bit",
    "python_version": "3.11.5"
  },
  "startup_time": "2024-03-29 11:18:17.551752",
  "superduper_db_root": "/Users/tarun/Desktop/superduperDB/superduperdb",
  "sys": {
    "argv": [
      "/Users/tarun/Desktop/superduperDB/superduperdb/superduperdb/__main__.py",
      "info"
    ],
    "path": [
      "/Users/tarun/Desktop/superduperDB/superduperdb/examples",
      "/Users/tarun/miniconda3/lib/python311.zip",
      "/Users/tarun/miniconda3/lib/python3.11",
      "/Users/tarun/miniconda3/lib/python3.11/lib-dynload",
      "/Users/tarun/Desktop/superduperDB/superduperdb/.venv/lib/python3.11/site-packages",
      "__editable__.superduperdb-0.1.1.finder.__path_hook__"
    ]
  }
}

What happened?

ValueError Traceback (most recent call last) Cell In[10], line 5 2 from superduperdb import VectorIndex 4 # Add a VectorIndex to the SuperDuperDB database with the specified identifier and indexing listener ----> 5 _ = db.add( 6 VectorIndex( 7 identifier='my-index', # Unique identifier for the VectorIndex 8 indexing_listener=listener # Listener to be used for indexing documents 9 ) 10 )

File ~/Desktop/superduperDB/superduperdb/superduperdb/base/datalayer.py:478, in Datalayer.add(self, object, dependencies) 470 return type(object)( 471 self._add( 472 object=component, (...) 475 for component in object 476 ) 477 elif isinstance(object, Component): --> 478 return self._add(object=object, dependencies=dependencies), object 479 else: 480 return self._add(superduper(object)), object

File ~/Desktop/superduperDB/superduperdb/superduperdb/base/datalayer.py:861, in Datalayer._add(self, object, dependencies, parent) 859 object.post_create(self) 860 self._add_component_to_cache(object) --> 861 these_jobs = object.schedule_jobs(self, dependencies=dependencies) 862 jobs.extend(these_jobs) 863 return jobs

File ~/Desktop/superduperDB/superduperdb/superduperdb/components/vector_index.py:198, in VectorIndex.schedule_jobs(self, db, dependencies) 188 if not db.cdc.running: 189 job = FunctionJob( 190 callable=copy_vectors, 191 args=[], (...) 196 }, 197 ) --> 198 job(db, dependencies=dependencies) 199 return [job] 200 return []

File ~/Desktop/superduperDB/superduperdb/superduperdb/jobs/job.py:146, in FunctionJob.call(self, db, dependencies) 143 self.db = db 144 db.metadata.create_job(self.dict()) --> 146 self.submit(dependencies=dependencies) 147 return self

File ~/Desktop/superduperDB/superduperdb/superduperdb/jobs/job.py:124, in FunctionJob.submit(self, dependencies) 118 def submit(self, dependencies=()): 119 """ 120 Submit job for execution 121 122 :param dependencies: list of dependencies 123 """ --> 124 self.future = self.db.compute.submit( 125 callable_job, 126 cfg=s.CFG.dict(), 127 function_to_call=self.callable, 128 job_id=self.identifier, 129 args=self.args, 130 kwargs=self.kwargs, 131 dependencies=dependencies, 132 db=self.db if self.db.compute.type == 'local' else None, 133 ) 135 return

File ~/Desktop/superduperDB/superduperdb/superduperdb/backends/local/compute.py:35, in LocalComputeBackend.submit(self, function, compute_kwargs, *args, *kwargs) 29 """ 30 Submits a function for local execution. 31 32 :param function: The function to be executed. 33 """ 34 logging.info(f"Submitting job. function:{function}") ---> 35 future = function(args, **kwargs) 37 future_key = str(uuid.uuid4()) 38 self.__outputs[future_key] = future

File ~/Desktop/superduperDB/superduperdb/superduperdb/jobs/tasks.py:107, in callable_job(cfg, function_to_call, args, kwargs, job_id, dependencies, db) 105 db.metadata.update_job(job_id, 'status', 'failed') 106 db.metadata.update_job(job_id, 'msg', tb) --> 107 raise e 108 else: 109 db.metadata.update_job(job_id, 'status', 'success')

File ~/Desktop/superduperDB/superduperdb/superduperdb/jobs/tasks.py:102, in callable_job(cfg, function_to_call, args, kwargs, job_id, dependencies, db) 100 output = None 101 try: --> 102 output = function_to_call(*args, db=db, **kwargs) 103 except Exception as e: 104 tb = traceback.format_exc()

File ~/Desktop/superduperDB/superduperdb/superduperdb/vector_search/update_tasks.py:85, in copy_vectors(vector_index, query, ids, db) 82 r['vector'] = r['vector'].numpy() 84 if vectors: ---> 85 db.fast_vector_searchers[vi.identifier].add( 86 [VectorItem(**vector) for vector in vectors] 87 )

File ~/Desktop/superduperDB/superduperdb/superduperdb/base/datalayer.py:1067, in LoadDict.missing(self, key) 1065 msg = f'callable is None for {key}' 1066 assert self.callable is not None, msg -> 1067 value = self[key] = self.callable(key) 1068 return value

File ~/Desktop/superduperDB/superduperdb/superduperdb/base/datalayer.py:132, in Datalayer.initialize_vector_searcher(self, identifier, searcher_type, backfill) 129 clt = vi.indexing_listener.select.table_or_collection 131 vector_search_cls = vector_searcher_implementations[searcher_type] --> 132 vector_comparison = vector_search_cls.from_component(vi) 134 assert isinstance(clt.identifier, str), 'clt.identifier must be a string' 136 self.backfill_vector_search(vi, vector_comparison)

File ~/Desktop/superduperDB/superduperdb/superduperdb/vector_search/base.py:19, in BaseVectorSearcher.from_component(cls, vi) 16 @classmethod 17 def from_component(cls, vi: 'VectorIndex'): 18 return cls( ---> 19 identifier=vi.identifier, dimensions=vi.dimensions, measure=vi.measure 20 )

File ~/Desktop/superduperDB/superduperdb/superduperdb/components/vector_index.py:173, in VectorIndex.dimensions(self) 171 if shape := getattr(self.indexing_listener.model.datatype, 'shape', None): 172 return shape[-1] --> 173 raise ValueError('Couldn\'t get shape of model outputs from model encoder')

ValueError: Couldn't get shape of model outputs from model encoder

Steps to reproduce

  1. db = mongodb+local
  2. ...

Relevant log output

No response

makkarss929 commented 6 months ago
import sentence_transformers
from superduperdb import Model, ObjectModel, vector

Use datatype and encoder both encoder = datatype = vector(shape=(1024,)). both should be same

model = Model(
    identifier='embedding', 
    object=sentence_transformers.SentenceTransformer('BAAI/bge-large-en-v1.5'),
    encoder=vector(shape=(1024,)),
    predict_method='encode', # Specify the prediction method
    postprocess=lambda x: x.tolist(),  # Define postprocessing function
    batch_predict=True, # Generate predictions for a set of observations all at once 
    datatype=vector(shape=(1024,))
)
makkarss929 commented 6 months ago
import sentence_transformers
from superduperdb import Model, ObjectModel, vector

Use datatype and encoder both encoder = datatype = vector(shape=(1024,)). both should be same

model = Model(
    identifier='embedding', 
    object=sentence_transformers.SentenceTransformer('BAAI/bge-large-en-v1.5'),
    encoder=vector(shape=(1024,)),
    predict_method='encode', # Specify the prediction method
    postprocess=lambda x: x.tolist(),  # Define postprocessing function
    batch_predict=True, # Generate predictions for a set of observations all at once 
    datatype=vector(shape=(1024,))
)