laminlabs / lamindb

A data framework for biology.
https://docs.lamin.ai
Apache License 2.0
129 stars 12 forks source link

Curator saving records with a different version than the saved records results in duplicate key errors #2165

Open Zethson opened 1 week ago

Zethson commented 1 week ago

Report

Minimal reproducible error script to come but I'll outline it here and add it later:

  1. New instance
  2. Import Gene from source (will use ENSEMBL 112 by default)
  3. Attempt to curate a new dataset but have the Gene version pinned to 110. This happens when using the cellxgene Curator.
  4. Be met with errors like
{
    "name": "IntegrityError",
    "message": "duplicate key value violates unique constraint \"bionty_gene_uid_key\"
DETAIL:  Key (uid)=(3BEZYDTfG7t4) already exists.
",
    "stack": "---------------------------------------------------------------------------
UniqueViolation                           Traceback (most recent call last)
File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:105, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    104 else:
--> 105     return self.cursor.execute(sql, params)

UniqueViolation: duplicate key value violates unique constraint \"bionty_gene_uid_key\"
DETAIL:  Key (uid)=(3BEZYDTfG7t4) already exists.

The above exception was the direct cause of the following exception:

IntegrityError                            Traceback (most recent call last)
Cell In[8], line 4
      1 curator = pts.PerturbationCurator(
      2     adata, using_key=THIS_INSTANCE
      3 )
----> 4 curator.validate()

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/pertpy_datasets/perturbation_curator.py:98, in PerturbationCurator.validate(self)
     96 def validate(self) -> bool:
     97     \"\"\"Validates the AnnData object against cellxgene and pertpy's requirements.\"\"\"
---> 98     return super().validate()

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/cellxgene_lamin/curate.py:285, in Curator.validate(self)
    280 if len(matching_keys) == 0:
    281     raise ValueError(
    282         \"Unable to find an embedding key. Please calculate an embedding.\"
    283     )
--> 285 return super().validate(organism=self.organism)

File ~/PycharmProjects/lamindb/lamindb/_curate.py:553, in AnnDataCurator.validate(self, organism)
    548     logger.important(
    549         f\"validating metadata using registries of instance {colors.italic(self._using_key)}\"
    550     )
    552 # add all validated records to the current instance
--> 553 self._update_registry_all()
    555 validated_var, non_validated_var = validate_categories(
    556     self._adata.var.index,
    557     field=self._var_field,
   (...)
    563     **self._kwargs,  # type: ignore
    564 )
    565 validated_obs, non_validated_obs = validate_categories_in_df(
    566     self._adata.obs,
    567     fields=self.categoricals,
   (...)
    571     **self._kwargs,
    572 )

File ~/PycharmProjects/lamindb/lamindb/_curate.py:523, in AnnDataCurator._update_registry_all(self, validated_only, **kwargs)
    521 def _update_registry_all(self, validated_only: bool = True, **kwargs):
    522     \"\"\"Save labels for all features.\"\"\"
--> 523     self._save_from_var_index(validated_only=validated_only, **self._kwargs)
    524     for name in self._obs_fields.keys():
    525         self._update_registry(name, validated_only=validated_only, **self._kwargs)

File ~/PycharmProjects/lamindb/lamindb/_curate.py:509, in AnnDataCurator._save_from_var_index(self, validated_only, organism)
    505 def _save_from_var_index(
    506     self, validated_only: bool = True, organism: str | None = None
    507 ):
    508     \"\"\"Save variable records.\"\"\"
--> 509     update_registry(
    510         values=list(self._adata.var.index),
    511         field=self.var_index,
    512         key=\"var_index\",
    513         save_function=\".add_new_from_var_index()\",
    514         using_key=self._using_key,
    515         validated_only=validated_only,
    516         organism=organism,
    517         source=self._sources.get(\"var_index\"),
    518         exclude=self._exclude.get(\"var_index\"),
    519     )

File ~/PycharmProjects/lamindb/lamindb/_curate.py:1522, in update_registry(values, field, key, save_function, using_key, validated_only, df, organism, dtype, source, standardize, warning, exclude, **kwargs)
   1520     logger.info(f\"saving validated records of '{key}'\")
   1521     settings.verbosity = \"error\"
-> 1522 ln_save(public_records)
   1523 labels_saved[\"from public\"] = [
   1524     getattr(r, field.field.name) for r in public_records
   1525 ]
   1526 non_public_labels = [i for i in values if i not in labels_saved[\"from public\"]]

File ~/PycharmProjects/lamindb/lamindb/_save.py:83, in save(records, ignore_conflicts)
     79 if non_artifacts:
     80     non_artifacts_old, non_artifacts_new = partition(
     81         lambda r: r._state.adding or r.pk is None, non_artifacts
     82     )
---> 83     bulk_create(non_artifacts_new, ignore_conflicts=ignore_conflicts)
     84     if non_artifacts_old:
     85         bulk_update(non_artifacts_old)

File ~/PycharmProjects/lamindb/lamindb/_save.py:114, in bulk_create(records, ignore_conflicts)
    112     records_by_orm[record.__class__].append(record)
    113 for registry, records in records_by_orm.items():
--> 114     registry.objects.bulk_create(records, ignore_conflicts=ignore_conflicts)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/manager.py:87, in BaseManager._get_queryset_methods.<locals>.create_method.<locals>.manager_method(self, *args, **kwargs)
     85 @wraps(method)
     86 def manager_method(self, *args, **kwargs):
---> 87     return getattr(self.get_queryset(), name)(*args, **kwargs)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:835, in QuerySet.bulk_create(self, objs, batch_size, ignore_conflicts, update_conflicts, update_fields, unique_fields)
    833 if objs_without_pk:
    834     fields = [f for f in fields if not isinstance(f, AutoField)]
--> 835     returned_columns = self._batched_insert(
    836         objs_without_pk,
    837         fields,
    838         batch_size,
    839         on_conflict=on_conflict,
    840         update_fields=update_fields,
    841         unique_fields=unique_fields,
    842     )
    843     connection = connections[self.db]
    844     if (
    845         connection.features.can_return_rows_from_bulk_insert
    846         and on_conflict is None
    847     ):

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:1875, in QuerySet._batched_insert(self, objs, fields, batch_size, on_conflict, update_fields, unique_fields)
   1870 for item in [objs[i : i + batch_size] for i in range(0, len(objs), batch_size)]:
   1871     if bulk_return and (
   1872         on_conflict is None or on_conflict == OnConflict.UPDATE
   1873     ):
   1874         inserted_rows.extend(
-> 1875             self._insert(
   1876                 item,
   1877                 fields=fields,
   1878                 using=self.db,
   1879                 on_conflict=on_conflict,
   1880                 update_fields=update_fields,
   1881                 unique_fields=unique_fields,
   1882                 returning_fields=self.model._meta.db_returning_fields,
   1883             )
   1884         )
   1885     else:
   1886         self._insert(
   1887             item,
   1888             fields=fields,
   (...)
   1892             unique_fields=unique_fields,
   1893         )

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:1847, in QuerySet._insert(self, objs, fields, returning_fields, raw, using, on_conflict, update_fields, unique_fields)
   1840 query = sql.InsertQuery(
   1841     self.model,
   1842     on_conflict=on_conflict,
   1843     update_fields=update_fields,
   1844     unique_fields=unique_fields,
   1845 )
   1846 query.insert_values(fields, objs, raw=raw)
-> 1847 return query.get_compiler(using=using).execute_sql(returning_fields)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/sql/compiler.py:1836, in SQLInsertCompiler.execute_sql(self, returning_fields)
   1834 with self.connection.cursor() as cursor:
   1835     for sql, params in self.as_sql():
-> 1836         cursor.execute(sql, params)
   1837     if not self.returning_fields:
   1838         return []

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:79, in CursorWrapper.execute(self, sql, params)
     78 def execute(self, sql, params=None):
---> 79     return self._execute_with_wrappers(
     80         sql, params, many=False, executor=self._execute
     81     )

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:92, in CursorWrapper._execute_with_wrappers(self, sql, params, many, executor)
     90 for wrapper in reversed(self.db.execute_wrappers):
     91     executor = functools.partial(wrapper, executor)
---> 92 return executor(sql, params, many, context)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:100, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
     98     warnings.warn(self.APPS_NOT_READY_WARNING_MSG, category=RuntimeWarning)
     99 self.db.validate_no_broken_transaction()
--> 100 with self.db.wrap_database_errors:
    101     if params is None:
    102         # params default might be backend specific.
    103         return self.cursor.execute(sql)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/utils.py:91, in DatabaseErrorWrapper.__exit__(self, exc_type, exc_value, traceback)
     89 if dj_exc_type not in (DataError, IntegrityError):
     90     self.wrapper.errors_occurred = True
---> 91 raise dj_exc_value.with_traceback(traceback) from exc_value

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:105, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    103     return self.cursor.execute(sql)
    104 else:
--> 105     return self.cursor.execute(sql, params)

IntegrityError: duplicate key value violates unique constraint \"bionty_gene_uid_key\"
DETAIL:  Key (uid)=(3BEZYDTfG7t4) already exists.
"
}

I am not sure why the expected outcome here should be.

  1. Should we somehow allow for duplicated Gene records with different versions? I think that we currently don't support that and it's not easy, right?
  2. Should this throw a more informative error and state that there are already gene records with a newer version? I think that's not useful because in this case the user wants to map against a specific version.
  3. Should we say "sorry but Genes are already at version 112 so we map against 112 instead"? Also unexpected behavior.

Version information

No response