laminlabs / lamindb

A data framework for biology.
https://docs.lamin.ai
Apache License 2.0
129 stars 12 forks source link

IntegrityError: UNIQUE constraint failed: bionty_gene.ensembl_gene_id when repeatedly validating example dataset #2137

Open Zethson opened 2 weeks ago

Zethson commented 2 weeks ago

Report

!lamin init --storage ./run-tests --name run-tests --schema bionty

import lamindb as ln
import bionty as bt

adata = ln.core.datasets.anndata_pbmc68k_reduced()

curator = ln.Curator.from_anndata(adata, var_index=bt.Gene.ensembl_gene_id, organism="human")
curator.validate()
curator.validate()

leads to

{
    "name": "IntegrityError",
    "message": "UNIQUE constraint failed: bionty_gene.ensembl_gene_id",
    "stack": "---------------------------------------------------------------------------
IntegrityError                            Traceback (most recent call last)
File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:105, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    104 else:
--> 105     return self.cursor.execute(sql, params)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/sqlite3/base.py:354, in SQLiteCursorWrapper.execute(self, query, params)
    353 query = self.convert_query(query, param_names=param_names)
--> 354 return super().execute(query, params)

IntegrityError: UNIQUE constraint failed: bionty_gene.ensembl_gene_id

The above exception was the direct cause of the following exception:

IntegrityError                            Traceback (most recent call last)
Cell In[3], line 1
----> 1 curator.validate()

File ~/PycharmProjects/lamindb/lamindb/_curate.py:548, in AnnDataCurator.validate(self, organism)
    543     logger.important(
    544         f\"validating metadata using registries of instance {colors.italic(self._using_key)}\"
    545     )
    547 # add all validated records to the current instance
--> 548 self._update_registry_all()
    550 validated_var, non_validated_var = validate_categories(
    551     self._adata.var.index,
    552     field=self._var_field,
   (...)
    558     **self._kwargs,  # type: ignore
    559 )
    560 validated_obs, non_validated_obs = validate_categories_in_df(
    561     self._adata.obs,
    562     fields=self.categoricals,
   (...)
    566     **self._kwargs,
    567 )

File ~/PycharmProjects/lamindb/lamindb/_curate.py:517, in AnnDataCurator._update_registry_all(self, validated_only, **kwargs)
    515 \"\"\"Save labels for all features.\"\"\"
    516 logger.info(\"saving validated records of 'var_index'\")
--> 517 self._save_from_var_index(validated_only=validated_only, **self._kwargs)
    518 for name in self._obs_fields.keys():
    519     logger.info(f\"saving validated terms of '{name}'\")

File ~/PycharmProjects/lamindb/lamindb/_curate.py:502, in AnnDataCurator._save_from_var_index(self, validated_only, organism)
    498 def _save_from_var_index(
    499     self, validated_only: bool = True, organism: str | None = None
    500 ):
    501     \"\"\"Save variable records.\"\"\"
--> 502     update_registry(
    503         values=list(self._adata.var.index),
    504         field=self.var_index,
    505         key=\"var_index\",
    506         save_function=\".add_new_from_var_index()\",
    507         using_key=self._using_key,
    508         validated_only=validated_only,
    509         organism=organism,
    510         source=self._sources.get(\"var_index\"),
    511         exclude=self._exclude.get(\"var_index\"),
    512     )

File ~/PycharmProjects/lamindb/lamindb/_curate.py:1512, in update_registry(values, field, key, save_function, using_key, validated_only, df, organism, dtype, source, standardize, warning, exclude, **kwargs)
   1510 if source:
   1511     public_records = [r for r in public_records if r.source.uid == source.uid]
-> 1512 ln_save(public_records)
   1513 labels_saved[\"from public\"] = [
   1514     getattr(r, field.field.name) for r in public_records
   1515 ]
   1516 non_public_labels = [i for i in values if i not in labels_saved[\"from public\"]]

File ~/PycharmProjects/lamindb/lamindb/_save.py:83, in save(records, ignore_conflicts)
     79 if non_artifacts:
     80     non_artifacts_old, non_artifacts_new = partition(
     81         lambda r: r._state.adding or r.pk is None, non_artifacts
     82     )
---> 83     bulk_create(non_artifacts_new, ignore_conflicts=ignore_conflicts)
     84     if non_artifacts_old:
     85         bulk_update(non_artifacts_old)

File ~/PycharmProjects/lamindb/lamindb/_save.py:114, in bulk_create(records, ignore_conflicts)
    112     records_by_orm[record.__class__].append(record)
    113 for registry, records in records_by_orm.items():
--> 114     registry.objects.bulk_create(records, ignore_conflicts=ignore_conflicts)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/manager.py:87, in BaseManager._get_queryset_methods.<locals>.create_method.<locals>.manager_method(self, *args, **kwargs)
     85 @wraps(method)
     86 def manager_method(self, *args, **kwargs):
---> 87     return getattr(self.get_queryset(), name)(*args, **kwargs)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:835, in QuerySet.bulk_create(self, objs, batch_size, ignore_conflicts, update_conflicts, update_fields, unique_fields)
    833 if objs_without_pk:
    834     fields = [f for f in fields if not isinstance(f, AutoField)]
--> 835     returned_columns = self._batched_insert(
    836         objs_without_pk,
    837         fields,
    838         batch_size,
    839         on_conflict=on_conflict,
    840         update_fields=update_fields,
    841         unique_fields=unique_fields,
    842     )
    843     connection = connections[self.db]
    844     if (
    845         connection.features.can_return_rows_from_bulk_insert
    846         and on_conflict is None
    847     ):

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:1875, in QuerySet._batched_insert(self, objs, fields, batch_size, on_conflict, update_fields, unique_fields)
   1870 for item in [objs[i : i + batch_size] for i in range(0, len(objs), batch_size)]:
   1871     if bulk_return and (
   1872         on_conflict is None or on_conflict == OnConflict.UPDATE
   1873     ):
   1874         inserted_rows.extend(
-> 1875             self._insert(
   1876                 item,
   1877                 fields=fields,
   1878                 using=self.db,
   1879                 on_conflict=on_conflict,
   1880                 update_fields=update_fields,
   1881                 unique_fields=unique_fields,
   1882                 returning_fields=self.model._meta.db_returning_fields,
   1883             )
   1884         )
   1885     else:
   1886         self._insert(
   1887             item,
   1888             fields=fields,
   (...)
   1892             unique_fields=unique_fields,
   1893         )

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/query.py:1847, in QuerySet._insert(self, objs, fields, returning_fields, raw, using, on_conflict, update_fields, unique_fields)
   1840 query = sql.InsertQuery(
   1841     self.model,
   1842     on_conflict=on_conflict,
   1843     update_fields=update_fields,
   1844     unique_fields=unique_fields,
   1845 )
   1846 query.insert_values(fields, objs, raw=raw)
-> 1847 return query.get_compiler(using=using).execute_sql(returning_fields)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/models/sql/compiler.py:1836, in SQLInsertCompiler.execute_sql(self, returning_fields)
   1834 with self.connection.cursor() as cursor:
   1835     for sql, params in self.as_sql():
-> 1836         cursor.execute(sql, params)
   1837     if not self.returning_fields:
   1838         return []

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:79, in CursorWrapper.execute(self, sql, params)
     78 def execute(self, sql, params=None):
---> 79     return self._execute_with_wrappers(
     80         sql, params, many=False, executor=self._execute
     81     )

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:92, in CursorWrapper._execute_with_wrappers(self, sql, params, many, executor)
     90 for wrapper in reversed(self.db.execute_wrappers):
     91     executor = functools.partial(wrapper, executor)
---> 92 return executor(sql, params, many, context)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:100, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
     98     warnings.warn(self.APPS_NOT_READY_WARNING_MSG, category=RuntimeWarning)
     99 self.db.validate_no_broken_transaction()
--> 100 with self.db.wrap_database_errors:
    101     if params is None:
    102         # params default might be backend specific.
    103         return self.cursor.execute(sql)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/utils.py:91, in DatabaseErrorWrapper.__exit__(self, exc_type, exc_value, traceback)
     89 if dj_exc_type not in (DataError, IntegrityError):
     90     self.wrapper.errors_occurred = True
---> 91 raise dj_exc_value.with_traceback(traceback) from exc_value

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/utils.py:105, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    103     return self.cursor.execute(sql)
    104 else:
--> 105     return self.cursor.execute(sql, params)

File ~/miniconda3/envs/lamindb/lib/python3.11/site-packages/django/db/backends/sqlite3/base.py:354, in SQLiteCursorWrapper.execute(self, query, params)
    352 param_names = list(params) if isinstance(params, Mapping) else None
    353 query = self.convert_query(query, param_names=param_names)
--> 354 return super().execute(query, params)

IntegrityError: UNIQUE constraint failed: bionty_gene.ensembl_gene_id"
}

Version information

No response

Zethson commented 2 weeks ago

I don't seem to have this issue with synthetic example data.