laminlabs / lamindb

A data framework for biology.
https://docs.lamin.ai
Apache License 2.0
127 stars 10 forks source link

integrity error when trying to save genes after they've been saved already #2090

Closed Zethson closed 4 days ago

Zethson commented 1 week ago

Report

I first ran

curator.add_new_from("all")

and then because I had some leftover code, I also ran

curator.add_new_from_var_index()

to be met with

{
    "name": "IntegrityError",
    "message": "UNIQUE constraint failed: bionty_gene.ensembl_gene_id",
    "stack": "---------------------------------------------------------------------------
IntegrityError                            Traceback (most recent call last)
File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/backends/utils.py:105, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    104 else:
--> 105     return self.cursor.execute(sql, params)

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/backends/sqlite3/base.py:354, in SQLiteCursorWrapper.execute(self, query, params)
    353 query = self.convert_query(query, param_names=param_names)
--> 354 return super().execute(query, params)

IntegrityError: UNIQUE constraint failed: bionty_gene.ensembl_gene_id

The above exception was the direct cause of the following exception:

IntegrityError                            Traceback (most recent call last)
Cell In[9], line 1
----> 1 curator.add_new_from_var_index()

File ~/PycharmProjects/lamindb/lamindb/_curate.py:529, in AnnDataCurator.add_new_from_var_index(self, organism, **kwargs)
    522 \"\"\"Update variable records.
    523 
    524 Args:
    525     organism: The organism name.
    526     **kwargs: Additional keyword arguments to pass to the registry model.
    527 \"\"\"
    528 self._kwargs.update({\"organism\": organism} if organism else {})
--> 529 self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)

File ~/PycharmProjects/lamindb/lamindb/_curate.py:501, in AnnDataCurator._save_from_var_index(self, validated_only, organism)
    497 def _save_from_var_index(
    498     self, validated_only: bool = True, organism: str | None = None
    499 ):
    500     \"\"\"Save variable records.\"\"\"
--> 501     update_registry(
    502         values=list(self._adata.var.index),
    503         field=self.var_index,
    504         key=\"var_index\",
    505         save_function=\".add_new_from_var_index()\",
    506         using_key=self._using_key,
    507         validated_only=validated_only,
    508         organism=organism,
    509         source=self._sources.get(\"var_index\"),
    510         exclude=self._exclude.get(\"var_index\"),
    511     )

File ~/PycharmProjects/lamindb/lamindb/_curate.py:1557, in update_registry(values, field, key, save_function, using_key, validated_only, df, organism, dtype, source, standardize, warning, exclude, **kwargs)
   1549                 init_kwargs[\"dtype\"] = \"cat\" if dtype is None else dtype
   1550             non_validated_records.append(
   1551                 registry(
   1552                     **init_kwargs,
   (...)
   1555                 )
   1556             )
-> 1557     ln_save(non_validated_records)
   1559 # save parent labels for ulabels
   1560 if registry == ULabel and field.field.name == \"name\":

File ~/PycharmProjects/lamindb/lamindb/_save.py:83, in save(records, ignore_conflicts)
     79 if non_artifacts:
     80     non_artifacts_old, non_artifacts_new = partition(
     81         lambda r: r._state.adding or r.pk is None, non_artifacts
     82     )
---> 83     bulk_create(non_artifacts_new, ignore_conflicts=ignore_conflicts)
     84     if non_artifacts_old:
     85         bulk_update(non_artifacts_old)

File ~/PycharmProjects/lamindb/lamindb/_save.py:114, in bulk_create(records, ignore_conflicts)
    112     records_by_orm[record.__class__].append(record)
    113 for registry, records in records_by_orm.items():
--> 114     registry.objects.bulk_create(records, ignore_conflicts=ignore_conflicts)

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/models/manager.py:87, in BaseManager._get_queryset_methods.<locals>.create_method.<locals>.manager_method(self, *args, **kwargs)
     85 @wraps(method)
     86 def manager_method(self, *args, **kwargs):
---> 87     return getattr(self.get_queryset(), name)(*args, **kwargs)

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/models/query.py:835, in QuerySet.bulk_create(self, objs, batch_size, ignore_conflicts, update_conflicts, update_fields, unique_fields)
    833 if objs_without_pk:
    834     fields = [f for f in fields if not isinstance(f, AutoField)]
--> 835     returned_columns = self._batched_insert(
    836         objs_without_pk,
    837         fields,
    838         batch_size,
    839         on_conflict=on_conflict,
    840         update_fields=update_fields,
    841         unique_fields=unique_fields,
    842     )
    843     connection = connections[self.db]
    844     if (
    845         connection.features.can_return_rows_from_bulk_insert
    846         and on_conflict is None
    847     ):

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/models/query.py:1875, in QuerySet._batched_insert(self, objs, fields, batch_size, on_conflict, update_fields, unique_fields)
   1870 for item in [objs[i : i + batch_size] for i in range(0, len(objs), batch_size)]:
   1871     if bulk_return and (
   1872         on_conflict is None or on_conflict == OnConflict.UPDATE
   1873     ):
   1874         inserted_rows.extend(
-> 1875             self._insert(
   1876                 item,
   1877                 fields=fields,
   1878                 using=self.db,
   1879                 on_conflict=on_conflict,
   1880                 update_fields=update_fields,
   1881                 unique_fields=unique_fields,
   1882                 returning_fields=self.model._meta.db_returning_fields,
   1883             )
   1884         )
   1885     else:
   1886         self._insert(
   1887             item,
   1888             fields=fields,
   (...)
   1892             unique_fields=unique_fields,
   1893         )

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/models/query.py:1847, in QuerySet._insert(self, objs, fields, returning_fields, raw, using, on_conflict, update_fields, unique_fields)
   1840 query = sql.InsertQuery(
   1841     self.model,
   1842     on_conflict=on_conflict,
   1843     update_fields=update_fields,
   1844     unique_fields=unique_fields,
   1845 )
   1846 query.insert_values(fields, objs, raw=raw)
-> 1847 return query.get_compiler(using=using).execute_sql(returning_fields)

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/models/sql/compiler.py:1836, in SQLInsertCompiler.execute_sql(self, returning_fields)
   1834 with self.connection.cursor() as cursor:
   1835     for sql, params in self.as_sql():
-> 1836         cursor.execute(sql, params)
   1837     if not self.returning_fields:
   1838         return []

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/backends/utils.py:79, in CursorWrapper.execute(self, sql, params)
     78 def execute(self, sql, params=None):
---> 79     return self._execute_with_wrappers(
     80         sql, params, many=False, executor=self._execute
     81     )

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/backends/utils.py:92, in CursorWrapper._execute_with_wrappers(self, sql, params, many, executor)
     90 for wrapper in reversed(self.db.execute_wrappers):
     91     executor = functools.partial(wrapper, executor)
---> 92 return executor(sql, params, many, context)

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/backends/utils.py:100, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
     98     warnings.warn(self.APPS_NOT_READY_WARNING_MSG, category=RuntimeWarning)
     99 self.db.validate_no_broken_transaction()
--> 100 with self.db.wrap_database_errors:
    101     if params is None:
    102         # params default might be backend specific.
    103         return self.cursor.execute(sql)

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/utils.py:91, in DatabaseErrorWrapper.__exit__(self, exc_type, exc_value, traceback)
     89 if dj_exc_type not in (DataError, IntegrityError):
     90     self.wrapper.errors_occurred = True
---> 91 raise dj_exc_value.with_traceback(traceback) from exc_value

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/backends/utils.py:105, in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    103     return self.cursor.execute(sql)
    104 else:
--> 105     return self.cursor.execute(sql, params)

File ~/miniconda3/envs/pertpy/lib/python3.12/site-packages/django/db/backends/sqlite3/base.py:354, in SQLiteCursorWrapper.execute(self, query, params)
    352 param_names = list(params) if isinstance(params, Mapping) else None
    353 query = self.convert_query(query, param_names=param_names)
--> 354 return super().execute(query, params)

IntegrityError: UNIQUE constraint failed: bionty_gene.ensembl_gene_id"
}

Why does it not just skip over these genes since they've already been registered? This error is maximally unhelpful and another django error of death.

Version information

No response

felix0097 commented 1 week ago

I'm getting a very similar error as wel when using the curator.add_validated_from method:

---------------------------------------------------------------------------
UniqueViolation                           Traceback (most recent call last)
File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py:105](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py#line=104), in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    104 else:
--> 105     return self.cursor.execute(sql, params)

UniqueViolation: duplicate key value violates unique constraint "lnschema_bionty_celltype_uid_key"
DETAIL:  Key (uid)=(1uF1evnz) already exists.

The above exception was the direct cause of the following exception:

IntegrityError                            Traceback (most recent call last)
Cell In[19], line 26
     24 curator.add_validated_from_var_index()
     25 curator.add_new_from_var_index()
---> 26 curator.add_validated_from("cell_type_author")
     27 curator.add_new_from("cell_type_author")
     28 curator.add_validated_from("tissue")

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_curate.py:256](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_curate.py#line=255), in DataFrameCurator.add_validated_from(self, key, organism)
    249 """Add validated categories.
    250 
    251 Args:
    252     key: The key referencing the slot in the DataFrame.
    253     organism: The organism name.
    254 """
    255 self._kwargs.update({"organism": organism} if organism else {})
--> 256 self._update_registry(key, validated_only=True, **self._kwargs)

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_curate.py:289](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_curate.py#line=288), in DataFrameCurator._update_registry(self, categorical, validated_only, **kwargs)
    287 if categorical not in self.fields:
    288     raise ValueError(f"Feature {categorical} is not part of the fields!")
--> 289 update_registry(
    290     values=self._df[categorical].unique().tolist(),
    291     field=self.fields[categorical],
    292     key=categorical,
    293     using_key=self._using_key,
    294     validated_only=validated_only,
    295     source=self._sources.get(categorical),
    296     exclude=self._exclude.get(categorical),
    297     **kwargs,
    298 )

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_curate.py:1395](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_curate.py#line=1394), in update_registry(values, field, key, save_function, using_key, validated_only, df, organism, dtype, source, standardize, warning, exclude, **kwargs)
   1393 if source:
   1394     public_records = [r for r in public_records if r.source.uid == source.uid]
-> 1395 ln_save(public_records)
   1396 labels_saved["from public"] = [
   1397     getattr(r, field.field.name) for r in public_records
   1398 ]
   1399 non_public_labels = [i for i in values if i not in labels_saved["from public"]]

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_save.py:83](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_save.py#line=82), in save(records, ignore_conflicts)
     79 if non_artifacts:
     80     non_artifacts_old, non_artifacts_new = partition(
     81         lambda r: r._state.adding or r.pk is None, non_artifacts
     82     )
---> 83     bulk_create(non_artifacts_new, ignore_conflicts=ignore_conflicts)
     84     if non_artifacts_old:
     85         bulk_update(non_artifacts_old)

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_save.py:114](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/lamindb/_save.py#line=113), in bulk_create(records, ignore_conflicts)
    112     records_by_orm[record.__class__].append(record)
    113 for registry, records in records_by_orm.items():
--> 114     registry.objects.bulk_create(records, ignore_conflicts=ignore_conflicts)

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/manager.py:87](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/manager.py#line=86), in BaseManager._get_queryset_methods.<locals>.create_method.<locals>.manager_method(self, *args, **kwargs)
     85 @wraps(method)
     86 def manager_method(self, *args, **kwargs):
---> 87     return getattr(self.get_queryset(), name)(*args, **kwargs)

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/query.py:835](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/query.py#line=834), in QuerySet.bulk_create(self, objs, batch_size, ignore_conflicts, update_conflicts, update_fields, unique_fields)
    833 if objs_without_pk:
    834     fields = [f for f in fields if not isinstance(f, AutoField)]
--> 835     returned_columns = self._batched_insert(
    836         objs_without_pk,
    837         fields,
    838         batch_size,
    839         on_conflict=on_conflict,
    840         update_fields=update_fields,
    841         unique_fields=unique_fields,
    842     )
    843     connection = connections[self.db]
    844     if (
    845         connection.features.can_return_rows_from_bulk_insert
    846         and on_conflict is None
    847     ):

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/query.py:1875](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/query.py#line=1874), in QuerySet._batched_insert(self, objs, fields, batch_size, on_conflict, update_fields, unique_fields)
   1870 for item in [objs[i : i + batch_size] for i in range(0, len(objs), batch_size)]:
   1871     if bulk_return and (
   1872         on_conflict is None or on_conflict == OnConflict.UPDATE
   1873     ):
   1874         inserted_rows.extend(
-> 1875             self._insert(
   1876                 item,
   1877                 fields=fields,
   1878                 using=self.db,
   1879                 on_conflict=on_conflict,
   1880                 update_fields=update_fields,
   1881                 unique_fields=unique_fields,
   1882                 returning_fields=self.model._meta.db_returning_fields,
   1883             )
   1884         )
   1885     else:
   1886         self._insert(
   1887             item,
   1888             fields=fields,
   (...)
   1892             unique_fields=unique_fields,
   1893         )

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/query.py:1847](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/query.py#line=1846), in QuerySet._insert(self, objs, fields, returning_fields, raw, using, on_conflict, update_fields, unique_fields)
   1840 query = sql.InsertQuery(
   1841     self.model,
   1842     on_conflict=on_conflict,
   1843     update_fields=update_fields,
   1844     unique_fields=unique_fields,
   1845 )
   1846 query.insert_values(fields, objs, raw=raw)
-> 1847 return query.get_compiler(using=using).execute_sql(returning_fields)

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/sql/compiler.py:1836](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/models/sql/compiler.py#line=1835), in SQLInsertCompiler.execute_sql(self, returning_fields)
   1834 with self.connection.cursor() as cursor:
   1835     for sql, params in self.as_sql():
-> 1836         cursor.execute(sql, params)
   1837     if not self.returning_fields:
   1838         return []

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py:79](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py#line=78), in CursorWrapper.execute(self, sql, params)
     78 def execute(self, sql, params=None):
---> 79     return self._execute_with_wrappers(
     80         sql, params, many=False, executor=self._execute
     81     )

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py:92](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py#line=91), in CursorWrapper._execute_with_wrappers(self, sql, params, many, executor)
     90 for wrapper in reversed(self.db.execute_wrappers):
     91     executor = functools.partial(wrapper, executor)
---> 92 return executor(sql, params, many, context)

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py:100](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py#line=99), in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
     98     warnings.warn(self.APPS_NOT_READY_WARNING_MSG, category=RuntimeWarning)
     99 self.db.validate_no_broken_transaction()
--> 100 with self.db.wrap_database_errors:
    101     if params is None:
    102         # params default might be backend specific.
    103         return self.cursor.execute(sql)

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/utils.py:91](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/utils.py#line=90), in DatabaseErrorWrapper.__exit__(self, exc_type, exc_value, traceback)
     89 if dj_exc_type not in (DataError, IntegrityError):
     90     self.wrapper.errors_occurred = True
---> 91 raise dj_exc_value.with_traceback(traceback) from exc_value

File [/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py:105](http://localhost:8888/vol/data/miniconda3/envs/similarity-lamin/lib/python3.10/site-packages/django/db/backends/utils.py#line=104), in CursorWrapper._execute(self, sql, params, *ignored_wrapper_args)
    103     return self.cursor.execute(sql)
    104 else:
--> 105     return self.cursor.execute(sql, params)

IntegrityError: duplicate key value violates unique constraint "lnschema_bionty_celltype_uid_key"
DETAIL:  Key (uid)=(1uF1evnz) already exists.
Zethson commented 4 days ago

I cannot get a nice reproducible example but have an idea of where this might be coming from. I'll open a new issue. @felix0097 we need a small reproducible example for your issue and I'm not sure whether it's the same one.