ocean-data-factory-sweden / kso

Notebooks to upload/download marine footage, connect to a citizen science project, train machine learning models and publish marine biological observations.
GNU General Public License v3.0
4 stars 12 forks source link

Tutorial 8: Shark_life project issue #262

Closed Bergylta closed 6 months ago

Bergylta commented 10 months ago

Input

pp.process_zoo_classifications()

Output


KeyError                                  Traceback (most recent call last)
Cell In[76], line 1
----> 1 pp.process_zoo_classifications()

File /usr/src/app/kso/kso_utils/kso_utils/project.py:746, in ProjectProcessor.process_zoo_classifications(self, test)
    742     workflow_checks = self.workflow_widget.checks
    744 # Retrieve a subset of the subjects from the workflows of interest and
    745 # populate the sql subjects table
--> 746 selected_zoo_workflows = zoo_utils.sample_subjects_from_workflows(
    747     project=self.project,
    748     server_connection=self.server_connection,
    749     db_connection=self.db_connection,
    750     workflow_widget_checks=workflow_checks,
    751     workflows_df=self.zoo_info["workflows"],
    752     subjects_df=self.zoo_info["subjects"],
    753 )
    755 # Make sure all the classifications have existing subjects,
    756 # Flatten the classifications provided the cit. scientists
    757 self.processed_zoo_classifications = zoo_utils.process_zoo_classifications(
    758     project=self.project,
    759     db_connection=self.db_connection,
   (...)
    763     selected_zoo_workflows=selected_zoo_workflows,
    764 )

File /usr/src/app/kso/kso_utils/kso_utils/zooniverse_utils.py:1294, in sample_subjects_from_workflows(project, server_connection, db_connection, workflow_widget_checks, workflows_df, subjects_df)
   1290 drop_table(conn=db_connection, table_name="subjects")
   1292 if len(subjects_series) > 0:
   1293     # Fill or re-fill subjects table
-> 1294     populate_subjects(project, server_connection, db_connection, subjects_series)
   1295 else:
   1296     logging.error("No subjects to populate database from the workflows selected.")

File /usr/src/app/kso/kso_utils/kso_utils/zooniverse_utils.py:1147, in populate_subjects(project, server_connection, db_connection, subjects)
   1144     movies_df = movies_df.rename(columns={"id": "movie_id"})
   1146     # Reference the movienames with the id movies table
-> 1147     subjects = pd.merge(subjects, movies_df, how="left", on="filename")
   1149 if subjects["subject_type"].value_counts().idxmax() == "clip":
   1150     # Calculate the clip_end_time
   1151     subjects["clip_end_time"] = (
   1152         subjects["clip_start_time"] + subjects["clip_length"]
   1153     )

File /usr/local/lib/python3.8/dist-packages/pandas/core/reshape/merge.py:107, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
     90 @Substitution("\nleft : DataFrame or named Series")
     91 @Appender(_merge_doc, indents=0)
     92 def merge(
   (...)
    105     validate: str | None = None,
    106 ) -> DataFrame:
--> 107     op = _MergeOperation(
    108         left,
    109         right,
    110         how=how,
    111         on=on,
    112         left_on=left_on,
    113         right_on=right_on,
    114         left_index=left_index,
    115         right_index=right_index,
    116         sort=sort,
    117         suffixes=suffixes,
    118         copy=copy,
    119         indicator=indicator,
    120         validate=validate,
    121     )
    122     return op.get_result()

File /usr/local/lib/python3.8/dist-packages/pandas/core/reshape/merge.py:700, in _MergeOperation.__init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)
    693 self._cross = cross_col
    695 # note this function has side effects
    696 (
    697     self.left_join_keys,
    698     self.right_join_keys,
    699     self.join_names,
--> 700 ) = self._get_merge_keys()
    702 # validate the merge keys dtypes. We may need to coerce
    703 # to avoid incompatible dtypes
    704 self._maybe_coerce_merge_keys()

File /usr/local/lib/python3.8/dist-packages/pandas/core/reshape/merge.py:1110, in _MergeOperation._get_merge_keys(self)
   1108     right_keys.append(rk)
   1109 if lk is not None:
-> 1110     left_keys.append(left._get_label_or_level_values(lk))
   1111     join_names.append(lk)
   1112 else:
   1113     # work-around for merge_asof(left_index=True)

File /usr/local/lib/python3.8/dist-packages/pandas/core/generic.py:1848, in NDFrame._get_label_or_level_values(self, key, axis)
   1846     values = self.axes[axis].get_level_values(key)._values
   1847 else:
-> 1848     raise KeyError(key)
   1850 # Check for duplicates
   1851 if values.ndim > 1:

KeyError: 'filename'
victor-wildlife commented 9 months ago

@jannesgg @Bergylta do any of you remember how the images of the Shark_life project were uploaded to Zooniverse? I am afraid they don't have "subject_type" information ...

Image

jannesgg commented 9 months ago

@victor-wildlife It could be that they managed to get uploaded without that subject metadata (as some subject sets were at the time). Maybe we should add a step that adds this subject_type info if it is missing since we can check the file extension of the subject on ZU right? I think this will help us overcome these weird cases.

victor-wildlife commented 8 months ago

@jannesgg is there a solution to this issue implemented in the cloudina-fix-migration branch or is this something still pending?

jannesgg commented 6 months ago
jannesgg commented 6 months ago

Moved to issue #336