LorenFrankLab / spyglass

Neuroscience data analysis framework for reproducible research built by Loren Frank Lab at UCSF
https://lorenfranklab.github.io/spyglass/
MIT License
81 stars 40 forks source link

Remove unused tables #976

Open CBroz1 opened 1 month ago

CBroz1 commented 1 month ago

Process

Using a sql command, I fetched tables with no rows, as recorded in the info schema. Some weren't actually empty, so I narrowed down the list by fetching with dj, and then looked for which items weren't represented in the package

Script ```python import inspect import pkgutil import sys import datajoint as dj from datajoint.user_tables import TableMeta from datajoint.utils import from_camel_case from tqdm import tqdm import spyglass from spyglass.utils.database_settings import SHARED_MODULES def empty_in_shared_module(full_table_name): if full_table_name.split(".")[0].split("_")[0] not in SHARED_MODULES: return False # private prefix if full_table_name.split(".")[1][0] == "~": # hidden table return False return len(dj.FreeTable(dj.conn(), full_table_name)) == 0 class_cache = dict() # Cache results to avoid re-importing modules sql_tbls = dict() # Cache for schema.list_tables() def load_cache(): for importer, modname, ispkg in tqdm( pkgutil.walk_packages(spyglass.__path__), desc="Loading cache", total=300, ): module = importer.find_module(modname).load_module(modname) if not hasattr(module, "schema"): continue database = module.schema.database if not class_cache.get(database): class_cache[database] = set() sql_tbls[database] = set() schema_list = module.schema.list_tables() sql_tbls[database].update(schema_list) for attr_name in getattr(module, "__dict__", []): if not isinstance(getattr(module, attr_name), TableMeta): continue if attr_name.startswith("_"): continue snake_case_name = from_camel_case(attr_name) if snake_case_name in schema_list: class_cache[database].add(from_camel_case(attr_name)) def class_exists_in_package(full_table_name): if not class_cache: load_cache() schema_name, table_name = full_table_name.split(".") schema_tables = class_cache.get(schema_name, []) return table_name in schema_tables my_query = """ select table_schema as database_name, table_name from information_schema.tables where table_type = 'BASE TABLE' and table_rows = 0 and table_schema not in('information_schema', 'sys', 'performance_schema', 'mysql') order by table_schema, table_name; """ mysql_empty = [".".join(t) for t in dj.conn().query(my_query).fetchall()] no_content = [name for name in mysql_empty if empty_in_shared_module(name)] no_class = [t for t in mysql_empty if not class_exists_in_package(t)] neither = list(set(no_content).intersection(set(no_class))) no_content_has_class = list(set(no_content).difference(set(no_class))) ```
Resulting Lists First pass with an older version of the script above, will update soon ```python no_content = [ "common_behav._head_dir", "common_behav._lin_pos", "common_behav._speed", "common_behav.merged_position", "common_behav.merged_position__method_two_position", "common_behav.merged_position__raw_position", "common_behav.method_two_position", "common_curation.__automatic_curation_sorting", "common_curation.__curated_spike_sorting", "common_curation.__curated_spike_sorting__unit", "common_curation.__selected_units", "common_curation.automatic_curation_parameters", "common_curation.automatic_curation_selection", "common_curation.curated_spike_sorting_selection", "common_curation.selected_units_parameters", "common_ephys.electrode_brain_region", "common_lab.__analysis_nwbfile_kachery", "common_lab.__nwbfile_kachery", "common_metrics.__quality_metrics", "common_metrics.metric_selection", "common_nwbfile.__analysis_nwbfile_kachery", "common_nwbfile.__nwbfile_kachery", "common_nwbfile.analysis_nwbfile_kachery", "common_nwbfile.analysis_nwbfile_kachery_selection", "common_nwbfile.nwbfile_kachery", "common_nwbfile.nwbfile_kachery_selection", "common_position.__position_video", "common_ripple.l_f_p_selection", "common_ripple.l_f_p_selection__l_f_p_electrode", "common_ripple.ripple_artifact_detection_selection", "common_session._experimenter_list", "common_session._experimenter_list__experimenter", "common_session.session_group_session", "common_sortingview.__sortingview_workspace__sortings", "common_spikesorting.__automatic_curation", "common_spikesorting.__curated_spike_sorting", "common_spikesorting.__curated_spike_sorting__unit", "common_spikesorting.__modify_sorting", "common_spikesorting.__spike_sorting", "common_spikesorting.__spike_sorting_workspace", "common_spikesorting.automatic_curation_selection", "common_spikesorting.curated_spike_sorting_selection", "common_spikesorting.modify_sorting_parameters", "common_spikesorting.modify_sorting_selection", "common_spikesorting.modify_sorting_selection__sortings_i_ds", "common_spikesorting.sorting", "common_spikesorting.sorting_i_d", "common_spikesorting.sorting_list", "common_spikesorting.sortings", "common_spikesorting.spike_sorting_artifact_parameters", "common_spikesorting.spike_sorting_filter_parameters", "common_task.apparatus", "common_usage.insert_error", "common_waveforms.__waveforms", "common_waveforms.waveform_selection", "decoding._decode_spikes_selection", "decoding._decode_spikes_selection__decode_spikes", "decoding.decode_spikes_selection", "decoding.decode_spikes_selection__decode_spikes", "decoding_clusterless.__multiunit_firing_rate", "decoding_clusterless.__multiunit_high_synchrony_events", "decoding_clusterless.mark_indicator_parameters", "decoding_sortedspikes.__my_sorted_spikes_indicator", "decoding_sortedspikes.__sorted_spikes_results", "lfp_imported._imported_l_f_p", "lfp_merge.l_f_p_output__imported_l_f_p", "lfp_v1.__l_f_p_band_artifact_detection", "lfp_v1._imported_l_f_p_v1", "lfp_v1.l_f_p_band_artifact_detection_parameters", "lfp_v1.l_f_p_band_artifact_detection_selection", "lfp_v1.l_f_p_band_artifact_removed_interval_list", "position_dlc_model.__d_l_c_model_evaluation", "position_dlc_selection.__d_l_c_pos_video", "position_linearization_merge.linearized_position_output__linearized_position_v0", "position_merge.__position_video", "position_merge.position_video_selection", "position_position.__position_video", "position_position.final_position__common_pos", "position_position.final_position__trodes_pos_v1", "position_position.position_video_selection", "position_trodes_position.__trodes_pos_v1", "position_v1_dlc_model.__d_l_c_model_evaluation", "position_v1_dlc_selection.__d_l_c_pos_video", "position_v1_trodes_position.__trodes_pos_video", "spikesorting_artifact.artifact_detection_parameter", "spikesorting_curation.__units", "spikesorting_curation.unit_inclusion_parameters", "spikesorting_group_v1.sorted_spikes_group__sort_group", "spikesorting_merge.unit_inclusion", "spikesorting_recording.__sort_group_targeted_location", "spikesorting_recording.electrode_brain_location", "spikesorting_recording.spike_sorting_preprocessing_parameter", "spikesorting_sorting.spike_sorter_parameter", "spikesorting_v1_unit_inclusion.imported_unit_inclusion_v1", ] no_class = [ "common_behav._head_dir", "common_behav._lin_pos", "common_behav._speed", "common_behav.merged_position", "common_behav.merged_position__method_two_position", "common_behav.merged_position__raw_position", "common_behav.method_two_position", "common_curation.__automatic_curation_sorting", "common_curation.__curated_spike_sorting__unit", "common_curation.__selected_units", "common_curation.selected_units_parameters", "common_lab.__nwbfile_kachery", "common_nwbfile.__nwbfile_kachery", "common_nwbfile.nwbfile_kachery", "common_nwbfile.nwbfile_kachery_selection", "common_ripple.l_f_p_selection__l_f_p_electrode", "common_ripple.ripple_artifact_detection_selection", "common_session._experimenter_list", "common_session._experimenter_list__experimenter", "common_sortingview.__sortingview_workspace__sortings", "common_spikesorting.__curated_spike_sorting__unit", "common_spikesorting.__modify_sorting", "common_spikesorting.__spike_sorting_workspace", "common_spikesorting.modify_sorting_parameters", "common_spikesorting.modify_sorting_selection", "common_spikesorting.modify_sorting_selection__sortings_i_ds", "common_spikesorting.sorting", "common_spikesorting.sorting_i_d", "common_spikesorting.sorting_list", "common_spikesorting.sortings", "common_spikesorting.spike_sorting_artifact_parameters", "common_spikesorting.spike_sorting_filter_parameters", "common_task.apparatus", "decoding._decode_spikes_selection", "decoding._decode_spikes_selection__decode_spikes", "decoding.decode_spikes_selection", "decoding.decode_spikes_selection__decode_spikes", "decoding_clusterless.__multiunit_firing_rate", "decoding_clusterless.__multiunit_high_synchrony_events", "decoding_clusterless.mark_indicator_parameters", "decoding_sortedspikes.__my_sorted_spikes_indicator", "decoding_sortedspikes.__sorted_spikes_results", "lfp_merge.l_f_p_output__imported_l_f_p", "lfp_v1.__l_f_p_band_artifact_detection", "lfp_v1._imported_l_f_p_v1", "lfp_v1.l_f_p_band_artifact_detection_parameters", "lfp_v1.l_f_p_band_artifact_detection_selection", "lfp_v1.l_f_p_band_artifact_removed_interval_list", "position_linearization_merge.linearized_position_output__linearized_position_v0", "position_position.final_position__common_pos", "position_position.final_position__trodes_pos_v1", "spikesorting_artifact.artifact_detection_parameter", "spikesorting_curation.__units", "spikesorting_group_v1.sorted_spikes_group__sort_group", "spikesorting_merge.unit_inclusion", "spikesorting_recording.__sort_group_targeted_location", "spikesorting_recording.electrode_brain_location", "spikesorting_recording.spike_sorting_preprocessing_parameter", "spikesorting_sorting.spike_sorter_parameter", "spikesorting_v1_unit_inclusion.imported_unit_inclusion_v1", ] neither = [ "common_spikesorting.sortings", "common_behav._head_dir", "decoding.decode_spikes_selection__decode_spikes", "common_curation.curated_spike_sorting_selection", "position_position.final_position__common_pos", "position_merge.__position_video", "lfp_v1.l_f_p_band_artifact_detection_selection", "spikesorting_group_v1.sorted_spikes_group__sort_group", "spikesorting_artifact.artifact_detection_parameter", "common_session._experimenter_list", "common_behav.merged_position__method_two_position", "decoding._decode_spikes_selection__decode_spikes", "common_nwbfile.analysis_nwbfile_kachery_selection", "common_spikesorting.__spike_sorting_workspace", "decoding.decode_spikes_selection", "spikesorting_curation.__units", "common_position.__position_video", "common_waveforms.__waveforms", "decoding_sortedspikes.__sorted_spikes_results", "common_spikesorting.sorting", "common_nwbfile.nwbfile_kachery_selection", "position_position.__position_video", "common_curation.__automatic_curation_sorting", "common_curation.automatic_curation_parameters", "spikesorting_recording.spike_sorting_preprocessing_parameter", "position_v1_dlc_selection.__d_l_c_pos_video", "common_spikesorting.__curated_spike_sorting", "common_spikesorting.sorting_i_d", "position_v1_dlc_model.__d_l_c_model_evaluation", "common_metrics.metric_selection", "common_nwbfile.nwbfile_kachery", "position_dlc_selection.__d_l_c_pos_video", "common_curation.automatic_curation_selection", "spikesorting_sorting.spike_sorter_parameter", "common_ripple.ripple_artifact_detection_selection", "common_spikesorting.__curated_spike_sorting__unit", "common_behav.merged_position__raw_position", "common_spikesorting.__modify_sorting", "common_spikesorting.__spike_sorting", "common_curation.selected_units_parameters", "common_nwbfile.__nwbfile_kachery", "lfp_v1.l_f_p_band_artifact_removed_interval_list", "common_behav.merged_position", "position_position.position_video_selection", "lfp_v1.l_f_p_band_artifact_detection_parameters", "common_spikesorting.sorting_list", "spikesorting_v1_unit_inclusion.imported_unit_inclusion_v1", "common_curation.__curated_spike_sorting", "decoding_clusterless.mark_indicator_parameters", "common_curation.__selected_units", "position_dlc_model.__d_l_c_model_evaluation", "spikesorting_recording.__sort_group_targeted_location", "common_spikesorting.modify_sorting_selection__sortings_i_ds", "common_metrics.__quality_metrics", "lfp_v1._imported_l_f_p_v1", "position_position.final_position__trodes_pos_v1", "position_linearization_merge.linearized_position_output__linearized_position_v0", "common_ripple.l_f_p_selection__l_f_p_electrode", "decoding_clusterless.__multiunit_firing_rate", "spikesorting_recording.electrode_brain_location", "decoding_clusterless.__multiunit_high_synchrony_events", "common_spikesorting.spike_sorting_artifact_parameters", "common_sortingview.__sortingview_workspace__sortings", "lfp_v1.__l_f_p_band_artifact_detection", "common_behav._lin_pos", "position_trodes_position.__trodes_pos_v1", "common_ripple.l_f_p_selection", "common_curation.__curated_spike_sorting__unit", "common_waveforms.waveform_selection", "common_behav.method_two_position", "common_spikesorting.__automatic_curation", "common_task.apparatus", "common_spikesorting.automatic_curation_selection", "position_v1_trodes_position.__trodes_pos_video", "spikesorting_merge.unit_inclusion", "common_lab.__analysis_nwbfile_kachery", "common_behav._speed", "common_spikesorting.spike_sorting_filter_parameters", "common_nwbfile.__analysis_nwbfile_kachery", "lfp_imported._imported_l_f_p", "common_lab.__nwbfile_kachery", "decoding_sortedspikes.__my_sorted_spikes_indicator", "decoding._decode_spikes_selection", "common_nwbfile.analysis_nwbfile_kachery", "lfp_merge.l_f_p_output__imported_l_f_p", "common_session._experimenter_list__experimenter", "common_spikesorting.curated_spike_sorting_selection", "common_spikesorting.modify_sorting_parameters", "common_spikesorting.modify_sorting_selection", ] no_content_has_class = [ "spikesorting_curation.unit_inclusion_parameters", "common_session.session_group_session", "common_ephys.electrode_brain_region", "common_usage.insert_error", "position_merge.position_video_selection", ] ```

Result

There are ....

  1. Schemas that do not currently or no longer have a Spyglass schema
    • common_analytic_signal: last used by Xulu 05/23
    • common_artifact: last used by Alison 11/22
    • common_backup: last used by Kyu 02/22
    • common_curation: last used by Eric 02/22
    • common_metrics: last used by Jen 03/22
    • common_sortingview: last used by Alison 11/22
    • common_waveform: last used by Jen 03/22
    • decoding: last used by Eric 05/23
    • position_position: last used by Daniel 04/23
  2. Tables on used schemas that do not have a Spyglass class (e.g., common_behav._head_dir). See 'neither' in list above.
  3. Tables on used schemas that have a Spyglass class, declared some time ago that remain unused
    • spikesorting_curation.unit_inclusion_parameters: declared 07/22
    • common_session.session_group_session: declared 03/22
    • common_ephys.electrode_brain_region: declared 05/22
    • position_merge.position_video_selection: 04/23
    • common_usage.insert_error: declared 02/24 - worth preserving for outside-lab insert errors?

Proposed

  1. Tables and schemas on shared prefixes without a python representation are scheduled for deletion after a team-wide two-week warning.
  2. Empty tables with Spyglass classes that have been up >3mo are marked for deprecation in version 0.6.0. Inserts into these tables can be overwritten to temporarily launch a deprecation warning and then log in common_usage to let us know they're being used.
edeno commented 1 month ago

All these would be good to clean up as far as I know.

CBroz1 commented 1 day ago

At @samuelbray32 's suggestion, I wrote a script to look at tables on shared prefixes, without a Spyglass class, with contents. They are listed at the end of the script below.

Script ```python import importlib import inspect import pkgutil import datajoint as dj from datajoint.user_tables import TableMeta from tqdm import tqdm from spyglass.utils.database_settings import SHARED_MODULES class SearchPkg: def __init__(self, package_name="spyglass"): self.package = importlib.import_module(package_name) self.class_cache = set() self.load_cache() def find_subclasses(self, module, parent_class): for name, obj in inspect.getmembers(parent_class, inspect.isclass): if isinstance(obj, TableMeta) and obj.__module__ == module.__name__: name = obj.full_table_name.replace("`", "") if name in self.class_cache: continue print(f"Found class: {name}") self.class_cache.add(obj.full_table_name.replace("`", "")) self.find_subclasses(module, obj) def load_cache(self, package_name="spyglass"): package = self.package for loader, module_name, is_pkg in tqdm( pkgutil.walk_packages(package.__path__, package.__name__ + "."), desc="Loading cache", total=64, ): module = importlib.import_module(module_name) if not getattr(module, "schema", None): continue print(f"Checking module: {module_name}") self.find_subclasses(module, module) my_query = """ select table_schema as database_name, table_name from information_schema.tables where table_type = 'BASE TABLE' and table_schema not in('information_schema', 'sys', 'performance_schema', 'mysql') order by table_schema, table_name; """ mysql_tbls = [ ".".join(t) for t in dj.conn().query(my_query).fetchall() if t[0].split("_")[0] in SHARED_MODULES and t[1][0] != "~" ] class_cache = SearchPkg().class_cache no_class = set(mysql_tbls).difference(class_cache) len_cache = dict() for t in no_class: len_cache[t] = len(dj.FreeTable(dj.conn(), t)) non_empty = {k: v for k, v in len_cache.items() if v > 0} non_empty = { "common_analytic_signal.__analytic_signal": 1, "common_analytic_signal.analytic_signal_parameters": 1, "common_analytic_signal.analytic_signal_selection": 1, "common_artifact.__artifact_detection": 1, "common_artifact.artifact_detection_parameters": 3, "common_artifact.artifact_detection_selection": 1, "common_artifact.artifact_removed_interval_list": 1, "common_backup.curated_spike_sorting_back_up": 413, "common_backup.spike_sorting_back_up": 2810, "common_filter.fir_filter": 3, "common_interval.sort_interval": 52, "common_lab.analysis_nwbfile": 5, "common_lab.nwbfile": 4, "common_metrics.metric_parameters": 1, "common_nwbfile.kachery_channel": 1, "common_ripple.ripple_artifact_detection_parameters": 2, "common_ripple.ripple_artifact_removed_interval_list": 2, "common_sortingview.__sortingview_workspace": 1, "common_spikesorting.__spike_sorting_recording": 1, "common_spikesorting.automatic_curation_parameters": 3, "common_spikesorting.sort_group": 1665, "common_spikesorting.sort_group__sort_group_electrode": 7427, "common_spikesorting.spike_sorter": 16, "common_spikesorting.spike_sorter_parameters": 20, "common_spikesorting.spike_sorting_artifact_detection_parameters": 1, "common_spikesorting.spike_sorting_metric_parameters": 6, "common_spikesorting.spike_sorting_metrics": 3, "common_spikesorting.spike_sorting_preprocessing_parameters": 2, "common_spikesorting.spike_sorting_recording_selection": 1, "common_spikesorting.spike_sorting_selection": 1, "common_spikesorting.spike_sorting_waveform_parameters": 1, "common_spikesorting.unit_inclusion_parameters": 2, "common_temp.temp": 2, "common_waveforms.waveform_parameters": 1, "decoding_clusterless.classifier_parameters": 3, "decoding_clusterless.multiunit_high_synchrony_events_parameters": 1, "decoding_clusterless.sorted_spikes_classifier_parameters": 4, "decoding_sortedspikes.#my_sorted_spikes_indicator_selection": 13, "lfp_v1.__l_f_p": 81, "lfp_v1.__l_f_p_band": 4, "lfp_v1.l_f_p_band_selection": 5, "lfp_v1.l_f_p_band_selection__l_f_p_band_electrode": 76, "lfp_v1.l_f_p_electrode_group": 65, "lfp_v1.l_f_p_electrode_group__l_f_p_electrode": 3248, "lfp_v1.l_f_p_output": 80, "lfp_v1.l_f_p_output__l_f_p": 80, "position_dlc_centroid.__d_l_c_centroid": 52, "position_dlc_centroid.d_l_c_centroid_params": 6, "position_dlc_centroid.d_l_c_centroid_selection": 60, "position_dlc_cohort.__d_l_c_smooth_interp_cohort": 52, "position_dlc_cohort.__d_l_c_smooth_interp_cohort__body_part": 206, "position_dlc_cohort.d_l_c_smooth_interp_cohort_selection": 52, "position_dlc_model.__d_l_c_model": 3, "position_dlc_model.__d_l_c_model__body_part": 15, "position_dlc_model.d_l_c_model_input": 3, "position_dlc_model.d_l_c_model_params": 1, "position_dlc_model.d_l_c_model_selection": 4, "position_dlc_model.d_l_c_model_source": 5, "position_dlc_model.d_l_c_model_source__from_import": 3, "position_dlc_model.d_l_c_model_source__from_upstream": 2, "position_dlc_orient.__d_l_c_orientation": 52, "position_dlc_orient.d_l_c_orientation_params": 2, "position_dlc_orient.d_l_c_orientation_selection": 60, "position_dlc_pose_estimation.__d_l_c_pose_estimation": 64, "position_dlc_pose_estimation.__d_l_c_pose_estimation__body_part": 320, "position_dlc_pose_estimation.d_l_c_pose_estimation_selection": 307, "position_dlc_position.__d_l_c_smooth_interp": 206, "position_dlc_position.d_l_c_smooth_interp_params": 5, "position_dlc_position.d_l_c_smooth_interp_selection": 242, "position_dlc_project.body_part": 7, "position_dlc_project.d_l_c_project": 8, "position_dlc_project.d_l_c_project__body_part": 37, "position_dlc_project.d_l_c_project__file": 55, "position_dlc_selection.__d_l_c_pos_v1": 36, "position_dlc_selection.d_l_c_pos_selection": 52, "position_dlc_selection.d_l_c_pos_video_params": 10, "position_dlc_selection.d_l_c_pos_video_selection": 26, "position_dlc_training.#d_l_c_model_training_params": 2, "position_dlc_training.__d_l_c_model_training": 2, "position_dlc_training.d_l_c_model_training_selection": 3, "position_merge.pose_output": 1, "position_merge.pose_output__d_l_c_pose_estimation": 1, "position_position.final_position": 35, "position_position.final_position__d_l_c_pos_v1": 26, "waveform_features.#waveform_features_params": 4, "waveform_features.unit_waveform_features_selection": 327, } ```

These seem to primarily be first drafts of tables that were later renamed. Unless there are objections, I plan to drop these tables as well.