BlueBrain / morphoclass

Neuronal morphology preparation and classification using Machine Learning.
https://morphoclass.readthedocs.io
Apache License 2.0
8 stars 4 forks source link

Remove unused experimentation datasets #58

Closed FrancescoCasalegno closed 2 years ago

FrancescoCasalegno commented 2 years ago

Fixes #53.

Description

The following datasets are removed (see #53 description for details).

CSV Dataset File Features in `extract-features/ Checkpoints in training/
IN_data_no_NBC.csv lida-in-merged-no-nbc/ checkpoints-lida-alt-neurites/in-merged-no-nbc-*/
checkpoints-lida/in-merged-no-nbc/
interneurons-merged-layers.csv in-merged-layers/ checkpoints-merged-interneurons/
pc-L2-merged.csv pc-L2-merged/ checkpoints-pc-merged-classes/
pc-L5-merged.csv pc-L5-merged/ checkpoints-pc-merged-classes/
pc-L6-merged.csv pc-L6-merged/ checkpoints-pc-merged-classes/

How to test?

First of all, let's get both the old and new versions of DVC files:

cp dvc/dvc.yaml dvc/dvc_new.yaml
cp dvc/dvc.lock dvc/dvc_new.lock 
git checkout main -- dvc/dvc.*

Then run the following Python script:

from __future__ import annotations

from pathlib import Path
from typing import Iterable
import warnings

import yaml

EXTRACT_FEATURES = [
    "extract-features/lida-in-merged-no-nbc/",
    "extract-features/in-merged-layers/",
    "extract-features/pc-L2-merged/",
    "extract-features/pc-L5-merged/",
    "extract-features/pc-L6-merged/",
]

CHECKPOINTS = [
    "training/checkpoints-lida-alt-neurites/in-merged-no-nbc-",
    "training/checkpoints-lida/in-merged-no-nbc",
    "training/checkpoints-merged-interneurons",
    "training/checkpoints-pc-merged-classes",
]

def check_contents(
    d: dict,
    keys: Iterable[str],
    extract_features: Iterable[str],
    checkpoints: Iterable[str],
    check_type: str,
):
    def _found_in_checkpoints(el: str):
        if isinstance(el, dict):
            el = el["path"]
        return any(el.startswith(cp) for cp in checkpoints)

    def _found_in_features(el: str):
        if isinstance(el, dict):
            el = el["path"]
        return any(el.startswith(ff) for ff in extract_features)

    for k in keys:
        d_k = d[k]

        if "do" in d_k:
            d_k = d_k["do"]

        if "deps" in d_k:
            deps: list[str] = d_k["deps"]

        if "outs" in d_k:
            outs: list[str] = d_k["outs"]

        if k.startswith("train"):
            if check_type == "presence":
                if not any(_found_in_features(dep) for dep in deps):
                    warnings.warn(f"For stage {k}: found deps {deps}")
                if not any(_found_in_checkpoints(out) for out in outs):
                    warnings.warn(f"For stage {k}: found outs {outs[0]}")
            elif check_type == "absence":
                if any(_found_in_features(dep) for dep in deps):
                    warnings.warn(f"For stage {k}: found deps {deps}")
                if any(_found_in_checkpoints(out) for out in outs):
                    warnings.warn(f"For stage {k}: found outs {outs[0]}")

        elif k.startswith("eval"):
            if check_type == "presence":
                if not any(_found_in_checkpoints(dep) for dep in deps):
                    warnings.warn(f"For stage {k}: found outs {deps[0]}")
            elif check_type == "absence":
                if any(_found_in_checkpoints(dep) for dep in deps):
                    warnings.warn(f"For stage {k}: found outs {deps[0]}")

        elif k.startswith("features-"):
            if len(outs) != 1:
                warnings.warn(f"For stage {k}: found outs {outs}")
            if check_type == "presence":
                if not any(_found_in_features(out) for out in outs):
                    warnings.warn(f"For stage {k}: found output {outs[0]}")
            elif check_type == "absence":
                if any(_found_in_features(out) for out in outs):
                    warnings.warn(f"For stage {k}: found output {outs[0]}")
        else:
            warnings.warn(f"Found key that will not be checked: {k}")

if __name__ == "__main__":
    yaml_path = Path.cwd() / "dvc"
    p_old = yaml_path / "dvc.yaml"
    p_new = yaml_path / "dvc_new.yaml"

    for suffix in [".yaml", ".lock"]:
        p_old = p_old.with_suffix(suffix)
        p_new = p_new.with_suffix(suffix)

        with p_old.open() as f:
            d_old = yaml.safe_load(f)["stages"]
        with p_new.open() as f:
            d_new = yaml.safe_load(f)["stages"]

        assert not set(d_new) - set(d_old)
        keys_diff = set(d_old) - set(d_new)
        print(f"Found {len(keys_diff):,d} differing keys.")

        print("Checking diff contents...")
        check_contents(d_old, keys_diff, EXTRACT_FEATURES, CHECKPOINTS, "presence")
        print("Checking new contents...")
        check_contents(d_new, d_new.keys(), EXTRACT_FEATURES, CHECKPOINTS, "absence")
        print("Done!")

Checklist