from __future__ import annotations
from pathlib import Path
from typing import Iterable
import warnings
import yaml
EXTRACT_FEATURES = [
"extract-features/lida-in-merged-no-nbc/",
"extract-features/in-merged-layers/",
"extract-features/pc-L2-merged/",
"extract-features/pc-L5-merged/",
"extract-features/pc-L6-merged/",
]
CHECKPOINTS = [
"training/checkpoints-lida-alt-neurites/in-merged-no-nbc-",
"training/checkpoints-lida/in-merged-no-nbc",
"training/checkpoints-merged-interneurons",
"training/checkpoints-pc-merged-classes",
]
def check_contents(
d: dict,
keys: Iterable[str],
extract_features: Iterable[str],
checkpoints: Iterable[str],
check_type: str,
):
def _found_in_checkpoints(el: str):
if isinstance(el, dict):
el = el["path"]
return any(el.startswith(cp) for cp in checkpoints)
def _found_in_features(el: str):
if isinstance(el, dict):
el = el["path"]
return any(el.startswith(ff) for ff in extract_features)
for k in keys:
d_k = d[k]
if "do" in d_k:
d_k = d_k["do"]
if "deps" in d_k:
deps: list[str] = d_k["deps"]
if "outs" in d_k:
outs: list[str] = d_k["outs"]
if k.startswith("train"):
if check_type == "presence":
if not any(_found_in_features(dep) for dep in deps):
warnings.warn(f"For stage {k}: found deps {deps}")
if not any(_found_in_checkpoints(out) for out in outs):
warnings.warn(f"For stage {k}: found outs {outs[0]}")
elif check_type == "absence":
if any(_found_in_features(dep) for dep in deps):
warnings.warn(f"For stage {k}: found deps {deps}")
if any(_found_in_checkpoints(out) for out in outs):
warnings.warn(f"For stage {k}: found outs {outs[0]}")
elif k.startswith("eval"):
if check_type == "presence":
if not any(_found_in_checkpoints(dep) for dep in deps):
warnings.warn(f"For stage {k}: found outs {deps[0]}")
elif check_type == "absence":
if any(_found_in_checkpoints(dep) for dep in deps):
warnings.warn(f"For stage {k}: found outs {deps[0]}")
elif k.startswith("features-"):
if len(outs) != 1:
warnings.warn(f"For stage {k}: found outs {outs}")
if check_type == "presence":
if not any(_found_in_features(out) for out in outs):
warnings.warn(f"For stage {k}: found output {outs[0]}")
elif check_type == "absence":
if any(_found_in_features(out) for out in outs):
warnings.warn(f"For stage {k}: found output {outs[0]}")
else:
warnings.warn(f"Found key that will not be checked: {k}")
if __name__ == "__main__":
yaml_path = Path.cwd() / "dvc"
p_old = yaml_path / "dvc.yaml"
p_new = yaml_path / "dvc_new.yaml"
for suffix in [".yaml", ".lock"]:
p_old = p_old.with_suffix(suffix)
p_new = p_new.with_suffix(suffix)
with p_old.open() as f:
d_old = yaml.safe_load(f)["stages"]
with p_new.open() as f:
d_new = yaml.safe_load(f)["stages"]
assert not set(d_new) - set(d_old)
keys_diff = set(d_old) - set(d_new)
print(f"Found {len(keys_diff):,d} differing keys.")
print("Checking diff contents...")
check_contents(d_old, keys_diff, EXTRACT_FEATURES, CHECKPOINTS, "presence")
print("Checking new contents...")
check_contents(d_new, d_new.keys(), EXTRACT_FEATURES, CHECKPOINTS, "absence")
print("Done!")
Checklist
[x] This PR refers to an issue from the issue tracker.
(if it is not the case, please create an issue first).
Fixes #53.
Description
The following datasets are removed (see #53 description for details).
training/
How to test?
First of all, let's get both the old and new versions of DVC files:
Then run the following Python script:
Checklist