Open roomrys opened 1 year ago
Potential workarounds:
SingleImageVideo
instead of individual ones for each image?ImgStoreVideo
?Currently following up with Dan Butler about this.
The culprit ended up being that we cache frames in SingleImageVideo
to help with switching frames in the GUI (for high resolution images). However, with 400k labeled frames being cached (not once, but twice: once for SingleImageVideo.test_frame_
, and another time in SingleImageVideo.__data
), we experience excessive memory growth during training.
The culprits: https://github.com/talmolab/sleap/blob/5093f6992e6214c0d528b7240331b99d0a89a62f/sleap/io/video.py#L847-L861 https://github.com/talmolab/sleap/blob/5093f6992e6214c0d528b7240331b99d0a89a62f/sleap/io/video.py#L967-L980
SingleImageVideo
s in the first place?Our current implementation to import coco datasets creates one SingleImageVideo
per image (to handle mismatched image sizes in training #1024). As a secondary PR, we could modify this to create one SingleImageVideo
per image size, but we would still be left with the caching problem if many images are of different sizes.
Remove default caching for SingleImageVideo
and instead allow users to pass an argument (through the GUI) to enable caching. Disable caching during training.
has this been resolved @roomrys ? I just ran into the same issue :)
Quick update: #1243 partially fixes this.
By disabling SingleImageVideo
caching, we can open projects with 10^5+ images pretty quickly now.
It doesn't solve some other issues related to annotating on those or the downstream training, which will try to cache it and do other unnecessary deserialization/serialization steps.
Bug description
Expected behaviour
SLEAP trains smoothly.
Actual behaviour
SLEAP freezes or just takes forever to run the
make_base_pipeline
command.Your personal set up
training config (single_instance_no_preload.json)
``` { "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": true, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 1.0, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 32, "output_stride": 4, "filters": 32, "filters_rate": 1.5, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": { "part_names": null, "sigma": 5.0, "output_stride": 4, "offset_refinement": false }, "centroid": null, "centered_instance": null, "multi_instance": null } }, "optimization": { "preload_data": false, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": true, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": true, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": true, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": true, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": true, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": false }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-06, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "221027_161513", "run_name_prefix": "", "run_name_suffix": ".single_instance", "runs_folder": "", "tags": [ "" ], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.1.5", "filename": "single_instance.json" } ```How to reproduce
single_instance_no_preload.json
provided above