Open amblypatty opened 1 year ago
For others experiencing this issue:
This issue seems to only appear for the "single instance" pipeline. We will need to make some changes with how we handle converting to grayscale for the "single instance" pipeline. Until then (or long-term - depending on performance), could you try using top-down. This should help save on memory if your animal is relatively small compared to the entire video (we find centroids first, then crop about the centroid to find all body parts).
Thanks, Liezl
Bug description
Hi,
I am attempting to adjust the single_instance parameters in the training configuration and to decrease the time taken per epoch during remote training in Google Colab. So, in the Training configuration, I set Convert Image To: grayscale, and then (because I have HD video files as input) I set the input scale to 0.50 to control the receptive field size without increasing the number of down blocks with max stride (which also increases the receptive field size). By doing so, the input shape of each frame is reduced from (1080, 1920, 3) to (544, 960, 1). However, when I run the training on Google Colab, I get an error:
Attached is the training configuration file for reference. If I have incorrectly adjusted parameters, then a message should pop when attempting to save the training configuration to tell me that this certain combination of parameters is incompatible.
Thank you!
Expected behaviour
Actual behaviour
Your personal set up
Environment packages
``` # paste output of `pip freeze` o [Training_Config5.zip](https://github.com/talmolab/sleap/files/10282877/Training_Config5.zip) r `conda list` here ```Logs
``` INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. INFO:numexpr.utils:NumExpr defaulting to 8 threads. INFO:sleap.nn.training:Versions: SLEAP: 1.2.9 TensorFlow: 2.8.4 Numpy: 1.21.6 Python: 3.8.16 OS: Linux-5.10.133+-x86_64-with-glibc2.27 INFO:sleap.nn.training:Training labels file: BPKmorn22M_test_0-G.pkg.slp INFO:sleap.nn.training:Training profile: single_instance.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "single_instance.json", "labels_path": "BPKmorn22M_test_0-G.pkg.slp", "video_paths": [ "" ], "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "", "prefix": "", "suffix": "", "cpu": false, "first_gpu": false, "last_gpu": false, "gpu": "auto" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.15, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": true, "imagenet_mode": null, "input_scaling": 0.5, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 32, "output_stride": 4, "filters": 32, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": { "part_names": null, "sigma": 2.5, "output_stride": 4, "loss_weight": 1.0, "offset_refinement": false }, "centroid": null, "centered_instance": null, "multi_instance": null, "multi_class_bottomup": null, "multi_class_topdown": null } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -15.0, "rotation_max_angle": 15.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": true, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 16, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-08, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "221221_193459", "run_name_prefix": "", "run_name_suffix": ".single_instance", "runs_folder": "models", "tags": [ "" ], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.2.9", "filename": "single_instance.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:Auto-selected GPU 0 with 40533 MiB of free memory. INFO:sleap.nn.training:Using GPU 0 for acceleration. INFO:sleap.nn.training:Disabled GPU memory pre-allocation. INFO:sleap.nn.training:System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: BPKmorn22M_test_0-G.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.15 INFO:sleap.nn.training: Splits: Training = 17 / Validation = 3. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... INFO:sleap.nn.training:Loaded test example. [2.513s] INFO:sleap.nn.training: Input shape: (544, 960, 1) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=32, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=5, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 32 INFO:sleap.nn.training: Parameters: 31,261,051 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = SingleInstanceConfmapsHead(part_names=['prosoma', 'pedicel', 'opisthosoma', 'pedipalpR1', 'pedipalpL1', 'antlegR1', 'antlegR2', 'antlegL1', 'antlegL2', 'forelegR1', 'forelegR2', 'forelegL1', 'forelegL2', 'midlegR1', 'midlegR2', 'midlegL1', 'midlegL2', 'hindlegR1', 'hindlegR2', 'hindlegL1', 'hindlegL2', 'pedipalpR2', 'pedipalpL2', 'antlegR3', 'antlegR4', 'antlegL3', 'antlegL4'], sigma=2.5, output_stride=4, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 136, 240, 27), dtype=tf.float32, name=None), name='SingleInstanceConfmapsHead/BiasAdd:0', description="created by layer 'SingleInstanceConfmapsHead'") INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 17 INFO:sleap.nn.training:Validation set: n = 3 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: models/221221_193459.single_instance INFO:sleap.nn.training:Setting up visualization... Unable to use Qt backend for matplotlib. This probably means Qt is running headless. INFO:sleap.nn.training:Finished trainer set up. [3.4s] INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation... INFO:sleap.nn.training:Finished creating training datasets. [3.5s] INFO:sleap.nn.training:Starting training loop... ```Screenshots
How to reproduce