Closed KennyTC closed 1 year ago
Hi @KennyTC,
Do you mind sharing the training_config.json
that's generated in the model folder when you try and train?
Alternatively, it's also printed in the terminal a few lines above the logs you pasted.
Thanks!
Talmo
Possibly related: #548
Here is my logs, from the starting point until the error appears.
Saving config: C:\Users/.sleap/1.2.8/preferences.yaml
Restoring GUI state...
Software versions:
SLEAP: 1.2.8
TensorFlow: 2.8.3
Numpy: 1.22.4
Python: 3.8.5
OS: Windows-10-10.0.18362-SP0
Happy SLEAPing! :)
Resetting monitor window.
Polling: C:/Users/Projects/09_animalpose/sleap\models\221104_091657.single_instance.n=1502\viz\validation.*.png
Start training single_instance...
['sleap-train', 'C:\\Users\\AppData\\Local\\Temp\\tmpoc2nr8f1\\221104_091657_training_job.json', 'C:/Users/Projects/09_animalpose/sleap/coco_train01.slp', '--zmq', '--save_viz']
INFO:sleap.nn.training:Versions:
SLEAP: 1.2.8
TensorFlow: 2.8.3
Numpy: 1.22.4
Python: 3.8.5
OS: Windows-10-10.0.18362-SP0
INFO:sleap.nn.training:Training labels file: C:/Users/Projects/09_animalpose/sleap/coco_train01.slp
INFO:sleap.nn.training:Training profile: C:\Users\AppData\Local\Temp\tmpoc2nr8f1\221104_091657_training_job.json
INFO:sleap.nn.training:
INFO:sleap.nn.training:Arguments:
INFO:sleap.nn.training:{
"training_job_path": "C:\\Users\\AppData\\Local\\Temp\\tmpoc2nr8f1\\221104_091657_training_job.json",
"labels_path": "C:/Users/Projects/09_animalpose/sleap/coco_train01.slp",
"video_paths": [
""
],
"val_labels": null,
"test_labels": null,
"tensorboard": false,
"save_viz": true,
"zmq": true,
"run_name": "",
"prefix": "",
"suffix": "",
"cpu": false,
"first_gpu": false,
"last_gpu": false,
"gpu": "auto"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Training job:
INFO:sleap.nn.training:{
"data": {
"labels": {
"training_labels": null,
"validation_labels": null,
"validation_fraction": 0.1,
"test_labels": null,
"split_by_inds": false,
"training_inds": null,
"validation_inds": null,
"test_inds": null,
"search_path_hints": [],
"skeletons": []
},
"preprocessing": {
"ensure_rgb": false,
"ensure_grayscale": false,
"imagenet_mode": null,
"input_scaling": 1.0,
"pad_to_stride": null,
"resize_and_pad_to_target": true,
"target_height": null,
"target_width": null
},
"instance_cropping": {
"center_on_part": null,
"crop_size": null,
"crop_size_detection_padding": 16
}
},
"model": {
"backbone": {
"leap": null,
"unet": {
"stem_stride": null,
"max_stride": 16,
"output_stride": 2,
"filters": 16,
"filters_rate": 2.0,
"middle_block": true,
"up_interpolate": true,
"stacks": 1
},
"hourglass": null,
"resnet": null,
"pretrained_encoder": null
},
"heads": {
"single_instance": {
"part_names": null,
"sigma": 2.5,
"output_stride": 2,
"loss_weight": 1.0,
"offset_refinement": false
},
"centroid": null,
"centered_instance": null,
"multi_instance": null,
"multi_class_bottomup": null,
"multi_class_topdown": null
}
},
"optimization": {
"preload_data": true,
"augmentation_config": {
"rotate": true,
"rotation_min_angle": -15.0,
"rotation_max_angle": 15.0,
"translate": false,
"translate_min": -5,
"translate_max": 5,
"scale": false,
"scale_min": 0.9,
"scale_max": 1.1,
"uniform_noise": false,
"uniform_noise_min_val": 0.0,
"uniform_noise_max_val": 10.0,
"gaussian_noise": false,
"gaussian_noise_mean": 5.0,
"gaussian_noise_stddev": 1.0,
"contrast": false,
"contrast_min_gamma": 0.5,
"contrast_max_gamma": 2.0,
"brightness": false,
"brightness_min_val": 0.0,
"brightness_max_val": 10.0,
"random_crop": false,
"random_crop_height": 256,
"random_crop_width": 256,
"random_flip": false,
"flip_horizontal": true
},
"online_shuffling": true,
"shuffle_buffer_size": 128,
"prefetch": true,
"batch_size": 64,
"batches_per_epoch": null,
"min_batches_per_epoch": 200,
"val_batches_per_epoch": null,
"min_val_batches_per_epoch": 10,
"epochs": 10,
"optimizer": "adam",
"initial_learning_rate": 0.0001,
"learning_rate_schedule": {
"reduce_on_plateau": true,
"reduction_factor": 0.5,
"plateau_min_delta": 1e-06,
"plateau_patience": 5,
"plateau_cooldown": 3,
"min_learning_rate": 1e-08
},
"hard_keypoint_mining": {
"online_mining": false,
"hard_to_easy_ratio": 2.0,
"min_hard_keypoints": 2,
"max_hard_keypoints": null,
"loss_scale": 5.0
},
"early_stopping": {
"stop_training_on_plateau": true,
"plateau_min_delta": 1e-08,
"plateau_patience": 10
}
},
"outputs": {
"save_outputs": true,
"run_name": "221104_091657.single_instance.n=1502",
"run_name_prefix": "",
"run_name_suffix": "",
"runs_folder": "C:/Users/Projects/09_animalpose/sleap\\models",
"tags": [
""
],
"save_visualizations": true,
"delete_viz_images": true,
"zip_outputs": false,
"log_to_csv": true,
"checkpointing": {
"initial_model": false,
"best_model": true,
"every_epoch": false,
"latest_model": false,
"final_model": false
},
"tensorboard": {
"write_logs": false,
"loss_frequency": "epoch",
"architecture_graph": false,
"profile_graph": false,
"visualizations": true
},
"zmq": {
"subscribe_to_controller": true,
"controller_address": "tcp://127.0.0.1:9000",
"controller_polling_timeout": 10,
"publish_updates": true,
"publish_address": "tcp://127.0.0.1:9001"
}
},
"name": "",
"description": "",
"sleap_version": "1.2.8",
"filename": "C:\\Users\\AppData\\Local\\Temp\\tmpoc2nr8f1\\221104_091657_training_job.json"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Auto-selected GPU 0 with [31808] MiB of free memory.
INFO:sleap.nn.training:Using GPU 0 for acceleration.
INFO:sleap.nn.training:Disabled GPU memory pre-allocation.
INFO:sleap.nn.training:System:
GPUs: 1/1 available
Device: /physical_device:GPU:0
Available: True
Initalized: False
Memory growth: True
INFO:sleap.nn.training:
INFO:sleap.nn.training:Initializing trainer...
INFO:sleap.nn.training:Loading training labels from: C:/Users/Projects/09_animalpose/sleap/coco_train01.slp
INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1
INFO:sleap.nn.training: Splits: Training = 1352 / Validation = 150.
INFO:sleap.nn.training:Setting up for training...
INFO:sleap.nn.training:Setting up pipeline builders...
INFO:sleap.nn.training:Setting up model...
INFO:sleap.nn.training:Building test pipeline...
2022-11-04 09:17:28.397728: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-04 09:17:29.666307: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 26210 MB memory: -> device: 0, name: Quadro GV100, pci bus id: 0000:b3:00.0, compute capability: 7.0
INFO:sleap.nn.training:Loaded test example. [3.015s]
INFO:sleap.nn.training: Input shape: (384, 512, 3)
INFO:sleap.nn.training:Created Keras model.
INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False)
INFO:sleap.nn.training: Max stride: 16
INFO:sleap.nn.training: Parameters: 1,954,020
INFO:sleap.nn.training: Heads:
INFO:sleap.nn.training: [0] = SingleInstanceConfmapsHead(part_names=['left_eye', 'right_eye', 'nose', 'left_ear', 'right_ear', 'left_front_elbow', 'right_front_elbow', 'left_back_elbow', 'right_back_elbow', 'left_front_knee', 'right_front_knee', 'left_back_knee', 'right_back_knee', 'left_front_paw', 'right_front_paw', 'left_back_paw', 'right_back_paw', 'throat', 'withers', 'tailbase'], sigma=2.5, output_stride=2, loss_weight=1.0)
INFO:sleap.nn.training: Outputs:
INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 192, 256, 20), dtype=tf.float32, name=None), name='SingleInstanceConfmapsHead/BiasAdd:0', description="created by layer 'SingleInstanceConfmapsHead'")
INFO:sleap.nn.training:Setting up data pipelines...
INFO:sleap.nn.training:Training set: n = 1352
INFO:sleap.nn.training:Validation set: n = 150
INFO:sleap.nn.training:Setting up optimization...
INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08)
INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=10)
INFO:sleap.nn.training:Setting up outputs...
INFO:sleap.nn.callbacks:Training controller subscribed to: tcp://127.0.0.1:9000 (topic: )
INFO:sleap.nn.training: ZMQ controller subcribed to: tcp://127.0.0.1:9000
INFO:sleap.nn.callbacks:Progress reporter publishing on: tcp://127.0.0.1:9001 for: not_set
INFO:sleap.nn.training: ZMQ progress reporter publish on: tcp://127.0.0.1:9001
INFO:sleap.nn.training:Created run path: C:/Users/Projects/09_animalpose/sleap\models\221104_091657.single_instance.n=1502
INFO:sleap.nn.training:Setting up visualization...
INFO:sleap.nn.training:Finished trainer set up. [4.1s]
INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation...
Traceback (most recent call last):
File "C:\Users\Anaconda3\lib\runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\Anaconda3\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\Scripts\sleap-train.exe\__main__.py", line 7, in <module>
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\lib\site-packages\sleap\nn\training.py", line 1981, in main
trainer.train()
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\lib\site-packages\sleap\nn\training.py", line 914, in train
training_ds = self.training_pipeline.make_dataset()
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\lib\site-packages\sleap\nn\data\pipelines.py", line 287, in make_dataset
ds = transformer.transform_dataset(ds)
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\lib\site-packages\sleap\nn\data\dataset_ops.py", line 318, in transform_dataset
self.examples = list(iter(ds))
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py", line 836, in __next__
return self._next_internal()
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py", line 819, in _next_internal
ret = gen_dataset_ops.iterator_get_next(
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\lib\site-packages\tensorflow\python\ops\gen_dataset_ops.py", line 2922, in iterator_get_next
_ops.raise_from_not_ok_status(e, name)
File "C:\Users\.virtualenvs\sleap-vuWYjmAg\lib\site-packages\tensorflow\python\framework\ops.py", line 7186, in raise_from_not_ok_status
raise core._status_to_exception(e) from None # pylint: disable=protected-access
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape of tensor EagerPyFunc [375,500,3] is not compatible with expected shape [150,300,3].
[[{{node EnsureShape}}]] [Op:IteratorGetNext]
2022-11-04 09:17:32.818151: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
INFO:sleap.nn.callbacks:Closing the reporter controller/context.
INFO:sleap.nn.callbacks:Closing the training controller socket/context.
Run Path: C:/Users/Projects/09_animalpose/sleap\models\221104_091657.single_instance.n=1502
If you have any findings, please share with us!! @talmo
The dataset contains a mix of different size images. I was unable to load the dataset using our Import from Coco...
and will need to make some modifications to the .json
to do so.
However, I wonder if the problem is that the SingleImageVideo
backend expects all images in that "video" to be of the same size. Hence, when we create a data provider for training, it uses a single example frame stored in SingleImageVideo._test_image
to determine the shape of ALL images in the SingleImageVideo
- which would be incorrect for an instance of SingleImageVideo
containing images of all different sizes.
This hypothesis would make even more sense if the resize_and_pad
only applies to different "videos" - i.e. we do not resize and pad for individual images all contained in a single instance of SingleImageVideo
, but will resize and pad for separate instances of SingleImageVideo
(or any other backend).
Again, I will need to make some modifications to be able to import and verify this hypothesis.
Hi @KennyTC,
Do you mind sharing the .slp
with us or the code used to generate the SLEAP project?
Thanks, Liezl
Hi @roomrys 1) Thanks for your help. The dataset is originally from here https://sites.google.com/view/animal-pose/ Part I. Keypoint-labeled animal data (4,000+ images, five categories): [Google Drive] I extract only dogs images, so the total are around 1502 images.
2) What surprised me is that when I use the above 1502 images + around 500 images that I created by my own) , then I could run single-animal training without any error (even though training loss very low, around 0.001 after only 2,3 epochs, which is weird). But at least, there was no error.
Thanks,
Closing this issue because it seems the error was in how the dataset was loaded into SLEAP - i.e. not through SLEAP's import COCO adaptor. If using a single SingleImageVideo
object to hold multiple images, these images should all have the same sizes. Our COCO adaptor loads all images into their own SingleImageVideo
object. The provided dataset had different sized images inside a single SingleImageVideo
class.
During training, I got the following error.
I have read the issue here https://github.com/talmolab/sleap/issues/735 but the issue seems different because shape of my tensors are all 3.
The data I used for training are here: https://drive.google.com/drive/folders/1xxm6ZjfsDSmv6C9JvbgiGrmHktrUjV5x?usp=sharing
Please give me some hints!