talmolab / sleap

A deep learning framework for multi-animal pose tracking.
https://sleap.ai
Other
425 stars 97 forks source link

Error when training new model with corrected frames of previous model's predictions #517

Closed Xiaoyu-Tong closed 3 years ago

Xiaoyu-Tong commented 3 years ago

Hi,

I successfully trained a model with SLEAP. And I really like the interactive GUI, thank you for developing that!

However, when I tried to train a new model with both the annotated frames I used for the first model and some corrected frames from the first model's predictions, I got the error shown below.

Specifically, this is what I did:

  1. Load the prediction.slp in SLEAP GUI.
  2. Double-click instances to activate them then correct labels.
  3. Click "Add current frame" to them one by one to the training set.
  4. Merge this SLEAP project (predictions with correction) to the SLEAP project I used to train the first model.
  5. Export the training package.
  6. Train it on Colab by following your notebook.

It may be worth mentioning that the videos I am using are grayscale videos with only one channel. But the first model was trained very smoothly.

The output message (error) I got: 2021-03-17 18:30:04.104526: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 INFO:sleap.nn.training:Versions: SLEAP: 1.1.0 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic INFO:sleap.nn.training:Training labels file: 2BMv2.pkg.slp INFO:sleap.nn.training:Training profile: /usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline_medium_rf.bottomup.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "baseline_medium_rf.bottomup.json", "labels_path": "2BMv2.pkg.slp", "video_paths": "", "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "2BMv2.bottomup", "prefix": "", "suffix": "" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 1.0, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 32, "output_stride": 4, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": null, "centered_instance": null, "multi_instance": { "confmaps": { "part_names": null, "sigma": 2.5, "output_stride": 4, "loss_weight": 1.0, "offset_refinement": false }, "pafs": { "edges": null, "sigma": 75.0, "output_stride": 8, "loss_weight": 1.0 } } } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 8, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-06, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "2BMv2.bottomup", "run_name_prefix": "", "run_name_suffix": null, "runs_folder": "models", "tags": [], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.1.0", "filename": "/usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline_medium_rf.bottomup.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:System: 2021-03-17 18:30:05.743608: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-17 18:30:05.768257: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:05.768897: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-17 18:30:05.768964: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 18:30:05.952332: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-17 18:30:05.957281: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-17 18:30:05.965076: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-17 18:30:05.981527: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-17 18:30:06.001245: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-17 18:30:06.371466: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-17 18:30:06.371733: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:06.372497: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:06.373034: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: 2BMv2.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 581 / Validation = 64. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2021-03-17 18:30:09.964350: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-17 18:30:09.973359: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2000134999 Hz 2021-03-17 18:30:09.973646: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55e08cb94fc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-17 18:30:09.973678: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-17 18:30:10.097990: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.099198: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55e08cb95dc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-17 18:30:10.099427: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla V100-SXM2-16GB, Compute Capability 7.0 2021-03-17 18:30:10.099756: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.100637: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-17 18:30:10.100762: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 18:30:10.100870: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-17 18:30:10.100919: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-17 18:30:10.100960: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-17 18:30:10.100993: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-17 18:30:10.101024: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-17 18:30:10.101056: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-17 18:30:10.101167: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.102116: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.102969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-17 18:30:10.103090: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 18:30:10.723669: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-17 18:30:10.723736: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-17 18:30:10.723748: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-17 18:30:10.724007: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.724747: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.725314: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14764 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0) INFO:sleap.nn.training:Loaded test example. [3.011s] INFO:sleap.nn.training: Input shape: (512, 672, 3) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=5, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 32 INFO:sleap.nn.training: Parameters: 7,820,663 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = MultiInstanceConfmapsHead(part_names=['Ear_left', 'Ear_right', 'Nose', 'Head', 'Neck', 'Center', 'Lateral_left', 'Lateral_right', 'Tail_base'], sigma=2.5, output_stride=4, loss_weight=1.0) INFO:sleap.nn.training: [1] = PartAffinityFieldsHead(edges=[('Ear_left', 'Ear_right'), ('Ear_left', 'Nose'), ('Ear_left', 'Head'), ('Ear_right', 'Nose'), ('Ear_right', 'Head'), ('Nose', 'Head'), ('Head', 'Neck'), ('Neck', 'Center'), ('Neck', 'Lateral_left'), ('Neck', 'Lateral_right'), ('Center', 'Lateral_left'), ('Center', 'Lateral_right'), ('Center', 'Tail_base'), ('Lateral_left', 'Tail_base'), ('Lateral_right', 'Tail_base')], sigma=75.0, output_stride=8, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = Tensor("MultiInstanceConfmapsHead_0/BiasAdd:0", shape=(None, 128, 168, 9), dtype=float32) INFO:sleap.nn.training: [1] = Tensor("PartAffinityFieldsHead_0/BiasAdd:0", shape=(None, 64, 84, 30), dtype=float32) INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 581 INFO:sleap.nn.training:Validation set: n = 64 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=8, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-06, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: models/2BMv2.bottomup_1 INFO:sleap.nn.training:Setting up visualization... Unable to use Qt backend for matplotlib. This probably means Qt is running headless. INFO:sleap.nn.training:Finished trainer set up. [6.7s] INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation... Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/context.py", line 2102, in execution_mode yield File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 758, in _next_internal output_shapes=self._flat_output_shapes) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 2610, in iterator_get_next _ops.raise_from_not_ok_status(e, name) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 6843, in raise_from_not_ok_status six.raise_from(core._status_to_exception(e.code, message), None) File "", line 3, in raise_from tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape of tensor EagerPyFunc [492,656,1] is not compatible with expected shape [492,656,3]. [[{{node EnsureShape}}]] [Op:IteratorGetNext]

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/usr/local/bin/sleap-train", line 8, in sys.exit(main()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 1567, in main trainer.train() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 879, in train training_ds = self.training_pipeline.make_dataset() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/pipelines.py", line 282, in make_dataset ds = transformer.transform_dataset(ds) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/dataset_ops.py", line 318, in transform_dataset self.examples = list(iter(ds)) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 736, in next return self.next() File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 772, in next return self._next_internal() File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 764, in _next_internal return structure.from_compatible_tensor_list(self._element_spec, ret) File "/usr/lib/python3.7/contextlib.py", line 130, in exit self.gen.throw(type, value, traceback) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/context.py", line 2105, in execution_mode executor_new.wait() File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/executor.py", line 67, in wait pywrap_tfe.TFE_ExecutorWaitForAllPendingNodes(self._handle) tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape of tensor EagerPyFunc [492,656,1] is not compatible with expected shape [492,656,3]. [[{{node EnsureShape}}]] 2021-03-17 18:30:17.431635: W tensorflow/core/kernels/data/cache_dataset_ops.cc:798] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to dataset.cache().take(k).repeat(). You should use dataset.take(k).cache().repeat() instead.

Can you please help me get around of this? Preferably that I can use the current training package or at least current labels. I have done much work to correct predicted frames and recorrect them would be a little bit too painful... Thank you in advance!

XIaoyu

Xiaoyu-Tong commented 3 years ago

I tried to set "convert images to greyscale" before exporting the training package. I got a different error as shown below when tried to train the model:

2021-03-17 23:46:22.913327: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 INFO:sleap.nn.training:Versions: SLEAP: 1.1.0 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic INFO:sleap.nn.training:Training labels file: 2BMv2.pkg.slp INFO:sleap.nn.training:Training profile: /usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline_medium_rf.bottomup.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "baseline_medium_rf.bottomup.json", "labels_path": "2BMv2.pkg.slp", "video_paths": "", "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "2BMv2.bottomup", "prefix": "", "suffix": "" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 1.0, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 32, "output_stride": 4, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": null, "centered_instance": null, "multi_instance": { "confmaps": { "part_names": null, "sigma": 2.5, "output_stride": 4, "loss_weight": 1.0, "offset_refinement": false }, "pafs": { "edges": null, "sigma": 75.0, "output_stride": 8, "loss_weight": 1.0 } } } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 8, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-06, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "2BMv2.bottomup", "run_name_prefix": "", "run_name_suffix": null, "runs_folder": "models", "tags": [], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.1.0", "filename": "/usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline_medium_rf.bottomup.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:System: 2021-03-17 23:46:24.561511: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-17 23:46:24.586686: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:24.587347: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-17 23:46:24.587422: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 23:46:24.763484: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-17 23:46:24.775344: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-17 23:46:24.780991: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-17 23:46:24.795277: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-17 23:46:24.815002: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-17 23:46:25.162332: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-17 23:46:25.162614: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:25.163372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:25.163923: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: 2BMv2.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 581 / Validation = 64. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2021-03-17 23:46:28.857347: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-17 23:46:28.862459: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2000129999 Hz 2021-03-17 23:46:28.862731: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5611e8300fc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-17 23:46:28.862769: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-17 23:46:28.951354: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:28.952317: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5611e8301dc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-17 23:46:28.952354: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla V100-SXM2-16GB, Compute Capability 7.0 2021-03-17 23:46:28.952691: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:28.953349: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-17 23:46:28.953540: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 23:46:28.953618: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-17 23:46:28.953650: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-17 23:46:28.953674: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-17 23:46:28.953709: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-17 23:46:28.953753: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-17 23:46:28.953783: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-17 23:46:28.953916: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:28.954641: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:28.955223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-17 23:46:28.955316: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 23:46:29.639419: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-17 23:46:29.639484: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-17 23:46:29.639508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-17 23:46:29.639858: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:29.640884: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 23:46:29.641709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14764 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0) INFO:sleap.nn.training:Loaded test example. [3.147s] INFO:sleap.nn.training: Input shape: (512, 672, 3) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=5, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 32 INFO:sleap.nn.training: Parameters: 7,820,663 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = MultiInstanceConfmapsHead(part_names=['Ear_left', 'Ear_right', 'Nose', 'Head', 'Neck', 'Center', 'Lateral_left', 'Lateral_right', 'Tail_base'], sigma=2.5, output_stride=4, loss_weight=1.0) INFO:sleap.nn.training: [1] = PartAffinityFieldsHead(edges=[('Ear_left', 'Ear_right'), ('Ear_left', 'Nose'), ('Ear_left', 'Head'), ('Ear_right', 'Nose'), ('Ear_right', 'Head'), ('Nose', 'Head'), ('Head', 'Neck'), ('Neck', 'Center'), ('Neck', 'Lateral_left'), ('Neck', 'Lateral_right'), ('Center', 'Lateral_left'), ('Center', 'Lateral_right'), ('Center', 'Tail_base'), ('Lateral_left', 'Tail_base'), ('Lateral_right', 'Tail_base')], sigma=75.0, output_stride=8, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = Tensor("MultiInstanceConfmapsHead_0/BiasAdd:0", shape=(None, 128, 168, 9), dtype=float32) INFO:sleap.nn.training: [1] = Tensor("PartAffinityFieldsHead_0/BiasAdd:0", shape=(None, 64, 84, 30), dtype=float32) INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 581 INFO:sleap.nn.training:Validation set: n = 64 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=8, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-06, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: models/2BMv2.bottomup INFO:sleap.nn.training:Setting up visualization... WARNING:tensorflow:Model was constructed with shape (None, 512, 672, 3) for input Tensor("input:0", shape=(None, 512, 672, 3), dtype=float32), but it was called on an input with incompatible shape (None, 512, 672, 1). Traceback (most recent call last): File "/usr/local/bin/sleap-train", line 8, in sys.exit(main()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 1567, in main trainer.train() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 875, in train self.setup() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 869, in setup self._setup_visualization() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 1360, in _setup_visualization validation_viz_ds_iter = iter(self.validation_viz_pipeline.make_dataset()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/pipelines.py", line 282, in make_dataset ds = transformer.transform_dataset(ds) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/inference.py", line 40, in transform_dataset keras_model = tf.keras.Model(input_layers, self.keras_model(input_layers)) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 926, in call input_list) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1117, in _functional_construction_call outputs = call_fn(cast_inputs, *args, *kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/functional.py", line 386, in call inputs, training=training, mask=mask) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/functional.py", line 508, in _run_internal_graph outputs = node.layer(args, **kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 926, in call input_list) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1092, in _functional_construction_call input_spec.assert_input_compatibility(self.input_spec, inputs, self.name) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/input_spec.py", line 216, in assert_input_compatibility ' but received input with shape ' + str(shape)) ValueError: Input 0 of layer stack0_enc0_conv0 is incompatible with the layer: expected axis -1 of input shape to have value 3 but received input with shape [None, 512, 672, 1]

Xiaoyu-Tong commented 3 years ago

And set "convert images to RGB" gave me the same error as not specifying "convert images to".

Xiaoyu-Tong commented 3 years ago

I did a little bit more investigation about this error. So based on the GUI, some of my videos have 3 channels and some have 1 channel. And I want to use them together and converting them to either RGB or greyscale is fine to me. Based on the error message, I am guessing that your code is trying to load RGB images if I set "convert images to greyscale". Not sure if it is the mixture of my video format caused this problem. Thank you so much for helping me in advance!

talmo commented 3 years ago

Hi @Xiaoyu-Tong,

Just wanted to give you a quick update that we're working on this issue.

While we do that, do you mind telling us a little more about the video formats? Are they all RGB or grayscale originally?

SLEAP should be able to handle this case anyway, but while we track it down, one potential workaround is to replace them with the same files by going to File -> Replace videos... and importing them all consistently as RGB or grayscale.

If you get a chance to try that, let me know if it worked and it'll help us find the source of the error.

Thanks!

Xiaoyu-Tong commented 3 years ago

Hi, @talmo

Thanks for the update! The videos are originally grayscale mp4 files. I tried to replace the one which is different from other, but did not find a way to specify its channel number. So after the replacement the difference remains. Probably it is worth mentioning that I imported most of my "videos" as figures (from DLC) and SLEAP does not have the access to the entire videos. These video files are originally grayscale but the imported images are RGB. But anyway it works well before I added a full video to the project. The full video is grayscale and remains grayscale after importation, and I did not find a way to specify its channel number. As I mentioned above, I have also tried to convert them to RGB/grayscale before training, but no luck.

Thanks!

talmo commented 3 years ago

Ah shoot, I thought we had added an option to set the number of channels from the replace video dialog. This is on our to-do list!

Again, it shouldn't have been an issue to begin with, but as a workaround open up python/ipython and run these lines:

import sleap
labels = sleap.load_file('2BMv2.slp')
for vid in labels.videos:
    vid.backend.grayscale = True
labels.save('2BMv2.gray.slp')

Replace '2BMv2.slp' with the filename of your labels file if that's not the right one. After running this, you'll get another copy of the same labels with the filename 2BMv2.gray.slp which you can then check to see if all the videos show up with 1 channel in the GUI.

Xiaoyu-Tong commented 3 years ago

Hi @talmo ,

Thanks for the update. I tried the above code and reloaded 2BMv2.gray.slp. However, based on what is shown in the GUI, the other ones are still with 3 channels. Maybe I can provide the slp file to you and you try to examine what's wrong?

Thank you!

talmo commented 3 years ago

Hi @Xiaoyu-Tong,

Yup that works! You can send it to sleap@princeton.edu. Thanks!

Xiaoyu-Tong commented 3 years ago

File sent via email. Thank you!

Xiaoyu-Tong commented 3 years ago

Hi @talmo ,

I think the dialog to specify channel number when importing videos does work, however, when replacing videos, as well as making predictions, there is no such a dialog and by default videos will become grayscale. So basically there is no chance for me to specify channel number when I added new videos in the way of making predictions. I think a quick fix may be that you add a similar dialog when users replace videos (I am assuming that replacing videos will match labeled frames correctly).

talmo commented 3 years ago

Yup, I think I'd like to add it as option in the GUI. Just FYI, I don't see the email to sleap@princeton.edu -- maybe it got spam filtered? You could try sending it as a Google Drive link? Or directly to talmo@princeton.edu?

Xiaoyu-Tong commented 3 years ago

Hi, I just forwarded the email to talmo@princeton.edu.

talmo commented 3 years ago

Hmm, I still haven't received it and I've gotten one from someone else in the meantime so I'm guessing it's not going to come through. What's your email? I can send you a link where you can upload it.

Xiaoyu-Tong commented 3 years ago

xiaoyu.tong@nyulangone.org

Thanks

Xiaoyu-Tong commented 3 years ago

Hmm, I still haven't received it and I've gotten one from someone else in the meantime so I'm guessing it's not going to come through. What's your email? I can send you a link where you can upload it.

Uploaded. Please let me know if you cannot receive it.

talmo commented 3 years ago

Cool, got it! Sent you a (potentially) fixed version.

Xiaoyu-Tong commented 3 years ago

Hi @talmo ,

Seems it helps and the model is now being trained! Thank you! How did you fix it?

talmo commented 3 years ago

Wonderful! Right so I basically did the same thing I had suggested in the code snippet earlier, but forced the video to be RGB instead of grayscale. The problem is that single image videos, like the ones you import from DLC, currently do not support automatic conversion to grayscale.

I think we overlooked this feature in the past because we assumed that the normalization would take care of it, but it seems we were mistaken! I'm going to keep this issue open until we either fix the normalization during training/inference, or allow for changing this setting per video in the GUI.

Thanks for the report and let me know if you run into more issues after training!

Xiaoyu-Tong commented 3 years ago

Hi,

When I tried to make predictions , I used the code (in Google Colab): !sleap-track 140630_SD5_behaviors_115680_131200.mp4 \ --tracking.tracker simple \ -m models/2BMv2.bottomup

But then it failed to predict and gave me the following error. Actually yesterday I tried to make a brand new model by labeling 20 frames in SLEAP GUI, and it gave me the same error. So I guess this is irrevalant with the channel number issue. Any idea how to fix it? Thank you!

The error message

2021-03-19 17:55:07.391236: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 Started inference at: 2021-03-19 17:55:09.043115 Args: data_path: 140630_SD5_behaviors_115680_131200.mp4 models: ['models/2BMv2.bottomup'] frames: None only_labeled_frames: False only_suggested_frames: False output: None no_empty_frames: False verbosity: rich video.dataset: None video.input_format: channels_last cpu: False first_gpu: False last_gpu: False gpu: 0 peak_threshold: 0.2 batch_size: 4 open_in_gui: False tracking.tracker: simple tracking.target_instance_count: None tracking.pre_cull_to_target: None tracking.pre_cull_iou_threshold: None tracking.post_connect_single_breaks: None tracking.clean_instance_count: None tracking.clean_iou_threshold: None tracking.similarity: None tracking.match: None tracking.track_window: None tracking.min_new_track_points: None tracking.min_match_points: None tracking.img_scale: None tracking.of_window_size: None tracking.of_max_levels: None tracking.kf_node_indices: None tracking.kf_init_frame_count: None 2021-03-19 17:55:09.045566: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-19 17:55:09.065636: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.066222: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 17:55:09.066277: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 17:55:09.068061: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 17:55:09.077751: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 17:55:09.078130: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 17:55:09.080076: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 17:55:09.086896: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 17:55:09.091941: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 17:55:09.092079: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.092701: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.093209: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic

System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True

2021-03-19 17:55:09.402452: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-19 17:55:09.408680: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz 2021-03-19 17:55:09.408909: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x555a7fe84f40 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-19 17:55:09.408945: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-19 17:55:09.504212: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.504937: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x555a7fe85100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-19 17:55:09.504968: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0 2021-03-19 17:55:09.505190: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.505795: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 17:55:09.505884: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 17:55:09.505936: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 17:55:09.505965: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 17:55:09.505997: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 17:55:09.506024: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 17:55:09.506050: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 17:55:09.506075: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 17:55:09.506166: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.506802: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.507307: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-19 17:55:09.507400: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 17:55:09.947734: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-19 17:55:09.947800: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-19 17:55:09.947815: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-19 17:55:09.948073: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.948775: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.949342: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14958 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0) Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ?2021-03-19 17:55:19.114744: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 17:55:19.862862: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ?2021-03-19 17:55:20.786519: W tensorflow/core/framework/op_kernel.cc:1755] Unknown: AssertionError: Traceback (most recent call last):

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 242, in call return func(device, token, args)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 131, in call ret = self._func(*args)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 302, in wrapper return func(*args, **kwargs)

File "/tmp/tmpqf6m3m4z.py", line 20, in _group_instances_sample retval1 = ag.converted_call(ag.ld(group_instances_sample), (ag.ld(peaks_sample), ag.ld(peak_scores_sample), ag.ld(peak_channel_inds_sample), ag.ld(match_edge_inds_sample), ag.ld(match_src_peak_inds_sample), ag.ld(match_dst_peak_inds_sample), ag__.ld(match_line_scores_sample), ag.ld(n_nodes), ag.ld(n_edges), ag__.ld(edge_types), ag.ld(min_instance_peaks)), dict(min_line_scores=ag__.ld(min_line_scores)), fscope_1)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 461, in converted_call return _call_unconverted(f, args, kwargs, options, False)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 339, in _call_unconverted return f(*args, **kwargs)

File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 1046, in group_instances_sample ) = make_predicted_instances(peaks, peak_scores, connections, instance_assignments)

File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 903, in make_predicted_instances assert instance_ind == instance_assignments[dst_peak_id]

AssertionError

Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ? Traceback (most recent call last): File "/usr/local/bin/sleap-track", line 8, in sys.exit(main()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 3025, in main labels_pr = predictor.predict(provider) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 385, in predict self._make_labeled_frames_from_generator(generator, data) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 2518, in _make_labeled_frames_from_generator for ex in generator: File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 295, in _predict_generator ex = process_batch(ex) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 266, in process_batch preds = self.inference_model.predict_on_batch(ex) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py", line 1788, in predict_on_batch outputs = predict_function(iterator) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 780, in call result = self._call(*args, *kwds) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 814, in _call results = self._stateful_fn(args, **kwds) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 2829, in call return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 1848, in _filtered_call cancellation_manager=cancellation_manager) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 1924, in _call_flat ctx, args, cancellation_manager=cancellation_manager)) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 550, in call ctx=ctx) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute inputs, attrs, num_outputs) tensorflow.python.framework.errors_impl.UnknownError: 2 root error(s) found. (0) Unknown: AssertionError: Traceback (most recent call last):

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 242, in call return func(device, token, args)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 131, in call ret = self._func(*args)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 302, in wrapper return func(*args, **kwargs)

File "/tmp/tmpqf6m3m4z.py", line 20, in _group_instances_sample retval1 = ag.converted_call(ag.ld(group_instances_sample), (ag.ld(peaks_sample), ag.ld(peak_scores_sample), ag.ld(peak_channel_inds_sample), ag.ld(match_edge_inds_sample), ag.ld(match_src_peak_inds_sample), ag.ld(match_dst_peak_inds_sample), ag__.ld(match_line_scores_sample), ag.ld(n_nodes), ag.ld(n_edges), ag__.ld(edge_types), ag.ld(min_instance_peaks)), dict(min_line_scores=ag__.ld(min_line_scores)), fscope_1)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 461, in converted_call return _call_unconverted(f, args, kwargs, options, False)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 339, in _call_unconverted return f(*args, **kwargs)

File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 1046, in group_instances_sample ) = make_predicted_instances(peaks, peak_scores, connections, instance_assignments)

File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 903, in make_predicted_instances assert instance_ind == instance_assignments[dst_peak_id]

AssertionError

 [[{{node bottom_up_inference_model/bottom_up_inference_layer/while_2/body/_211/bottom_up_inference_model/bottom_up_inference_layer/while_2/EagerPyFunc}}]]

(1) Unknown: AssertionError: Traceback (most recent call last):

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 242, in call return func(device, token, args)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 131, in call ret = self._func(*args)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 302, in wrapper return func(*args, **kwargs)

File "/tmp/tmpqf6m3m4z.py", line 20, in _group_instances_sample retval1 = ag.converted_call(ag.ld(group_instances_sample), (ag.ld(peaks_sample), ag.ld(peak_scores_sample), ag.ld(peak_channel_inds_sample), ag.ld(match_edge_inds_sample), ag.ld(match_src_peak_inds_sample), ag.ld(match_dst_peak_inds_sample), ag__.ld(match_line_scores_sample), ag.ld(n_nodes), ag.ld(n_edges), ag__.ld(edge_types), ag.ld(min_instance_peaks)), dict(min_line_scores=ag__.ld(min_line_scores)), fscope_1)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 461, in converted_call return _call_unconverted(f, args, kwargs, options, False)

File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 339, in _call_unconverted return f(*args, **kwargs)

File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 1046, in group_instances_sample ) = make_predicted_instances(peaks, peak_scores, connections, instance_assignments)

File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 903, in make_predicted_instances assert instance_ind == instance_assignments[dst_peak_id]

AssertionError

 [[{{node bottom_up_inference_model/bottom_up_inference_layer/while_2/body/_211/bottom_up_inference_model/bottom_up_inference_layer/while_2/EagerPyFunc}}]]
 [[bottom_up_inference_model/bottom_up_inference_layer/while_2/body/_211/bottom_up_inference_model/bottom_up_inference_layer/while_2/Shape/_926]]

0 successful operations. 0 derived errors ignored. [Op:__inference_predict_function_5426]

Function call stack: predict_function -> predict_function

talmo commented 3 years ago

Ah yes, this is an issue with the skeleton configuration. When training from within the GUI it should give you a warning that the skeleton does not form an arborescence, but maybe we should add something on the CLI side as well.

So basically, you want to make sure that your skeleton forms no cycle and no node has two parents. Right now, your skeleton has multiple of these, for example:

('Ear_left', 'Ear_right'), ('Ear_left', 'Nose'), ('Ear_left', 'Head'), ('Ear_right', 'Nose'), ('Ear_right', 'Head')

Ideally you'll just have:

('Head', 'Ear_right'), ('Head', 'Nose'), ('Head', 'Ear_right')

and so forth.

If you're coming from DLC, this is a common issue. I think they fundamentally misunderstand how the bottom-up (part affinity field) matching algorithm works -- by having multiple edges that point to the same child node, you can have conflicting groupings depending on which path you take, resulting in noisy part assignments when the animals are close together.

Try reducing your skeleton edges such that no node has two parents and you should be good to go. In the next version of SLEAP, we'll have this displayed more prominently in the skeleton editor.

Xiaoyu-Tong commented 3 years ago

Hi,

Thank you for the answer. I am trying the top-down model as I prefer to keep the complexity of skeleton. By the way, when I tried to do the same thing for centroid model "!sleap-train baseline.centroid.json 2BMv2.pkg.slp --run_name "2BMv2.centroid", it gave me an error which prevented the training from happening. So I wonder if there is any issue of the centroid model training?

The error: 2021-03-19 18:02:44.808179: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 INFO:sleap.nn.training:Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic INFO:sleap.nn.training:Training labels file: 2BMv2.pkg.slp INFO:sleap.nn.training:Training profile: /usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline.centroid.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "baseline.centroid.json", "labels_path": "2BMv2.pkg.slp", "video_paths": "", "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "2BMv2.centroid", "prefix": "", "suffix": "" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 0.5, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 16, "output_stride": 2, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": { "anchor_part": null, "sigma": 5.0, "output_stride": 2, "offset_refinement": false }, "centered_instance": null, "multi_instance": null } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-06, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "2BMv2.centroid", "run_name_prefix": "", "run_name_suffix": null, "runs_folder": "models", "tags": [], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.1.1", "filename": "/usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline.centroid.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:System: 2021-03-19 18:02:46.291431: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-19 18:02:46.311568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:46.312165: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 18:02:46.312226: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:02:46.313971: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 18:02:46.315857: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 18:02:46.316204: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 18:02:46.318229: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 18:02:46.319164: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 18:02:46.323225: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 18:02:46.323354: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:46.323960: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:46.324462: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: 2BMv2.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 581 / Validation = 64. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2021-03-19 18:02:47.406442: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-19 18:02:47.410827: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz 2021-03-19 18:02:47.411050: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56164c058fc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-19 18:02:47.411082: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-19 18:02:47.498989: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.499829: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56164c059a40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-19 18:02:47.499866: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0 2021-03-19 18:02:47.500083: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.500649: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 18:02:47.500734: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:02:47.500778: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 18:02:47.500807: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 18:02:47.500834: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 18:02:47.500865: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 18:02:47.500889: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 18:02:47.500915: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 18:02:47.501006: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.501577: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.502097: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-19 18:02:47.502182: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:02:47.930706: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-19 18:02:47.930764: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-19 18:02:47.930776: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-19 18:02:47.930987: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.931662: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.932202: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14958 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0) INFO:sleap.nn.training:Loaded test example. [2.012s] INFO:sleap.nn.training: Input shape: (256, 336, 3) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 16 INFO:sleap.nn.training: Parameters: 1,953,393 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = CentroidConfmapsHead(anchor_part=None, sigma=5.0, output_stride=2, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = Tensor("CentroidConfmapsHead_0/BiasAdd:0", shape=(None, 128, 168, 1), dtype=float32) INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 581 INFO:sleap.nn.training:Validation set: n = 64 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-06, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: models/2BMv2.centroid INFO:sleap.nn.training:Setting up visualization... WARNING:tensorflow:Model was constructed with shape (None, 256, 336, 3) for input Tensor("input:0", shape=(None, 256, 336, 3), dtype=float32), but it was called on an input with incompatible shape (None, 492, 656, 3). Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 1812, in _create_c_op c_op = pywrap_tf_session.TF_FinishOperation(op_desc) tensorflow.python.framework.errors_impl.InvalidArgumentError: Dimension 1 in both shapes must be equal, but are 123 and 124. Shapes are [?,123,164] and [?,124,164]. for '{{node functional_1/stack0_dec1_s8_to_s4_skip_concat/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](functional_1/stack0_enc2_act1_relu/Relu, functional_1/stack0_dec1_s8_to_s4_interp_bilinear/resize/ResizeBilinear, functional_1/stack0_dec1_s8_to_s4_skip_concat/concat/axis)' with input shapes: [?,123,164,64], [?,124,164,128], [] and with computed input tensors: input[2] = <3>.

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/usr/local/bin/sleap-train", line 8, in sys.exit(main()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 1582, in main trainer.train() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 875, in train self.setup() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 869, in setup self._setup_visualization() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 1142, in _setup_visualization training_viz_ds_iter = iter(self.training_viz_pipeline.make_dataset()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/pipelines.py", line 282, in make_dataset ds = transformer.transform_dataset(ds) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/inference.py", line 40, in transform_dataset keras_model = tf.keras.Model(input_layers, self.keras_model(input_layers)) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 926, in call input_list) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1117, in _functional_construction_call outputs = call_fn(cast_inputs, *args, kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/functional.py", line 386, in call inputs, training=training, mask=mask) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/functional.py", line 508, in _run_internal_graph outputs = node.layer(*args, *kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 926, in call input_list) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1117, in _functional_construction_call outputs = call_fn(cast_inputs, args, kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/layers/merge.py", line 183, in call return self._merge_function(inputs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/layers/merge.py", line 522, in _merge_function return K.concatenate(inputs, axis=self.axis) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py", line 201, in wrapper return target(*args, *kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/backend.py", line 2881, in concatenate return array_ops.concat([to_dense(x) for x in tensors], axis) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py", line 201, in wrapper return target(args, **kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/array_ops.py", line 1654, in concat return gen_array_ops.concat_v2(values=values, axis=axis, name=name) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 1222, in concat_v2 "ConcatV2", values=values, axis=axis, name=name) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 744, in _apply_op_helper attrs=attr_protos, op_def=op_def) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py", line 593, in _create_op_internal compute_device) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 3485, in _create_op_internal op_def=op_def) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 1975, in init control_input_ops, op_def) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 1815, in _create_c_op raise ValueError(str(e)) ValueError: Dimension 1 in both shapes must be equal, but are 123 and 124. Shapes are [?,123,164] and [?,124,164]. for '{{node functional_1/stack0_dec1_s8_to_s4_skip_concat/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](functional_1/stack0_enc2_act1_relu/Relu, functional_1/stack0_dec1_s8_to_s4_interp_bilinear/resize/ResizeBilinear, functional_1/stack0_dec1_s8_to_s4_skip_concat/concat/axis)' with input shapes: [?,123,164,64], [?,124,164,128], [] and with computed input tensors: input[2] = <3>.

talmo commented 3 years ago

Eurgh, sorry about that -- looks like the same problem as #510.

We're working on that one in parallel, in the meantime, do you mind dropping the 2BMv2.pkg.slp in the same Dropbox link I sent you? We'll make sure that the fix works on both of your datasets.

Thanks!!

Xiaoyu-Tong commented 3 years ago

OK, no worries! I have dropped the training package in the Dropbox for this issue. Thank you in advance for your help.

Do I also need to make sure codes do not form circle and no node has two parents when using the top-down model? because when I tried to use the top-down model to predict it gave me the following error:

2021-03-19 18:21:41.665564: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 Started inference at: 2021-03-19 18:21:43.123669 Args: data_path: 140630_SD5_behaviors_115680_131200.mp4 models: ['models/2BMv2.topdown_confmaps'] frames: None only_labeled_frames: False only_suggested_frames: False output: None no_empty_frames: False verbosity: rich video.dataset: None video.input_format: channels_last cpu: False first_gpu: False last_gpu: False gpu: 0 peak_threshold: 0.2 batch_size: 4 open_in_gui: False tracking.tracker: simple tracking.target_instance_count: None tracking.pre_cull_to_target: None tracking.pre_cull_iou_threshold: None tracking.post_connect_single_breaks: None tracking.clean_instance_count: None tracking.clean_iou_threshold: None tracking.similarity: None tracking.match: None tracking.track_window: None tracking.min_new_track_points: None tracking.min_match_points: None tracking.img_scale: None tracking.of_window_size: None tracking.of_max_levels: None tracking.kf_node_indices: None tracking.kf_init_frame_count: None 2021-03-19 18:21:43.126086: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-19 18:21:43.145928: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.146514: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 18:21:43.146565: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:21:43.148523: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 18:21:43.150439: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 18:21:43.150832: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 18:21:43.152573: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 18:21:43.153817: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 18:21:43.157753: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 18:21:43.157886: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.158472: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.158992: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic

System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True

2021-03-19 18:21:43.411734: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-19 18:21:43.417823: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz 2021-03-19 18:21:43.418048: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55fb2bb10f40 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-19 18:21:43.418087: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-19 18:21:43.508961: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.509797: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55fb2bb11100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-19 18:21:43.509833: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0 2021-03-19 18:21:43.510060: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.510610: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 18:21:43.510694: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:21:43.510749: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 18:21:43.510780: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 18:21:43.510813: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 18:21:43.510839: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 18:21:43.510864: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 18:21:43.510890: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 18:21:43.510982: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.511568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.512102: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-19 18:21:43.512192: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:21:43.946276: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-19 18:21:43.946345: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-19 18:21:43.946361: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-19 18:21:43.946618: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.947269: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.947858: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14958 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0) Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ? Traceback (most recent call last): File "/usr/local/bin/sleap-track", line 8, in sys.exit(main()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 3025, in main labels_pr = predictor.predict(provider) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 385, in predict self._make_labeled_frames_from_generator(generator, data) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 1996, in _make_labeled_frames_from_generator for ex in generator: File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 294, in _predict_generator for ex in self.pipeline.make_dataset(): File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/pipelines.py", line 271, in make_dataset self.validate_pipeline() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/pipelines.py", line 247, in validate_pipeline f"Missing required keys for transformer (index = {i}, " ValueError: Missing required keys for transformer (index = 1, type = <class 'sleap.nn.data.instance_centroids.InstanceCentroidFinder'>): ['instances']. Available: ['scale', 'raw_image_size', 'image', 'video_ind', 'frame_ind']

talmo commented 3 years ago

You do not, but you do need a centroid model (which you had an error when trying to train) in addition to the centered instance model to do topdown inference.

I'll get back to you asap when we fix this image size issue. Sorry about that again!

Xiaoyu-Tong commented 3 years ago

OK I see!

Xiaoyu-Tong commented 3 years ago

Hi @talmo ,

I also tried to revise my skeleton so that it can be used to train a bottom-up model. I edited it so that all nodes except the root one has only one parent and no circle formed as shown below. image However, when trying to make predictions, I still got the following error:

Any idea how to fix it? Thank you!

2021-03-19 20:30:12.804426: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 Started inference at: 2021-03-19 20:30:14.341726 Args: data_path: 140630_SD5_behaviors_115680_131200.mp4 models: ['models/2BMv2_BottomUp.bottomup'] frames: None only_labeled_frames: False only_suggested_frames: False output: None no_empty_frames: False verbosity: rich video.dataset: None video.input_format: channels_last cpu: False first_gpu: False last_gpu: False gpu: 0 peak_threshold: 0.2 batch_size: 4 open_in_gui: False tracking.tracker: simple tracking.target_instance_count: None tracking.pre_cull_to_target: None tracking.pre_cull_iou_threshold: None tracking.post_connect_single_breaks: None tracking.clean_instance_count: None tracking.clean_iou_threshold: None tracking.similarity: None tracking.match: None tracking.track_window: None tracking.min_new_track_points: None tracking.min_match_points: None tracking.img_scale: None tracking.of_window_size: None tracking.of_max_levels: None tracking.kf_node_indices: None tracking.kf_init_frame_count: None 2021-03-19 20:30:14.346315: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-19 20:30:14.370451: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.371095: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 20:30:14.371157: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 20:30:14.373666: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 20:30:14.375685: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 20:30:14.376317: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 20:30:14.378399: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 20:30:14.379826: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 20:30:14.384641: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 20:30:14.384773: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.385372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.385888: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic

System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True

2021-03-19 20:30:14.678050: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-19 20:30:14.683679: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz 2021-03-19 20:30:14.683930: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55c6f7e08f40 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-19 20:30:14.683966: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-19 20:30:14.774451: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.775217: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55c6f7e09100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-19 20:30:14.775264: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0 2021-03-19 20:30:14.775512: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.776160: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 20:30:14.776316: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 20:30:14.776413: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 20:30:14.776446: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 20:30:14.776474: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 20:30:14.776505: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 20:30:14.776532: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 20:30:14.776558: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 20:30:14.776776: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.777397: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.777941: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-19 20:30:14.778018: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 20:30:15.218348: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-19 20:30:15.218402: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-19 20:30:15.218414: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-19 20:30:15.218664: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:15.219310: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:15.219872: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14958 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0) Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ?2021-03-19 20:30:23.347639: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 20:30:24.119046: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: 0:14:28 17.8 FPS2021-03-19 20:30:28.115905: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES failed at list_kernels.h:357 : Invalid argument: PartialTensorShape: Incompatible ranks during merge: 2 vs. 1 2021-03-19 20:30:28.115905: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES failed at list_kernels.h:357 : Invalid argument: PartialTensorShape: Incompatible ranks during merge: 2 vs. 1 Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: 0:14:28 17.8 FPS Traceback (most recent call last): File "/usr/local/bin/sleap-track", line 8, in sys.exit(main()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 3025, in main labels_pr = predictor.predict(provider) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 385, in predict self._make_labeled_frames_from_generator(generator, data) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 2518, in _make_labeled_frames_from_generator for ex in generator: File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 295, in _predict_generator ex = process_batch(ex) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/inference.py", line 266, in process_batch preds = self.inference_model.predict_on_batch(ex) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py", line 1788, in predict_on_batch outputs = predict_function(iterator) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 780, in call result = self._call(*args, *kwds) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/def_function.py", line 814, in _call results = self._stateful_fn(args, **kwds) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 2829, in call return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 1848, in _filtered_call cancellation_manager=cancellation_manager) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 1924, in _call_flat ctx, args, cancellation_manager=cancellation_manager)) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/function.py", line 550, in call ctx=ctx) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute inputs, attrs, num_outputs) tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found. (0) Invalid argument: PartialTensorShape: Incompatible ranks during merge: 2 vs. 1 [[{{node bottom_up_inference_model/bottom_up_inference_layer/while/body/_44/bottom_up_inference_model/bottom_up_inference_layer/while/TensorListConcatV2_1}}]] [[bottom_up_inference_model/bottom_up_inference_layer/while_1/LoopCond/_159/_802]] (1) Invalid argument: PartialTensorShape: Incompatible ranks during merge: 2 vs. 1 [[{{node bottom_up_inference_model/bottom_up_inference_layer/while/body/_44/bottom_up_inference_model/bottom_up_inference_layer/while/TensorListConcatV2_1}}]] 0 successful operations. 0 derived errors ignored. [Op:__inference_predict_function_4845]

Function call stack: predict_function -> predict_function

Xiaoyu-Tong commented 3 years ago

Hi, @talmo I just tried the training of centroid/topdown model again, and seems its working! (Now its being trained. I haven't got any result so not sure if training and predicting indeed work well.) Thank you very much for fixing the issue! Can you please take a look on the most recent update I made in this issue about the bottom-up model (I edited the skeleton and retrained the model but the bottom up prediction is still not working). Thank you!

Xiaoyu-Tong commented 3 years ago

Update:

I ran "!sleap-train baseline_medium_rf.topdown.json 2BMv2_ImplantBranch.pkg.slp", which went smoothly. But when I ran "!sleap-train baseline.centroid.json 2BMv2_ImplantBranch.pkg.slp", it still gave me the following error: 2021-03-22 19:50:40.383868: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 INFO:sleap.nn.training:Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic INFO:sleap.nn.training:Training labels file: 2BMv2_ImplantBranch.pkg.slp INFO:sleap.nn.training:Training profile: /usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline.centroid.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "baseline.centroid.json", "labels_path": "2BMv2_ImplantBranch.pkg.slp", "video_paths": "", "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "2BMv2_ImplantBranch.centroid", "prefix": "", "suffix": "" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 0.5, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 16, "output_stride": 2, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": { "anchor_part": null, "sigma": 5.0, "output_stride": 2, "offset_refinement": false }, "centered_instance": null, "multi_instance": null } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-06, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "2BMv2_ImplantBranch.centroid", "run_name_prefix": "", "run_name_suffix": null, "runs_folder": "models", "tags": [], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.1.1", "filename": "/usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline.centroid.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:System: 2021-03-22 19:50:41.953850: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-22 19:50:41.979840: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:41.980522: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-22 19:50:41.980596: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-22 19:50:41.982335: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-22 19:50:41.984356: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-22 19:50:41.984717: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-22 19:50:41.986916: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-22 19:50:41.987892: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-22 19:50:41.991685: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-22 19:50:41.991873: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:41.992621: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:41.993266: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: 2BMv2_ImplantBranch.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 581 / Validation = 64. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2021-03-22 19:50:43.125614: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-22 19:50:43.130396: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2000160000 Hz 2021-03-22 19:50:43.130617: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x558c0eb1afc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-22 19:50:43.130653: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-22 19:50:43.225023: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.226184: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x558c0eb1bdc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-22 19:50:43.226222: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla V100-SXM2-16GB, Compute Capability 7.0 2021-03-22 19:50:43.226449: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.227012: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-22 19:50:43.227119: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-22 19:50:43.227169: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-22 19:50:43.227205: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-22 19:50:43.227236: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-22 19:50:43.227264: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-22 19:50:43.227294: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-22 19:50:43.227325: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-22 19:50:43.227422: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.228041: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.228578: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-22 19:50:43.228655: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-22 19:50:43.722938: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-22 19:50:43.722997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-22 19:50:43.723010: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-22 19:50:43.723254: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.723935: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.724525: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14764 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0) INFO:sleap.nn.training:Loaded test example. [2.173s] INFO:sleap.nn.training: Input shape: (256, 336, 3) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 16 INFO:sleap.nn.training: Parameters: 1,953,393 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = CentroidConfmapsHead(anchor_part=None, sigma=5.0, output_stride=2, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = Tensor("CentroidConfmapsHead_0/BiasAdd:0", shape=(None, 128, 168, 1), dtype=float32) INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 581 INFO:sleap.nn.training:Validation set: n = 64 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-06, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: models/2BMv2_ImplantBranch.centroid INFO:sleap.nn.training:Setting up visualization... WARNING:tensorflow:Model was constructed with shape (None, 256, 336, 3) for input Tensor("input:0", shape=(None, 256, 336, 3), dtype=float32), but it was called on an input with incompatible shape (None, 492, 656, 3). Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 1812, in _create_c_op c_op = pywrap_tf_session.TF_FinishOperation(op_desc) tensorflow.python.framework.errors_impl.InvalidArgumentError: Dimension 1 in both shapes must be equal, but are 123 and 124. Shapes are [?,123,164] and [?,124,164]. for '{{node functional_1/stack0_dec1_s8_to_s4_skip_concat/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](functional_1/stack0_enc2_act1_relu/Relu, functional_1/stack0_dec1_s8_to_s4_interp_bilinear/resize/ResizeBilinear, functional_1/stack0_dec1_s8_to_s4_skip_concat/concat/axis)' with input shapes: [?,123,164,64], [?,124,164,128], [] and with computed input tensors: input[2] = <3>.

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/usr/local/bin/sleap-train", line 8, in sys.exit(main()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 1582, in main trainer.train() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 875, in train self.setup() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 869, in setup self._setup_visualization() File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 1142, in _setup_visualization training_viz_ds_iter = iter(self.training_viz_pipeline.make_dataset()) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/pipelines.py", line 282, in make_dataset ds = transformer.transform_dataset(ds) File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/inference.py", line 40, in transform_dataset keras_model = tf.keras.Model(input_layers, self.keras_model(input_layers)) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 926, in call input_list) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1117, in _functional_construction_call outputs = call_fn(cast_inputs, *args, kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/functional.py", line 386, in call inputs, training=training, mask=mask) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/functional.py", line 508, in _run_internal_graph outputs = node.layer(*args, *kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 926, in call input_list) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1117, in _functional_construction_call outputs = call_fn(cast_inputs, args, kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/layers/merge.py", line 183, in call return self._merge_function(inputs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/layers/merge.py", line 522, in _merge_function return K.concatenate(inputs, axis=self.axis) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py", line 201, in wrapper return target(*args, *kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/backend.py", line 2881, in concatenate return array_ops.concat([to_dense(x) for x in tensors], axis) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py", line 201, in wrapper return target(args, **kwargs) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/array_ops.py", line 1654, in concat return gen_array_ops.concat_v2(values=values, axis=axis, name=name) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 1222, in concat_v2 "ConcatV2", values=values, axis=axis, name=name) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 744, in _apply_op_helper attrs=attr_protos, op_def=op_def) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py", line 593, in _create_op_internal compute_device) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 3485, in _create_op_internal op_def=op_def) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 1975, in init control_input_ops, op_def) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 1815, in _create_c_op raise ValueError(str(e)) ValueError: Dimension 1 in both shapes must be equal, but are 123 and 124. Shapes are [?,123,164] and [?,124,164]. for '{{node functional_1/stack0_dec1_s8_to_s4_skip_concat/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](functional_1/stack0_enc2_act1_relu/Relu, functional_1/stack0_dec1_s8_to_s4_interp_bilinear/resize/ResizeBilinear, functional_1/stack0_dec1_s8_to_s4_skip_concat/concat/axis)' with input shapes: [?,123,164,64], [?,124,164,128], [] and with computed input tensors: input[2] = <3>.

talmo commented 3 years ago

Hey @Xiaoyu-Tong,

You're really finding all the bugs! We fixed that latest issue with the centroid, and it'll be in the next minor release (I'll update you here).

Still working on the bottom-up problem.

Thanks for the reports!

Xiaoyu-Tong commented 3 years ago

Sounds great. Thanks!

talmo commented 3 years ago

Hi @Xiaoyu-Tong,

You should now be able to train your top-down centroid models correctly in SLEAP v1.1.2. Feel free to re-open if this is not the case!

Do you mind opening another issue if you're still having problems with bottom-up with the refined skeleton in v1.1.2?

Thanks!!

Talmo