Closed Xiaoyu-Tong closed 3 years ago
I tried to set "convert images to greyscale" before exporting the training package. I got a different error as shown below when tried to train the model:
2021-03-17 23:46:22.913327: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
INFO:sleap.nn.training:Versions:
SLEAP: 1.1.0
TensorFlow: 2.3.1
Numpy: 1.18.5
Python: 3.7.10
OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
INFO:sleap.nn.training:Training labels file: 2BMv2.pkg.slp
INFO:sleap.nn.training:Training profile: /usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline_medium_rf.bottomup.json
INFO:sleap.nn.training:
INFO:sleap.nn.training:Arguments:
INFO:sleap.nn.training:{
"training_job_path": "baseline_medium_rf.bottomup.json",
"labels_path": "2BMv2.pkg.slp",
"video_paths": "",
"val_labels": null,
"test_labels": null,
"tensorboard": false,
"save_viz": false,
"zmq": false,
"run_name": "2BMv2.bottomup",
"prefix": "",
"suffix": ""
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Training job:
INFO:sleap.nn.training:{
"data": {
"labels": {
"training_labels": null,
"validation_labels": null,
"validation_fraction": 0.1,
"test_labels": null,
"split_by_inds": false,
"training_inds": null,
"validation_inds": null,
"test_inds": null,
"search_path_hints": [],
"skeletons": []
},
"preprocessing": {
"ensure_rgb": false,
"ensure_grayscale": false,
"imagenet_mode": null,
"input_scaling": 1.0,
"pad_to_stride": null,
"resize_and_pad_to_target": true,
"target_height": null,
"target_width": null
},
"instance_cropping": {
"center_on_part": null,
"crop_size": null,
"crop_size_detection_padding": 16
}
},
"model": {
"backbone": {
"leap": null,
"unet": {
"stem_stride": null,
"max_stride": 32,
"output_stride": 4,
"filters": 16,
"filters_rate": 2.0,
"middle_block": true,
"up_interpolate": true,
"stacks": 1
},
"hourglass": null,
"resnet": null,
"pretrained_encoder": null
},
"heads": {
"single_instance": null,
"centroid": null,
"centered_instance": null,
"multi_instance": {
"confmaps": {
"part_names": null,
"sigma": 2.5,
"output_stride": 4,
"loss_weight": 1.0,
"offset_refinement": false
},
"pafs": {
"edges": null,
"sigma": 75.0,
"output_stride": 8,
"loss_weight": 1.0
}
}
}
},
"optimization": {
"preload_data": true,
"augmentation_config": {
"rotate": true,
"rotation_min_angle": -180.0,
"rotation_max_angle": 180.0,
"translate": false,
"translate_min": -5,
"translate_max": 5,
"scale": false,
"scale_min": 0.9,
"scale_max": 1.1,
"uniform_noise": false,
"uniform_noise_min_val": 0.0,
"uniform_noise_max_val": 10.0,
"gaussian_noise": false,
"gaussian_noise_mean": 5.0,
"gaussian_noise_stddev": 1.0,
"contrast": false,
"contrast_min_gamma": 0.5,
"contrast_max_gamma": 2.0,
"brightness": false,
"brightness_min_val": 0.0,
"brightness_max_val": 10.0,
"random_crop": false,
"random_crop_height": 256,
"random_crop_width": 256,
"random_flip": false,
"flip_horizontal": true
},
"online_shuffling": true,
"shuffle_buffer_size": 128,
"prefetch": true,
"batch_size": 4,
"batches_per_epoch": null,
"min_batches_per_epoch": 200,
"val_batches_per_epoch": null,
"min_val_batches_per_epoch": 10,
"epochs": 200,
"optimizer": "adam",
"initial_learning_rate": 0.0001,
"learning_rate_schedule": {
"reduce_on_plateau": true,
"reduction_factor": 0.5,
"plateau_min_delta": 1e-06,
"plateau_patience": 8,
"plateau_cooldown": 3,
"min_learning_rate": 1e-08
},
"hard_keypoint_mining": {
"online_mining": false,
"hard_to_easy_ratio": 2.0,
"min_hard_keypoints": 2,
"max_hard_keypoints": null,
"loss_scale": 5.0
},
"early_stopping": {
"stop_training_on_plateau": true,
"plateau_min_delta": 1e-06,
"plateau_patience": 10
}
},
"outputs": {
"save_outputs": true,
"run_name": "2BMv2.bottomup",
"run_name_prefix": "",
"run_name_suffix": null,
"runs_folder": "models",
"tags": [],
"save_visualizations": true,
"delete_viz_images": true,
"zip_outputs": false,
"log_to_csv": true,
"checkpointing": {
"initial_model": false,
"best_model": true,
"every_epoch": false,
"latest_model": false,
"final_model": false
},
"tensorboard": {
"write_logs": false,
"loss_frequency": "epoch",
"architecture_graph": false,
"profile_graph": false,
"visualizations": true
},
"zmq": {
"subscribe_to_controller": false,
"controller_address": "tcp://127.0.0.1:9000",
"controller_polling_timeout": 10,
"publish_updates": false,
"publish_address": "tcp://127.0.0.1:9001"
}
},
"name": "",
"description": "",
"sleap_version": "1.1.0",
"filename": "/usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline_medium_rf.bottomup.json"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:System:
2021-03-17 23:46:24.561511: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-03-17 23:46:24.586686: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:24.587347: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2021-03-17 23:46:24.587422: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-03-17 23:46:24.763484: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-03-17 23:46:24.775344: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-03-17 23:46:24.780991: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2021-03-17 23:46:24.795277: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2021-03-17 23:46:24.815002: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2021-03-17 23:46:25.162332: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2021-03-17 23:46:25.162614: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:25.163372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:25.163923: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
GPUs: 1/1 available
Device: /physical_device:GPU:0
Available: True
Initalized: False
Memory growth: True
INFO:sleap.nn.training:
INFO:sleap.nn.training:Initializing trainer...
INFO:sleap.nn.training:Loading training labels from: 2BMv2.pkg.slp
INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1
INFO:sleap.nn.training: Splits: Training = 581 / Validation = 64.
INFO:sleap.nn.training:Setting up for training...
INFO:sleap.nn.training:Setting up pipeline builders...
INFO:sleap.nn.training:Setting up model...
INFO:sleap.nn.training:Building test pipeline...
2021-03-17 23:46:28.857347: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-03-17 23:46:28.862459: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2000129999 Hz
2021-03-17 23:46:28.862731: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5611e8300fc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-03-17 23:46:28.862769: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2021-03-17 23:46:28.951354: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:28.952317: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5611e8301dc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-03-17 23:46:28.952354: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla V100-SXM2-16GB, Compute Capability 7.0
2021-03-17 23:46:28.952691: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:28.953349: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2021-03-17 23:46:28.953540: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-03-17 23:46:28.953618: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-03-17 23:46:28.953650: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-03-17 23:46:28.953674: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2021-03-17 23:46:28.953709: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2021-03-17 23:46:28.953753: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2021-03-17 23:46:28.953783: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2021-03-17 23:46:28.953916: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:28.954641: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:28.955223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2021-03-17 23:46:28.955316: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-03-17 23:46:29.639419: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-03-17 23:46:29.639484: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0
2021-03-17 23:46:29.639508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N
2021-03-17 23:46:29.639858: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:29.640884: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-17 23:46:29.641709: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14764 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0)
INFO:sleap.nn.training:Loaded test example. [3.147s]
INFO:sleap.nn.training: Input shape: (512, 672, 3)
INFO:sleap.nn.training:Created Keras model.
INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=5, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False)
INFO:sleap.nn.training: Max stride: 32
INFO:sleap.nn.training: Parameters: 7,820,663
INFO:sleap.nn.training: Heads:
INFO:sleap.nn.training: [0] = MultiInstanceConfmapsHead(part_names=['Ear_left', 'Ear_right', 'Nose', 'Head', 'Neck', 'Center', 'Lateral_left', 'Lateral_right', 'Tail_base'], sigma=2.5, output_stride=4, loss_weight=1.0)
INFO:sleap.nn.training: [1] = PartAffinityFieldsHead(edges=[('Ear_left', 'Ear_right'), ('Ear_left', 'Nose'), ('Ear_left', 'Head'), ('Ear_right', 'Nose'), ('Ear_right', 'Head'), ('Nose', 'Head'), ('Head', 'Neck'), ('Neck', 'Center'), ('Neck', 'Lateral_left'), ('Neck', 'Lateral_right'), ('Center', 'Lateral_left'), ('Center', 'Lateral_right'), ('Center', 'Tail_base'), ('Lateral_left', 'Tail_base'), ('Lateral_right', 'Tail_base')], sigma=75.0, output_stride=8, loss_weight=1.0)
INFO:sleap.nn.training: Outputs:
INFO:sleap.nn.training: [0] = Tensor("MultiInstanceConfmapsHead_0/BiasAdd:0", shape=(None, 128, 168, 9), dtype=float32)
INFO:sleap.nn.training: [1] = Tensor("PartAffinityFieldsHead_0/BiasAdd:0", shape=(None, 64, 84, 30), dtype=float32)
INFO:sleap.nn.training:Setting up data pipelines...
INFO:sleap.nn.training:Training set: n = 581
INFO:sleap.nn.training:Validation set: n = 64
INFO:sleap.nn.training:Setting up optimization...
INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=8, plateau_cooldown=3, min_learning_rate=1e-08)
INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-06, plateau_patience=10)
INFO:sleap.nn.training:Setting up outputs...
INFO:sleap.nn.training:Created run path: models/2BMv2.bottomup
INFO:sleap.nn.training:Setting up visualization...
WARNING:tensorflow:Model was constructed with shape (None, 512, 672, 3) for input Tensor("input:0", shape=(None, 512, 672, 3), dtype=float32), but it was called on an input with incompatible shape (None, 512, 672, 1).
Traceback (most recent call last):
File "/usr/local/bin/sleap-train", line 8, in
And set "convert images to RGB" gave me the same error as not specifying "convert images to".
I did a little bit more investigation about this error. So based on the GUI, some of my videos have 3 channels and some have 1 channel. And I want to use them together and converting them to either RGB or greyscale is fine to me. Based on the error message, I am guessing that your code is trying to load RGB images if I set "convert images to greyscale". Not sure if it is the mixture of my video format caused this problem. Thank you so much for helping me in advance!
Hi @Xiaoyu-Tong,
Just wanted to give you a quick update that we're working on this issue.
While we do that, do you mind telling us a little more about the video formats? Are they all RGB or grayscale originally?
SLEAP should be able to handle this case anyway, but while we track it down, one potential workaround is to replace them with the same files by going to File -> Replace videos... and importing them all consistently as RGB or grayscale.
If you get a chance to try that, let me know if it worked and it'll help us find the source of the error.
Thanks!
Hi, @talmo
Thanks for the update! The videos are originally grayscale mp4 files. I tried to replace the one which is different from other, but did not find a way to specify its channel number. So after the replacement the difference remains. Probably it is worth mentioning that I imported most of my "videos" as figures (from DLC) and SLEAP does not have the access to the entire videos. These video files are originally grayscale but the imported images are RGB. But anyway it works well before I added a full video to the project. The full video is grayscale and remains grayscale after importation, and I did not find a way to specify its channel number. As I mentioned above, I have also tried to convert them to RGB/grayscale before training, but no luck.
Thanks!
Ah shoot, I thought we had added an option to set the number of channels from the replace video dialog. This is on our to-do list!
Again, it shouldn't have been an issue to begin with, but as a workaround open up python/ipython and run these lines:
import sleap
labels = sleap.load_file('2BMv2.slp')
for vid in labels.videos:
vid.backend.grayscale = True
labels.save('2BMv2.gray.slp')
Replace '2BMv2.slp'
with the filename of your labels file if that's not the right one. After running this, you'll get another copy of the same labels with the filename 2BMv2.gray.slp
which you can then check to see if all the videos show up with 1 channel in the GUI.
Hi @talmo ,
Thanks for the update. I tried the above code and reloaded 2BMv2.gray.slp. However, based on what is shown in the GUI, the other ones are still with 3 channels. Maybe I can provide the slp file to you and you try to examine what's wrong?
Thank you!
Hi @Xiaoyu-Tong,
Yup that works! You can send it to sleap@princeton.edu
. Thanks!
File sent via email. Thank you!
Hi @talmo ,
I think the dialog to specify channel number when importing videos does work, however, when replacing videos, as well as making predictions, there is no such a dialog and by default videos will become grayscale. So basically there is no chance for me to specify channel number when I added new videos in the way of making predictions. I think a quick fix may be that you add a similar dialog when users replace videos (I am assuming that replacing videos will match labeled frames correctly).
Yup, I think I'd like to add it as option in the GUI. Just FYI, I don't see the email to sleap@princeton.edu
-- maybe it got spam filtered? You could try sending it as a Google Drive link? Or directly to talmo@princeton.edu
?
Hi, I just forwarded the email to talmo@princeton.edu.
Hmm, I still haven't received it and I've gotten one from someone else in the meantime so I'm guessing it's not going to come through. What's your email? I can send you a link where you can upload it.
xiaoyu.tong@nyulangone.org
Thanks
Hmm, I still haven't received it and I've gotten one from someone else in the meantime so I'm guessing it's not going to come through. What's your email? I can send you a link where you can upload it.
Uploaded. Please let me know if you cannot receive it.
Cool, got it! Sent you a (potentially) fixed version.
Hi @talmo ,
Seems it helps and the model is now being trained! Thank you! How did you fix it?
Wonderful! Right so I basically did the same thing I had suggested in the code snippet earlier, but forced the video to be RGB instead of grayscale. The problem is that single image videos, like the ones you import from DLC, currently do not support automatic conversion to grayscale.
I think we overlooked this feature in the past because we assumed that the normalization would take care of it, but it seems we were mistaken! I'm going to keep this issue open until we either fix the normalization during training/inference, or allow for changing this setting per video in the GUI.
Thanks for the report and let me know if you run into more issues after training!
Hi,
When I tried to make predictions , I used the code (in Google Colab): !sleap-track 140630_SD5_behaviors_115680_131200.mp4 \ --tracking.tracker simple \ -m models/2BMv2.bottomup
But then it failed to predict and gave me the following error. Actually yesterday I tried to make a brand new model by labeling 20 frames in SLEAP GUI, and it gave me the same error. So I guess this is irrevalant with the channel number issue. Any idea how to fix it? Thank you!
The error message
2021-03-19 17:55:07.391236: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 Started inference at: 2021-03-19 17:55:09.043115 Args: data_path: 140630_SD5_behaviors_115680_131200.mp4 models: ['models/2BMv2.bottomup'] frames: None only_labeled_frames: False only_suggested_frames: False output: None no_empty_frames: False verbosity: rich video.dataset: None video.input_format: channels_last cpu: False first_gpu: False last_gpu: False gpu: 0 peak_threshold: 0.2 batch_size: 4 open_in_gui: False tracking.tracker: simple tracking.target_instance_count: None tracking.pre_cull_to_target: None tracking.pre_cull_iou_threshold: None tracking.post_connect_single_breaks: None tracking.clean_instance_count: None tracking.clean_iou_threshold: None tracking.similarity: None tracking.match: None tracking.track_window: None tracking.min_new_track_points: None tracking.min_match_points: None tracking.img_scale: None tracking.of_window_size: None tracking.of_max_levels: None tracking.kf_node_indices: None tracking.kf_init_frame_count: None 2021-03-19 17:55:09.045566: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-19 17:55:09.065636: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.066222: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 17:55:09.066277: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 17:55:09.068061: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 17:55:09.077751: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 17:55:09.078130: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 17:55:09.080076: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 17:55:09.086896: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 17:55:09.091941: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 17:55:09.092079: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.092701: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.093209: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True
2021-03-19 17:55:09.402452: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-19 17:55:09.408680: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz 2021-03-19 17:55:09.408909: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x555a7fe84f40 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-19 17:55:09.408945: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-19 17:55:09.504212: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.504937: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x555a7fe85100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-19 17:55:09.504968: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0 2021-03-19 17:55:09.505190: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.505795: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 17:55:09.505884: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 17:55:09.505936: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 17:55:09.505965: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 17:55:09.505997: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 17:55:09.506024: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 17:55:09.506050: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 17:55:09.506075: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 17:55:09.506166: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.506802: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.507307: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-19 17:55:09.507400: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 17:55:09.947734: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-19 17:55:09.947800: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-19 17:55:09.947815: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-19 17:55:09.948073: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.948775: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 17:55:09.949342: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14958 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0) Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ?2021-03-19 17:55:19.114744: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 17:55:19.862862: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ?2021-03-19 17:55:20.786519: W tensorflow/core/framework/op_kernel.cc:1755] Unknown: AssertionError: Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 242, in call return func(device, token, args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 131, in call ret = self._func(*args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 302, in wrapper return func(*args, **kwargs)
File "/tmp/tmpqf6m3m4z.py", line 20, in _group_instances_sample retval1 = ag.converted_call(ag.ld(group_instances_sample), (ag.ld(peaks_sample), ag.ld(peak_scores_sample), ag.ld(peak_channel_inds_sample), ag.ld(match_edge_inds_sample), ag.ld(match_src_peak_inds_sample), ag.ld(match_dst_peak_inds_sample), ag__.ld(match_line_scores_sample), ag.ld(n_nodes), ag.ld(n_edges), ag__.ld(edge_types), ag.ld(min_instance_peaks)), dict(min_line_scores=ag__.ld(min_line_scores)), fscope_1)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 461, in converted_call return _call_unconverted(f, args, kwargs, options, False)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 339, in _call_unconverted return f(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 1046, in group_instances_sample ) = make_predicted_instances(peaks, peak_scores, connections, instance_assignments)
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 903, in make_predicted_instances assert instance_ind == instance_assignments[dst_peak_id]
AssertionError
Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ?
Traceback (most recent call last):
File "/usr/local/bin/sleap-track", line 8, in
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 242, in call return func(device, token, args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 131, in call ret = self._func(*args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 302, in wrapper return func(*args, **kwargs)
File "/tmp/tmpqf6m3m4z.py", line 20, in _group_instances_sample retval1 = ag.converted_call(ag.ld(group_instances_sample), (ag.ld(peaks_sample), ag.ld(peak_scores_sample), ag.ld(peak_channel_inds_sample), ag.ld(match_edge_inds_sample), ag.ld(match_src_peak_inds_sample), ag.ld(match_dst_peak_inds_sample), ag__.ld(match_line_scores_sample), ag.ld(n_nodes), ag.ld(n_edges), ag__.ld(edge_types), ag.ld(min_instance_peaks)), dict(min_line_scores=ag__.ld(min_line_scores)), fscope_1)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 461, in converted_call return _call_unconverted(f, args, kwargs, options, False)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 339, in _call_unconverted return f(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 1046, in group_instances_sample ) = make_predicted_instances(peaks, peak_scores, connections, instance_assignments)
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 903, in make_predicted_instances assert instance_ind == instance_assignments[dst_peak_id]
AssertionError
[[{{node bottom_up_inference_model/bottom_up_inference_layer/while_2/body/_211/bottom_up_inference_model/bottom_up_inference_layer/while_2/EagerPyFunc}}]]
(1) Unknown: AssertionError: Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 242, in call return func(device, token, args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 131, in call ret = self._func(*args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 302, in wrapper return func(*args, **kwargs)
File "/tmp/tmpqf6m3m4z.py", line 20, in _group_instances_sample retval1 = ag.converted_call(ag.ld(group_instances_sample), (ag.ld(peaks_sample), ag.ld(peak_scores_sample), ag.ld(peak_channel_inds_sample), ag.ld(match_edge_inds_sample), ag.ld(match_src_peak_inds_sample), ag.ld(match_dst_peak_inds_sample), ag__.ld(match_line_scores_sample), ag.ld(n_nodes), ag.ld(n_edges), ag__.ld(edge_types), ag.ld(min_instance_peaks)), dict(min_line_scores=ag__.ld(min_line_scores)), fscope_1)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 461, in converted_call return _call_unconverted(f, args, kwargs, options, False)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 339, in _call_unconverted return f(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 1046, in group_instances_sample ) = make_predicted_instances(peaks, peak_scores, connections, instance_assignments)
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/paf_grouping.py", line 903, in make_predicted_instances assert instance_ind == instance_assignments[dst_peak_id]
AssertionError
[[{{node bottom_up_inference_model/bottom_up_inference_layer/while_2/body/_211/bottom_up_inference_model/bottom_up_inference_layer/while_2/EagerPyFunc}}]]
[[bottom_up_inference_model/bottom_up_inference_layer/while_2/body/_211/bottom_up_inference_model/bottom_up_inference_layer/while_2/Shape/_926]]
0 successful operations. 0 derived errors ignored. [Op:__inference_predict_function_5426]
Function call stack: predict_function -> predict_function
Ah yes, this is an issue with the skeleton configuration. When training from within the GUI it should give you a warning that the skeleton does not form an arborescence, but maybe we should add something on the CLI side as well.
So basically, you want to make sure that your skeleton forms no cycle and no node has two parents. Right now, your skeleton has multiple of these, for example:
('Ear_left', 'Ear_right'), ('Ear_left', 'Nose'), ('Ear_left', 'Head'), ('Ear_right', 'Nose'), ('Ear_right', 'Head')
Ideally you'll just have:
('Head', 'Ear_right'), ('Head', 'Nose'), ('Head', 'Ear_right')
and so forth.
If you're coming from DLC, this is a common issue. I think they fundamentally misunderstand how the bottom-up (part affinity field) matching algorithm works -- by having multiple edges that point to the same child node, you can have conflicting groupings depending on which path you take, resulting in noisy part assignments when the animals are close together.
Try reducing your skeleton edges such that no node has two parents and you should be good to go. In the next version of SLEAP, we'll have this displayed more prominently in the skeleton editor.
Hi,
Thank you for the answer. I am trying the top-down model as I prefer to keep the complexity of skeleton. By the way, when I tried to do the same thing for centroid model "!sleap-train baseline.centroid.json 2BMv2.pkg.slp --run_name "2BMv2.centroid", it gave me an error which prevented the training from happening. So I wonder if there is any issue of the centroid model training?
The error: 2021-03-19 18:02:44.808179: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 INFO:sleap.nn.training:Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic INFO:sleap.nn.training:Training labels file: 2BMv2.pkg.slp INFO:sleap.nn.training:Training profile: /usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline.centroid.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "baseline.centroid.json", "labels_path": "2BMv2.pkg.slp", "video_paths": "", "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "2BMv2.centroid", "prefix": "", "suffix": "" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 0.5, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 16, "output_stride": 2, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": { "anchor_part": null, "sigma": 5.0, "output_stride": 2, "offset_refinement": false }, "centered_instance": null, "multi_instance": null } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-06, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "2BMv2.centroid", "run_name_prefix": "", "run_name_suffix": null, "runs_folder": "models", "tags": [], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.1.1", "filename": "/usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline.centroid.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:System: 2021-03-19 18:02:46.291431: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-19 18:02:46.311568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:46.312165: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 18:02:46.312226: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:02:46.313971: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 18:02:46.315857: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 18:02:46.316204: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 18:02:46.318229: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 18:02:46.319164: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 18:02:46.323225: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 18:02:46.323354: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:46.323960: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:46.324462: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: 2BMv2.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 581 / Validation = 64. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2021-03-19 18:02:47.406442: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-19 18:02:47.410827: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz 2021-03-19 18:02:47.411050: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56164c058fc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-19 18:02:47.411082: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-19 18:02:47.498989: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.499829: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56164c059a40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-19 18:02:47.499866: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0 2021-03-19 18:02:47.500083: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.500649: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 18:02:47.500734: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:02:47.500778: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 18:02:47.500807: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 18:02:47.500834: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 18:02:47.500865: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 18:02:47.500889: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 18:02:47.500915: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 18:02:47.501006: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.501577: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.502097: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-19 18:02:47.502182: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:02:47.930706: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-19 18:02:47.930764: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-19 18:02:47.930776: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-19 18:02:47.930987: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.931662: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:02:47.932202: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14958 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0) INFO:sleap.nn.training:Loaded test example. [2.012s] INFO:sleap.nn.training: Input shape: (256, 336, 3) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 16 INFO:sleap.nn.training: Parameters: 1,953,393 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = CentroidConfmapsHead(anchor_part=None, sigma=5.0, output_stride=2, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = Tensor("CentroidConfmapsHead_0/BiasAdd:0", shape=(None, 128, 168, 1), dtype=float32) INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 581 INFO:sleap.nn.training:Validation set: n = 64 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-06, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: models/2BMv2.centroid INFO:sleap.nn.training:Setting up visualization... WARNING:tensorflow:Model was constructed with shape (None, 256, 336, 3) for input Tensor("input:0", shape=(None, 256, 336, 3), dtype=float32), but it was called on an input with incompatible shape (None, 492, 656, 3). Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 1812, in _create_c_op c_op = pywrap_tf_session.TF_FinishOperation(op_desc) tensorflow.python.framework.errors_impl.InvalidArgumentError: Dimension 1 in both shapes must be equal, but are 123 and 124. Shapes are [?,123,164] and [?,124,164]. for '{{node functional_1/stack0_dec1_s8_to_s4_skip_concat/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](functional_1/stack0_enc2_act1_relu/Relu, functional_1/stack0_dec1_s8_to_s4_interp_bilinear/resize/ResizeBilinear, functional_1/stack0_dec1_s8_to_s4_skip_concat/concat/axis)' with input shapes: [?,123,164,64], [?,124,164,128], [] and with computed input tensors: input[2] = <3>.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/bin/sleap-train", line 8, in
Eurgh, sorry about that -- looks like the same problem as #510.
We're working on that one in parallel, in the meantime, do you mind dropping the 2BMv2.pkg.slp
in the same Dropbox link I sent you? We'll make sure that the fix works on both of your datasets.
Thanks!!
OK, no worries! I have dropped the training package in the Dropbox for this issue. Thank you in advance for your help.
Do I also need to make sure codes do not form circle and no node has two parents when using the top-down model? because when I tried to use the top-down model to predict it gave me the following error:
2021-03-19 18:21:41.665564: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 Started inference at: 2021-03-19 18:21:43.123669 Args: data_path: 140630_SD5_behaviors_115680_131200.mp4 models: ['models/2BMv2.topdown_confmaps'] frames: None only_labeled_frames: False only_suggested_frames: False output: None no_empty_frames: False verbosity: rich video.dataset: None video.input_format: channels_last cpu: False first_gpu: False last_gpu: False gpu: 0 peak_threshold: 0.2 batch_size: 4 open_in_gui: False tracking.tracker: simple tracking.target_instance_count: None tracking.pre_cull_to_target: None tracking.pre_cull_iou_threshold: None tracking.post_connect_single_breaks: None tracking.clean_instance_count: None tracking.clean_iou_threshold: None tracking.similarity: None tracking.match: None tracking.track_window: None tracking.min_new_track_points: None tracking.min_match_points: None tracking.img_scale: None tracking.of_window_size: None tracking.of_max_levels: None tracking.kf_node_indices: None tracking.kf_init_frame_count: None 2021-03-19 18:21:43.126086: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-19 18:21:43.145928: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.146514: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 18:21:43.146565: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 18:21:43.148523: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 18:21:43.150439: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 18:21:43.150832: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 18:21:43.152573: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 18:21:43.153817: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 18:21:43.157753: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 18:21:43.157886: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.158472: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 18:21:43.158992: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True
2021-03-19 18:21:43.411734: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-03-19 18:21:43.417823: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz
2021-03-19 18:21:43.418048: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55fb2bb10f40 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-03-19 18:21:43.418087: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2021-03-19 18:21:43.508961: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 18:21:43.509797: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55fb2bb11100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-03-19 18:21:43.509833: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
2021-03-19 18:21:43.510060: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 18:21:43.510610: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2021-03-19 18:21:43.510694: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-03-19 18:21:43.510749: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-03-19 18:21:43.510780: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-03-19 18:21:43.510813: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2021-03-19 18:21:43.510839: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2021-03-19 18:21:43.510864: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2021-03-19 18:21:43.510890: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2021-03-19 18:21:43.510982: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 18:21:43.511568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 18:21:43.512102: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2021-03-19 18:21:43.512192: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-03-19 18:21:43.946276: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-03-19 18:21:43.946345: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0
2021-03-19 18:21:43.946361: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N
2021-03-19 18:21:43.946618: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 18:21:43.947269: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 18:21:43.947858: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14958 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0)
Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ?
Traceback (most recent call last):
File "/usr/local/bin/sleap-track", line 8, in
You do not, but you do need a centroid model (which you had an error when trying to train) in addition to the centered instance model to do topdown inference.
I'll get back to you asap when we fix this image size issue. Sorry about that again!
OK I see!
Hi @talmo ,
I also tried to revise my skeleton so that it can be used to train a bottom-up model. I edited it so that all nodes except the root one has only one parent and no circle formed as shown below. However, when trying to make predictions, I still got the following error:
Any idea how to fix it? Thank you!
2021-03-19 20:30:12.804426: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 Started inference at: 2021-03-19 20:30:14.341726 Args: data_path: 140630_SD5_behaviors_115680_131200.mp4 models: ['models/2BMv2_BottomUp.bottomup'] frames: None only_labeled_frames: False only_suggested_frames: False output: None no_empty_frames: False verbosity: rich video.dataset: None video.input_format: channels_last cpu: False first_gpu: False last_gpu: False gpu: 0 peak_threshold: 0.2 batch_size: 4 open_in_gui: False tracking.tracker: simple tracking.target_instance_count: None tracking.pre_cull_to_target: None tracking.pre_cull_iou_threshold: None tracking.post_connect_single_breaks: None tracking.clean_instance_count: None tracking.clean_iou_threshold: None tracking.similarity: None tracking.match: None tracking.track_window: None tracking.min_new_track_points: None tracking.min_match_points: None tracking.img_scale: None tracking.of_window_size: None tracking.of_max_levels: None tracking.kf_node_indices: None tracking.kf_init_frame_count: None 2021-03-19 20:30:14.346315: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-19 20:30:14.370451: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.371095: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0 coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s 2021-03-19 20:30:14.371157: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-19 20:30:14.373666: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-19 20:30:14.375685: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-19 20:30:14.376317: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-19 20:30:14.378399: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-19 20:30:14.379826: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-19 20:30:14.384641: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-19 20:30:14.384773: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.385372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-19 20:30:14.385888: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True
2021-03-19 20:30:14.678050: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-03-19 20:30:14.683679: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz
2021-03-19 20:30:14.683930: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55c6f7e08f40 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-03-19 20:30:14.683966: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2021-03-19 20:30:14.774451: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 20:30:14.775217: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55c6f7e09100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-03-19 20:30:14.775264: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
2021-03-19 20:30:14.775512: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 20:30:14.776160: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2021-03-19 20:30:14.776316: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-03-19 20:30:14.776413: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-03-19 20:30:14.776446: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-03-19 20:30:14.776474: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2021-03-19 20:30:14.776505: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2021-03-19 20:30:14.776532: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2021-03-19 20:30:14.776558: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2021-03-19 20:30:14.776776: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 20:30:14.777397: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 20:30:14.777941: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2021-03-19 20:30:14.778018: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-03-19 20:30:15.218348: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-03-19 20:30:15.218402: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0
2021-03-19 20:30:15.218414: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N
2021-03-19 20:30:15.218664: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 20:30:15.219310: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-03-19 20:30:15.219872: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14958 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0)
Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: -:--:-- ?2021-03-19 20:30:23.347639: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2021-03-19 20:30:24.119046: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: 0:14:28 17.8 FPS2021-03-19 20:30:28.115905: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES failed at list_kernels.h:357 : Invalid argument: PartialTensorShape: Incompatible ranks during merge: 2 vs. 1
2021-03-19 20:30:28.115905: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES failed at list_kernels.h:357 : Invalid argument: PartialTensorShape: Incompatible ranks during merge: 2 vs. 1
Predicting... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0% ETA: 0:14:28 17.8 FPS
Traceback (most recent call last):
File "/usr/local/bin/sleap-track", line 8, in
Function call stack: predict_function -> predict_function
Hi, @talmo I just tried the training of centroid/topdown model again, and seems its working! (Now its being trained. I haven't got any result so not sure if training and predicting indeed work well.) Thank you very much for fixing the issue! Can you please take a look on the most recent update I made in this issue about the bottom-up model (I edited the skeleton and retrained the model but the bottom up prediction is still not working). Thank you!
Update:
I ran "!sleap-train baseline_medium_rf.topdown.json 2BMv2_ImplantBranch.pkg.slp", which went smoothly. But when I ran "!sleap-train baseline.centroid.json 2BMv2_ImplantBranch.pkg.slp", it still gave me the following error: 2021-03-22 19:50:40.383868: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 INFO:sleap.nn.training:Versions: SLEAP: 1.1.1 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic INFO:sleap.nn.training:Training labels file: 2BMv2_ImplantBranch.pkg.slp INFO:sleap.nn.training:Training profile: /usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline.centroid.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "baseline.centroid.json", "labels_path": "2BMv2_ImplantBranch.pkg.slp", "video_paths": "", "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "2BMv2_ImplantBranch.centroid", "prefix": "", "suffix": "" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 0.5, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 16, "output_stride": 2, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": { "anchor_part": null, "sigma": 5.0, "output_stride": 2, "offset_refinement": false }, "centered_instance": null, "multi_instance": null } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-06, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "2BMv2_ImplantBranch.centroid", "run_name_prefix": "", "run_name_suffix": null, "runs_folder": "models", "tags": [], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.1.1", "filename": "/usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline.centroid.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:System: 2021-03-22 19:50:41.953850: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-22 19:50:41.979840: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:41.980522: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-22 19:50:41.980596: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-22 19:50:41.982335: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-22 19:50:41.984356: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-22 19:50:41.984717: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-22 19:50:41.986916: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-22 19:50:41.987892: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-22 19:50:41.991685: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-22 19:50:41.991873: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:41.992621: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:41.993266: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: 2BMv2_ImplantBranch.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 581 / Validation = 64. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2021-03-22 19:50:43.125614: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-22 19:50:43.130396: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2000160000 Hz 2021-03-22 19:50:43.130617: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x558c0eb1afc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-22 19:50:43.130653: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-22 19:50:43.225023: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.226184: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x558c0eb1bdc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-22 19:50:43.226222: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla V100-SXM2-16GB, Compute Capability 7.0 2021-03-22 19:50:43.226449: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.227012: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-22 19:50:43.227119: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-22 19:50:43.227169: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-22 19:50:43.227205: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-22 19:50:43.227236: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-22 19:50:43.227264: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-22 19:50:43.227294: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-22 19:50:43.227325: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-22 19:50:43.227422: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.228041: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.228578: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-22 19:50:43.228655: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-22 19:50:43.722938: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-22 19:50:43.722997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-22 19:50:43.723010: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-22 19:50:43.723254: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.723935: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-22 19:50:43.724525: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14764 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0) INFO:sleap.nn.training:Loaded test example. [2.173s] INFO:sleap.nn.training: Input shape: (256, 336, 3) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 16 INFO:sleap.nn.training: Parameters: 1,953,393 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = CentroidConfmapsHead(anchor_part=None, sigma=5.0, output_stride=2, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = Tensor("CentroidConfmapsHead_0/BiasAdd:0", shape=(None, 128, 168, 1), dtype=float32) INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 581 INFO:sleap.nn.training:Validation set: n = 64 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-06, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: models/2BMv2_ImplantBranch.centroid INFO:sleap.nn.training:Setting up visualization... WARNING:tensorflow:Model was constructed with shape (None, 256, 336, 3) for input Tensor("input:0", shape=(None, 256, 336, 3), dtype=float32), but it was called on an input with incompatible shape (None, 492, 656, 3). Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 1812, in _create_c_op c_op = pywrap_tf_session.TF_FinishOperation(op_desc) tensorflow.python.framework.errors_impl.InvalidArgumentError: Dimension 1 in both shapes must be equal, but are 123 and 124. Shapes are [?,123,164] and [?,124,164]. for '{{node functional_1/stack0_dec1_s8_to_s4_skip_concat/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](functional_1/stack0_enc2_act1_relu/Relu, functional_1/stack0_dec1_s8_to_s4_interp_bilinear/resize/ResizeBilinear, functional_1/stack0_dec1_s8_to_s4_skip_concat/concat/axis)' with input shapes: [?,123,164,64], [?,124,164,128], [] and with computed input tensors: input[2] = <3>.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/bin/sleap-train", line 8, in
Hey @Xiaoyu-Tong,
You're really finding all the bugs! We fixed that latest issue with the centroid, and it'll be in the next minor release (I'll update you here).
Still working on the bottom-up problem.
Thanks for the reports!
Sounds great. Thanks!
Hi @Xiaoyu-Tong,
You should now be able to train your top-down centroid models correctly in SLEAP v1.1.2. Feel free to re-open if this is not the case!
Do you mind opening another issue if you're still having problems with bottom-up with the refined skeleton in v1.1.2?
Thanks!!
Talmo
Hi,
I successfully trained a model with SLEAP. And I really like the interactive GUI, thank you for developing that!
However, when I tried to train a new model with both the annotated frames I used for the first model and some corrected frames from the first model's predictions, I got the error shown below.
Specifically, this is what I did:
It may be worth mentioning that the videos I am using are grayscale videos with only one channel. But the first model was trained very smoothly.
The output message (error) I got: 2021-03-17 18:30:04.104526: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 INFO:sleap.nn.training:Versions: SLEAP: 1.1.0 TensorFlow: 2.3.1 Numpy: 1.18.5 Python: 3.7.10 OS: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic INFO:sleap.nn.training:Training labels file: 2BMv2.pkg.slp INFO:sleap.nn.training:Training profile: /usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline_medium_rf.bottomup.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "baseline_medium_rf.bottomup.json", "labels_path": "2BMv2.pkg.slp", "video_paths": "", "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "2BMv2.bottomup", "prefix": "", "suffix": "" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 1.0, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 32, "output_stride": 4, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": null, "centered_instance": null, "multi_instance": { "confmaps": { "part_names": null, "sigma": 2.5, "output_stride": 4, "loss_weight": 1.0, "offset_refinement": false }, "pafs": { "edges": null, "sigma": 75.0, "output_stride": 8, "loss_weight": 1.0 } } } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 8, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-06, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "2BMv2.bottomup", "run_name_prefix": "", "run_name_suffix": null, "runs_folder": "models", "tags": [], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.1.0", "filename": "/usr/local/lib/python3.7/dist-packages/sleap/training_profiles/baseline_medium_rf.bottomup.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:System: 2021-03-17 18:30:05.743608: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1 2021-03-17 18:30:05.768257: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:05.768897: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-17 18:30:05.768964: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 18:30:05.952332: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-17 18:30:05.957281: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-17 18:30:05.965076: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-17 18:30:05.981527: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-17 18:30:06.001245: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-17 18:30:06.371466: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-17 18:30:06.371733: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:06.372497: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:06.373034: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: 2BMv2.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 581 / Validation = 64. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2021-03-17 18:30:09.964350: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-03-17 18:30:09.973359: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2000134999 Hz 2021-03-17 18:30:09.973646: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55e08cb94fc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2021-03-17 18:30:09.973678: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2021-03-17 18:30:10.097990: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.099198: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55e08cb95dc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2021-03-17 18:30:10.099427: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla V100-SXM2-16GB, Compute Capability 7.0 2021-03-17 18:30:10.099756: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.100637: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-03-17 18:30:10.100762: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 18:30:10.100870: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10 2021-03-17 18:30:10.100919: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10 2021-03-17 18:30:10.100960: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10 2021-03-17 18:30:10.100993: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10 2021-03-17 18:30:10.101024: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10 2021-03-17 18:30:10.101056: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7 2021-03-17 18:30:10.101167: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.102116: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.102969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0 2021-03-17 18:30:10.103090: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 2021-03-17 18:30:10.723669: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-03-17 18:30:10.723736: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 2021-03-17 18:30:10.723748: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N 2021-03-17 18:30:10.724007: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.724747: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-03-17 18:30:10.725314: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14764 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0) INFO:sleap.nn.training:Loaded test example. [3.011s] INFO:sleap.nn.training: Input shape: (512, 672, 3) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=5, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 32 INFO:sleap.nn.training: Parameters: 7,820,663 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = MultiInstanceConfmapsHead(part_names=['Ear_left', 'Ear_right', 'Nose', 'Head', 'Neck', 'Center', 'Lateral_left', 'Lateral_right', 'Tail_base'], sigma=2.5, output_stride=4, loss_weight=1.0) INFO:sleap.nn.training: [1] = PartAffinityFieldsHead(edges=[('Ear_left', 'Ear_right'), ('Ear_left', 'Nose'), ('Ear_left', 'Head'), ('Ear_right', 'Nose'), ('Ear_right', 'Head'), ('Nose', 'Head'), ('Head', 'Neck'), ('Neck', 'Center'), ('Neck', 'Lateral_left'), ('Neck', 'Lateral_right'), ('Center', 'Lateral_left'), ('Center', 'Lateral_right'), ('Center', 'Tail_base'), ('Lateral_left', 'Tail_base'), ('Lateral_right', 'Tail_base')], sigma=75.0, output_stride=8, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = Tensor("MultiInstanceConfmapsHead_0/BiasAdd:0", shape=(None, 128, 168, 9), dtype=float32) INFO:sleap.nn.training: [1] = Tensor("PartAffinityFieldsHead_0/BiasAdd:0", shape=(None, 64, 84, 30), dtype=float32) INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 581 INFO:sleap.nn.training:Validation set: n = 64 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=8, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-06, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: models/2BMv2.bottomup_1 INFO:sleap.nn.training:Setting up visualization... Unable to use Qt backend for matplotlib. This probably means Qt is running headless. INFO:sleap.nn.training:Finished trainer set up. [6.7s] INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation... Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/context.py", line 2102, in execution_mode yield File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 758, in _next_internal output_shapes=self._flat_output_shapes) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 2610, in iterator_get_next _ops.raise_from_not_ok_status(e, name) File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 6843, in raise_from_not_ok_status six.raise_from(core._status_to_exception(e.code, message), None) File "", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape of tensor EagerPyFunc [492,656,1] is not compatible with expected shape [492,656,3].
[[{{node EnsureShape}}]] [Op:IteratorGetNext]
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/usr/local/bin/sleap-train", line 8, in
sys.exit(main())
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 1567, in main
trainer.train()
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/training.py", line 879, in train
training_ds = self.training_pipeline.make_dataset()
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/pipelines.py", line 282, in make_dataset
ds = transformer.transform_dataset(ds)
File "/usr/local/lib/python3.7/dist-packages/sleap/nn/data/dataset_ops.py", line 318, in transform_dataset
self.examples = list(iter(ds))
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 736, in next
return self.next()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 772, in next
return self._next_internal()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 764, in _next_internal
return structure.from_compatible_tensor_list(self._element_spec, ret)
File "/usr/lib/python3.7/contextlib.py", line 130, in exit
self.gen.throw(type, value, traceback)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/context.py", line 2105, in execution_mode
executor_new.wait()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/executor.py", line 67, in wait
pywrap_tfe.TFE_ExecutorWaitForAllPendingNodes(self._handle)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape of tensor EagerPyFunc [492,656,1] is not compatible with expected shape [492,656,3].
[[{{node EnsureShape}}]]
2021-03-17 18:30:17.431635: W tensorflow/core/kernels/data/cache_dataset_ops.cc:798] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to
dataset.cache().take(k).repeat()
. You should usedataset.take(k).cache().repeat()
instead.Can you please help me get around of this? Preferably that I can use the current training package or at least current labels. I have done much work to correct predicted frames and recorrect them would be a little bit too painful... Thank you in advance!
XIaoyu