Closed SamMinkowicz closed 10 months ago
Hi, We have also found now the same error when using default 3 Stacks in the Hourglass backbone.
Any ideas?
For future reference: this is due to some bugs with model construction for some legacy architectures. We will be likely reimplementing this once we finish our new neural network backend which should address this and other issues.
Hi, I'm attempting to train an hourglass model and am getting the error below whenever I use more than one stack. Is there a workaround?
Thanks for your help! Sam
Training output
INFO:sleap.nn.training:Versions: SLEAP: 1.2.2 TensorFlow: 2.6.3 Numpy: 1.19.5 Python: 3.7.12 OS: Linux-5.4.0-96-generic-x86_64-with-debian-bullseye-sid INFO:sleap.nn.training:Training labels file: min_keypoints.pkg.slp INFO:sleap.nn.training:Training profile: hourglass.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "hourglass.json", "labels_path": "min_keypoints.pkg.slp", "video_paths": [ "" ], "val_labels": null, "test_labels": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "", "prefix": "", "suffix": "", "cpu": false, "first_gpu": false, "last_gpu": false, "gpu": 0 } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": true, "imagenet_mode": null, "input_scaling": 1.0, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": null, "hourglass": { "stem_stride": 4, "max_stride": 32, "output_stride": 2, "stem_filters": 128, "filters": 256, "filter_increase": 128, "stacks": 2 }, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": null, "centered_instance": { "anchor_part": null, "part_names": null, "sigma": 2.5, "output_stride": 4, "loss_weight": 1.0, "offset_refinement": false }, "multi_instance": null, "multi_class_bottomup": null, "multi_class_topdown": null } }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -2.5, "rotation_max_angle": 2.5, "translate": false, "translate_min": -5, "translate_max": 5, "scale": true, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": false, "flip_horizontal": true }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 6, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 3000, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-08, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": null, "run_name_prefix": "min_kps_hourglass", "run_name_suffix": ".centered_instance", "runs_folder": "models", "tags": [ "" ], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": true }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.2.2", "filename": "hourglass.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:Using GPU 0 for acceleration. INFO:sleap.nn.training:Disabled GPU memory pre-allocation. INFO:sleap.nn.training:System: GPUs: 1/2 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True Device: /physical_device:GPU:1 Available: False Initalized: False Memory growth: None INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: min_keypoints.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 1462 / Validation = 162. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2022-04-27 13:47:31.910299: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2022-04-27 13:47:32.363714: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 19623 MB memory: -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:17:00.0, compute capability: 7.5 2022-04-27 13:47:32.735466: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2) 2022-04-27 13:47:34.178944: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 608 } dim { size: 1440 } dim { size: 1 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 3299 num_cores: 28 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 1048576 l3_cache_size: 20185088 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 288 } dim { size: 288 } dim { size: 1 } } } INFO:sleap.nn.training:Loaded test example. [2.486s] INFO:sleap.nn.training: Input shape: (288, 288, 1) Traceback (most recent call last): File "/home/sam/miniconda3/envs/sleap/bin/sleap-train", line 33, in
sys.exit(load_entry_point('sleap==1.2.2', 'console_scripts', 'sleap-train')())
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/sleap/nn/training.py", line 1947, in main
trainer.train()
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/sleap/nn/training.py", line 906, in train
self.setup()
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/sleap/nn/training.py", line 892, in setup
self._setup_model()
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/sleap/nn/training.py", line 732, in _setup_model
self.model.make_model(input_shape)
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/sleap/nn/model.py", line 363, in make_model
self.keras_model = tf.keras.Model(inputs=x_in, outputs=x_outs)
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/tensorflow/python/training/tracking/base.py", line 530, in _method_wrapper
result = method(self, *args, *kwargs)
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/keras/engine/functional.py", line 109, in init
self._init_graph_network(inputs, outputs)
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/tensorflow/python/training/tracking/base.py", line 530, in _method_wrapper
result = method(self, args, **kwargs)
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/keras/engine/functional.py", line 193, in _init_graph_network
self.inputs, self.outputs)
File "/home/sam/miniconda3/envs/sleap/lib/python3.7/site-packages/keras/engine/functional.py", line 995, in _map_graph_network
str(all_names.count(name)) + ' times in the model. '
ValueError: The name "CenteredInstanceConfmapsHead" is used 2 times in the model. All layer names should be unique.