talmolab / sleap

A deep learning framework for multi-animal pose tracking.
https://sleap.ai
Other
438 stars 97 forks source link

W tensorflow/core/framework/op_kernel.cc:1692] OP_REQUIRES failed at conv_ops.cc:682 #1351

Closed jverpeut closed 1 year ago

jverpeut commented 1 year ago

This could be an error with my laptop, but I can't seem to get past it. I tried using both an older version of SLEAP and the newest version and am having the same error. I have a NVIDIA GeForce GTX 1050 GPU, intel core i7-7700HQ CPU @2.5Ghz, 31.86 GB RAM. I can load the GUI and label frames, but training is failing and I don't recognize this error. I have been able to train data on this computer previously. Thank you

(base) C:\WINDOWS\system32>conda activate sleap130

(sleap130) C:\WINDOWS\system32>sleap-label Saving config: C:\Users\jverpeut/.sleap/1.3.0/preferences.yaml Restoring GUI state...

Software versions: SLEAP: 1.3.0 TensorFlow: 2.6.3 Numpy: 1.19.5 Python: 3.7.12 OS: Windows-10-10.0.19041-SP0

Happy SLEAPing! :) Resetting monitor window. Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_103343.single_instance.n=31\viz\validation.*.png Start training single_instance... ['sleap-train', 'C:\Users\jverpeut\AppData\Local\Temp\tmpcijlduej\230614_103343_training_job.json', 'C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset/labels.v001.slp', '--zmq', '--save_viz'] INFO:sleap.nn.training:Versions: SLEAP: 1.3.0 TensorFlow: 2.6.3 Numpy: 1.19.5 Python: 3.7.12 OS: Windows-10-10.0.19041-SP0 INFO:sleap.nn.training:Training labels file: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset/labels.v001.slp INFO:sleap.nn.training:Training profile: C:\Users\jverpeut\AppData\Local\Temp\tmpcijlduej\230614_103343_training_job.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "C:\Users\jverpeut\AppData\Local\Temp\tmpcijlduej\230614_103343_training_job.json", "labels_path": "C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset/labels.v001.slp", "video_paths": [ "" ], "val_labels": null, "test_labels": null, "base_checkpoint": null, "tensorboard": false, "save_viz": true, "zmq": true, "run_name": "", "prefix": "", "suffix": "", "cpu": false, "first_gpu": false, "last_gpu": false, "gpu": "auto" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 1.75, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 64, "output_stride": 2, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": { "part_names": null, "sigma": 2.0, "output_stride": 2, "loss_weight": 1.0, "offset_refinement": false }, "centroid": null, "centered_instance": null, "multi_instance": null, "multi_class_bottomup": null, "multi_class_topdown": null }, "base_checkpoint": null }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": true, "flip_horizontal": false }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-08, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "230614_103343.single_instance.n=31", "run_name_prefix": "", "run_name_suffix": "", "runs_folder": "C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models", "tags": [ "" ], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": true, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": true, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.3.0", "filename": "C:\Users\jverpeut\AppData\Local\Temp\tmpcijlduej\230614_103343_training_job.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:Auto-selected GPU 0 with 3950 MiB of free memory. INFO:sleap.nn.training:Using GPU 0 for acceleration. INFO:sleap.nn.training:Disabled GPU memory pre-allocation. INFO:sleap.nn.training:System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset/labels.v001.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 28 / Validation = 3. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2023-06-14 10:34:03.475499: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2 To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-06-14 10:34:04.484384: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2777 MB memory: -> device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1 2023-06-14 10:34:06.057899: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2) INFO:sleap.nn.training:Loaded test example. [6.012s] INFO:sleap.nn.training: Input shape: (1920, 2560, 1) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=6, middle_block=True, up_blocks=5, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 64 INFO:sleap.nn.training: Parameters: 31,449,243 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = SingleInstanceConfmapsHead(part_names=['nose1', 'neck1', 'earL1', 'earR1', 'forelegL1', 'forelegR1', 'tailstart1', 'hindlegL1', 'hindlegR1', 'tail1', 'tailend1'], sigma=2.0, output_stride=2, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 960, 1280, 11), dtype=tf.float32, name=None), name='SingleInstanceConfmapsHead/BiasAdd:0', description="created by layer 'SingleInstanceConfmapsHead'") INFO:sleap.nn.training:Training from scratch INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 28 INFO:sleap.nn.training:Validation set: n = 3 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.callbacks:Training controller subscribed to: tcp://127.0.0.1:9000 (topic: ) INFO:sleap.nn.training: ZMQ controller subcribed to: tcp://127.0.0.1:9000 INFO:sleap.nn.callbacks:Progress reporter publishing on: tcp://127.0.0.1:9001 for: not_set INFO:sleap.nn.training: ZMQ progress reporter publish on: tcp://127.0.0.1:9001 INFO:sleap.nn.training:Created run path: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_103343.single_instance.n=31 INFO:sleap.nn.training:Setting up visualization... INFO:sleap.nn.training:Finished trainer set up. [8.5s] INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation... INFO:sleap.nn.training:Finished creating training datasets. [12.5s] INFO:sleap.nn.training:Starting training loop... Epoch 1/200 2023-06-14 10:34:42.162874: W tensorflow/core/common_runtime/bfc_allocator.cc:457] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.17GiB (rounded to 1258291200)requested by op model/stack0_enc0_conv0/Conv2D If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. Current allocation summary follows. Current allocation summary follows. 2023-06-14 10:34:42.163374: I tensorflow/core/common_runtime/bfc_allocator.cc:1004] BFCAllocator dump for GPU_0_bfc 2023-06-14 10:34:42.170098: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (256): Total Chunks: 104, Chunks in use: 103. 26.0KiB allocated for chunks. 25.8KiB in use in bin. 7.1KiB client-requested in use in bin. 2023-06-14 10:34:42.170942: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (512): Total Chunks: 20, Chunks in use: 20. 11.0KiB allocated for chunks. 11.0KiB in use in bin. 10.2KiB client-requested in use in bin. 2023-06-14 10:34:42.171809: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (1024): Total Chunks: 20, Chunks in use: 20. 21.8KiB allocated for chunks. 21.8KiB in use in bin. 21.1KiB client-requested in use in bin. 2023-06-14 10:34:42.172615: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (2048): Total Chunks: 19, Chunks in use: 19. 41.5KiB allocated for chunks. 41.5KiB in use in bin. 40.9KiB client-requested in use in bin. 2023-06-14 10:34:42.173464: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (4096): Total Chunks: 10, Chunks in use: 10. 43.0KiB allocated for chunks. 43.0KiB in use in bin. 42.0KiB client-requested in use in bin. 2023-06-14 10:34:42.174278: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (8192): Total Chunks: 4, Chunks in use: 4. 40.5KiB allocated for chunks. 40.5KiB in use in bin. 36.0KiB client-requested in use in bin. 2023-06-14 10:34:42.175109: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (16384): Total Chunks: 4, Chunks in use: 4. 72.0KiB allocated for chunks. 72.0KiB in use in bin. 72.0KiB client-requested in use in bin. 2023-06-14 10:34:42.175913: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (32768): Total Chunks: 8, Chunks in use: 8. 306.0KiB allocated for chunks. 306.0KiB in use in bin. 288.0KiB client-requested in use in bin. 2023-06-14 10:34:42.176765: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (65536): Total Chunks: 7, Chunks in use: 7. 648.0KiB allocated for chunks. 648.0KiB in use in bin. 612.0KiB client-requested in use in bin. 2023-06-14 10:34:42.177596: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (131072): Total Chunks: 8, Chunks in use: 8. 1.16MiB allocated for chunks. 1.16MiB in use in bin. 1.09MiB client-requested in use in bin. 2023-06-14 10:34:42.178561: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (262144): Total Chunks: 9, Chunks in use: 9. 3.07MiB allocated for chunks. 3.07MiB in use in bin. 2.95MiB client-requested in use in bin. 2023-06-14 10:34:42.193592: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (524288): Total Chunks: 8, Chunks in use: 8. 4.64MiB allocated for chunks. 4.64MiB in use in bin. 4.50MiB client-requested in use in bin. 2023-06-14 10:34:42.194516: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (1048576): Total Chunks: 10, Chunks in use: 10. 14.20MiB allocated for chunks. 14.20MiB in use in bin. 14.06MiB client-requested in use in bin. 2023-06-14 10:34:42.195270: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (2097152): Total Chunks: 10, Chunks in use: 10. 23.81MiB allocated for chunks. 23.81MiB in use in bin. 21.19MiB client-requested in use in bin. 2023-06-14 10:34:42.196057: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (4194304): Total Chunks: 8, Chunks in use: 8. 48.63MiB allocated for chunks. 48.63MiB in use in bin. 45.00MiB client-requested in use in bin. 2023-06-14 10:34:42.196791: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (8388608): Total Chunks: 8, Chunks in use: 8. 81.25MiB allocated for chunks. 81.25MiB in use in bin. 72.00MiB client-requested in use in bin. 2023-06-14 10:34:42.197540: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (16777216): Total Chunks: 7, Chunks in use: 7. 163.00MiB allocated for chunks. 163.00MiB in use in bin. 153.00MiB client-requested in use in bin. 2023-06-14 10:34:42.198285: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (33554432): Total Chunks: 5, Chunks in use: 5. 190.02MiB allocated for chunks. 190.02MiB in use in bin. 171.00MiB client-requested in use in bin. 2023-06-14 10:34:42.199021: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (67108864): Total Chunks: 1, Chunks in use: 1. 75.00MiB allocated for chunks. 75.00MiB in use in bin. 75.00MiB client-requested in use in bin. 2023-06-14 10:34:42.199857: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (134217728): Total Chunks: 2, Chunks in use: 1. 431.25MiB allocated for chunks. 206.25MiB in use in bin. 206.25MiB client-requested in use in bin. 2023-06-14 10:34:42.200611: I tensorflow/core/common_runtime/bfc_allocator.cc:1011] Bin (268435456): Total Chunks: 2, Chunks in use: 1. 1008.78MiB allocated for chunks. 266.03MiB in use in bin. 206.25MiB client-requested in use in bin. 2023-06-14 10:34:42.201359: I tensorflow/core/common_runtime/bfc_allocator.cc:1027] Bin for 1.17GiB was 256.00MiB, Chunk State: 2023-06-14 10:34:42.202124: I tensorflow/core/common_runtime/bfc_allocator.cc:1033] Size: 742.75MiB | Requested Size: 0B | in_use: 0 | bin_num: 20, prev: Size: 206.25MiB | Requested Size: 206.25MiB | in_use: 1 | bin_num: -1 2023-06-14 10:34:42.202918: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 2097152 2023-06-14 10:34:42.203699: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400000 of size 256 next 4 2023-06-14 10:34:42.204510: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400100 of size 256 next 5 2023-06-14 10:34:42.205280: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400200 of size 256 next 6 2023-06-14 10:34:42.206288: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400300 of size 256 next 7 2023-06-14 10:34:42.207154: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400400 of size 256 next 10 2023-06-14 10:34:42.207976: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400500 of size 256 next 11 2023-06-14 10:34:42.209023: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400600 of size 256 next 12 2023-06-14 10:34:42.210082: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400700 of size 256 next 15 2023-06-14 10:34:42.227170: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400800 of size 256 next 8 2023-06-14 10:34:42.227887: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400900 of size 768 next 9 2023-06-14 10:34:42.228653: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400c00 of size 256 next 16 2023-06-14 10:34:42.229468: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400d00 of size 256 next 19 2023-06-14 10:34:42.230307: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400e00 of size 256 next 20 2023-06-14 10:34:42.231209: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02400f00 of size 256 next 21 2023-06-14 10:34:42.231928: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401000 of size 256 next 24 2023-06-14 10:34:42.232772: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401100 of size 256 next 25 2023-06-14 10:34:42.233573: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401200 of size 256 next 28 2023-06-14 10:34:42.234374: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401300 of size 256 next 29 2023-06-14 10:34:42.235203: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401400 of size 256 next 30 2023-06-14 10:34:42.236396: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401500 of size 256 next 31 2023-06-14 10:34:42.237214: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401600 of size 256 next 34 2023-06-14 10:34:42.238002: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401700 of size 256 next 35 2023-06-14 10:34:42.238805: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401800 of size 512 next 38 2023-06-14 10:34:42.239927: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401a00 of size 256 next 39 2023-06-14 10:34:42.240988: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401b00 of size 256 next 40 2023-06-14 10:34:42.241779: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401c00 of size 512 next 43 2023-06-14 10:34:42.242569: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401e00 of size 256 next 44 2023-06-14 10:34:42.243328: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02401f00 of size 256 next 45 2023-06-14 10:34:42.244102: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02402000 of size 1024 next 46 2023-06-14 10:34:42.244879: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02402400 of size 256 next 49 2023-06-14 10:34:42.245708: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02402500 of size 256 next 50 2023-06-14 10:34:42.246521: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02402600 of size 1024 next 53 2023-06-14 10:34:42.247407: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02402a00 of size 256 next 54 2023-06-14 10:34:42.264558: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02402b00 of size 256 next 55 2023-06-14 10:34:42.265297: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02402c00 of size 2048 next 57 2023-06-14 10:34:42.266113: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02403400 of size 256 next 58 2023-06-14 10:34:42.266872: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02403500 of size 256 next 60 2023-06-14 10:34:42.267661: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02403600 of size 2048 next 64 2023-06-14 10:34:42.268492: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02403e00 of size 256 next 65 2023-06-14 10:34:42.269236: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02403f00 of size 256 next 66 2023-06-14 10:34:42.270048: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02404000 of size 5120 next 14 2023-06-14 10:34:42.270797: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02405400 of size 9216 next 13 2023-06-14 10:34:42.271460: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02407800 of size 256 next 68 2023-06-14 10:34:42.272275: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02407900 of size 256 next 69 2023-06-14 10:34:42.273272: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02407a00 of size 4096 next 74 2023-06-14 10:34:42.274149: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02408a00 of size 2048 next 76 2023-06-14 10:34:42.275240: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02409200 of size 2048 next 77 2023-06-14 10:34:42.276161: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02409a00 of size 1024 next 79 2023-06-14 10:34:42.276925: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02409e00 of size 1024 next 80 2023-06-14 10:34:42.277760: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240a200 of size 512 next 82 2023-06-14 10:34:42.278538: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240a400 of size 512 next 84 2023-06-14 10:34:42.292022: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240a600 of size 256 next 86 2023-06-14 10:34:42.292945: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240a700 of size 256 next 87 2023-06-14 10:34:42.293808: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240a800 of size 256 next 89 2023-06-14 10:34:42.294608: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240a900 of size 256 next 91 2023-06-14 10:34:42.295362: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240aa00 of size 256 next 93 2023-06-14 10:34:42.296070: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240ab00 of size 256 next 94 2023-06-14 10:34:42.296789: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240ac00 of size 256 next 95 2023-06-14 10:34:42.297568: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240ad00 of size 256 next 96 2023-06-14 10:34:42.298404: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240ae00 of size 256 next 98 2023-06-14 10:34:42.299234: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240af00 of size 256 next 99 2023-06-14 10:34:42.300070: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b000 of size 256 next 100 2023-06-14 10:34:42.300919: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b100 of size 256 next 103 2023-06-14 10:34:42.301744: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b200 of size 256 next 104 2023-06-14 10:34:42.302559: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b300 of size 256 next 105 2023-06-14 10:34:42.303370: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b400 of size 256 next 109 2023-06-14 10:34:42.304191: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b500 of size 256 next 111 2023-06-14 10:34:42.304881: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b600 of size 256 next 112 2023-06-14 10:34:42.305746: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b700 of size 256 next 115 2023-06-14 10:34:42.306623: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b800 of size 256 next 97 2023-06-14 10:34:42.307605: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240b900 of size 2048 next 17 2023-06-14 10:34:42.308533: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0240c100 of size 18432 next 18 2023-06-14 10:34:42.309292: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02410900 of size 18432 next 122 2023-06-14 10:34:42.310135: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02415100 of size 55296 next 23 2023-06-14 10:34:42.326842: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02422900 of size 36864 next 22 2023-06-14 10:34:42.327578: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0242b900 of size 36864 next 92 2023-06-14 10:34:42.328436: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02434900 of size 256 next 116 2023-06-14 10:34:42.329193: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02434a00 of size 256 next 117 2023-06-14 10:34:42.330008: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02434b00 of size 256 next 118 2023-06-14 10:34:42.330777: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02434c00 of size 768 next 119 2023-06-14 10:34:42.331586: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02434f00 of size 256 next 120 2023-06-14 10:34:42.332421: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435000 of size 256 next 121 2023-06-14 10:34:42.333248: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435100 of size 256 next 123 2023-06-14 10:34:42.333966: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435200 of size 256 next 124 2023-06-14 10:34:42.334939: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435300 of size 256 next 125 2023-06-14 10:34:42.335927: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435400 of size 256 next 126 2023-06-14 10:34:42.336730: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435500 of size 512 next 128 2023-06-14 10:34:42.337558: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435700 of size 512 next 129 2023-06-14 10:34:42.338280: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435900 of size 1024 next 106 2023-06-14 10:34:42.339223: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02435d00 of size 5120 next 107 2023-06-14 10:34:42.340168: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02437100 of size 3840 next 108 2023-06-14 10:34:42.341052: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02438000 of size 5120 next 113 2023-06-14 10:34:42.341807: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02439400 of size 3840 next 114 2023-06-14 10:34:42.342631: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0243a300 of size 13824 next 27 2023-06-14 10:34:42.343467: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0243d900 of size 73728 next 26 2023-06-14 10:34:42.344264: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0244f900 of size 110592 next 90 2023-06-14 10:34:42.345087: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0246a900 of size 184320 next 33 2023-06-14 10:34:42.345915: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02497900 of size 147456 next 32 2023-06-14 10:34:42.346751: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b024bb900 of size 147456 next 88 2023-06-14 10:34:42.347658: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b024df900 of size 147456 next 37 2023-06-14 10:34:42.364611: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02503900 of size 294912 next 36 2023-06-14 10:34:42.365318: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0254b900 of size 739072 next 18446744073709551615 2023-06-14 10:34:42.366165: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 4194304 2023-06-14 10:34:42.366940: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02600000 of size 1280 next 2 2023-06-14 10:34:42.367943: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02600500 of size 256 next 3 2023-06-14 10:34:42.368952: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02600600 of size 294912 next 127 2023-06-14 10:34:42.370014: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02648600 of size 1024 next 130 2023-06-14 10:34:42.370757: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02648a00 of size 2048 next 132 2023-06-14 10:34:42.371618: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02649200 of size 2048 next 134 2023-06-14 10:34:42.372898: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02649a00 of size 4096 next 136 2023-06-14 10:34:42.374175: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264aa00 of size 4096 next 137 2023-06-14 10:34:42.374718: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264ba00 of size 2048 next 140 2023-06-14 10:34:42.376726: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264c200 of size 2048 next 141 2023-06-14 10:34:42.377473: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264ca00 of size 1024 next 143 2023-06-14 10:34:42.378104: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264ce00 of size 1024 next 145 2023-06-14 10:34:42.379012: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264d200 of size 512 next 146 2023-06-14 10:34:42.392826: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264d400 of size 512 next 148 2023-06-14 10:34:42.393609: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264d600 of size 256 next 150 2023-06-14 10:34:42.394485: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0264d700 of size 274176 next 42 2023-06-14 10:34:42.395373: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02690600 of size 589824 next 41 2023-06-14 10:34:42.396146: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02720600 of size 3013120 next 18446744073709551615 2023-06-14 10:34:42.396957: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 8388608 2023-06-14 10:34:42.397796: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02a00000 of size 1179648 next 48 2023-06-14 10:34:42.398624: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02b20000 of size 589824 next 83 2023-06-14 10:34:42.399375: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02bb0000 of size 442368 next 85 2023-06-14 10:34:42.400128: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02c1c000 of size 1327104 next 52 2023-06-14 10:34:42.400973: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02d60000 of size 2359296 next 51 2023-06-14 10:34:42.401804: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b02fa0000 of size 2490368 next 18446744073709551615 2023-06-14 10:34:42.402557: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 16777216 2023-06-14 10:34:42.403385: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b03200000 of size 1769472 next 81 2023-06-14 10:34:42.404140: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b033b0000 of size 1572608 next 101 2023-06-14 10:34:42.404826: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0352ff00 of size 1572608 next 102 2023-06-14 10:34:42.405724: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b036afe00 of size 1572608 next 110 2023-06-14 10:34:42.406757: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0382fd00 of size 2949888 next 59 2023-06-14 10:34:42.407593: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b03b00000 of size 7340032 next 18446744073709551615 2023-06-14 10:34:42.408632: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 33554432 2023-06-14 10:34:42.409452: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b04200000 of size 4718592 next 131 2023-06-14 10:34:42.410288: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b04680000 of size 2359296 next 144 2023-06-14 10:34:42.427835: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b048c0000 of size 2359296 next 63 2023-06-14 10:34:42.428621: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b04b00000 of size 9437184 next 61 2023-06-14 10:34:42.429382: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b05400000 of size 14680064 next 18446744073709551615 2023-06-14 10:34:42.430172: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 67108864 2023-06-14 10:34:42.430967: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b06200000 of size 18874368 next 135 2023-06-14 10:34:42.431815: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07400000 of size 7077888 next 142 2023-06-14 10:34:42.432618: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07ac0000 of size 589824 next 147 2023-06-14 10:34:42.433348: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07b50000 of size 442368 next 149 2023-06-14 10:34:42.434102: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07bbc000 of size 256 next 151 2023-06-14 10:34:42.434857: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07bbc100 of size 110592 next 152 2023-06-14 10:34:42.435607: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07bd7100 of size 256 next 153 2023-06-14 10:34:42.436375: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07bd7200 of size 36864 next 154 2023-06-14 10:34:42.437193: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be0200 of size 256 next 155 2023-06-14 10:34:42.437905: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be0300 of size 1536 next 156 2023-06-14 10:34:42.438798: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be0900 of size 256 next 157 2023-06-14 10:34:42.439935: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be0a00 of size 768 next 158 2023-06-14 10:34:42.440808: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be0d00 of size 256 next 159 2023-06-14 10:34:42.441669: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be0e00 of size 9216 next 160 2023-06-14 10:34:42.442480: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be3200 of size 256 next 161 2023-06-14 10:34:42.443302: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be3300 of size 18432 next 162 2023-06-14 10:34:42.444104: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be7b00 of size 256 next 163 2023-06-14 10:34:42.444939: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07be7c00 of size 36864 next 164 2023-06-14 10:34:42.445680: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07bf0c00 of size 256 next 165 2023-06-14 10:34:42.446444: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07bf0d00 of size 73728 next 166 2023-06-14 10:34:42.447268: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07c02d00 of size 256 next 167 2023-06-14 10:34:42.448452: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07c02e00 of size 147456 next 168 2023-06-14 10:34:42.466516: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07c26e00 of size 256 next 169 2023-06-14 10:34:42.467471: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07c26f00 of size 294912 next 170 2023-06-14 10:34:42.468450: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07c6ef00 of size 512 next 171 2023-06-14 10:34:42.469212: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07c6f100 of size 589824 next 172 2023-06-14 10:34:42.469984: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07cff100 of size 512 next 173 2023-06-14 10:34:42.470803: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07cff300 of size 1179648 next 174 2023-06-14 10:34:42.471621: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07e1f300 of size 1024 next 175 2023-06-14 10:34:42.472530: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b07e1f700 of size 2359296 next 176 2023-06-14 10:34:42.473533: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0805f700 of size 1024 next 177 2023-06-14 10:34:42.474415: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0805fb00 of size 5899520 next 70 2023-06-14 10:34:42.475163: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b08600000 of size 29360128 next 18446744073709551615 2023-06-14 10:34:42.475905: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 134217728 2023-06-14 10:34:42.476691: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0a200000 of size 37748736 next 73 2023-06-14 10:34:42.477513: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0c600000 of size 37748736 next 71 2023-06-14 10:34:42.478332: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b0ea00000 of size 28311552 next 75 2023-06-14 10:34:42.479142: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b10500000 of size 7077888 next 78 2023-06-14 10:34:42.492186: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b10bc0000 of size 9437184 next 133 2023-06-14 10:34:42.493222: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b114c0000 of size 13893632 next 18446744073709551615 2023-06-14 10:34:42.494097: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 268435456 2023-06-14 10:34:42.495726: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b12200000 of size 28311552 next 139 2023-06-14 10:34:42.496604: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b13d00000 of size 2048 next 178 2023-06-14 10:34:42.497845: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b13d00800 of size 9437184 next 179 2023-06-14 10:34:42.498307: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b14600800 of size 2048 next 180 2023-06-14 10:34:42.499004: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b14601000 of size 18874368 next 181 2023-06-14 10:34:42.499815: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b15801000 of size 4096 next 182 2023-06-14 10:34:42.500642: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b15802000 of size 37748736 next 183 2023-06-14 10:34:42.501469: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b17c02000 of size 4096 next 184 2023-06-14 10:34:42.502276: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b17c03000 of size 28311552 next 185 2023-06-14 10:34:42.503019: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b19703000 of size 2048 next 186 2023-06-14 10:34:42.503853: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b19703800 of size 9437184 next 187 2023-06-14 10:34:42.504742: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1a003800 of size 2048 next 188 2023-06-14 10:34:42.505605: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1a004000 of size 7077888 next 189 2023-06-14 10:34:42.506461: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1a6c4000 of size 1024 next 190 2023-06-14 10:34:42.507645: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1a6c4400 of size 2359296 next 191 2023-06-14 10:34:42.508645: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1a904400 of size 1024 next 192 2023-06-14 10:34:42.509629: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1a904800 of size 1769472 next 193 2023-06-14 10:34:42.510792: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1aab4800 of size 512 next 194 2023-06-14 10:34:42.527177: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1aab4a00 of size 589824 next 195 2023-06-14 10:34:42.527963: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ab44a00 of size 512 next 196 2023-06-14 10:34:42.528782: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ab44c00 of size 442368 next 197 2023-06-14 10:34:42.529543: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abb0c00 of size 256 next 198 2023-06-14 10:34:42.530345: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abb0d00 of size 147456 next 199 2023-06-14 10:34:42.531189: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abd4d00 of size 256 next 200 2023-06-14 10:34:42.531953: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abd4e00 of size 110592 next 201 2023-06-14 10:34:42.532739: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abefe00 of size 256 next 202 2023-06-14 10:34:42.533511: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abeff00 of size 36864 next 203 2023-06-14 10:34:42.534328: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abf8f00 of size 256 next 204 2023-06-14 10:34:42.535060: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abf9000 of size 1536 next 205 2023-06-14 10:34:42.535868: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abf9600 of size 256 next 206 2023-06-14 10:34:42.536637: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abf9700 of size 768 next 207 2023-06-14 10:34:42.537671: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abf9a00 of size 256 next 208 2023-06-14 10:34:42.538614: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abf9b00 of size 9216 next 209 2023-06-14 10:34:42.539543: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abfbf00 of size 256 next 210 2023-06-14 10:34:42.540545: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1abfc000 of size 18432 next 211 2023-06-14 10:34:42.541536: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac00800 of size 256 next 212 2023-06-14 10:34:42.542445: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac00900 of size 36864 next 213 2023-06-14 10:34:42.543321: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac09900 of size 256 next 214 2023-06-14 10:34:42.544081: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac09a00 of size 73728 next 215 2023-06-14 10:34:42.544908: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac1ba00 of size 256 next 216 2023-06-14 10:34:42.545656: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac1bb00 of size 147456 next 217 2023-06-14 10:34:42.546477: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac3fb00 of size 256 next 218 2023-06-14 10:34:42.547321: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac3fc00 of size 294912 next 219 2023-06-14 10:34:42.548144: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac87c00 of size 512 next 220 2023-06-14 10:34:42.564746: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ac87e00 of size 589824 next 221 2023-06-14 10:34:42.565501: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ad17e00 of size 512 next 222 2023-06-14 10:34:42.566347: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ad18000 of size 1179648 next 223 2023-06-14 10:34:42.567199: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ae38000 of size 1024 next 224 2023-06-14 10:34:42.567939: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1ae38400 of size 2359296 next 225 2023-06-14 10:34:42.568758: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1b078400 of size 1024 next 226 2023-06-14 10:34:42.569575: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1b078800 of size 4718592 next 227 2023-06-14 10:34:42.570393: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1b4f8800 of size 2048 next 228 2023-06-14 10:34:42.571212: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1b4f9000 of size 9437184 next 229 2023-06-14 10:34:42.571993: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1bdf9000 of size 2048 next 230 2023-06-14 10:34:42.573051: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1bdf9800 of size 18874368 next 231 2023-06-14 10:34:42.574225: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1cff9800 of size 4096 next 232 2023-06-14 10:34:42.575177: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1cffa800 of size 37748736 next 233 2023-06-14 10:34:42.575989: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1f3fa800 of size 4096 next 234 2023-06-14 10:34:42.576815: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b1f3fb800 of size 48252928 next 18446744073709551615 2023-06-14 10:34:42.577665: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 536870912 2023-06-14 10:34:42.578453: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b22200000 of size 2048 next 236 2023-06-14 10:34:42.579296: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b22200800 of size 9437184 next 237 2023-06-14 10:34:42.593538: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b22b00800 of size 2048 next 238 2023-06-14 10:34:42.594239: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b22b01000 of size 7077888 next 239 2023-06-14 10:34:42.595060: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b231c1000 of size 1024 next 240 2023-06-14 10:34:42.595966: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b231c1400 of size 2359296 next 241 2023-06-14 10:34:42.596939: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b23401400 of size 1024 next 242 2023-06-14 10:34:42.597602: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b23401800 of size 1769472 next 243 2023-06-14 10:34:42.598322: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b235b1800 of size 512 next 244 2023-06-14 10:34:42.599274: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b235b1a00 of size 589824 next 245 2023-06-14 10:34:42.600257: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b23641a00 of size 512 next 246 2023-06-14 10:34:42.601274: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b23641c00 of size 442368 next 247 2023-06-14 10:34:42.602196: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236adc00 of size 256 next 248 2023-06-14 10:34:42.603058: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236add00 of size 147456 next 249 2023-06-14 10:34:42.603787: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236d1d00 of size 256 next 250 2023-06-14 10:34:42.604539: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236d1e00 of size 110592 next 251 2023-06-14 10:34:42.605391: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236ece00 of size 256 next 252 2023-06-14 10:34:42.606241: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236ecf00 of size 36864 next 253 2023-06-14 10:34:42.611615: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f5f00 of size 256 next 254 2023-06-14 10:34:42.624909: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6000 of size 1536 next 255 2023-06-14 10:34:42.626164: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6600 of size 256 next 256 2023-06-14 10:34:42.626935: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6700 of size 256 next 257 2023-06-14 10:34:42.627725: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6800 of size 256 next 258 2023-06-14 10:34:42.628565: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6900 of size 256 next 259 2023-06-14 10:34:42.629342: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6a00 of size 256 next 260 2023-06-14 10:34:42.630170: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6b00 of size 256 next 261 2023-06-14 10:34:42.630994: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6c00 of size 256 next 262 2023-06-14 10:34:42.631809: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6d00 of size 256 next 263 2023-06-14 10:34:42.632663: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6e00 of size 256 next 264 2023-06-14 10:34:42.633342: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f6f00 of size 256 next 265 2023-06-14 10:34:42.634269: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f7000 of size 256 next 269 2023-06-14 10:34:42.635071: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f7100 of size 256 next 266 2023-06-14 10:34:42.635916: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] Free at b236f7200 of size 256 next 270 2023-06-14 10:34:42.636728: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b236f7300 of size 256 next 273 2023-06-14 10:34:42.637566: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] Free at b236f7400 of size 235928832 next 268 2023-06-14 10:34:42.638576: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b317f7100 of size 278957824 next 18446744073709551615 2023-06-14 10:34:42.639642: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Next region of size 1073741824 2023-06-14 10:34:42.640472: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b42200000 of size 78643200 next 271 2023-06-14 10:34:42.641471: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] InUse at b46d00000 of size 216268800 next 272 2023-06-14 10:34:42.642427: I tensorflow/core/common_runtime/bfc_allocator.cc:1060] Free at b53b40000 of size 778829824 next 18446744073709551615 2023-06-14 10:34:42.643294: I tensorflow/core/common_runtime/bfc_allocator.cc:1065] Summary of in-use Chunks by size: 2023-06-14 10:34:42.644113: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 103 Chunks of size 256 totalling 25.8KiB 2023-06-14 10:34:42.644911: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 16 Chunks of size 512 totalling 8.0KiB 2023-06-14 10:34:42.645733: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 4 Chunks of size 768 totalling 3.0KiB 2023-06-14 10:34:42.646602: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 16 Chunks of size 1024 totalling 16.0KiB 2023-06-14 10:34:42.647459: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 1280 totalling 1.2KiB 2023-06-14 10:34:42.648313: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 1536 totalling 4.5KiB 2023-06-14 10:34:42.670989: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 17 Chunks of size 2048 totalling 34.0KiB 2023-06-14 10:34:42.672316: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 2 Chunks of size 3840 totalling 7.5KiB 2023-06-14 10:34:42.673253: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 7 Chunks of size 4096 totalling 28.0KiB 2023-06-14 10:34:42.674115: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 5120 totalling 15.0KiB 2023-06-14 10:34:42.674987: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 9216 totalling 27.0KiB 2023-06-14 10:34:42.675785: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 13824 totalling 13.5KiB 2023-06-14 10:34:42.676590: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 4 Chunks of size 18432 totalling 72.0KiB 2023-06-14 10:34:42.677435: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 7 Chunks of size 36864 totalling 252.0KiB 2023-06-14 10:34:42.678269: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 55296 totalling 54.0KiB 2023-06-14 10:34:42.679095: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 73728 totalling 216.0KiB 2023-06-14 10:34:42.679861: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 4 Chunks of size 110592 totalling 432.0KiB 2023-06-14 10:34:42.696583: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 7 Chunks of size 147456 totalling 1008.0KiB 2023-06-14 10:34:42.697808: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 184320 totalling 180.0KiB 2023-06-14 10:34:42.699269: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 274176 totalling 267.8KiB 2023-06-14 10:34:42.700739: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 4 Chunks of size 294912 totalling 1.12MiB 2023-06-14 10:34:42.701499: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 4 Chunks of size 442368 totalling 1.69MiB 2023-06-14 10:34:42.702322: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 7 Chunks of size 589824 totalling 3.94MiB 2023-06-14 10:34:42.703082: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 739072 totalling 721.8KiB 2023-06-14 10:34:42.703868: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 1179648 totalling 3.38MiB 2023-06-14 10:34:42.704652: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 1327104 totalling 1.27MiB 2023-06-14 10:34:42.705518: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 1572608 totalling 4.50MiB 2023-06-14 10:34:42.706269: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 1769472 totalling 5.06MiB 2023-06-14 10:34:42.707230: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 7 Chunks of size 2359296 totalling 15.75MiB 2023-06-14 10:34:42.708071: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 2490368 totalling 2.38MiB 2023-06-14 10:34:42.708868: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 2949888 totalling 2.81MiB 2023-06-14 10:34:42.709615: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 3013120 totalling 2.87MiB 2023-06-14 10:34:42.710373: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 2 Chunks of size 4718592 totalling 9.00MiB 2023-06-14 10:34:42.711300: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 5899520 totalling 5.63MiB 2023-06-14 10:34:42.724082: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 4 Chunks of size 7077888 totalling 27.00MiB 2023-06-14 10:34:42.724898: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 7340032 totalling 7.00MiB 2023-06-14 10:34:42.725848: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 6 Chunks of size 9437184 totalling 54.00MiB 2023-06-14 10:34:42.726698: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 13893632 totalling 13.25MiB 2023-06-14 10:34:42.728118: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 14680064 totalling 14.00MiB 2023-06-14 10:34:42.728935: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 18874368 totalling 54.00MiB 2023-06-14 10:34:42.729744: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 3 Chunks of size 28311552 totalling 81.00MiB 2023-06-14 10:34:42.730566: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 29360128 totalling 28.00MiB 2023-06-14 10:34:42.731314: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 4 Chunks of size 37748736 totalling 144.00MiB 2023-06-14 10:34:42.732135: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 48252928 totalling 46.02MiB 2023-06-14 10:34:42.732930: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 78643200 totalling 75.00MiB 2023-06-14 10:34:42.733754: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 216268800 totalling 206.25MiB2023-06-14 10:34:42.734592: I tensorflow/core/common_runtime/bfc_allocator.cc:1068] 1 Chunks of size 278957824 totalling 266.03MiB2023-06-14 10:34:42.735323: I tensorflow/core/common_runtime/bfc_allocator.cc:1072] Sum Total of in-use chunks: 1.05GiB 2023-06-14 10:34:42.736141: I tensorflow/core/common_runtime/bfc_allocator.cc:1074] total_region_allocatedbytes: 2145386496 memorylimit: 2911895552 available bytes: 766509056 curr_region_allocationbytes: 2147483648 2023-06-14 10:34:42.736961: I tensorflow/core/common_runtime/bfc_allocator.cc:1080] Stats: Limit: 2911895552 InUse: 1130627584 MaxInUse: 1366556672 NumAllocs: 657 MaxAllocSize: 278957824 Reserved: 0 PeakReserved: 0 LargestFreeBlock: 0

2023-06-14 10:34:42.737936: W tensorflow/core/common_runtime/bfc_allocator.cc:468] **__****x*****____ 2023-06-14 10:34:42.738649: W tensorflow/core/framework/op_kernel.cc:1692] OP_REQUIRES failed at conv_ops.cc:682 : Resource exhausted: OOM when allocating tensor with shape[4,16,1920,2560] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc Traceback (most recent call last): File "C:\ProgramData\Anaconda3\envs\sleap130\Scripts\sleap-train-script.py", line 33, in sys.exit(load_entry_point('sleap==1.3.0', 'console_scripts', 'sleap-train')()) File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\sleap\nn\training.py", line 2014, in main trainer.train() File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\sleap\nn\training.py", line 943, in train verbose=2, File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\keras\engine\training.py", line 1184, in fit tmp_logs = self.train_function(iterator) File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\tensorflow\python\eager\def_function.py", line 885, in call result = self._call(*args, kwds) File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\tensorflow\python\eager\def_function.py", line 950, in _call return self._stateless_fn(*args, **kwds) File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\tensorflow\python\eager\function.py", line 3040, in call filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\tensorflow\python\eager\function.py", line 1964, in _call_flat ctx, args, cancellation_manager=cancellation_manager)) File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\tensorflow\python\eager\function.py", line 596, in call ctx=ctx) File "C:\ProgramData\Anaconda3\envs\sleap130\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute inputs, attrs, num_outputs) tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[4,16,1920,2560] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [[node model/stack0_enc0_conv0/Conv2D (defined at \lib\site-packages\sleap\nn\training.py:943) ]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode. [Op:__inference_train_function_7558]

Function call stack: train_function

INFO:sleap.nn.callbacks:Closing the reporter controller/context. INFO:sleap.nn.callbacks:Closing the training controller socket/context. Run Path: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_103343.single_instance.n=31****

jverpeut commented 1 year ago

I just found that the error persists in Google Colab.:

INFO:numexpr.utils:NumExpr defaulting to 2 threads. INFO:sleap.nn.training:Versions: SLEAP: 1.3.1 TensorFlow: 2.8.4 Numpy: 1.22.4 Python: 3.10.12 OS: Linux-5.15.107+-x86_64-with-glibc2.31 INFO:sleap.nn.training:Training labels file: labels.v001.pkg.slp INFO:sleap.nn.training:Training profile: single_instance.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "single_instance.json", "labels_path": "labels.v001.pkg.slp", "video_paths": [ "" ], "val_labels": null, "test_labels": null, "base_checkpoint": null, "tensorboard": false, "save_viz": false, "zmq": false, "run_name": "", "prefix": "", "suffix": "", "cpu": false, "first_gpu": false, "last_gpu": false, "gpu": "auto" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 1.75, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 64, "output_stride": 2, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": { "part_names": null, "sigma": 2.5, "output_stride": 2, "loss_weight": 1.0, "offset_refinement": false }, "centroid": null, "centered_instance": null, "multi_instance": null, "multi_class_bottomup": null, "multi_class_topdown": null }, "base_checkpoint": null }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -180.0, "rotation_max_angle": 180.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": true, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": true, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": true, "flip_horizontal": false }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-08, "plateau_patience": 10 } }, "outputs": { "save_outputs": true, "run_name": "230614_111042", "run_name_prefix": "", "run_name_suffix": ".single_instance", "runs_folder": "C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models", "tags": [ "" ], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": false, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": false, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.3.0", "filename": "single_instance.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:Auto-selected GPU 0 with 15098 MiB of free memory. INFO:sleap.nn.training:Using GPU 0 for acceleration. INFO:sleap.nn.training:Disabled GPU memory pre-allocation. INFO:sleap.nn.training:System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: labels.v001.pkg.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 36 / Validation = 4. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... INFO:sleap.nn.training:Loaded test example. [3.588s] INFO:sleap.nn.training: Input shape: (1920, 2560, 1) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=6, middle_block=True, up_blocks=5, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 64 INFO:sleap.nn.training: Parameters: 31,449,243 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = SingleInstanceConfmapsHead(part_names=['nose1', 'neck1', 'earL1', 'earR1', 'forelegL1', 'forelegR1', 'tailstart1', 'hindlegL1', 'hindlegR1', 'tail1', 'tailend1'], sigma=2.5, output_stride=2, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 960, 1280, 11), dtype=tf.float32, name=None), name='SingleInstanceConfmapsHead/BiasAdd:0', description="created by layer 'SingleInstanceConfmapsHead'") INFO:sleap.nn.training:Training from scratch INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 36 INFO:sleap.nn.training:Validation set: n = 4 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=10) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.training:Created run path: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models/230614_111042.single_instance INFO:sleap.nn.training:Setting up visualization... Unable to use Qt backend for matplotlib. This probably means Qt is running headless. Unable to use Qt backend for matplotlib. This probably means Qt is running headless. INFO:sleap.nn.training:Finished trainer set up. [4.7s] INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation... INFO:sleap.nn.training:Finished creating training datasets. [3.5s] INFO:sleap.nn.training:Starting training loop... Epoch 1/200 2023-06-14 18:23:44.077325: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 216268800 exceeds 10% of free system memory. 2023-06-14 18:23:44.220492: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 54067200 exceeds 10% of free system memory. 2023-06-14 18:23:44.476186: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 216268800 exceeds 10% of free system memory. 2023-06-14 18:23:44.593199: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 54067200 exceeds 10% of free system memory. 2023-06-14 18:23:44.805005: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 39321608 exceeds 10% of free system memory. 2023-06-14 18:24:19.514425: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.76GiB (rounded to 1887436800)requested by op model/stack0_dec4_s4_to_s2_skip_concat/concat If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. Current allocation summary follows. Current allocation summary follows. 2023-06-14 18:24:19.514842: W tensorflow/core/common_runtime/bfc_allocator.cc:474] *******____**___**** 2023-06-14 18:24:19.514907: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at concat_op.cc:159 : RESOURCE_EXHAUSTED: OOM when allocating tensor with shape[4,96,960,1280] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc Traceback (most recent call last): File "/usr/local/bin/sleap-train", line 8, in sys.exit(main()) File "/usr/local/lib/python3.10/dist-packages/sleap/nn/training.py", line 2016, in main trainer.train() File "/usr/local/lib/python3.10/dist-packages/sleap/nn/training.py", line 936, in train self.keras_model.fit( File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler raise e.with_traceback(filtered_tb) from None File "/usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/execute.py", line 54, in quick_execute tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, tensorflow.python.framework.errors_impl.ResourceExhaustedError: Graph execution error:

Detected at node 'model/stack0_dec4_s4_to_s2_skip_concat/concat' defined at (most recent call last): File "/usr/local/bin/sleap-train", line 8, in sys.exit(main()) File "/usr/local/lib/python3.10/dist-packages/sleap/nn/training.py", line 2016, in main trainer.train() File "/usr/local/lib/python3.10/dist-packages/sleap/nn/training.py", line 936, in train self.keras_model.fit( File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler return fn(*args, kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 1384, in fit tmp_logs = self.train_function(iterator) File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 1021, in train_function return step_function(self, iterator) File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 1010, in step_function outputs = model.distribute_strategy.run(run_step, args=(data,)) File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 1000, in run_step outputs = model.train_step(data) File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 859, in train_step y_pred = self(x, training=True) File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler return fn(*args, *kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/engine/base_layer.py", line 1096, in call outputs = call_fn(inputs, args, kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler return fn(*args, kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/engine/functional.py", line 451, in call return self._run_internal_graph( File "/usr/local/lib/python3.10/dist-packages/keras/engine/functional.py", line 589, in _run_internal_graph outputs = node.layer(*args, *kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler return fn(args, kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/engine/base_layer.py", line 1096, in call outputs = call_fn(inputs, *args, *kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler return fn(args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/layers/merge.py", line 183, in call return self._merge_function(inputs) File "/usr/local/lib/python3.10/dist-packages/keras/layers/merge.py", line 531, in _merge_function return backend.concatenate(inputs, axis=self.axis) File "/usr/local/lib/python3.10/dist-packages/keras/backend.py", line 3313, in concatenate return tf.concat([to_dense(x) for x in tensors], axis) Node: 'model/stack0_dec4_s4_to_s2_skip_concat/concat' OOM when allocating tensor with shape[4,96,960,1280] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [[{{node model/stack0_dec4_s4_to_s2_skip_concat/concat}}]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode. [Op:__inference_train_function_8004]

jverpeut commented 1 year ago

I tried running training on the cluster and spoke with my IT about the errors:

"I see you have a process running right now, but you seem to have 3+ cores worth of work running on a single core. I can confirm you have requested and were allocated 4 cores. So for 4 cores' work to be on a single core (in practice), likely leads to the idea that your job is unaware that it has been allocated 4 cores.

Is there a configuration or command-line parameter meant for you to indicate that the job is supposed to be spread out across X cores? It appears that your sleap program is starting many subprocesses, but they are running at less than full capacity (100.0)."

I created the slp folder on my laptop with a new environment and download of sleap-1.3.0. I labeled 50 frames. I didn't see any errors during this time, but this slp file seems compromised. I can send it over email if anyone can troubleshoot.

roomrys commented 1 year ago

Hi @jverpeut,

It seems this is an out of memory issue:

OOM when allocating tensor with shape[4,96,960,1280] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc

I see your image sizes are HUGE: shape[4,96,960,1280] due to having an input_scaling greater than 1.

"input_scaling": 1.75

I had mentioned in a previous issue that for the centered instance model (used in top-down), the input scaling should be equal to 1, but for other models (including the single instance model you are training), the input scaling should always be less than or equal to 1. Having an input scaling greater than 1 provides no new information and makes training take longer (and/or causes memory issues).

If your goal was to set the receptive field size, try adjusting the max stride instead. You can decrease the input scaling (<= 1). If you have small features, these should still be visible when decreasing the input scaling (i.e. antennae that are 1px without scaling would require input scaling to remain at 1.0, but paws at 6px could be down-sampled without accidentally removing the feature entirely).

Thanks, Liezl

jverpeut commented 1 year ago

Liezl,

Thank you for your quick reply! That was absolutely the problem, as I had not changed that setting back. This is now running and training well using a top-down model:

Working logs ``` (base) C:\Users\jverpeut>conda activate sleap130 (sleap130) C:\Users\jverpeut>sleap-label Saving config: C:\Users\jverpeut/.sleap/1.3.0/preferences.yaml Restoring GUI state... Software versions: SLEAP: 1.3.0 TensorFlow: 2.6.3 Numpy: 1.19.5 Python: 3.7.12 OS: Windows-10-10.0.19041-SP0 Happy SLEAPing! :) Resetting monitor window. Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png Start training centroid... ['sleap-train', 'C:\\Users\\jverpeut\\AppData\\Local\\Temp\\tmpyzw0jo64\\230614_183811_training_job.json', 'C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset/labels.v001.slp', '--zmq', '--save_viz'] INFO:sleap.nn.training:Versions: SLEAP: 1.3.0 TensorFlow: 2.6.3 Numpy: 1.19.5 Python: 3.7.12 OS: Windows-10-10.0.19041-SP0 INFO:sleap.nn.training:Training labels file: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset/labels.v001.slp INFO:sleap.nn.training:Training profile: C:\Users\jverpeut\AppData\Local\Temp\tmpyzw0jo64\230614_183811_training_job.json INFO:sleap.nn.training: INFO:sleap.nn.training:Arguments: INFO:sleap.nn.training:{ "training_job_path": "C:\\Users\\jverpeut\\AppData\\Local\\Temp\\tmpyzw0jo64\\230614_183811_training_job.json", "labels_path": "C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset/labels.v001.slp", "video_paths": [ "" ], "val_labels": null, "test_labels": null, "base_checkpoint": null, "tensorboard": false, "save_viz": true, "zmq": true, "run_name": "", "prefix": "", "suffix": "", "cpu": false, "first_gpu": false, "last_gpu": false, "gpu": "auto" } INFO:sleap.nn.training: INFO:sleap.nn.training:Training job: INFO:sleap.nn.training:{ "data": { "labels": { "training_labels": null, "validation_labels": null, "validation_fraction": 0.1, "test_labels": null, "split_by_inds": false, "training_inds": null, "validation_inds": null, "test_inds": null, "search_path_hints": [], "skeletons": [] }, "preprocessing": { "ensure_rgb": false, "ensure_grayscale": false, "imagenet_mode": null, "input_scaling": 0.5, "pad_to_stride": null, "resize_and_pad_to_target": true, "target_height": null, "target_width": null }, "instance_cropping": { "center_on_part": null, "crop_size": null, "crop_size_detection_padding": 16 } }, "model": { "backbone": { "leap": null, "unet": { "stem_stride": null, "max_stride": 16, "output_stride": 2, "filters": 16, "filters_rate": 2.0, "middle_block": true, "up_interpolate": true, "stacks": 1 }, "hourglass": null, "resnet": null, "pretrained_encoder": null }, "heads": { "single_instance": null, "centroid": { "anchor_part": null, "sigma": 2.5, "output_stride": 2, "loss_weight": 1.0, "offset_refinement": false }, "centered_instance": null, "multi_instance": null, "multi_class_bottomup": null, "multi_class_topdown": null }, "base_checkpoint": null }, "optimization": { "preload_data": true, "augmentation_config": { "rotate": true, "rotation_min_angle": -15.0, "rotation_max_angle": 15.0, "translate": false, "translate_min": -5, "translate_max": 5, "scale": false, "scale_min": 0.9, "scale_max": 1.1, "uniform_noise": false, "uniform_noise_min_val": 0.0, "uniform_noise_max_val": 10.0, "gaussian_noise": false, "gaussian_noise_mean": 5.0, "gaussian_noise_stddev": 1.0, "contrast": false, "contrast_min_gamma": 0.5, "contrast_max_gamma": 2.0, "brightness": false, "brightness_min_val": 0.0, "brightness_max_val": 10.0, "random_crop": false, "random_crop_height": 256, "random_crop_width": 256, "random_flip": true, "flip_horizontal": false }, "online_shuffling": true, "shuffle_buffer_size": 128, "prefetch": true, "batch_size": 4, "batches_per_epoch": null, "min_batches_per_epoch": 200, "val_batches_per_epoch": null, "min_val_batches_per_epoch": 10, "epochs": 200, "optimizer": "adam", "initial_learning_rate": 0.0001, "learning_rate_schedule": { "reduce_on_plateau": true, "reduction_factor": 0.5, "plateau_min_delta": 1e-06, "plateau_patience": 5, "plateau_cooldown": 3, "min_learning_rate": 1e-08 }, "hard_keypoint_mining": { "online_mining": false, "hard_to_easy_ratio": 2.0, "min_hard_keypoints": 2, "max_hard_keypoints": null, "loss_scale": 5.0 }, "early_stopping": { "stop_training_on_plateau": true, "plateau_min_delta": 1e-08, "plateau_patience": 20 } }, "outputs": { "save_outputs": true, "run_name": "230614_183811.centroid.n=50", "run_name_prefix": "", "run_name_suffix": "", "runs_folder": "C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\\models", "tags": [ "" ], "save_visualizations": true, "delete_viz_images": true, "zip_outputs": false, "log_to_csv": true, "checkpointing": { "initial_model": false, "best_model": true, "every_epoch": false, "latest_model": false, "final_model": false }, "tensorboard": { "write_logs": false, "loss_frequency": "epoch", "architecture_graph": false, "profile_graph": false, "visualizations": true }, "zmq": { "subscribe_to_controller": true, "controller_address": "tcp://127.0.0.1:9000", "controller_polling_timeout": 10, "publish_updates": true, "publish_address": "tcp://127.0.0.1:9001" } }, "name": "", "description": "", "sleap_version": "1.3.0", "filename": "C:\\Users\\jverpeut\\AppData\\Local\\Temp\\tmpyzw0jo64\\230614_183811_training_job.json" } INFO:sleap.nn.training: INFO:sleap.nn.training:Auto-selected GPU 0 with 4020 MiB of free memory. INFO:sleap.nn.training:Using GPU 0 for acceleration. INFO:sleap.nn.training:Disabled GPU memory pre-allocation. INFO:sleap.nn.training:System: GPUs: 1/1 available Device: /physical_device:GPU:0 Available: True Initalized: False Memory growth: True INFO:sleap.nn.training: INFO:sleap.nn.training:Initializing trainer... INFO:sleap.nn.training:Loading training labels from: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset/labels.v001.slp INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1 INFO:sleap.nn.training: Splits: Training = 45 / Validation = 5. INFO:sleap.nn.training:Setting up for training... INFO:sleap.nn.training:Setting up pipeline builders... INFO:sleap.nn.training:Setting up model... INFO:sleap.nn.training:Building test pipeline... 2023-06-14 18:38:33.466655: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2 To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-06-14 18:38:34.658497: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2777 MB memory: -> device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1 2023-06-14 18:38:36.220243: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2) INFO:sleap.nn.training:Loaded test example. [7.235s] INFO:sleap.nn.training: Input shape: (544, 736, 3) INFO:sleap.nn.training:Created Keras model. INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False) INFO:sleap.nn.training: Max stride: 16 INFO:sleap.nn.training: Parameters: 1,953,393 INFO:sleap.nn.training: Heads: INFO:sleap.nn.training: [0] = CentroidConfmapsHead(anchor_part=None, sigma=2.5, output_stride=2, loss_weight=1.0) INFO:sleap.nn.training: Outputs: INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 272, 368, 1), dtype=tf.float32, name=None), name='CentroidConfmapsHead/BiasAdd:0', description="created by layer 'CentroidConfmapsHead'") INFO:sleap.nn.training:Training from scratch INFO:sleap.nn.training:Setting up data pipelines... INFO:sleap.nn.training:Training set: n = 45 INFO:sleap.nn.training:Validation set: n = 5 INFO:sleap.nn.training:Setting up optimization... INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08) INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=20) INFO:sleap.nn.training:Setting up outputs... INFO:sleap.nn.callbacks:Training controller subscribed to: tcp://127.0.0.1:9000 (topic: ) INFO:sleap.nn.training: ZMQ controller subcribed to: tcp://127.0.0.1:9000 INFO:sleap.nn.callbacks:Progress reporter publishing on: tcp://127.0.0.1:9001 for: not_set INFO:sleap.nn.training: ZMQ progress reporter publish on: tcp://127.0.0.1:9001 INFO:sleap.nn.training:Created run path: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50 INFO:sleap.nn.training:Setting up visualization... 2023-06-14 18:38:49.489744: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: -34 } dim { size: -35 } dim { size: -36 } dim { size: 1 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce GTX 1050" frequency: 1493 num_cores: 5 environment { key: "architecture" value: "6.1" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 524288 shared_memory_size_per_multiprocessor: 98304 memory_size: 2911895552 bandwidth: 112128000 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: -37 } dim { size: -38 } dim { size: 1 } } } 2023-06-14 18:38:52.942854: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: -34 } dim { size: -35 } dim { size: -36 } dim { size: 1 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce GTX 1050" frequency: 1493 num_cores: 5 environment { key: "architecture" value: "6.1" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 524288 shared_memory_size_per_multiprocessor: 98304 memory_size: 2911895552 bandwidth: 112128000 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: -37 } dim { size: -38 } dim { size: 1 } } } INFO:sleap.nn.training:Finished trainer set up. [20.1s] INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation... INFO:sleap.nn.training:Finished creating training datasets. [18.2s] INFO:sleap.nn.training:Starting training loop... Epoch 1/200 2023-06-14 18:39:15.966637: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201 2023-06-14 18:39:18.921651: W tensorflow/core/common_runtime/bfc_allocator.cc:338] Garbage collection: deallocate free memory regions (i.e., allocations) so that we can re-allocate a larger region to avoid OOM due to memory fragmentation. If you see this message frequently, you are running near the threshold of the available device memory and re-allocation may incur great performance overhead. You may try smaller batch sizes to observe the performance impact. Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to disable this feature. 2023-06-14 18:39:19.849494: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.16GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:20.191465: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.11GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:20.535565: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.11GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:22.161518: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.73GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:23.023134: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.73GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:23.165612: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.05GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:23.213737: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.16GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:23.584359: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.06GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:23.621365: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.11GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. 2023-06-14 18:39:23.917032: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.08GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available. WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.1168s vs `on_train_batch_end` time: 0.2201s). Check your callbacks. 200/200 - 122s - loss: 1.9263e-04 - val_loss: 1.8861e-04 2023-06-14 18:41:15.873328: W tensorflow/core/common_runtime/bfc_allocator.cc:338] Garbage collection: deallocate free memory regions (i.e., allocations) so that we can re-allocate a larger region to avoid OOM due to memory fragmentation. If you see this message frequently, you are running near the threshold of the available device memory and re-allocation may incur great performance overhead. You may try smaller batch sizes to observe the performance impact. Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to disable this feature. Epoch 2/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 109s - loss: 1.8734e-04 - val_loss: 1.8607e-04 Epoch 3/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 131s - loss: 1.8307e-04 - val_loss: 1.8374e-04 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png Epoch 4/200 200/200 - 122s - loss: 1.8261e-04 - val_loss: 1.8236e-04 Epoch 5/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 116s - loss: 1.6985e-04 - val_loss: 1.6286e-04 Epoch 6/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 127s - loss: 1.7169e-04 - val_loss: 1.4733e-04 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png Epoch 7/200 200/200 - 115s - loss: 1.3785e-04 - val_loss: 1.6617e-04 Epoch 8/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 143s - loss: 1.2597e-04 - val_loss: 1.5156e-04 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png Epoch 9/200 200/200 - 107s - loss: 1.0186e-04 - val_loss: 1.7897e-04 Epoch 10/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 145s - loss: 8.5757e-05 - val_loss: 1.7263e-04 Epoch 11/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 111s - loss: 7.1866e-05 - val_loss: 1.6712e-04 Epoch 00011: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05. Epoch 12/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 146s - loss: 4.6386e-05 - val_loss: 1.6608e-04 Epoch 13/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 111s - loss: 3.6698e-05 - val_loss: 1.8564e-04 Epoch 14/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\train.*.png Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png 200/200 - 147s - loss: 3.1395e-05 - val_loss: 1.6000e-04 Epoch 15/200 Polling: C:/Users/jverpeut/Desktop/SLEAP_workshop/Testdataset\models\230614_183811.centroid.n=50\viz\validation.*.png ```