FIRST-Tech-Challenge / fmltc

FIRST Machine Learning Toolchain
Other
39 stars 14 forks source link

Use a custom image to run training and evaluation jobs with GPU. #222

Closed lizlooney closed 2 years ago

lizlooney commented 2 years ago

This fixes the problem where jobs had 0% GPU utilization.

@cmacfarl I didn't touch the terraform files.

lizlooney commented 2 years ago

This is ready for review now. Training works on either TPU or GPU.

github-actions[bot] commented 2 years ago

Terraform plan Succeeded for Workspace: default

Show Output ```diff An execution plan has been generated and is shown below. Resource actions are indicated with the following symbols: + create ! update in-place - destroy -/+ destroy and then create replacement Terraform will perform the following actions: # module.dev.google_app_engine_standard_app_version.fmltc-app-v1 will be updated in-place ! resource "google_app_engine_standard_app_version" "fmltc-app-v1" { id = "apps/ftc-ml-firstinspires-dev/services/default/versions/v1" name = "apps/ftc-ml-firstinspires-dev/services/default/versions/v1" # (9 unchanged attributes hidden) ! automatic_scaling { ! min_pending_latency = "0.100s" -> "0.1s" # (4 unchanged attributes hidden) # (1 unchanged block hidden) } ! deployment { ! zip { ! source_url = "https://storage.googleapis.com/ftc-ml-firstinspires-dev-gae-source/3d3145cb335a1e31b2b0ac65bdd5cee9.zip" -> "https://storage.googleapis.com/ftc-ml-firstinspires-dev-gae-source/ec1f11e44e9155995c7bc2d47d628cd7.zip" # (1 unchanged attribute hidden) } } - handlers { - auth_fail_action = "AUTH_FAIL_ACTION_REDIRECT" -> null - login = "LOGIN_OPTIONAL" -> null - security_level = "SECURE_OPTIONAL" -> null - url_regex = ".*" -> null - script { - script_path = "auto" -> null } } # (8 unchanged blocks hidden) } # module.dev.google_cloudfunctions_function.perform-action will be updated in-place ! resource "google_cloudfunctions_function" "perform-action" { id = "projects/ftc-ml-firstinspires-dev/locations/us-central1/functions/perform_action" name = "perform_action" ! source_archive_object = "1dd5fee327f8353c423fe48a20ac4de6.zip" -> "5d6dc8abab7d69eca68b1ac0fd6a3066.zip" # (13 unchanged attributes hidden) # (2 unchanged blocks hidden) } # module.dev.google_storage_bucket_object.app-server-archive must be replaced -/+ resource "google_storage_bucket_object" "app-server-archive" { ! content_type = "application/zip" -> (known after apply) ! crc32c = "vo4EKw==" -> (known after apply) ! detect_md5hash = "PTFFyzNaHjGysKxlvdXO6Q==" -> "different hash" # forces replacement - event_based_hold = false -> null ! id = "ftc-ml-firstinspires-dev-gae-source-3d3145cb335a1e31b2b0ac65bdd5cee9.zip" -> (known after apply) + kms_key_name = (known after apply) ! md5hash = "PTFFyzNaHjGysKxlvdXO6Q==" -> (known after apply) ! media_link = "https://storage.googleapis.com/download/storage/v1/b/ftc-ml-firstinspires-dev-gae-source/o/3d3145cb335a1e31b2b0ac65bdd5cee9.zip?generation=1637955360419030&alt=media" -> (known after apply) - metadata = {} -> null ! name = "3d3145cb335a1e31b2b0ac65bdd5cee9.zip" -> "ec1f11e44e9155995c7bc2d47d628cd7.zip" # forces replacement ! output_name = "3d3145cb335a1e31b2b0ac65bdd5cee9.zip" -> (known after apply) ! self_link = "https://www.googleapis.com/storage/v1/b/ftc-ml-firstinspires-dev-gae-source/o/3d3145cb335a1e31b2b0ac65bdd5cee9.zip" -> (known after apply) ! storage_class = "STANDARD" -> (known after apply) - temporary_hold = false -> null # (2 unchanged attributes hidden) } # module.dev.google_storage_bucket_object.cloud-function-archive must be replaced -/+ resource "google_storage_bucket_object" "cloud-function-archive" { ! content_type = "application/zip" -> (known after apply) ! crc32c = "AFup5w==" -> (known after apply) ! detect_md5hash = "HdX+4yf4NTxCP+SKIKxN5g==" -> "different hash" # forces replacement - event_based_hold = false -> null ! id = "ftc-ml-firstinspires-dev-gcf-source-1dd5fee327f8353c423fe48a20ac4de6.zip" -> (known after apply) + kms_key_name = (known after apply) ! md5hash = "HdX+4yf4NTxCP+SKIKxN5g==" -> (known after apply) ! media_link = "https://storage.googleapis.com/download/storage/v1/b/ftc-ml-firstinspires-dev-gcf-source/o/1dd5fee327f8353c423fe48a20ac4de6.zip?generation=1637955360420320&alt=media" -> (known after apply) - metadata = {} -> null ! name = "1dd5fee327f8353c423fe48a20ac4de6.zip" -> "5d6dc8abab7d69eca68b1ac0fd6a3066.zip" # forces replacement ! output_name = "1dd5fee327f8353c423fe48a20ac4de6.zip" -> (known after apply) ! self_link = "https://www.googleapis.com/storage/v1/b/ftc-ml-firstinspires-dev-gcf-source/o/1dd5fee327f8353c423fe48a20ac4de6.zip" -> (known after apply) ! storage_class = "STANDARD" -> (known after apply) - temporary_hold = false -> null # (2 unchanged attributes hidden) } # module.dev.google_storage_bucket_object.models["object_detection-0.1.tar.gz"] will be created + resource "google_storage_bucket_object" "models" { + bucket = "ftc-ml-firstinspires-dev" + content_type = (known after apply) + crc32c = (known after apply) + detect_md5hash = "different hash" + id = (known after apply) + kms_key_name = (known after apply) + md5hash = (known after apply) + media_link = (known after apply) + name = "static/training/object_detection-0.1.tar.gz" + output_name = (known after apply) + self_link = (known after apply) + source = "./../../server/static/training/object_detection-0.1.tar.gz" + storage_class = (known after apply) } # module.dev.google_storage_bucket_object.models["object_detection-0.1_2.5.0.tar.gz"] will be destroyed - resource "google_storage_bucket_object" "models" { - bucket = "ftc-ml-firstinspires-dev" -> null - content_type = "application/x-gzip" -> null - crc32c = "mSdacQ==" -> null - detect_md5hash = "kWJMZ5l24QiI2v8Z4AjtZA==" -> null - event_based_hold = false -> null - id = "ftc-ml-firstinspires-dev-static/training/object_detection-0.1_2.5.0.tar.gz" -> null - md5hash = "kWJMZ5l24QiI2v8Z4AjtZA==" -> null - media_link = "https://storage.googleapis.com/download/storage/v1/b/ftc-ml-firstinspires-dev/o/static%2Ftraining%2Fobject_detection-0.1_2.5.0.tar.gz?generation=1632861115972481&alt=media" -> null - metadata = {} -> null - name = "static/training/object_detection-0.1_2.5.0.tar.gz" -> null - output_name = "static/training/object_detection-0.1_2.5.0.tar.gz" -> null - self_link = "https://www.googleapis.com/storage/v1/b/ftc-ml-firstinspires-dev/o/static%2Ftraining%2Fobject_detection-0.1_2.5.0.tar.gz" -> null - source = "./../../server/static/training/object_detection-0.1_2.5.0.tar.gz" -> null - storage_class = "STANDARD" -> null - temporary_hold = false -> null } # module.dev.google_storage_bucket_object.models["pycocotools-2.0.tar.gz"] will be destroyed - resource "google_storage_bucket_object" "models" { - bucket = "ftc-ml-firstinspires-dev" -> null - content_type = "application/x-gzip" -> null - crc32c = "OSiaLw==" -> null - detect_md5hash = "S+jVWSmKSKBiUovY5Op2cQ==" -> null - event_based_hold = false -> null - id = "ftc-ml-firstinspires-dev-static/training/pycocotools-2.0.tar.gz" -> null - md5hash = "S+jVWSmKSKBiUovY5Op2cQ==" -> null - media_link = "https://storage.googleapis.com/download/storage/v1/b/ftc-ml-firstinspires-dev/o/static%2Ftraining%2Fpycocotools-2.0.tar.gz?generation=1630097745467239&alt=media" -> null - metadata = {} -> null - name = "static/training/pycocotools-2.0.tar.gz" -> null - output_name = "static/training/pycocotools-2.0.tar.gz" -> null - self_link = "https://www.googleapis.com/storage/v1/b/ftc-ml-firstinspires-dev/o/static%2Ftraining%2Fpycocotools-2.0.tar.gz" -> null - source = "./../../server/static/training/pycocotools-2.0.tar.gz" -> null - storage_class = "STANDARD" -> null - temporary_hold = false -> null } # module.dev.google_storage_bucket_object.models["slim-0.1.tar.gz"] will be destroyed - resource "google_storage_bucket_object" "models" { - bucket = "ftc-ml-firstinspires-dev" -> null - content_type = "application/x-gzip" -> null - crc32c = "TJqw6A==" -> null - detect_md5hash = "ozH+uwOsKjN7oFloa46CKg==" -> null - event_based_hold = false -> null - id = "ftc-ml-firstinspires-dev-static/training/slim-0.1.tar.gz" -> null - md5hash = "ozH+uwOsKjN7oFloa46CKg==" -> null - media_link = "https://storage.googleapis.com/download/storage/v1/b/ftc-ml-firstinspires-dev/o/static%2Ftraining%2Fslim-0.1.tar.gz?generation=1630097726755061&alt=media" -> null - metadata = {} -> null - name = "static/training/slim-0.1.tar.gz" -> null - output_name = "static/training/slim-0.1.tar.gz" -> null - self_link = "https://www.googleapis.com/storage/v1/b/ftc-ml-firstinspires-dev/o/static%2Ftraining%2Fslim-0.1.tar.gz" -> null - source = "./../../server/static/training/slim-0.1.tar.gz" -> null - storage_class = "STANDARD" -> null - temporary_hold = false -> null } Plan: 3 to add, 2 to change, 5 to destroy. ```