iterative / terraform-provider-iterative

☁️ Terraform plugin for machine learning workloads: spot instance recovery & auto-termination | AWS, GCP, Azure, Kubernetes
https://registry.terraform.io/providers/iterative/iterative/latest/docs
Apache License 2.0
287 stars 27 forks source link

K8s support for specifying an existing persistent volume claim #661

Closed tasdomas closed 1 year ago

tasdomas commented 1 year ago

This can be tested with the following terraform config (main.tf):

terraform {
  required_providers { iterative = { source = "github.com/iterative/iterative" } }
}

provider "kubernetes" {
  config_path    = "~/.kube/config"
  config_context = "minikube"
}

# A pre-allocated PVC.
resource "kubernetes_persistent_volume_claim" "example" {
  metadata {
    name = "pvc"
  }
  spec {
    access_modes = ["ReadWriteMany"]
    resources {
      requests = {
        storage = "1Gi"
      }
    }
  }
}

# A busybox pod to let us inspect the contents of the PVC.
resource "kubernetes_pod" "pvsee" {
  metadata {
    name = "pvsee"
  }

  spec {
    container {
      image = "busybox"
      name  = "bb"
      command = ["tail", "-f", "/dev/null"]
      volume_mount {
        name = "data"
        mount_path = "/data"
      }
    }
    volume {
      name = "data"
      persistent_volume_claim {
        claim_name = "${kubernetes_persistent_volume_claim.example.metadata.0.name}"
      }
    }
  }
}

provider "iterative" {}

resource "iterative_task" "example" {
  cloud      = "k8s" # or any of: gcp, az, k8s
  region     = "us-east"
  machine    = "m"   # medium. Or any of: l, xl, m+k80, xl+v100, ...
  spot       = -1     # auto-price. Default -1 to disable, or >0 for hourly USD limit
  disk_size  = -1    # GB. Default -1 for automatic
  storage {
    workdir = "./"       # default blank (don't upload)
    output  = "results" # default blank (don't download). Relative to workdir
    container = "${kubernetes_persistent_volume_claim.example.metadata.0.name}"
    container_path = "tpi-run"
  }
  script = <<-END
    #!/bin/bash

    # create output directory if needed
    mkdir -p results
    # read last result (in case of spot/preemptible instance recovery)
    if test -f results/epoch.txt; then EPOCH="$(cat results/epoch.txt)"; fi
    EPOCH=$${EPOCH:-1}  # start from 1 if last result not found

    echo "(re)starting training loop from $EPOCH up to 1337 epochs"
    for epoch in $(seq $EPOCH 1337); do
      sleep 1
      echo "$epoch" | tee results/epoch.txt
    done
  END
}

Once the config is applied, run kubectl exec -it pvsee -- sh and then ls /data to inspect the contents of the PVC.