Closed fanyang01 closed 8 months ago
I am not sure whether the problem is no on-demand pricing found
. If so, I believe the root cause is the following code:
The currency
variable will NOT be changed to "CNY" since my region is cn-northwest-1
. However, the pricing API returns something like "pricePerUnit":{"CNY":"12.4770000000"}
. L296-L298 will skip all returned prices because "USD" cannot be found. The problem could be fixed by replacing if p.region == "cn-north-1"
with if strings.HasPrefix(p.region, "cn-")
.
Based on what you shared, it seems like Karpenter is failing to get a NodeClass from your NodePool. Can you share the NodePool that you are using with Karpenter?
Sure, Karpenter was deployed with the following Terraform code:
resource "kubectl_manifest" "karpenter_node_class" {
yaml_body = yamlencode({
apiVersion = "karpenter.k8s.aws/v1beta1"
kind = "EC2NodeClass"
metadata = {
name = "default"
namespace = helm_release.karpenter.metadata[0].namespace
labels = merge(local.k8s_labels, {
"helm-release-revision/karpenter" = tostring(helm_release.karpenter.metadata[0].revision)
})
}
spec = {
amiFamily = var.bottlerocket ? "Bottlerocket" : "AL2"
subnetSelectorTerms = [for subnet_id in data.aws_subnets.private.ids: {
id = subnet_id
}]
securityGroupSelectorTerms = [{
tags = {
"kubernetes.io/cluster/${module.eks.cluster_name}" = "owned"
}
}]
role = module.karpenter.node_iam_role_name
tags = {
"EKS" = module.eks.cluster_name
"owner" = local.owner
}
}
})
depends_on = [
helm_release.karpenter
]
}
resource "kubectl_manifest" "karpenter_on_demand_node_pool" {
yaml_body = yamlencode({
apiVersion = "karpenter.sh/v1beta1"
kind = "NodePool"
metadata = {
name = "on-demand-default"
namespace = helm_release.karpenter.metadata[0].namespace
labels = merge(local.k8s_labels, {
"helm-release-revision/karpenter" = tostring(helm_release.karpenter.metadata[0].revision)
})
}
template = {
metadata = {
labels = {
"karpenter-node-pool" = "on-demand-default"
}
}
spec = {
nodeClassRef = {
apiVersion = kubectl_manifest.karpenter_node_class.api_version
kind = kubectl_manifest.karpenter_node_class.kind
name = kubectl_manifest.karpenter_node_class.name
}
taints = [
{
key = "autoscaler"
value = "Karpenter"
effect = "NoSchedule"
}
]
requirements = [
{
key = "kubernetes.io/os"
operator = "In"
values = ["linux"]
},
{
key = "topology.kubernetes.io/zone"
operator = "In"
values = var.karpenter_on_demand_instance_zone
},
{
key = "karpenter.k8s.aws/instance-family"
operator = "In"
values = var.karpenter_on_demand_instance_families
},
{
key = "karpenter.k8s.aws/instance-size"
operator = "In"
values = var.karpenter_on_demand_instance_sizes
},
{
key = "karpenter.sh/capacity-type"
operator = "In"
values = ["on-demand"]
},
{
key = "karpenter.k8s.aws/instance-gpu-count"
operator = "DoesNotExist"
},
{
key = "karpenter.k8s.aws/instance-accelerator-count"
operator = "DoesNotExist"
}
]
disruption = {
consolidationPolicy = "WhenEmpty"
consolidateAfter = "10s"
expireAfter = "24h"
}
weight = 10
}
}
})
depends_on = [
helm_release.karpenter,
kubectl_manifest.karpenter_node_class
]
}
resource "kubectl_manifest" "karpenter_spot_node_pool" {
yaml_body = yamlencode({
apiVersion = "karpenter.sh/v1beta1"
kind = "NodePool"
metadata = {
name = "spot-default"
namespace = helm_release.karpenter.metadata[0].namespace
labels = merge(local.k8s_labels, {
"helm-release-revision/karpenter" = tostring(helm_release.karpenter.metadata[0].revision)
})
}
template = {
metadata = {
labels = {
"karpenter-node-pool" = "spot-default"
}
}
spec = {
nodeClassRef = {
apiVersion = kubectl_manifest.karpenter_node_class.api_version
kind = kubectl_manifest.karpenter_node_class.kind
name = kubectl_manifest.karpenter_node_class.name
}
taints = [
{
key = "autoscaler"
value = "Karpenter"
effect = "NoSchedule"
}
]
requirements = [
{
key = "kubernetes.io/os"
operator = "In"
values = ["linux"]
},
{
key = "topology.kubernetes.io/zone"
operator = "In"
values = var.karpenter_spot_instance_zone
},
{
key = "karpenter.k8s.aws/instance-family"
operator = "In"
values = var.karpenter_spot_instance_families
},
{
key = "karpenter.k8s.aws/instance-size"
operator = "In"
values = var.karpenter_spot_instance_sizes
},
{
key = "karpenter.sh/capacity-type"
operator = "In"
values = ["spot"]
},
{
key = "karpenter.k8s.aws/instance-gpu-count"
operator = "DoesNotExist"
},
{
key = "karpenter.k8s.aws/instance-accelerator-count"
operator = "DoesNotExist"
}
]
disruption = {
consolidationPolicy = "WhenEmpty"
consolidateAfter = "10s"
expireAfter = "24h"
}
weight = 20
}
}
})
depends_on = [
helm_release.karpenter,
kubectl_manifest.karpenter_node_class
]
}
The output of k get ec2nodeclass -o yaml
:
apiVersion: v1
items:
- apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
annotations:
karpenter.k8s.aws/ec2nodeclass-hash: "16028390140742089221"
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"karpenter.k8s.aws/v1beta1","kind":"EC2NodeClass","metadata":{"annotations":{},"labels":{"EKS":"xxx-dev-13","Terraform":"true","helm-release-revision/karpenter":"2"},"name":"default"},"spec":{"amiFamily":"Bottlerocket","role":"xxx-dev-13-karpenter-node-role","securityGroupSelectorTerms":[{"tags":{"kubernetes.io/cluster/xxx-dev-13":"owned"}}],"subnetSelectorTerms":[{"id":"subnet-xxx"},{"id":"subnet-xxx"},{"id":"subnet-xxx"}],"tags":{"EKS":"xxx-dev-13","owner":"xxx"}}}
creationTimestamp: "2024-02-19T06:43:10Z"
finalizers:
- karpenter.k8s.aws/termination
generation: 1
labels:
EKS: xxx-dev-13
Terraform: "true"
helm-release-revision/karpenter: "2"
name: default
resourceVersion: "362365"
uid: fa764a78-8e7a-4039-98d3-d34607a85aeb
spec:
amiFamily: Bottlerocket
metadataOptions:
httpEndpoint: enabled
httpProtocolIPv6: disabled
httpPutResponseHopLimit: 2
httpTokens: required
role: xxx-dev-13-karpenter-node-role
securityGroupSelectorTerms:
- tags:
kubernetes.io/cluster/xxx-dev-13: owned
subnetSelectorTerms:
- id: subnet-xxx
- id: subnet-xxx
- id: subnet-xxx
tags:
EKS: xxx-dev-13
owner: xxx
status:
amis:
- id: ami-0d1bdeec409b7bd69
name: bottlerocket-aws-k8s-1.29-aarch64-v1.19.1-c325a08b
requirements:
- key: kubernetes.io/arch
operator: In
values:
- arm64
- key: karpenter.k8s.aws/instance-gpu-count
operator: DoesNotExist
- key: karpenter.k8s.aws/instance-accelerator-count
operator: DoesNotExist
- id: ami-0fadcb605ee965289
name: bottlerocket-aws-k8s-1.29-nvidia-x86_64-v1.19.1-c325a08b
requirements:
- key: kubernetes.io/arch
operator: In
values:
- amd64
- key: karpenter.k8s.aws/instance-gpu-count
operator: Exists
- id: ami-0fadcb605ee965289
name: bottlerocket-aws-k8s-1.29-nvidia-x86_64-v1.19.1-c325a08b
requirements:
- key: kubernetes.io/arch
operator: In
values:
- amd64
- key: karpenter.k8s.aws/instance-accelerator-count
operator: Exists
- id: ami-0d26a80b71908b194
name: bottlerocket-aws-k8s-1.29-x86_64-v1.19.1-c325a08b
requirements:
- key: kubernetes.io/arch
operator: In
values:
- amd64
- key: karpenter.k8s.aws/instance-gpu-count
operator: DoesNotExist
- key: karpenter.k8s.aws/instance-accelerator-count
operator: DoesNotExist
instanceProfile: xxx-dev-13_18241233644497187194
securityGroups:
- id: sg-01f0cbf4488911dd0
name: xxx-dev-13-node-20240219063220192100000001
- id: sg-031d1d1f77c1dd325
name: eks-cluster-sg-xxx-dev-13-1444326569
- id: sg-093cadda468d0333c
name: xxx-dev-13-cluster-20240219063220727000000007
subnets:
- id: subnet-xxx
zone: cn-northwest-1a
- id: subnet-xxx
zone: cn-northwest-1b
- id: subnet-xxx
zone: cn-northwest-1c
kind: List
metadata:
resourceVersion: ""
The output of k get nodepool -o yaml
:
apiVersion: v1
items:
- apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
annotations:
karpenter.sh/nodepool-hash: "17307646073972311821"
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"karpenter.sh/v1beta1","kind":"NodePool","metadata":{"annotations":{},"labels":{"EKS":"xxx-dev-13","Terraform":"true","helm-release-revision/karpenter":"2"},"name":"on-demand-default"},"template":{"metadata":{"labels":{"karpenter-node-pool":"on-demand-default"}},"spec":{"disruption":{"consolidateAfter":"10s","consolidationPolicy":"WhenEmpty","expireAfter":"24h"},"nodeClassRef":{"apiVersion":"karpenter.k8s.aws/v1beta1","kind":"EC2NodeClass","name":"default"},"requirements":[{"key":"kubernetes.io/os","operator":"In","values":["linux"]},{"key":"topology.kubernetes.io/zone","operator":"In","values":["cn-northwest-1a"]},{"key":"karpenter.k8s.aws/instance-family","operator":"In","values":["t4g","c7g","t3a"]},{"key":"karpenter.k8s.aws/instance-size","operator":"In","values":["nano","micro","small","medium","large","xlarge","2xlarge"]},{"key":"karpenter.sh/capacity-type","operator":"In","values":["on-demand"]},{"key":"karpenter.k8s.aws/instance-gpu-count","operator":"DoesNotExist"},{"key":"karpenter.k8s.aws/instance-accelerator-count","operator":"DoesNotExist"}],"taints":[{"effect":"NoSchedule","key":"autoscaler","value":"Karpenter"}],"weight":10}}}
creationTimestamp: "2024-02-19T06:43:15Z"
generation: 1
labels:
EKS: xxx-dev-13
Terraform: "true"
helm-release-revision/karpenter: "2"
name: on-demand-default
resourceVersion: "32182"
uid: a67c8072-bc38-434f-b4db-54ace5900f5c
- apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
annotations:
karpenter.sh/nodepool-hash: "17307646073972311821"
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"karpenter.sh/v1beta1","kind":"NodePool","metadata":{"annotations":{},"labels":{"EKS":"xxx-dev-13","Terraform":"true","helm-release-revision/karpenter":"2"},"name":"spot-default"},"template":{"metadata":{"labels":{"karpenter-node-pool":"spot-default"}},"spec":{"disruption":{"consolidateAfter":"10s","consolidationPolicy":"WhenEmpty","expireAfter":"24h"},"nodeClassRef":{"apiVersion":"karpenter.k8s.aws/v1beta1","kind":"EC2NodeClass","name":"default"},"requirements":[{"key":"kubernetes.io/os","operator":"In","values":["linux"]},{"key":"topology.kubernetes.io/zone","operator":"In","values":["cn-northwest-1b","cn-northwest-1c"]},{"key":"karpenter.k8s.aws/instance-family","operator":"In","values":["t4g","c7g","m7g","r7g","m6g","t3a","t3","m5a","m5","m6i"]},{"key":"karpenter.k8s.aws/instance-size","operator":"In","values":["medium","large","xlarge","2xlarge"]},{"key":"karpenter.sh/capacity-type","operator":"In","values":["spot"]},{"key":"karpenter.k8s.aws/instance-gpu-count","operator":"DoesNotExist"},{"key":"karpenter.k8s.aws/instance-accelerator-count","operator":"DoesNotExist"}],"taints":[{"effect":"NoSchedule","key":"autoscaler","value":"Karpenter"}],"weight":20}}}
creationTimestamp: "2024-02-19T06:43:14Z"
generation: 1
labels:
EKS: xxx-dev-13
Terraform: "true"
helm-release-revision/karpenter: "2"
name: spot-default
resourceVersion: "32183"
uid: 48dc4cc1-c181-4f00-85ce-b0606055a338
kind: List
metadata:
resourceVersion: ""
Hope this helps.
By the way, should I open a new issue for the no on-demand pricing found
error?
Interesting, I see the nodeClassRef
on the last-applied-configuration
annotation but not on the NodePool
itself. Is there anything that could have potentially modified the NodePool
after it was set up by terraform? Could you try manually applying the NodepPool
via kubectl and see if the result changes?
@jmdeal Great findings! It turns out that the NodePool yaml was incorrect because the template
was misplaced:
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: on-demand-test
template:
metadata:
labels:
karpenter-node-pool: on-demand-test
spec:
disruption:
consolidateAfter: "10s"
consolidationPolicy: "WhenEmpty"
expireAfter: "24h"
nodeClassRef:
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
name: default
requirements:
- key: kubernetes.io/os
operator: In
values:
- linux
- key: topology.kubernetes.io/zone
operator: In
values:
- cn-northwest-1a
- key: karpenter.k8s.aws/instance-family
operator: In
values:
- t4g
- c7g
- t3a
- key: karpenter.k8s.aws/instance-size
operator: In
values:
- nano
- micro
- small
- medium
- large
- xlarge
- 2xlarge
- key: karpenter.sh/capacity-type
operator: In
values:
- on-demand
- key: karpenter.k8s.aws/instance-gpu-count
operator: DoesNotExist
- key: karpenter.k8s.aws/instance-accelerator-count
operator: DoesNotExist
taints:
- effect: NoSchedule
key: autoscaler
value: Karpenter
weight: 10
$ k apply -f nodepool.yaml
Error from server (BadRequest): error when creating "nodepool.yaml": NodePool in version "v1beta1" cannot be handled as a NodePool: strict decoding error: unknown field "template"
Correcting the yaml makes the controller start properly:
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: on-demand-test
spec:
template:
metadata:
labels:
karpenter-node-pool: on-demand-test
spec:
nodeClassRef:
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
name: default
requirements:
- key: kubernetes.io/os
operator: In
values:
- linux
- key: topology.kubernetes.io/zone
operator: In
values:
- cn-northwest-1a
- key: karpenter.k8s.aws/instance-family
operator: In
values:
- t4g
- c7g
- t3a
- key: karpenter.k8s.aws/instance-size
operator: In
values:
- nano
- micro
- small
- medium
- large
- xlarge
- 2xlarge
- key: karpenter.sh/capacity-type
operator: In
values:
- on-demand
- key: karpenter.k8s.aws/instance-gpu-count
operator: DoesNotExist
- key: karpenter.k8s.aws/instance-accelerator-count
operator: DoesNotExist
taints:
- effect: NoSchedule
key: autoscaler
value: Karpenter
disruption:
consolidateAfter: "10s"
consolidationPolicy: "WhenEmpty"
expireAfter: "24h"
weight: 10
k get pods -n karpenter
NAME READY STATUS RESTARTS AGE
karpenter-b5cb5ff9-5fd77 1/1 Running 0 22s
karpenter-b5cb5ff9-bbbfp 1/1 Running 246 (13m ago) 21h
{"level":"DEBUG","time":"2024-02-20T06:03:13.762Z","logger":"controller.pricing","message":"updated spot pricing with instance types and offerings","commit":"17d6c05","instance-type-count":762,"offering-count":829}
{"level":"DEBUG","time":"2024-02-20T06:03:13.793Z","logger":"controller.nodeclass","message":"discovered subnets","commit":"17d6c05","ec2nodeclass":"default","subnets":["subnet-xxx (cn-northwest-1c)","subnet-xxx (cn-northwest-1a)","subnet-xxx (cn-northwest-1b)"]}
{"level":"DEBUG","time":"2024-02-20T06:03:13.809Z","logger":"controller.disruption","message":"waiting on cluster sync","commit":"17d6c05"}
{"level":"DEBUG","time":"2024-02-20T06:03:13.871Z","logger":"controller.nodeclass","message":"discovered security groups","commit":"17d6c05","ec2nodeclass":"default","security-groups":["sg-xxx","sg-xxx","sg-xxx"]}
{"level":"DEBUG","time":"2024-02-20T06:03:13.873Z","logger":"controller.nodeclass","message":"discovered kubernetes version","commit":"17d6c05","ec2nodeclass":"default","version":"1.29"}
{"level":"DEBUG","time":"2024-02-20T06:03:13.928Z","logger":"controller","message":"hydrated launch template cache","commit":"17d6c05","tag-key":"karpenter.k8s.aws/cluster","tag-value":"xxx-dev-13","count":0}
{"level":"ERROR","time":"2024-02-20T06:03:14.002Z","logger":"controller.nodeclass","message":"discovering amis from ssm, getting ssm parameter \"/aws/service/bottlerocket/aws-k8s-1.29-nvidia/arm64/latest/image_id\", ParameterNotFound: ","commit":"17d6c05","ec2nodeclass":"default","query":"/aws/service/bottlerocket/aws-k8s-1.29-nvidia/arm64/latest/image_id"}
{"level":"ERROR","time":"2024-02-20T06:03:14.017Z","logger":"controller.nodeclass","message":"discovering amis from ssm, getting ssm parameter \"/aws/service/bottlerocket/aws-k8s-1.29-nvidia/arm64/latest/image_id\", ParameterNotFound: ","commit":"17d6c05","ec2nodeclass":"default","query":"/aws/service/bottlerocket/aws-k8s-1.29-nvidia/arm64/latest/image_id"}
{"level":"DEBUG","time":"2024-02-20T06:03:14.069Z","logger":"controller.nodeclass","message":"discovered amis","commit":"17d6c05","ec2nodeclass":"default","ids":"ami-0d1bdeec409b7bd69, ami-0fadcb605ee965289, ami-0fadcb605ee965289, ami-0d26a80b71908b194","count":4}
{"level":"ERROR","time":"2024-02-20T06:03:14.122Z","logger":"controller.pricing","message":"no on-demand pricing found","commit":"17d6c05"}
{"level":"ERROR","time":"2024-02-20T06:03:14.558Z","logger":"controller.pricing","message":"no on-demand pricing found","commit":"17d6c05"}
{"level":"ERROR","time":"2024-02-20T06:03:15.068Z","logger":"controller.pricing","message":"no on-demand pricing found","commit":"17d6c05"}
{"level":"DEBUG","time":"2024-02-20T06:03:15.260Z","logger":"controller.disruption","message":"discovered instance types","commit":"17d6c05","count":297}
{"level":"DEBUG","time":"2024-02-20T06:03:15.317Z","logger":"controller.disruption","message":"discovered offerings for instance types","commit":"17d6c05","instance-type-count":297}
{"level":"DEBUG","time":"2024-02-20T06:03:15.318Z","logger":"controller.disruption","message":"discovered zones","commit":"17d6c05","zones":["cn-northwest-1c","cn-northwest-1b","cn-northwest-1a"]}
{"level":"ERROR","time":"2024-02-20T06:03:15.537Z","logger":"controller.pricing","message":"no on-demand pricing found","commit":"17d6c05"}
{"level":"ERROR","time":"2024-02-20T06:03:15.979Z","logger":"controller.pricing","message":"no on-demand pricing found","commit":"17d6c05"}
{"level":"ERROR","time":"2024-02-20T06:03:16.443Z","logger":"controller.pricing","message":"no on-demand pricing found","commit":"17d6c05"}
{"level":"ERROR","time":"2024-02-20T06:03:16.923Z","logger":"controller.pricing","message":"no on-demand pricing found","commit":"17d6c05"}
Thanks to both of you for your help!
Yeah, I'd normally expect that the apply of the manifests should fail here if you don't specify the nodeClassRef
. Seems like if you don't specify a spec
at all, it won't catch the fact that the spec and the nodeClassRef
is required here.
Description
Observed Behavior:
Expected Behavior:
Start successfully.
Reproduction Steps (Please include YAML):
Use helm to deploy Karpenter to a newly created EKS cluster in AWS China (cn-northwest-1). I do this in Terraform:
Versions:
kubectl version
):