Open Fluder-Paradyne opened 3 months ago
So strange, could you please check the failed pod and share it , just running kubectl get pod $name -o json
and kubectl logs -p $name
And please make sure your node have enough resource
Did it just now again
Get pod output
kubectl get pod $name -o json
{
"apiVersion": "v1",
"items": [
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"creationTimestamp": "2024-06-11T03:58:41Z",
"generateName": "emqx-core-75ff7c75f9-",
"labels": {
"apps.emqx.io/db-role": "core",
"apps.emqx.io/instance": "emqx",
"apps.emqx.io/managed-by": "emqx-operator",
"apps.emqx.io/pod-template-hash": "75ff7c75f9",
"apps.kubernetes.io/pod-index": "0",
"controller-revision-hash": "emqx-core-75ff7c75f9-df4bcf689",
"statefulset.kubernetes.io/pod-name": "emqx-core-75ff7c75f9-0"
},
"name": "emqx-core-75ff7c75f9-0",
"namespace": "emqx",
"ownerReferences": [
{
"apiVersion": "apps/v1",
"blockOwnerDeletion": true,
"controller": true,
"kind": "StatefulSet",
"name": "emqx-core-75ff7c75f9",
"uid": "19197aa0-ba43-4a3b-ab12-8c203107ff1f"
}
],
"resourceVersion": "32867086",
"uid": "63b85614-e6f1-4b46-8006-44bf1cf0dcb3"
},
"spec": {
"containers": [
{
"env": [
{
"name": "EMQX_DASHBOARD__LISTENERS__HTTP__BIND",
"value": "18083"
},
{
"name": "POD_NAME",
"valueFrom": {
"fieldRef": {
"apiVersion": "v1",
"fieldPath": "metadata.name"
}
}
},
{
"name": "EMQX_CLUSTER__DISCOVERY_STRATEGY",
"value": "dns"
},
{
"name": "EMQX_CLUSTER__DNS__RECORD_TYPE",
"value": "srv"
},
{
"name": "EMQX_CLUSTER__DNS__NAME",
"value": "emqx-headless.emqx.svc.cluster.local"
},
{
"name": "EMQX_HOST",
"value": "$(POD_NAME).$(EMQX_CLUSTER__DNS__NAME)"
},
{
"name": "EMQX_NODE__DATA_DIR",
"value": "data"
},
{
"name": "EMQX_NODE__ROLE",
"value": "core"
},
{
"name": "EMQX_NODE__COOKIE",
"valueFrom": {
"secretKeyRef": {
"key": "node_cookie",
"name": "emqx-node-cookie"
}
}
},
{
"name": "EMQX_API_KEY__BOOTSTRAP_FILE",
"value": "\"/opt/emqx/data/bootstrap_api_key\""
}
],
"image": "emqx:5",
"imagePullPolicy": "IfNotPresent",
"livenessProbe": {
"failureThreshold": 3,
"httpGet": {
"path": "/status",
"port": "dashboard",
"scheme": "HTTP"
},
"initialDelaySeconds": 60,
"periodSeconds": 30,
"successThreshold": 1,
"timeoutSeconds": 1
},
"name": "emqx",
"ports": [
{
"containerPort": 18083,
"name": "dashboard",
"protocol": "TCP"
}
],
"readinessProbe": {
"failureThreshold": 12,
"httpGet": {
"path": "/status",
"port": "dashboard",
"scheme": "HTTP"
},
"initialDelaySeconds": 10,
"periodSeconds": 5,
"successThreshold": 1,
"timeoutSeconds": 1
},
"resources": {
"limits": {
"cpu": "1",
"memory": "1Gi"
},
"requests": {
"cpu": "1",
"memory": "512Mi"
}
},
"securityContext": {
"runAsGroup": 1000,
"runAsNonRoot": true,
"runAsUser": 1000
},
"terminationMessagePath": "/dev/termination-log",
"terminationMessagePolicy": "File",
"volumeMounts": [
{
"mountPath": "/opt/emqx/data/bootstrap_api_key",
"name": "bootstrap-api-key",
"readOnly": true,
"subPath": "bootstrap_api_key"
},
{
"mountPath": "/opt/emqx/etc/emqx.conf",
"name": "bootstrap-config",
"readOnly": true,
"subPath": "emqx.conf"
},
{
"mountPath": "/opt/emqx/log",
"name": "emqx-core-log"
},
{
"mountPath": "/opt/emqx/data",
"name": "emqx-core-data"
},
{
"mountPath": "/var/run/secrets/kubernetes.io/serviceaccount",
"name": "kube-api-access-5lp9j",
"readOnly": true
}
]
}
],
"dnsPolicy": "ClusterFirst",
"enableServiceLinks": true,
"hostname": "emqx-core-75ff7c75f9-0",
"nodeName": "ip-10-16-253-98.ec2.internal",
"preemptionPolicy": "PreemptLowerPriority",
"priority": 0,
"readinessGates": [
{
"conditionType": "apps.emqx.io/on-serving"
}
],
"restartPolicy": "Always",
"schedulerName": "default-scheduler",
"securityContext": {
"fsGroup": 1000,
"fsGroupChangePolicy": "Always",
"runAsGroup": 1000,
"runAsUser": 1000,
"supplementalGroups": [
1000
]
},
"serviceAccount": "default",
"serviceAccountName": "default",
"subdomain": "emqx-headless",
"terminationGracePeriodSeconds": 30,
"tolerations": [
{
"effect": "NoExecute",
"key": "node.kubernetes.io/not-ready",
"operator": "Exists",
"tolerationSeconds": 300
},
{
"effect": "NoExecute",
"key": "node.kubernetes.io/unreachable",
"operator": "Exists",
"tolerationSeconds": 300
}
],
"volumes": [
{
"name": "emqx-core-data",
"persistentVolumeClaim": {
"claimName": "emqx-core-data-emqx-core-75ff7c75f9-0"
}
},
{
"name": "bootstrap-api-key",
"secret": {
"defaultMode": 420,
"secretName": "emqx-bootstrap-api-key"
}
},
{
"configMap": {
"defaultMode": 420,
"name": "emqx-configs"
},
"name": "bootstrap-config"
},
{
"emptyDir": {},
"name": "emqx-core-log"
},
{
"name": "kube-api-access-5lp9j",
"projected": {
"defaultMode": 420,
"sources": [
{
"serviceAccountToken": {
"expirationSeconds": 3607,
"path": "token"
}
},
{
"configMap": {
"items": [
{
"key": "ca.crt",
"path": "ca.crt"
}
],
"name": "kube-root-ca.crt"
}
},
{
"downwardAPI": {
"items": [
{
"fieldRef": {
"apiVersion": "v1",
"fieldPath": "metadata.namespace"
},
"path": "namespace"
}
]
}
}
]
}
}
]
},
"status": {
"conditions": [
{
"lastProbeTime": null,
"lastTransitionTime": null,
"status": "",
"type": "apps.emqx.io/on-serving"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:45Z",
"status": "True",
"type": "PodReadyToStartContainers"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:41Z",
"status": "True",
"type": "Initialized"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:41Z",
"message": "containers with unready status: [emqx]",
"reason": "ContainersNotReady",
"status": "False",
"type": "Ready"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:41Z",
"message": "containers with unready status: [emqx]",
"reason": "ContainersNotReady",
"status": "False",
"type": "ContainersReady"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:41Z",
"status": "True",
"type": "PodScheduled"
}
],
"containerStatuses": [
{
"containerID": "containerd://48de18c9bb87de65ae106acc273547ae180526e876f3fef214a8b04e6ed219b3",
"image": "docker.io/library/emqx:5",
"imageID": "docker.io/library/emqx@sha256:4750557065e94e53e0ec96f9a804c1bb4bc3c6001a5ef517f9b873eea886e0ae",
"lastState": {
"terminated": {
"containerID": "containerd://48de18c9bb87de65ae106acc273547ae180526e876f3fef214a8b04e6ed219b3",
"exitCode": 137,
"finishedAt": "2024-06-11T03:59:02Z",
"reason": "OOMKilled",
"startedAt": "2024-06-11T03:59:01Z"
}
},
"name": "emqx",
"ready": false,
"restartCount": 2,
"started": false,
"state": {
"waiting": {
"message": "back-off 20s restarting failed container=emqx pod=emqx-core-75ff7c75f9-0_emqx(63b85614-e6f1-4b46-8006-44bf1cf0dcb3)",
"reason": "CrashLoopBackOff"
}
}
}
],
"hostIP": "10.16.253.98",
"hostIPs": [
{
"ip": "10.16.253.98"
}
],
"phase": "Running",
"podIP": "10.16.253.84",
"podIPs": [
{
"ip": "10.16.253.84"
}
],
"qosClass": "Burstable",
"startTime": "2024-06-11T03:58:41Z"
}
},
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"creationTimestamp": "2024-06-11T03:58:41Z",
"generateName": "emqx-core-75ff7c75f9-",
"labels": {
"apps.emqx.io/db-role": "core",
"apps.emqx.io/instance": "emqx",
"apps.emqx.io/managed-by": "emqx-operator",
"apps.emqx.io/pod-template-hash": "75ff7c75f9",
"apps.kubernetes.io/pod-index": "1",
"controller-revision-hash": "emqx-core-75ff7c75f9-df4bcf689",
"statefulset.kubernetes.io/pod-name": "emqx-core-75ff7c75f9-1"
},
"name": "emqx-core-75ff7c75f9-1",
"namespace": "emqx",
"ownerReferences": [
{
"apiVersion": "apps/v1",
"blockOwnerDeletion": true,
"controller": true,
"kind": "StatefulSet",
"name": "emqx-core-75ff7c75f9",
"uid": "19197aa0-ba43-4a3b-ab12-8c203107ff1f"
}
],
"resourceVersion": "32867077",
"uid": "7d5d46db-4937-49b8-87b7-0130fe4f614b"
},
"spec": {
"containers": [
{
"env": [
{
"name": "EMQX_DASHBOARD__LISTENERS__HTTP__BIND",
"value": "18083"
},
{
"name": "POD_NAME",
"valueFrom": {
"fieldRef": {
"apiVersion": "v1",
"fieldPath": "metadata.name"
}
}
},
{
"name": "EMQX_CLUSTER__DISCOVERY_STRATEGY",
"value": "dns"
},
{
"name": "EMQX_CLUSTER__DNS__RECORD_TYPE",
"value": "srv"
},
{
"name": "EMQX_CLUSTER__DNS__NAME",
"value": "emqx-headless.emqx.svc.cluster.local"
},
{
"name": "EMQX_HOST",
"value": "$(POD_NAME).$(EMQX_CLUSTER__DNS__NAME)"
},
{
"name": "EMQX_NODE__DATA_DIR",
"value": "data"
},
{
"name": "EMQX_NODE__ROLE",
"value": "core"
},
{
"name": "EMQX_NODE__COOKIE",
"valueFrom": {
"secretKeyRef": {
"key": "node_cookie",
"name": "emqx-node-cookie"
}
}
},
{
"name": "EMQX_API_KEY__BOOTSTRAP_FILE",
"value": "\"/opt/emqx/data/bootstrap_api_key\""
}
],
"image": "emqx:5",
"imagePullPolicy": "IfNotPresent",
"livenessProbe": {
"failureThreshold": 3,
"httpGet": {
"path": "/status",
"port": "dashboard",
"scheme": "HTTP"
},
"initialDelaySeconds": 60,
"periodSeconds": 30,
"successThreshold": 1,
"timeoutSeconds": 1
},
"name": "emqx",
"ports": [
{
"containerPort": 18083,
"name": "dashboard",
"protocol": "TCP"
}
],
"readinessProbe": {
"failureThreshold": 12,
"httpGet": {
"path": "/status",
"port": "dashboard",
"scheme": "HTTP"
},
"initialDelaySeconds": 10,
"periodSeconds": 5,
"successThreshold": 1,
"timeoutSeconds": 1
},
"resources": {
"limits": {
"cpu": "1",
"memory": "1Gi"
},
"requests": {
"cpu": "1",
"memory": "512Mi"
}
},
"securityContext": {
"runAsGroup": 1000,
"runAsNonRoot": true,
"runAsUser": 1000
},
"terminationMessagePath": "/dev/termination-log",
"terminationMessagePolicy": "File",
"volumeMounts": [
{
"mountPath": "/opt/emqx/data/bootstrap_api_key",
"name": "bootstrap-api-key",
"readOnly": true,
"subPath": "bootstrap_api_key"
},
{
"mountPath": "/opt/emqx/etc/emqx.conf",
"name": "bootstrap-config",
"readOnly": true,
"subPath": "emqx.conf"
},
{
"mountPath": "/opt/emqx/log",
"name": "emqx-core-log"
},
{
"mountPath": "/opt/emqx/data",
"name": "emqx-core-data"
},
{
"mountPath": "/var/run/secrets/kubernetes.io/serviceaccount",
"name": "kube-api-access-j6xpw",
"readOnly": true
}
]
}
],
"dnsPolicy": "ClusterFirst",
"enableServiceLinks": true,
"hostname": "emqx-core-75ff7c75f9-1",
"nodeName": "ip-10-16-253-98.ec2.internal",
"preemptionPolicy": "PreemptLowerPriority",
"priority": 0,
"readinessGates": [
{
"conditionType": "apps.emqx.io/on-serving"
}
],
"restartPolicy": "Always",
"schedulerName": "default-scheduler",
"securityContext": {
"fsGroup": 1000,
"fsGroupChangePolicy": "Always",
"runAsGroup": 1000,
"runAsUser": 1000,
"supplementalGroups": [
1000
]
},
"serviceAccount": "default",
"serviceAccountName": "default",
"subdomain": "emqx-headless",
"terminationGracePeriodSeconds": 30,
"tolerations": [
{
"effect": "NoExecute",
"key": "node.kubernetes.io/not-ready",
"operator": "Exists",
"tolerationSeconds": 300
},
{
"effect": "NoExecute",
"key": "node.kubernetes.io/unreachable",
"operator": "Exists",
"tolerationSeconds": 300
}
],
"volumes": [
{
"name": "emqx-core-data",
"persistentVolumeClaim": {
"claimName": "emqx-core-data-emqx-core-75ff7c75f9-1"
}
},
{
"name": "bootstrap-api-key",
"secret": {
"defaultMode": 420,
"secretName": "emqx-bootstrap-api-key"
}
},
{
"configMap": {
"defaultMode": 420,
"name": "emqx-configs"
},
"name": "bootstrap-config"
},
{
"emptyDir": {},
"name": "emqx-core-log"
},
{
"name": "kube-api-access-j6xpw",
"projected": {
"defaultMode": 420,
"sources": [
{
"serviceAccountToken": {
"expirationSeconds": 3607,
"path": "token"
}
},
{
"configMap": {
"items": [
{
"key": "ca.crt",
"path": "ca.crt"
}
],
"name": "kube-root-ca.crt"
}
},
{
"downwardAPI": {
"items": [
{
"fieldRef": {
"apiVersion": "v1",
"fieldPath": "metadata.namespace"
},
"path": "namespace"
}
]
}
}
]
}
}
]
},
"status": {
"conditions": [
{
"lastProbeTime": null,
"lastTransitionTime": null,
"status": "",
"type": "apps.emqx.io/on-serving"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:45Z",
"status": "True",
"type": "PodReadyToStartContainers"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:41Z",
"status": "True",
"type": "Initialized"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:41Z",
"message": "containers with unready status: [emqx]",
"reason": "ContainersNotReady",
"status": "False",
"type": "Ready"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:41Z",
"message": "containers with unready status: [emqx]",
"reason": "ContainersNotReady",
"status": "False",
"type": "ContainersReady"
},
{
"lastProbeTime": null,
"lastTransitionTime": "2024-06-11T03:58:41Z",
"status": "True",
"type": "PodScheduled"
}
],
"containerStatuses": [
{
"containerID": "containerd://082125f05b7dca038e6a1910f72366e5852609065f415a82861ee38fad656dea",
"image": "docker.io/library/emqx:5",
"imageID": "docker.io/library/emqx@sha256:4750557065e94e53e0ec96f9a804c1bb4bc3c6001a5ef517f9b873eea886e0ae",
"lastState": {
"terminated": {
"containerID": "containerd://082125f05b7dca038e6a1910f72366e5852609065f415a82861ee38fad656dea",
"exitCode": 137,
"finishedAt": "2024-06-11T03:59:00Z",
"reason": "OOMKilled",
"startedAt": "2024-06-11T03:59:00Z"
}
},
"name": "emqx",
"ready": false,
"restartCount": 2,
"started": false,
"state": {
"waiting": {
"message": "back-off 20s restarting failed container=emqx pod=emqx-core-75ff7c75f9-1_emqx(7d5d46db-4937-49b8-87b7-0130fe4f614b)",
"reason": "CrashLoopBackOff"
}
}
}
],
"hostIP": "10.16.253.98",
"hostIPs": [
{
"ip": "10.16.253.98"
}
],
"phase": "Running",
"podIP": "10.16.255.30",
"podIPs": [
{
"ip": "10.16.255.30"
}
],
"qosClass": "Burstable",
"startTime": "2024-06-11T03:58:41Z"
}
}
],
"kind": "List",
"metadata": {
"resourceVersion": ""
}
}
yea, if I have some logs it would have been great, that's why it is very strange. I tried chaging log level to debug but no results.
Also changed increased the limit to 1024Mi.
I am able to run the docker image in my laptop and when I monitored it's resource consumption it didn't exceed 250mb.
stats of the node where the pod is scheduled
I'm sorry, I have no idea. Cloud you please cancel the resource limit of emqx, and retry, let's watch how many memory does it use, and maybe we can get some log of emqx. I think it will be helpful for us.
Hi, It worked when I removed limit, but the memory consumption is low
to test this theory I put back the limit and it started crashing again
This is the file I applied, hopefully this help you in figuring this out.
Normally when the limits is lower than requests it won't get applied, maybe the default is too low.
apiVersion: apps.emqx.io/v2beta1
kind: EMQX
metadata:
name: emqx
spec:
image: emqx:5
config:
data: |
log.console.level = debug
coreTemplate:
spec:
resources:
requests:
cpu: 2
memory: 1024Mi
limits:
cpu: 2
memory: 1024Mi
## EMQX custom resources do not support updating this field at runtime
volumeClaimTemplates:
## More content: https://docs.aws.amazon.com/eks/latest/userguide/storage-classes.html
## Please manage the Amazon EBS CSI driver as an Amazon EKS add-on.
## For more documentation please refer to: https://docs.aws.amazon.com/zh_cn/eks/latest/userguide/managing-ebs-csi.html
storageClassName: gp2
resources:
requests:
storage: 10Gi
accessModes:
- ReadWriteOnce
dashboardServiceTemplate:
metadata:
## More content: https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.4/guide/service/annotations/
annotations:
## Specifies whether the NLB is Internet-facing or internal. If not specified, defaults to internal.
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
## Specify the availability zone to which the NLB will route traffic. Specify at least one subnet, either subnetID or subnetName (subnet name label) can be used.
service.beta.kubernetes.io/aws-load-balancer-subnets: subnet-xxx1,subnet-xxx2
spec:
type: LoadBalancer
## More content: https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.4/guide/service/nlb/
loadBalancerClass: service.k8s.aws/nlb
listenersServiceTemplate:
metadata:
## More content: https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.4/guide/service/annotations/
annotations:
## Specifies whether the NLB is Internet-facing or internal. If not specified, defaults to internal.
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
## Specify the availability zone to which the NLB will route traffic. Specify at least one subnet, either subnetID or subnetName (subnet name label) can be used.
service.beta.kubernetes.io/aws-load-balancer-subnets: subnet-xxx1,subnet-xxx2
spec:
type: LoadBalancer
## More content: https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.4/guide/service/nlb/
loadBalancerClass: service.k8s.aws/nlb
@zmstone @id Do we have any suggestion for emqx memory, could you please take a look
Regular docker image starts just fine with these limits
docker run -d --name emqx --memory=512m --kernel-memory=1024m --cpus=1 emqx:5
Perhaps there is a memory spike when operator applies config via API? Can we get core dump to see what exactly caused it?
I am not sure how to take a core dump, if there are some commands that I can run, will replicate and send it.
Can we get core dump to see what exactly caused it?
Looks emqx didn't print log to console, is there any way to print core dump log to console? @id
Other, looks emqx didn't print any log, it's too strange
Describe the bug A clear and concise description of what the bug is. I installed the cert-manager, then operator, then applied getting started yaml from docs. The pods immediately fails saying OOMKilled, no logs.
I have tried other example deployment files, same issue with this file
Am I making some mistake ?
I have tried giving it more resources like this, but same issue
To Reproduce Steps to reproduce the behavior: Install operator using helm version : 2.2.23 And after wait to get conditions met from operator. Then apply this
Expected behavior The pods should run and cluster start
Anything else we need to know?: enqx operator logs
Environment details::