Closed AmitKumarDas closed 2 years ago
// https://sysdig.com/blog/debug-kubernetes-crashloopbackoff/
//
// Kubernetes Events
kubernetes:
node:
- TerminatedAllPods # Terminated All Pods (information)
- RegisteredNode # Node Registered (information)*
- RemovingNode # Removing Node (information)*
- DeletingNode # Deleting Node (information)*
- DeletingAllPods # Deleting All Pods (information)
- TerminatingEvictedPod # Terminating Evicted Pod (information)*
- NodeReady # Node Ready (information)*
- NodeNotReady # Node not Ready (information)*
- NodeSchedulable # Node is Schedulable (information)*
- NodeNotSchedulable # Node is not Schedulable (information)*
- CIDRNotAvailable # CIDR not Available (information)*
- CIDRAssignmentFailed # CIDR Assignment Failed (information)*
- Starting # Starting Kubelet (information)*
- KubeletSetupFailed # Kubelet Setup Failed (warning)*
- FailedMount # Volume Mount Failed (warning)*
- NodeSelectorMismatching # Node Selector Mismatch (warning)*
- InsufficientFreeCPU # Insufficient Free CPU (warning)*
- InsufficientFreeMemory # Insufficient Free Mem (warning)*
- OutOfDisk # Out of Disk (information)*
- HostNetworkNotSupported # Host Ntw not Supported (warning)*
- NilShaper # Undefined Shaper (warning)*
- Rebooted # Node Rebooted (warning)*
- NodeHasSufficientDisk # Node Has Sufficient Disk (information)*
- NodeOutOfDisk # Node Out of Disk Space (information)*
- InvalidDiskCapacity # Invalid Disk Capacity (warning)*
- FreeDiskSpaceFailed # Free Disk Space Failed (warning)*
pod:
- Pulling # Pulling Container Image (information)
- Pulled # Ctr Img Pulled (information)
- Failed # Ctr Img Pull/Create/Start Fail (warning)*
- InspectFailed # Ctr Img Inspect Failed (warning)*
- ErrImageNeverPull # Ctr Img NeverPull Policy Violate (warning)*
- BackOff # Back Off Ctr Start, Image Pull (warning)
- Created # Container Created (information)
- Started # Container Started (information)
- Killing # Killing Container (information)*
- Unhealthy # Container Unhealthy (warning)
- FailedSync # Pod Sync Failed (warning)
- FailedValidation # Failed Pod Config Validation (warning)
- OutOfDisk # Out of Disk (information)*
- HostPortConflict # Host/Port Conflict (warning)*
replicationController:
- SuccessfulCreate # Pod Created (information)*
- FailedCreate # Pod Create Failed (warning)*
- SuccessfulDelete # Pod Deleted (information)*
- FailedDelete # Pod Delete Failed (warning)*
// https://containersolutions.github.io/runbooks/posts/kubernetes/crashloopbackoff/
// https://docs.microsoft.com/en-us/answers/questions/328469/understanding-aks-crashloopbackoff.html
// Use kubectl describe to get more data on the pod failure
// A pod can also be in CrashLoopBackOff if it has completed and it’s configured to keep restarting
// (even with exit code 0). A good example is when you deploy a busybox image without any arguments:
// it will start, execute, and finish. It will keep restarting.
Containers:
busybox:
State: Waiting
Reason: CrashLoopBackoff
LastState: Terminated
Reason: Completed
ExitCode: 0
Ready: false
Restart Count: 2
// check for lastState.terminated.message
// https://kubernetes.io/docs/tasks/debug-application-cluster/determine-reason-pod-failure/
// customising termination message
//
// Kubernetes retrieves termination messages from the message file specified in the
// terminationMessagePath field of a Container, which has a default value of /dev/termination-log.
// By customizing this field, you can tell Kubernetes to use a different file. Kubernetes uses the
// contents from the specified file to populate the Container's status message on both success and failure.
//
// The termination message is intended to be brief final status, such as an assertion failure message.
// The kubelet truncates messages that are longer than 4096 bytes. The total message length across
// all containers will be limited to 12KiB. The default termination message path is /dev/termination-log.
// You cannot set the termination message path after a Pod is launched
// https://kubernetes.io/docs/concepts/cluster-administration/logging/
# https://zerokspot.com/weblog/2019/11/25/testing-prometheus-alerts/
#
# e.g. if some app has been unreachable for more than 5 minutes,
# I want to get notified. To achieve that I have added the following rule
groups:
- name: zerokspot
rules:
- alert: zerokspot--sys--down
expr: up{job="zerokspot-sys"} == 0
for: 5m
# https://www.robustperception.io/unit-testing-alerts-with-prometheus
#
groups:
- name: example
rules:
- alert: MyAlert
expr: avg without(instance)(up) < 0.75
for: 2m
labels:
severity: page
annotations:
description: 'Only {{$value}} of {{$labels.job}} job is up'
Prometheus Blackbox Exporter