Skip to content

kubectl Debug Cheatsheet


Terminal window
# Check pod status and events
kubectl get pods -n <ns> -o wide
kubectl describe pod <pod> -n <ns>
# Common statuses and what they mean:
# Pending → scheduling issue (resources, node selector, affinity)
# ContainerCreating → image pull or volume mount issue
# CrashLoopBackOff → container starts then crashes (check logs)
# ImagePullBackOff → can't pull image (wrong name, no auth, registry down)
# Init:Error → init container failed
# Init:CrashLoop → init container keeps crashing
# Check container logs (current)
kubectl logs <pod> -n <ns> -c <container>
# Check container logs (previous crash)
kubectl logs <pod> -n <ns> -c <container> --previous
# Check init container logs
kubectl logs <pod> -n <ns> -c <init-container-name>
# Check events (sorted by time)
kubectl get events -n <ns> --sort-by='.lastTimestamp'
# Check events for a specific pod
kubectl get events -n <ns> --field-selector involvedObject.name=<pod>
Terminal window
# Get the exit code
kubectl describe pod <pod> -n <ns> | grep -A5 "Last State"
# Exit code 0 = container exited normally (check if command is wrong)
# Exit code 1 = application error (check logs)
# Exit code 137 = OOMKilled (memory limit too low)
# Exit code 139 = Segfault
# Exit code 143 = SIGTERM (graceful shutdown failed)
# Check if OOMKilled
kubectl describe pod <pod> -n <ns> | grep -i oom
kubectl get pod <pod> -n <ns> -o jsonpath='{.status.containerStatuses[0].lastState}'
# Check resource limits vs actual usage
kubectl top pod <pod> -n <ns> --containers
# Live debug with ephemeral container (K8s 1.25+)
kubectl debug <pod> -n <ns> -it --image=busybox --target=<container>
Terminal window
# Check why scheduler can't place the pod
kubectl describe pod <pod> -n <ns> | grep -A10 Events
# Common reasons:
# "Insufficient cpu" → node doesn't have enough CPU
# "Insufficient memory" → node doesn't have enough memory
# "node(s) had taint" → missing toleration
# "node selector" → no nodes match nodeSelector
# "unbound PVC" → PVC can't bind to a PV
# Check node resources
kubectl describe nodes | grep -A5 "Allocated resources"
kubectl top nodes
# Check if node has taints
kubectl describe node <node> | grep Taints
# Check ResourceQuota
kubectl get resourcequota -n <ns>
kubectl describe resourcequota -n <ns>
# Check LimitRange
kubectl get limitrange -n <ns>
kubectl describe limitrange -n <ns>
Terminal window
# Check for finalizers blocking deletion
kubectl get pod <pod> -n <ns> -o jsonpath='{.metadata.finalizers}'
# Check if pod has a long terminationGracePeriodSeconds
kubectl get pod <pod> -n <ns> -o jsonpath='{.spec.terminationGracePeriodSeconds}'
# Force delete (use cautiously)
kubectl delete pod <pod> -n <ns> --grace-period=0 --force
# Check if node is unreachable (pod stuck on a dead node)
kubectl get node <node> -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'

Terminal window
# List all nodes with status
kubectl get nodes -o wide
# Check node conditions
kubectl describe node <node> | grep -A20 Conditions
# Ready=False → kubelet is down or node is unhealthy
# MemoryPressure=True → node running out of memory
# DiskPressure=True → node running out of disk
# PIDPressure=True → too many processes
# NetworkUnavailable → CNI plugin issue
# Check node resource usage
kubectl top nodes
kubectl describe node <node> | grep -A10 "Allocated resources"
# Check node taints
kubectl get nodes -o json | jq '.items[] | {name: .metadata.name, taints: .spec.taints}'
# Check pods on a specific node
kubectl get pods --all-namespaces --field-selector spec.nodeName=<node> -o wide
# Cordon a node (prevent new scheduling)
kubectl cordon <node>
# Drain a node (evict pods safely)
kubectl drain <node> --ignore-daemonsets --delete-emptydir-data
# Uncordon after maintenance
kubectl uncordon <node>
# Check kubelet logs (SSH to node first)
journalctl -u kubelet -f --no-pager | tail -100

Terminal window
# Check service exists and has endpoints
kubectl get svc <service> -n <ns>
kubectl get endpoints <service> -n <ns>
# If endpoints is empty → no pods match the service selector
# Check service selector matches pod labels
kubectl get svc <service> -n <ns> -o jsonpath='{.spec.selector}'
kubectl get pods -n <ns> -l <key>=<value>
# Test DNS resolution from within a pod
kubectl run debug --rm -it --image=busybox --restart=Never -- nslookup <service>.<ns>.svc.cluster.local
kubectl run debug --rm -it --image=busybox --restart=Never -- nslookup <service>
# Test connectivity from within a pod
kubectl run debug --rm -it --image=curlimages/curl --restart=Never -- curl -v http://<service>.<ns>:port/health
# Check CoreDNS is running
kubectl get pods -n kube-system -l k8s-app=kube-dns
kubectl logs -n kube-system -l k8s-app=kube-dns
# Test external DNS resolution
kubectl run debug --rm -it --image=busybox --restart=Never -- nslookup google.com
Terminal window
# List network policies in a namespace
kubectl get networkpolicy -n <ns>
# Describe network policy rules
kubectl describe networkpolicy <policy> -n <ns>
# Check if a default-deny policy exists
kubectl get networkpolicy -n <ns> -o yaml | grep -A5 "policyTypes"
# Test connectivity between pods
kubectl exec <pod-a> -n <ns> -- wget -qO- --timeout=5 http://<pod-b-ip>:port
kubectl exec <pod-a> -n <ns> -- nc -zv <pod-b-ip> <port>
Terminal window
# Check ingress resources
kubectl get ingress -n <ns>
kubectl describe ingress <ingress> -n <ns>
# Check ingress controller logs
kubectl logs -n ingress-nginx -l app.kubernetes.io/name=ingress-nginx
# Check if external LB was created
kubectl get svc -n ingress-nginx
# EXTERNAL-IP should not be <pending>
# Check Gateway API resources
kubectl get gateway -A
kubectl get httproute -A
kubectl describe httproute <route> -n <ns>
# Check ALB Ingress Controller (EKS)
kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller
# Check GKE Gateway Controller
kubectl logs -n gke-managed-system -l app=gke-gateway-controller

Terminal window
# Check PVC status
kubectl get pvc -n <ns>
# Pending → StorageClass issue, no available PV, or AZ mismatch
# Bound → healthy
# Check PV
kubectl get pv
kubectl describe pv <pv-name>
# Check StorageClass
kubectl get storageclass
kubectl describe storageclass <sc>
# Check if CSI driver is installed
kubectl get csidrivers
# Debug PVC stuck in Pending
kubectl describe pvc <pvc> -n <ns>
# "waiting for first consumer" → WaitForFirstConsumer binding mode (normal)
# "no persistent volumes available" → need to create PV or check StorageClass
# "exceeded quota" → ResourceQuota limit reached
# Check volume attachments
kubectl get volumeattachments
# Check disk usage inside a pod
kubectl exec <pod> -n <ns> -- df -h
# Force detach a stuck volume (careful!)
kubectl delete volumeattachment <name>

Terminal window
# Check if a user/SA can perform an action
kubectl auth can-i create pods -n <ns> --as=system:serviceaccount:<ns>:<sa>
kubectl auth can-i get secrets -n <ns> --as=user@example.com
kubectl auth can-i '*' '*' --as=system:serviceaccount:kube-system:admin # cluster-admin check
# List all roles and bindings in a namespace
kubectl get roles,rolebindings -n <ns>
kubectl get clusterroles,clusterrolebindings
# Describe a role to see its permissions
kubectl describe role <role> -n <ns>
kubectl describe clusterrole <clusterrole>
# Check who has what access
kubectl get rolebinding -n <ns> -o json | jq '.items[] | {name: .metadata.name, subjects: .subjects, role: .roleRef.name}'
# Check service account exists and has token
kubectl get sa <sa> -n <ns>
kubectl get secrets -n <ns> | grep <sa>
# Debug IRSA (EKS) — check SA annotation
kubectl get sa <sa> -n <ns> -o jsonpath='{.metadata.annotations.eks\.amazonaws\.com/role-arn}'
# Debug Workload Identity (GKE) — check SA annotation
kubectl get sa <sa> -n <ns> -o jsonpath='{.metadata.annotations.iam\.gke\.io/gcp-service-account}'
# Check API server audit logs for denied requests
kubectl logs -n kube-system -l component=kube-apiserver | grep "Forbidden"

Terminal window
# Check deployment status
kubectl get deploy -n <ns>
kubectl describe deploy <deploy> -n <ns>
# Check rollout status
kubectl rollout status deploy/<deploy> -n <ns>
# View rollout history
kubectl rollout history deploy/<deploy> -n <ns>
# Rollback to previous version
kubectl rollout undo deploy/<deploy> -n <ns>
# Rollback to specific revision
kubectl rollout undo deploy/<deploy> -n <ns> --to-revision=3
# Check ReplicaSets (shows old and new)
kubectl get rs -n <ns> -l app=<app>
# Watch rolling update progress
kubectl get pods -n <ns> -l app=<app> -w
# Check HPA status
kubectl get hpa -n <ns>
kubectl describe hpa <hpa> -n <ns>
# "unable to get metrics" → metrics-server or Prometheus adapter issue
# Check PDB (Pod Disruption Budget)
kubectl get pdb -n <ns>
kubectl describe pdb <pdb> -n <ns>

Terminal window
# All events in a namespace (sorted by time)
kubectl get events -n <ns> --sort-by='.lastTimestamp'
# Warning events only
kubectl get events -n <ns> --field-selector type=Warning
# Cluster-wide events
kubectl get events --all-namespaces --sort-by='.lastTimestamp' | tail -50
# Check component status (deprecated but useful)
kubectl get componentstatuses
# Check API server health
kubectl get --raw='/healthz'
kubectl get --raw='/readyz'
# Check etcd health (if accessible)
kubectl get --raw='/healthz/etcd'
# Check cluster resource usage summary
kubectl top nodes
kubectl top pods --all-namespaces --sort-by=memory | head -20
kubectl top pods --all-namespaces --sort-by=cpu | head -20

kubectl debug flowchart — diagnosis tree for Pending, CrashLoopBackOff, ImagePullBackOff, not ready, and not reachable pods