kubectl Debug Cheatsheet

Pod Failures

Pod Not Starting

# Check pod status and events
kubectl get pods -n <ns> -o wide
kubectl describe pod <pod> -n <ns>

# Common statuses and what they mean:
# Pending          → scheduling issue (resources, node selector, affinity)
# ContainerCreating → image pull or volume mount issue
# CrashLoopBackOff → container starts then crashes (check logs)
# ImagePullBackOff → can't pull image (wrong name, no auth, registry down)
# Init:Error       → init container failed
# Init:CrashLoop   → init container keeps crashing

# Check container logs (current)
kubectl logs <pod> -n <ns> -c <container>

# Check container logs (previous crash)
kubectl logs <pod> -n <ns> -c <container> --previous

# Check init container logs
kubectl logs <pod> -n <ns> -c <init-container-name>

# Check events (sorted by time)
kubectl get events -n <ns> --sort-by='.lastTimestamp'

# Check events for a specific pod
kubectl get events -n <ns> --field-selector involvedObject.name=<pod>

Pod CrashLoopBackOff

# Get the exit code
kubectl describe pod <pod> -n <ns> | grep -A5 "Last State"
# Exit code 0   = container exited normally (check if command is wrong)
# Exit code 1   = application error (check logs)
# Exit code 137  = OOMKilled (memory limit too low)
# Exit code 139  = Segfault
# Exit code 143  = SIGTERM (graceful shutdown failed)

# Check if OOMKilled
kubectl describe pod <pod> -n <ns> | grep -i oom
kubectl get pod <pod> -n <ns> -o jsonpath='{.status.containerStatuses[0].lastState}'

# Check resource limits vs actual usage
kubectl top pod <pod> -n <ns> --containers

# Live debug with ephemeral container (K8s 1.25+)
kubectl debug <pod> -n <ns> -it --image=busybox --target=<container>

Pod Stuck in Pending

# Check why scheduler can't place the pod
kubectl describe pod <pod> -n <ns> | grep -A10 Events

# Common reasons:
# "Insufficient cpu"     → node doesn't have enough CPU
# "Insufficient memory"  → node doesn't have enough memory
# "node(s) had taint"    → missing toleration
# "node selector"        → no nodes match nodeSelector
# "unbound PVC"          → PVC can't bind to a PV

# Check node resources
kubectl describe nodes | grep -A5 "Allocated resources"
kubectl top nodes

# Check if node has taints
kubectl describe node <node> | grep Taints

# Check ResourceQuota
kubectl get resourcequota -n <ns>
kubectl describe resourcequota -n <ns>

# Check LimitRange
kubectl get limitrange -n <ns>
kubectl describe limitrange -n <ns>

Pod Stuck in Terminating

# Check for finalizers blocking deletion
kubectl get pod <pod> -n <ns> -o jsonpath='{.metadata.finalizers}'

# Check if pod has a long terminationGracePeriodSeconds
kubectl get pod <pod> -n <ns> -o jsonpath='{.spec.terminationGracePeriodSeconds}'

# Force delete (use cautiously)
kubectl delete pod <pod> -n <ns> --grace-period=0 --force

# Check if node is unreachable (pod stuck on a dead node)
kubectl get node <node> -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'

Node Issues

# List all nodes with status
kubectl get nodes -o wide

# Check node conditions
kubectl describe node <node> | grep -A20 Conditions
# Ready=False         → kubelet is down or node is unhealthy
# MemoryPressure=True → node running out of memory
# DiskPressure=True   → node running out of disk
# PIDPressure=True    → too many processes
# NetworkUnavailable  → CNI plugin issue

# Check node resource usage
kubectl top nodes
kubectl describe node <node> | grep -A10 "Allocated resources"

# Check node taints
kubectl get nodes -o json | jq '.items[] | {name: .metadata.name, taints: .spec.taints}'

# Check pods on a specific node
kubectl get pods --all-namespaces --field-selector spec.nodeName=<node> -o wide

# Cordon a node (prevent new scheduling)
kubectl cordon <node>

# Drain a node (evict pods safely)
kubectl drain <node> --ignore-daemonsets --delete-emptydir-data

# Uncordon after maintenance
kubectl uncordon <node>

# Check kubelet logs (SSH to node first)
journalctl -u kubelet -f --no-pager | tail -100

Networking

Service Connectivity

# Check service exists and has endpoints
kubectl get svc <service> -n <ns>
kubectl get endpoints <service> -n <ns>
# If endpoints is empty → no pods match the service selector

# Check service selector matches pod labels
kubectl get svc <service> -n <ns> -o jsonpath='{.spec.selector}'
kubectl get pods -n <ns> -l <key>=<value>

# Test DNS resolution from within a pod
kubectl run debug --rm -it --image=busybox --restart=Never -- nslookup <service>.<ns>.svc.cluster.local
kubectl run debug --rm -it --image=busybox --restart=Never -- nslookup <service>

# Test connectivity from within a pod
kubectl run debug --rm -it --image=curlimages/curl --restart=Never -- curl -v http://<service>.<ns>:port/health

# Check CoreDNS is running
kubectl get pods -n kube-system -l k8s-app=kube-dns
kubectl logs -n kube-system -l k8s-app=kube-dns

# Test external DNS resolution
kubectl run debug --rm -it --image=busybox --restart=Never -- nslookup google.com

Network Policies

# List network policies in a namespace
kubectl get networkpolicy -n <ns>

# Describe network policy rules
kubectl describe networkpolicy <policy> -n <ns>

# Check if a default-deny policy exists
kubectl get networkpolicy -n <ns> -o yaml | grep -A5 "policyTypes"

# Test connectivity between pods
kubectl exec <pod-a> -n <ns> -- wget -qO- --timeout=5 http://<pod-b-ip>:port
kubectl exec <pod-a> -n <ns> -- nc -zv <pod-b-ip> <port>

Ingress / Gateway API

# Check ingress resources
kubectl get ingress -n <ns>
kubectl describe ingress <ingress> -n <ns>

# Check ingress controller logs
kubectl logs -n ingress-nginx -l app.kubernetes.io/name=ingress-nginx

# Check if external LB was created
kubectl get svc -n ingress-nginx
# EXTERNAL-IP should not be <pending>

# Check Gateway API resources
kubectl get gateway -A
kubectl get httproute -A
kubectl describe httproute <route> -n <ns>

# Check ALB Ingress Controller (EKS)
kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller

# Check GKE Gateway Controller
kubectl logs -n gke-managed-system -l app=gke-gateway-controller

Storage

# Check PVC status
kubectl get pvc -n <ns>
# Pending → StorageClass issue, no available PV, or AZ mismatch
# Bound   → healthy

# Check PV
kubectl get pv
kubectl describe pv <pv-name>

# Check StorageClass
kubectl get storageclass
kubectl describe storageclass <sc>

# Check if CSI driver is installed
kubectl get csidrivers

# Debug PVC stuck in Pending
kubectl describe pvc <pvc> -n <ns>
# "waiting for first consumer"  → WaitForFirstConsumer binding mode (normal)
# "no persistent volumes available" → need to create PV or check StorageClass
# "exceeded quota"              → ResourceQuota limit reached

# Check volume attachments
kubectl get volumeattachments

# Check disk usage inside a pod
kubectl exec <pod> -n <ns> -- df -h

# Force detach a stuck volume (careful!)
kubectl delete volumeattachment <name>

RBAC

# Check if a user/SA can perform an action
kubectl auth can-i create pods -n <ns> --as=system:serviceaccount:<ns>:<sa>
kubectl auth can-i get secrets -n <ns> --as=user@example.com
kubectl auth can-i '*' '*' --as=system:serviceaccount:kube-system:admin  # cluster-admin check

# List all roles and bindings in a namespace
kubectl get roles,rolebindings -n <ns>
kubectl get clusterroles,clusterrolebindings

# Describe a role to see its permissions
kubectl describe role <role> -n <ns>
kubectl describe clusterrole <clusterrole>

# Check who has what access
kubectl get rolebinding -n <ns> -o json | jq '.items[] | {name: .metadata.name, subjects: .subjects, role: .roleRef.name}'

# Check service account exists and has token
kubectl get sa <sa> -n <ns>
kubectl get secrets -n <ns> | grep <sa>

# Debug IRSA (EKS) — check SA annotation
kubectl get sa <sa> -n <ns> -o jsonpath='{.metadata.annotations.eks\.amazonaws\.com/role-arn}'

# Debug Workload Identity (GKE) — check SA annotation
kubectl get sa <sa> -n <ns> -o jsonpath='{.metadata.annotations.iam\.gke\.io/gcp-service-account}'

# Check API server audit logs for denied requests
kubectl logs -n kube-system -l component=kube-apiserver | grep "Forbidden"

Deployments & Rollouts

# Check deployment status
kubectl get deploy -n <ns>
kubectl describe deploy <deploy> -n <ns>

# Check rollout status
kubectl rollout status deploy/<deploy> -n <ns>

# View rollout history
kubectl rollout history deploy/<deploy> -n <ns>

# Rollback to previous version
kubectl rollout undo deploy/<deploy> -n <ns>

# Rollback to specific revision
kubectl rollout undo deploy/<deploy> -n <ns> --to-revision=3

# Check ReplicaSets (shows old and new)
kubectl get rs -n <ns> -l app=<app>

# Watch rolling update progress
kubectl get pods -n <ns> -l app=<app> -w

# Check HPA status
kubectl get hpa -n <ns>
kubectl describe hpa <hpa> -n <ns>
# "unable to get metrics" → metrics-server or Prometheus adapter issue

# Check PDB (Pod Disruption Budget)
kubectl get pdb -n <ns>
kubectl describe pdb <pdb> -n <ns>

Events & Cluster Health

# All events in a namespace (sorted by time)
kubectl get events -n <ns> --sort-by='.lastTimestamp'

# Warning events only
kubectl get events -n <ns> --field-selector type=Warning

# Cluster-wide events
kubectl get events --all-namespaces --sort-by='.lastTimestamp' | tail -50

# Check component status (deprecated but useful)
kubectl get componentstatuses

# Check API server health
kubectl get --raw='/healthz'
kubectl get --raw='/readyz'

# Check etcd health (if accessible)
kubectl get --raw='/healthz/etcd'

# Check cluster resource usage summary
kubectl top nodes
kubectl top pods --all-namespaces --sort-by=memory | head -20
kubectl top pods --all-namespaces --sort-by=cpu | head -20

Quick Diagnosis Flowchart

kubectl debug flowchart — diagnosis tree for Pending, CrashLoopBackOff, ImagePullBackOff, not ready, and not reachable pods

References

AWS

Amazon EKS Troubleshooting Guide — common EKS issues with cluster, nodes, and networking
EKS Best Practices Guide — operational best practices for running EKS in production

GCP

GKE Troubleshooting Documentation — common GKE issues with clusters, workloads, and networking
GKE Security Best Practices — cluster hardening and security configuration

Tools & Frameworks

kubectl Reference Documentation — complete command reference for kubectl
kubectl Cheat Sheet (official) — Kubernetes official quick reference for common commands
Kubernetes Troubleshooting Guide — debugging applications, clusters, pods, and services
Kubernetes Debug Containers — ephemeral container debugging for running pods
Lens (K8s IDE) — desktop application for Kubernetes cluster management and troubleshooting