Skip to content

GCP Troubleshooting - Street-Level Ops

Real-world GCP debugging workflows for production incidents.

First move: Who am I and what project?

gcloud auth list
# ACTIVE  ACCOUNT
# *       deploy-sa@my-project.iam.gserviceaccount.com

gcloud config get-value project
# my-project

gcloud config get-value compute/region
# us-central1

IAM Access Denied — systematic debug

# 1. Use the policy troubleshooter (purpose-built for this)
gcloud policy-troubleshoot iam \
  //cloudresourcemanager.googleapis.com/projects/my-project \
  --permission=storage.objects.get \
  --principal-email=my-sa@my-project.iam.gserviceaccount.com
# Access: DENIED
# Reason: No binding with role containing permission

# 2. Check what roles the SA has
gcloud projects get-iam-policy my-project \
  --flatten="bindings[].members" \
  --filter="bindings.members:serviceAccount:my-sa@my-project.iam.gserviceaccount.com" \
  --format="table(bindings.role)"
# ROLE
# roles/viewer    ← missing roles/storage.objectViewer

# 3. Check for org-level constraints blocking access
gcloud resource-manager org-policies list --project=my-project

# 4. Workload Identity — is the KSA bound to the right GSA?
kubectl get serviceaccount my-ksa -n production -o yaml | grep -A2 annotations
# iam.gke.io/gcp-service-account: my-gsa@my-project.iam.gserviceaccount.com

gcloud iam service-accounts get-iam-policy my-gsa@my-project.iam.gserviceaccount.com \
  --format=json | jq '.bindings[] | select(.role == "roles/iam.workloadIdentityUser")'

Firewall rules — traffic not reaching instance

# List all firewall rules for the VPC
gcloud compute firewall-rules list \
  --filter="network:my-vpc" \
  --format="table(name,direction,allowed[].map().firewall_rule().flat(),sourceRanges[],targetTags[])"

# Key check: does the instance have the right network tag?
gcloud compute instances describe web-01 --zone us-central1-a \
  --format="value(tags.items)"
# http-server,https-server

# Run a connectivity test (built-in network diagnostic)
gcloud network-management connectivity-tests create test-web-to-db \
  --source-instance=projects/my-project/zones/us-central1-a/instances/web-01 \
  --destination-instance=projects/my-project/zones/us-central1-a/instances/db-01 \
  --destination-port=5432 --protocol=TCP

gcloud network-management connectivity-tests describe test-web-to-db
# result: REACHABLE / UNREACHABLE (with explanation)

GKE pod issues

# Get credentials
gcloud container clusters get-credentials my-cluster --zone us-central1-a

# Pods stuck Pending — check events
kubectl get events -n production --sort-by=.lastTimestamp | tail -20
# FailedScheduling: Insufficient cpu

# Check node pool capacity and autoscaler
gcloud container node-pools list --cluster=my-cluster --zone us-central1-a
gcloud logging read 'resource.type="gke_cluster" AND jsonPayload.message:"scale"' --limit=10

# Resource quotas blocking scheduling
kubectl describe resourcequota -n production
# Used: cpu 3800m, memory 14Gi
# Hard: cpu 4000m, memory 16Gi  ← almost full

# Node not ready
kubectl describe node gke-my-cluster-default-pool-abc123 | grep -A5 Conditions

Load balancer returning 502

# Check backend health
gcloud compute backend-services get-health my-backend-service --global
# status: UNHEALTHY
# healthCheckFirewallState: MISCONFIGURED

# Missing firewall rule for GCP health checks
gcloud compute firewall-rules list --filter="name~health"
# (empty — that is the problem)

# Create the required health check firewall rule
gcloud compute firewall-rules create allow-health-check \
  --network=my-vpc \
  --action=allow \
  --direction=ingress \
  --source-ranges=35.191.0.0/16,130.211.0.0/22 \
  --target-tags=http-server \
  --rules=tcp:8080

# Check health check config
gcloud compute health-checks describe my-health-check
# httpHealthCheck: path=/health, port=8080

Cloud Logging — find errors fast

# Errors from a specific container in the last hour
gcloud logging read \
  'resource.type="k8s_container"
   AND resource.labels.namespace_name="production"
   AND resource.labels.container_name="api-server"
   AND severity>=ERROR' \
  --limit=20 --freshness=1h --format=json | jq '.[].textPayload'

# Audit logs — who changed IAM?
gcloud logging read \
  'logName:"cloudaudit.googleapis.com"
   AND protoPayload.methodName:"SetIamPolicy"' \
  --limit=10 --format=json | jq '.[].protoPayload | {caller: .authenticationInfo.principalEmail, method: .methodName}'

# Tail logs in real time
gcloud logging tail 'resource.type="k8s_container" AND resource.labels.namespace_name="production"'

Instance boot diagnostics

# Serial port output (POST, GRUB, kernel)
gcloud compute instances get-serial-port-output my-instance --zone us-central1-a | tail -50

# SSH through IAP (no public IP needed)
gcloud compute ssh my-instance --zone us-central1-a --tunnel-through-iap

# Port forward through IAP
gcloud compute start-iap-tunnel my-instance 8080 \
  --local-host-port=localhost:8080 --zone us-central1-a

Quota limits causing silent failures

# Check compute quotas (resource creation fails silently at quota)
gcloud compute project-info describe --project my-project \
  --format="table(quotas.metric,quotas.limit,quotas.usage)" | grep -iE "cpu|ssd|ip"
# CPUS               96     82     ← almost full
# SSD_TOTAL_GB       4096   3800
# IN_USE_ADDRESSES   20     18

# Recent operations (what just failed?)
gcloud compute operations list --filter="status=DONE" --limit=10 --sort-by=~insertTime

Quick resource inventory

# Find all compute instances across all zones
gcloud asset search-all-resources \
  --scope=projects/my-project \
  --asset-types="compute.googleapis.com/Instance" \
  --format="table(name,location,state)"

# List all resources in a project
gcloud asset search-all-resources --scope=projects/my-project --limit=100