Skip to content

AWS Troubleshooting - Street-Level Ops

Real-world AWS debugging workflows for production incidents.

First move: Who am I and where am I?

aws sts get-caller-identity
# {
#   "Account": "123456789012",
#   "Arn": "arn:aws:iam::123456789012:role/deploy-role",
#   "UserId": "AROA..."
# }

aws configure list
# Name                    Value             Type    Location
# profile                 prod              env     AWS_PROFILE
# region                  us-east-1         env     AWS_DEFAULT_REGION

IAM Access Denied — systematic debug

# 1. Confirm identity
aws sts get-caller-identity

# 2. Check policies on the role
aws iam list-attached-role-policies --role-name deploy-role
aws iam list-role-policies --role-name deploy-role

# 3. Simulate the permission
aws iam simulate-principal-policy \
  --policy-source-arn arn:aws:iam::123456789012:role/deploy-role \
  --action-names s3:GetObject \
  --resource-arns arn:aws:s3:::my-bucket/config.yaml
# EvalDecision: implicitDeny

# 4. Decode the cryptic authorization failure message
aws sts decode-authorization-message --encoded-message <msg> | \
  jq '.DecodedMessage | fromjson'

# 5. Check resource-side policies (bucket policy, KMS key policy)
aws s3api get-bucket-policy --bucket my-bucket | jq '.Policy | fromjson'
aws kms describe-key-policy --key-id <key-id> --policy-name default

EC2 instance won't start or is unreachable

# Check instance and system status
aws ec2 describe-instance-status --instance-ids i-abc123
# "SystemStatus": "impaired" = hardware problem, AWS's fault
# "InstanceStatus": "impaired" = OS/config problem, your fault

# Get serial console output (boot errors, kernel panics)
aws ec2 get-console-output --instance-id i-abc123 --output text | tail -50

# Cannot SSH? Check security group
aws ec2 describe-security-groups --group-ids sg-abc123 | \
  jq '.SecurityGroups[].IpPermissions[] | select(.FromPort == 22)'

# Check the instance is in the right subnet with a route to you
aws ec2 describe-instances --instance-ids i-abc123 \
  --query 'Reservations[].Instances[].[SubnetId,PublicIpAddress,PrivateIpAddress]'

# Instance stuck "stopping"
aws ec2 stop-instances --instance-ids i-abc123 --force

Load balancer targets unhealthy

# Check target health
aws elbv2 describe-target-health \
  --target-group-arn arn:aws:elasticloadbalancing:us-east-1:123456789012:targetgroup/my-tg/abc123
# "State": "unhealthy",
# "Reason": "Target.ResponseCodeMismatch",
# "Description": "Health checks failed with these codes: [503]"

# Check health check config
aws elbv2 describe-target-groups \
  --target-group-arns arn:aws:elasticloadbalancing:... | \
  jq '.TargetGroups[] | {HealthCheckPath, HealthCheckPort, HealthCheckProtocol, HealthCheckIntervalSeconds}'

# Verify from the instance
ssh ec2-user@10.0.1.5 'curl -v http://localhost:8080/health'
# Is it returning 200 within the timeout?

VPC flow logs — find blocked traffic

# Query flow logs for REJECTs in the last hour
aws logs start-query \
  --log-group-name /vpc/flow-logs \
  --start-time $(date -d '1 hour ago' +%s) \
  --end-time $(date +%s) \
  --query-string 'filter action = "REJECT" | stats count() by srcAddr, dstAddr, dstPort | sort count desc | limit 20'

# Get query results
aws logs get-query-results --query-id <query-id>
# dstPort=5432, srcAddr=10.0.1.5, dstAddr=10.0.2.10, count=847
# ← SG or NACL is blocking database traffic

CloudTrail — who changed what?

# Find who stopped an instance
aws cloudtrail lookup-events \
  --lookup-attributes AttributeKey=EventName,AttributeValue=StopInstances \
  --max-results 5 | jq '.Events[] | {user: .Username, time: .EventTime, resources: .Resources}'

# Find all changes to a specific resource
aws cloudtrail lookup-events \
  --lookup-attributes AttributeKey=ResourceName,AttributeValue=i-abc123 \
  --start-time 2024-03-13T00:00:00Z

# Find failed console logins
aws cloudtrail lookup-events \
  --lookup-attributes AttributeKey=EventName,AttributeValue=ConsoleLogin \
  --max-results 20 | jq '.Events[] | select(.ErrorCode)'

S3 access denied checklist

# Check bucket policy
aws s3api get-bucket-policy --bucket my-bucket | jq '.Policy | fromjson'

# Check public access block (often the culprit)
aws s3api get-public-access-block --bucket my-bucket

# Check bucket encryption (KMS key policy might deny your role)
aws s3api get-bucket-encryption --bucket my-bucket

# Test object access
aws s3api head-object --bucket my-bucket --key config.yaml
# If this fails with AccessDenied, check IAM + bucket policy + KMS

Find resources across all regions

for region in $(aws ec2 describe-regions --query 'Regions[].RegionName' --output text); do
  count=$(aws ec2 describe-instances --region "${region}" \
    --query 'length(Reservations[].Instances[])' --output text 2>/dev/null)
  [[ "${count}" -gt 0 ]] && echo "${region}: ${count} instances"
done
# us-east-1: 42 instances
# eu-west-1: 12 instances

Quick cost spike investigation

aws ce get-cost-and-usage \
  --time-period Start=$(date -d '-7 days' +%Y-%m-%d),End=$(date +%Y-%m-%d) \
  --granularity DAILY \
  --metrics BlendedCost \
  --group-by Type=DIMENSION,Key=SERVICE \
  --output table

Security group audit for an instance

INSTANCE_SGS=$(aws ec2 describe-instances --instance-ids i-abc123 \
  --query 'Reservations[].Instances[].SecurityGroups[].GroupId' --output text)

for sg in ${INSTANCE_SGS}; do
  echo "=== ${sg} ==="
  aws ec2 describe-security-groups --group-ids "${sg}" \
    --query 'SecurityGroups[].{Name:GroupName,Ingress:IpPermissions[].{Port:FromPort,Sources:IpRanges[].CidrIp}}' \
    --output table
done

Quick Reference