AWS Troubleshooting - Street-Level Ops¶
Real-world AWS debugging workflows for production incidents.
First move: Who am I and where am I?¶
aws sts get-caller-identity
# {
# "Account": "123456789012",
# "Arn": "arn:aws:iam::123456789012:role/deploy-role",
# "UserId": "AROA..."
# }
aws configure list
# Name Value Type Location
# profile prod env AWS_PROFILE
# region us-east-1 env AWS_DEFAULT_REGION
IAM Access Denied — systematic debug¶
# 1. Confirm identity
aws sts get-caller-identity
# 2. Check policies on the role
aws iam list-attached-role-policies --role-name deploy-role
aws iam list-role-policies --role-name deploy-role
# 3. Simulate the permission
aws iam simulate-principal-policy \
--policy-source-arn arn:aws:iam::123456789012:role/deploy-role \
--action-names s3:GetObject \
--resource-arns arn:aws:s3:::my-bucket/config.yaml
# EvalDecision: implicitDeny
# 4. Decode the cryptic authorization failure message
aws sts decode-authorization-message --encoded-message <msg> | \
jq '.DecodedMessage | fromjson'
# 5. Check resource-side policies (bucket policy, KMS key policy)
aws s3api get-bucket-policy --bucket my-bucket | jq '.Policy | fromjson'
aws kms describe-key-policy --key-id <key-id> --policy-name default
EC2 instance won't start or is unreachable¶
# Check instance and system status
aws ec2 describe-instance-status --instance-ids i-abc123
# "SystemStatus": "impaired" = hardware problem, AWS's fault
# "InstanceStatus": "impaired" = OS/config problem, your fault
# Get serial console output (boot errors, kernel panics)
aws ec2 get-console-output --instance-id i-abc123 --output text | tail -50
# Cannot SSH? Check security group
aws ec2 describe-security-groups --group-ids sg-abc123 | \
jq '.SecurityGroups[].IpPermissions[] | select(.FromPort == 22)'
# Check the instance is in the right subnet with a route to you
aws ec2 describe-instances --instance-ids i-abc123 \
--query 'Reservations[].Instances[].[SubnetId,PublicIpAddress,PrivateIpAddress]'
# Instance stuck "stopping"
aws ec2 stop-instances --instance-ids i-abc123 --force
Load balancer targets unhealthy¶
# Check target health
aws elbv2 describe-target-health \
--target-group-arn arn:aws:elasticloadbalancing:us-east-1:123456789012:targetgroup/my-tg/abc123
# "State": "unhealthy",
# "Reason": "Target.ResponseCodeMismatch",
# "Description": "Health checks failed with these codes: [503]"
# Check health check config
aws elbv2 describe-target-groups \
--target-group-arns arn:aws:elasticloadbalancing:... | \
jq '.TargetGroups[] | {HealthCheckPath, HealthCheckPort, HealthCheckProtocol, HealthCheckIntervalSeconds}'
# Verify from the instance
ssh ec2-user@10.0.1.5 'curl -v http://localhost:8080/health'
# Is it returning 200 within the timeout?
VPC flow logs — find blocked traffic¶
# Query flow logs for REJECTs in the last hour
aws logs start-query \
--log-group-name /vpc/flow-logs \
--start-time $(date -d '1 hour ago' +%s) \
--end-time $(date +%s) \
--query-string 'filter action = "REJECT" | stats count() by srcAddr, dstAddr, dstPort | sort count desc | limit 20'
# Get query results
aws logs get-query-results --query-id <query-id>
# dstPort=5432, srcAddr=10.0.1.5, dstAddr=10.0.2.10, count=847
# ← SG or NACL is blocking database traffic
CloudTrail — who changed what?¶
# Find who stopped an instance
aws cloudtrail lookup-events \
--lookup-attributes AttributeKey=EventName,AttributeValue=StopInstances \
--max-results 5 | jq '.Events[] | {user: .Username, time: .EventTime, resources: .Resources}'
# Find all changes to a specific resource
aws cloudtrail lookup-events \
--lookup-attributes AttributeKey=ResourceName,AttributeValue=i-abc123 \
--start-time 2024-03-13T00:00:00Z
# Find failed console logins
aws cloudtrail lookup-events \
--lookup-attributes AttributeKey=EventName,AttributeValue=ConsoleLogin \
--max-results 20 | jq '.Events[] | select(.ErrorCode)'
S3 access denied checklist¶
# Check bucket policy
aws s3api get-bucket-policy --bucket my-bucket | jq '.Policy | fromjson'
# Check public access block (often the culprit)
aws s3api get-public-access-block --bucket my-bucket
# Check bucket encryption (KMS key policy might deny your role)
aws s3api get-bucket-encryption --bucket my-bucket
# Test object access
aws s3api head-object --bucket my-bucket --key config.yaml
# If this fails with AccessDenied, check IAM + bucket policy + KMS
Find resources across all regions¶
for region in $(aws ec2 describe-regions --query 'Regions[].RegionName' --output text); do
count=$(aws ec2 describe-instances --region "${region}" \
--query 'length(Reservations[].Instances[])' --output text 2>/dev/null)
[[ "${count}" -gt 0 ]] && echo "${region}: ${count} instances"
done
# us-east-1: 42 instances
# eu-west-1: 12 instances
Quick cost spike investigation¶
aws ce get-cost-and-usage \
--time-period Start=$(date -d '-7 days' +%Y-%m-%d),End=$(date +%Y-%m-%d) \
--granularity DAILY \
--metrics BlendedCost \
--group-by Type=DIMENSION,Key=SERVICE \
--output table
Security group audit for an instance¶
INSTANCE_SGS=$(aws ec2 describe-instances --instance-ids i-abc123 \
--query 'Reservations[].Instances[].SecurityGroups[].GroupId' --output text)
for sg in ${INSTANCE_SGS}; do
echo "=== ${sg} ==="
aws ec2 describe-security-groups --group-ids "${sg}" \
--query 'SecurityGroups[].{Name:GroupName,Ingress:IpPermissions[].{Port:FromPort,Sources:IpRanges[].CidrIp}}' \
--output table
done
Quick Reference¶
- Deep Dive: Aws Vpc Internals