SQL Fundamentals — Street-Level Ops¶
Quick Diagnosis Commands¶
# Connect to PostgreSQL and check basic health
psql -h localhost -U postgres -c "SELECT version();"
psql -h localhost -U postgres -c "SELECT pg_is_in_recovery();"
# true = replica, false = primary
# Check active connections and what they are doing
psql -c "SELECT pid, state, query_start, LEFT(query, 80) AS query
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY query_start;"
# Find long-running queries (> 60 seconds)
psql -c "SELECT pid, now() - query_start AS duration, LEFT(query, 100) AS query
FROM pg_stat_activity
WHERE state = 'active' AND now() - query_start > INTERVAL '60 seconds'
ORDER BY duration DESC;"
# Kill a stuck query
psql -c "SELECT pg_cancel_backend(12345);" -- graceful (SIGINT)
psql -c "SELECT pg_terminate_backend(12345);" -- hard kill (SIGTERM)
# Check database size
psql -c "SELECT pg_database.datname,
pg_size_pretty(pg_database_size(pg_database.datname))
FROM pg_database ORDER BY pg_database_size(pg_database.datname) DESC;"
# Check table sizes (including indexes)
psql -d mydb -c "SELECT relname, pg_size_pretty(pg_total_relation_size(relid))
FROM pg_catalog.pg_statio_user_tables
ORDER BY pg_total_relation_size(relid) DESC LIMIT 10;"
# Check for blocking locks
psql -c "SELECT blocked_locks.pid AS blocked_pid,
blocking_locks.pid AS blocking_pid,
LEFT(blocked_activity.query, 60) AS blocked_query,
LEFT(blocking_activity.query, 60) AS blocking_query
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_locks blocking_locks
ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.relation = blocked_locks.relation
AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocked_activity
ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity
ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;"
# MySQL equivalents
mysql -e "SHOW PROCESSLIST;"
mysql -e "SELECT * FROM information_schema.innodb_lock_waits\G"
Gotcha: Slow Query — How to Diagnose¶
Symptom: A query that used to take 100ms now takes 30 seconds. No code changes.
Rule: Always start with EXPLAIN ANALYZE. It shows the actual execution plan and actual row counts — not estimates.
Debug clue: Read
EXPLAIN ANALYZEoutput bottom-up. The most common smoking gun: "actual rows=500000" next to "rows=1" — this means the planner's row estimate was wildly wrong, likely due to stale statistics. RunANALYZE <table>and re-check.
-- PostgreSQL: show the plan with actual execution times
EXPLAIN ANALYZE
SELECT o.id, o.total, c.name
FROM orders o
JOIN customers c ON o.customer_id = c.id
WHERE o.created_at > NOW() - INTERVAL '1 hour'
AND o.status = 'pending';
-- Read bottom-up. Look for:
-- 1. Seq Scan where you expected Index Scan → missing index
-- 2. "actual rows" much higher than "rows" → stale statistics
-- 3. Nested Loop with high row count → missing join index
-- 4. Sort with "external merge" → not enough work_mem
-- If statistics are stale, refresh them:
ANALYZE orders;
-- MySQL equivalent:
EXPLAIN ANALYZE
SELECT o.id, o.total, c.name
FROM orders o
JOIN customers c ON o.customer_id = c.id
WHERE o.created_at > NOW() - INTERVAL 1 HOUR
AND o.status = 'pending';
Pattern: Incident Query Toolkit¶
These queries answer common incident questions fast.
-- Top 10 error paths in the last hour (access log table)
SELECT path, status, COUNT(*) AS errors
FROM access_log
WHERE status >= 500
AND timestamp > NOW() - INTERVAL '1 hour'
GROUP BY path, status
ORDER BY errors DESC
LIMIT 10;
-- Request rate per minute (detect traffic spikes)
SELECT DATE_TRUNC('minute', timestamp) AS minute,
COUNT(*) AS requests
FROM access_log
WHERE timestamp > NOW() - INTERVAL '1 hour'
GROUP BY 1
ORDER BY 1;
-- Percentile response times (find latency spikes)
SELECT path,
PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
COUNT(*) AS total
FROM access_log
WHERE timestamp > NOW() - INTERVAL '1 hour'
GROUP BY path
HAVING COUNT(*) > 10
ORDER BY p99 DESC;
-- Find the worst N queries by total execution time (pg_stat_statements)
SELECT LEFT(query, 80) AS query,
calls,
total_exec_time::numeric(12,2) AS total_ms,
mean_exec_time::numeric(12,2) AS mean_ms,
rows
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 10;
Pattern: Safe Schema Migrations¶
Schema changes on production databases are high-risk. Follow this pattern.
-- RULE: Never run ALTER TABLE on a large table without checking lock behavior
-- Step 1: Check table size (know what you are touching)
SELECT pg_size_pretty(pg_total_relation_size('orders'));
-- Step 2: Add columns as NULL first (no table rewrite needed)
ALTER TABLE orders ADD COLUMN priority INTEGER;
-- This is fast — no default value means no table rewrite
-- DANGEROUS: Adding a column with a DEFAULT rewrites the entire table
-- on PostgreSQL < 11. On PG 11+, simple defaults are metadata-only.
ALTER TABLE orders ADD COLUMN priority INTEGER DEFAULT 0;
-- Step 3: Create indexes CONCURRENTLY (does not block writes)
CREATE INDEX CONCURRENTLY idx_orders_priority ON orders(priority);
-- Note: CONCURRENTLY takes longer but does not lock the table
> **Gotcha:** `CREATE INDEX CONCURRENTLY` cannot run inside a transaction block. If your migration tool wraps everything in a transaction, the concurrent index build will fail. Django, Alembic, and Flyway all have specific flags to handle this — check your tool's docs.
-- Step 4: For column type changes, use a staged approach:
-- 1. Add new column with new type
-- 2. Backfill in batches (not one giant UPDATE)
-- 3. Switch application to read from new column
-- 4. Drop old column
-- Batch backfill pattern (avoid giant transactions):
UPDATE orders SET priority_new = priority
WHERE id IN (SELECT id FROM orders WHERE priority_new IS NULL LIMIT 10000);
-- Repeat in a loop until no rows remain
Pattern: Debugging Connection Exhaustion¶
-- Check current connection count vs limit
SELECT COUNT(*) AS current,
(SELECT setting FROM pg_settings WHERE name = 'max_connections') AS max
FROM pg_stat_activity;
-- Who is consuming connections?
SELECT usename, client_addr, state, COUNT(*)
FROM pg_stat_activity
GROUP BY usename, client_addr, state
ORDER BY COUNT(*) DESC;
-- Find idle connections that should be closed
SELECT pid, usename, state, query_start,
NOW() - query_start AS idle_time
FROM pg_stat_activity
WHERE state = 'idle'
AND NOW() - query_start > INTERVAL '10 minutes'
ORDER BY idle_time DESC;
-- Kill idle connections older than 10 minutes
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'idle'
AND NOW() - query_start > INTERVAL '10 minutes'
AND pid != pg_backend_pid();
Pattern: Safe Data Investigation During Incidents¶
-- RULE: Always use transactions with ROLLBACK for incident investigation
-- This prevents accidental writes
BEGIN;
-- Your investigation queries here
SELECT * FROM orders WHERE status = 'stuck' LIMIT 10;
-- When done:
ROLLBACK; -- NOT COMMIT
> **Remember:** During incidents, always wrap investigation queries in `BEGIN` / `ROLLBACK`. One accidental `UPDATE` without a `WHERE` clause during a 3 AM incident turns a partial outage into a data disaster.
-- RULE: Always add LIMIT when exploring unfamiliar tables
SELECT * FROM audit_log ORDER BY created_at DESC LIMIT 20;
-- Without LIMIT on a 100M row table → your query runs for minutes
-- and your terminal hangs
-- RULE: Use CTEs for complex incident queries (readable, debuggable)
WITH recent_errors AS (
SELECT * FROM events
WHERE severity = 'ERROR'
AND timestamp > NOW() - INTERVAL '1 hour'
),
error_counts AS (
SELECT source, COUNT(*) AS cnt
FROM recent_errors
GROUP BY source
)
SELECT * FROM error_counts ORDER BY cnt DESC LIMIT 10;
Useful One-Liners¶
# Export query results to CSV
psql -c "COPY (SELECT * FROM orders WHERE status='failed')
TO STDOUT WITH CSV HEADER" > failed_orders.csv
# Quick row count estimate (fast, uses stats, not exact)
psql -c "SELECT reltuples::bigint FROM pg_class WHERE relname = 'orders';"
> **One-liner:** `SELECT count(*)` on a large table does a full sequential scan in PostgreSQL — it can take minutes on 100M+ row tables. Use `reltuples` from `pg_class` for instant estimates. The number is updated by `ANALYZE` and autovacuum, so it is usually close enough for operational decisions.
# Check replication lag (PostgreSQL)
psql -c "SELECT now() - pg_last_xact_replay_timestamp() AS replication_lag;"
# Show running autovacuum processes
psql -c "SELECT * FROM pg_stat_activity WHERE query LIKE 'autovacuum%';"
# Check index usage (find unused indexes)
psql -d mydb -c "SELECT indexrelname, idx_scan, pg_size_pretty(pg_relation_size(indexrelid))
FROM pg_stat_user_indexes
WHERE idx_scan = 0
ORDER BY pg_relation_size(indexrelid) DESC LIMIT 10;"
# Check for table bloat (dead tuples needing vacuum)
psql -d mydb -c "SELECT relname, n_dead_tup, n_live_tup,
ROUND(n_dead_tup::numeric / NULLIF(n_live_tup, 0) * 100, 2) AS dead_pct
FROM pg_stat_user_tables
WHERE n_dead_tup > 1000
ORDER BY n_dead_tup DESC LIMIT 10;"