SQL Fundamentals — Street-Level Ops¶

Quick Diagnosis Commands¶

# Connect to PostgreSQL and check basic health
psql -h localhost -U postgres -c "SELECT version();"
psql -h localhost -U postgres -c "SELECT pg_is_in_recovery();"
# true = replica, false = primary

# Check active connections and what they are doing
psql -c "SELECT pid, state, query_start, LEFT(query, 80) AS query
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY query_start;"

# Find long-running queries (> 60 seconds)
psql -c "SELECT pid, now() - query_start AS duration, LEFT(query, 100) AS query
FROM pg_stat_activity
WHERE state = 'active' AND now() - query_start > INTERVAL '60 seconds'
ORDER BY duration DESC;"

# Kill a stuck query
psql -c "SELECT pg_cancel_backend(12345);"     -- graceful (SIGINT)
psql -c "SELECT pg_terminate_backend(12345);"  -- hard kill (SIGTERM)

# Check database size
psql -c "SELECT pg_database.datname,
  pg_size_pretty(pg_database_size(pg_database.datname))
FROM pg_database ORDER BY pg_database_size(pg_database.datname) DESC;"

# Check table sizes (including indexes)
psql -d mydb -c "SELECT relname, pg_size_pretty(pg_total_relation_size(relid))
FROM pg_catalog.pg_statio_user_tables
ORDER BY pg_total_relation_size(relid) DESC LIMIT 10;"

# Check for blocking locks
psql -c "SELECT blocked_locks.pid AS blocked_pid,
  blocking_locks.pid AS blocking_pid,
  LEFT(blocked_activity.query, 60) AS blocked_query,
  LEFT(blocking_activity.query, 60) AS blocking_query
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_locks blocking_locks
  ON blocking_locks.locktype = blocked_locks.locktype
  AND blocking_locks.relation = blocked_locks.relation
  AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocked_activity
  ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity
  ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;"

# MySQL equivalents
mysql -e "SHOW PROCESSLIST;"
mysql -e "SELECT * FROM information_schema.innodb_lock_waits\G"

Gotcha: Slow Query — How to Diagnose¶

Symptom: A query that used to take 100ms now takes 30 seconds. No code changes.

Rule: Always start with EXPLAIN ANALYZE. It shows the actual execution plan and actual row counts — not estimates.

Debug clue: Read EXPLAIN ANALYZE output bottom-up. The most common smoking gun: "actual rows=500000" next to "rows=1" — this means the planner's row estimate was wildly wrong, likely due to stale statistics. Run ANALYZE <table> and re-check.

-- PostgreSQL: show the plan with actual execution times
EXPLAIN ANALYZE
SELECT o.id, o.total, c.name
FROM orders o
JOIN customers c ON o.customer_id = c.id
WHERE o.created_at > NOW() - INTERVAL '1 hour'
  AND o.status = 'pending';

-- Read bottom-up. Look for:
-- 1. Seq Scan where you expected Index Scan → missing index
-- 2. "actual rows" much higher than "rows" → stale statistics
-- 3. Nested Loop with high row count → missing join index
-- 4. Sort with "external merge" → not enough work_mem

-- If statistics are stale, refresh them:
ANALYZE orders;

-- MySQL equivalent:
EXPLAIN ANALYZE
SELECT o.id, o.total, c.name
FROM orders o
JOIN customers c ON o.customer_id = c.id
WHERE o.created_at > NOW() - INTERVAL 1 HOUR
  AND o.status = 'pending';

Pattern: Incident Query Toolkit¶

These queries answer common incident questions fast.

-- Top 10 error paths in the last hour (access log table)
SELECT path, status, COUNT(*) AS errors
FROM access_log
WHERE status >= 500
  AND timestamp > NOW() - INTERVAL '1 hour'
GROUP BY path, status
ORDER BY errors DESC
LIMIT 10;

-- Request rate per minute (detect traffic spikes)
SELECT DATE_TRUNC('minute', timestamp) AS minute,
       COUNT(*) AS requests
FROM access_log
WHERE timestamp > NOW() - INTERVAL '1 hour'
GROUP BY 1
ORDER BY 1;

-- Percentile response times (find latency spikes)
SELECT path,
  PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY duration_ms) AS p50,
  PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY duration_ms) AS p95,
  PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99,
  COUNT(*) AS total
FROM access_log
WHERE timestamp > NOW() - INTERVAL '1 hour'
GROUP BY path
HAVING COUNT(*) > 10
ORDER BY p99 DESC;

-- Find the worst N queries by total execution time (pg_stat_statements)
SELECT LEFT(query, 80) AS query,
  calls,
  total_exec_time::numeric(12,2) AS total_ms,
  mean_exec_time::numeric(12,2) AS mean_ms,
  rows
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 10;

Pattern: Safe Schema Migrations¶

Schema changes on production databases are high-risk. Follow this pattern.

-- RULE: Never run ALTER TABLE on a large table without checking lock behavior

-- Step 1: Check table size (know what you are touching)
SELECT pg_size_pretty(pg_total_relation_size('orders'));

-- Step 2: Add columns as NULL first (no table rewrite needed)
ALTER TABLE orders ADD COLUMN priority INTEGER;
-- This is fast — no default value means no table rewrite

-- DANGEROUS: Adding a column with a DEFAULT rewrites the entire table
-- on PostgreSQL < 11. On PG 11+, simple defaults are metadata-only.
ALTER TABLE orders ADD COLUMN priority INTEGER DEFAULT 0;

-- Step 3: Create indexes CONCURRENTLY (does not block writes)
CREATE INDEX CONCURRENTLY idx_orders_priority ON orders(priority);
-- Note: CONCURRENTLY takes longer but does not lock the table

> **Gotcha:** `CREATE INDEX CONCURRENTLY` cannot run inside a transaction block. If your migration tool wraps everything in a transaction, the concurrent index build will fail. Django, Alembic, and Flyway all have specific flags to handle this — check your tool's docs.

-- Step 4: For column type changes, use a staged approach:
-- 1. Add new column with new type
-- 2. Backfill in batches (not one giant UPDATE)
-- 3. Switch application to read from new column
-- 4. Drop old column

-- Batch backfill pattern (avoid giant transactions):
UPDATE orders SET priority_new = priority
WHERE id IN (SELECT id FROM orders WHERE priority_new IS NULL LIMIT 10000);
-- Repeat in a loop until no rows remain

Pattern: Debugging Connection Exhaustion¶

-- Check current connection count vs limit
SELECT COUNT(*) AS current,
  (SELECT setting FROM pg_settings WHERE name = 'max_connections') AS max
FROM pg_stat_activity;

-- Who is consuming connections?
SELECT usename, client_addr, state, COUNT(*)
FROM pg_stat_activity
GROUP BY usename, client_addr, state
ORDER BY COUNT(*) DESC;

-- Find idle connections that should be closed
SELECT pid, usename, state, query_start,
  NOW() - query_start AS idle_time
FROM pg_stat_activity
WHERE state = 'idle'
  AND NOW() - query_start > INTERVAL '10 minutes'
ORDER BY idle_time DESC;

-- Kill idle connections older than 10 minutes
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'idle'
  AND NOW() - query_start > INTERVAL '10 minutes'
  AND pid != pg_backend_pid();

Pattern: Safe Data Investigation During Incidents¶

-- RULE: Always use transactions with ROLLBACK for incident investigation
-- This prevents accidental writes

BEGIN;
-- Your investigation queries here
SELECT * FROM orders WHERE status = 'stuck' LIMIT 10;
-- When done:
ROLLBACK;  -- NOT COMMIT

> **Remember:** During incidents, always wrap investigation queries in `BEGIN` / `ROLLBACK`. One accidental `UPDATE` without a `WHERE` clause during a 3 AM incident turns a partial outage into a data disaster.

-- RULE: Always add LIMIT when exploring unfamiliar tables
SELECT * FROM audit_log ORDER BY created_at DESC LIMIT 20;
-- Without LIMIT on a 100M row table → your query runs for minutes
-- and your terminal hangs

-- RULE: Use CTEs for complex incident queries (readable, debuggable)
WITH recent_errors AS (
    SELECT * FROM events
    WHERE severity = 'ERROR'
      AND timestamp > NOW() - INTERVAL '1 hour'
),
error_counts AS (
    SELECT source, COUNT(*) AS cnt
    FROM recent_errors
    GROUP BY source
)
SELECT * FROM error_counts ORDER BY cnt DESC LIMIT 10;

Useful One-Liners¶

# Export query results to CSV
psql -c "COPY (SELECT * FROM orders WHERE status='failed')
TO STDOUT WITH CSV HEADER" > failed_orders.csv

# Quick row count estimate (fast, uses stats, not exact)
psql -c "SELECT reltuples::bigint FROM pg_class WHERE relname = 'orders';"

> **One-liner:** `SELECT count(*)` on a large table does a full sequential scan in PostgreSQL — it can take minutes on 100M+ row tables. Use `reltuples` from `pg_class` for instant estimates. The number is updated by `ANALYZE` and autovacuum, so it is usually close enough for operational decisions.

# Check replication lag (PostgreSQL)
psql -c "SELECT now() - pg_last_xact_replay_timestamp() AS replication_lag;"

# Show running autovacuum processes
psql -c "SELECT * FROM pg_stat_activity WHERE query LIKE 'autovacuum%';"

# Check index usage (find unused indexes)
psql -d mydb -c "SELECT indexrelname, idx_scan, pg_size_pretty(pg_relation_size(indexrelid))
FROM pg_stat_user_indexes
WHERE idx_scan = 0
ORDER BY pg_relation_size(indexrelid) DESC LIMIT 10;"

# Check for table bloat (dead tuples needing vacuum)
psql -d mydb -c "SELECT relname, n_dead_tup, n_live_tup,
  ROUND(n_dead_tup::numeric / NULLIF(n_live_tup, 0) * 100, 2) AS dead_pct
FROM pg_stat_user_tables
WHERE n_dead_tup > 1000
ORDER BY n_dead_tup DESC LIMIT 10;"