Load Testing — Street-Level Ops¶
Quick Diagnosis Commands¶
# Run a k6 test and watch metrics live
k6 run script.js
# Run with specific VUs and duration (override script options)
k6 run -u 50 -d 5m script.js
# Run with verbose output (show request/response details)
k6 run --http-debug=full script.js
# Run and write results to JSON for analysis
k6 run --out json=results.json script.js
# Parse results with jq
cat results.json | jq 'select(.type == "Point" and .metric == "http_req_duration") | .data.value' | sort -n | awk '{a[NR]=$1} END{print "p95:", a[int(NR*0.95)]}'
# Check if test passed thresholds (exit code 0 = pass, 99 = threshold failed)
k6 run script.js; echo "Exit: $?"
# View k6 summary output only (suppress request logs)
k6 run script.js 2>/dev/null
# Locust headless run
locust -f locustfile.py --headless -u 100 -r 10 --run-time 5m --host https://api.example.com
# Get system CPU/net stats during a test (run in parallel)
watch -n 1 "ss -s && cat /proc/net/softnet_stat | awk '{print \$2}' | paste -sd+ | bc"
Gotcha: Checks Don't Fail the Test¶
Rule: A failed check is not a test failure. If all your checks fail but you have no thresholds, k6 exits 0 (success). Always pair checks with thresholds.
// WRONG: check fails silently, test passes
export default function () {
const res = http.get(url);
check(res, { 'status 200': (r) => r.status === 200 });
}
// RIGHT: threshold makes the test fail if check rate drops
export const options = {
thresholds: {
'checks': ['rate>0.99'], // fail if > 1% of checks fail
},
};
Gotcha: sleep() Hides Real Latency¶
Rule: sleep(1) in a VU-based test artificially caps throughput. Under load, VUs spend most time sleeping, not testing.
// BAD: 50 VUs with 1s sleep = ~50 RPS max, regardless of server capacity
export default function () {
http.get(url);
sleep(1);
}
// BETTER: use arrival rate to model actual traffic
export const options = {
scenarios: {
load: {
executor: 'constant-arrival-rate',
rate: 100, // exactly 100 RPS
timeUnit: '1s',
duration: '5m',
preAllocatedVUs: 50,
},
},
};
Pattern: Progressive Stress Test¶
Find your breaking point by incrementally increasing load until the service degrades.
export const options = {
stages: [
{ duration: '2m', target: 50 }, // warm up
{ duration: '5m', target: 50 }, // baseline
{ duration: '2m', target: 100 }, // ramp
{ duration: '5m', target: 100 }, // hold
{ duration: '2m', target: 200 }, // push
{ duration: '5m', target: 200 }, // hold
{ duration: '2m', target: 400 }, // stress
{ duration: '5m', target: 400 }, // hold
{ duration: '5m', target: 0 }, // ramp down
],
thresholds: {
http_req_duration: [
{ threshold: 'p(95)<1000', abortOnFail: true, delayAbortEval: '60s' },
],
http_req_failed: [
{ threshold: 'rate<0.05', abortOnFail: true },
],
},
};
Watch for the stage where p95 starts climbing — that's where you're saturating a resource.
Debug clue: If p95 spikes but p50 stays flat, you have a tail-latency problem (likely GC pauses, connection pool exhaustion, or a slow downstream dependency for some requests). If both p50 and p95 climb together, you are saturating CPU or hitting a global bottleneck like a database lock.
Pattern: Soak Test Setup¶
export const options = {
stages: [
{ duration: '5m', target: 50 }, // ramp up
{ duration: '4h', target: 50 }, // soak — watch for memory leak, conn leak
{ duration: '5m', target: 0 }, // ramp down
],
thresholds: {
http_req_duration: ['p(95)<500'],
http_req_failed: ['rate<0.01'],
},
};
War story: A soak test caught a connection pool leak that only manifested after 3 hours. The application opened a new database connection for each request inside a
tryblock but only closed it in thefinallyof a different code path. Unit tests passed (one connection, one close). Under sustained load, the pool slowly grew until PostgreSQL hitmax_connectionsand refused all traffic.
What to watch during a soak test:
- RSS memory of the application process (growing = leak)
- Open file descriptors: lsof -p <pid> | wc -l
- Database connection count: SELECT count(*) FROM pg_stat_activity
- Event loop lag (Node.js): process.hrtime() comparison
Pattern: Test Data Management¶
import { SharedArray } from 'k6/data';
// SharedArray: loaded once, shared across all VUs (efficient)
const users = new SharedArray('users', function () {
return JSON.parse(open('./data/users.json'));
});
const products = new SharedArray('products', function () {
return open('./data/products.csv').trim().split('\n')
.slice(1) // skip header
.map(line => {
const [id, name, price] = line.split(',');
return { id, name, price: parseFloat(price) };
});
});
export default function () {
// Each VU gets a different user (modulo to wrap around)
const user = users[__VU % users.length];
const product = products[Math.floor(Math.random() * products.length)];
const res = http.post('/api/orders', JSON.stringify({
user_id: user.id,
product_id: product.id,
}), { headers: { 'Content-Type': 'application/json' } });
check(res, { 'order created': (r) => r.status === 201 });
}
Scenario: API Baseline Before a Deploy¶
Use this to verify you're not introducing a regression.
#!/bin/bash
# pre-deploy-check.sh
# Run load test against staging (same config as prod)
k6 run \
-e BASE_URL=https://staging.api.example.com \
-e DURATION=5m \
--out json=pre-deploy-results.json \
tests/load/api-baseline.js
if [ $? -ne 0 ]; then
echo "FAIL: Pre-deploy load test failed. Blocking deploy."
exit 1
fi
echo "PASS: Staging meets performance requirements."
// tests/load/api-baseline.js
export const options = {
scenarios: {
baseline: {
executor: 'constant-arrival-rate',
rate: parseInt(__ENV.RATE || '50'),
timeUnit: '1s',
duration: __ENV.DURATION || '5m',
preAllocatedVUs: 100,
},
},
thresholds: {
http_req_duration: ['p(95)<300', 'p(99)<500'],
http_req_failed: ['rate<0.001'],
},
};
const BASE_URL = __ENV.BASE_URL || 'http://localhost:8000';
export default function () {
const responses = http.batch([
['GET', `${BASE_URL}/products`],
['GET', `${BASE_URL}/products/123`],
['GET', `${BASE_URL}/health`],
]);
responses.forEach(res => {
check(res, { 'ok': (r) => r.status === 200 });
});
}
Emergency: Service Is Slow in Production — Reproduce with Load Test¶
# 1. Check what production traffic looks like right now
# (from your access logs or APM tool)
# e.g., 200 RPS, p95 = 800ms, 2% error rate
# 2. Replicate that pattern in staging
cat > /tmp/incident-repro.js << 'EOF'
import http from 'k6/http';
import { check } from 'k6';
export const options = {
scenarios: {
repro: {
executor: 'constant-arrival-rate',
rate: 200, // match production RPS
timeUnit: '1s',
duration: '10m',
preAllocatedVUs: 200,
maxVUs: 500,
},
},
thresholds: {
http_req_duration: ['p(95)<200'], // what SHOULD happen
},
};
export default function () {
// Focus on the endpoint that's slow
const res = http.get(`${__ENV.BASE_URL}/api/v2/products`);
check(res, { 'ok': (r) => r.status < 500 });
}
EOF
k6 run -e BASE_URL=https://staging.example.com /tmp/incident-repro.js
# 3. While running, check database
# Slow queries?
psql -c "SELECT query, mean_exec_time, calls FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 10;"
# Connection saturation?
psql -c "SELECT count(*), state FROM pg_stat_activity GROUP BY state;"
# Lock waits?
psql -c "SELECT pid, wait_event_type, wait_event, query FROM pg_stat_activity WHERE wait_event IS NOT NULL;"
Emergency: Stop a Running Test¶
# k6: Ctrl+C in terminal, or send SIGINT
kill -SIGINT $(pgrep k6)
# k6: graceful stop (allow in-flight requests to complete)
kill -SIGUSR1 $(pgrep k6)
# Locust: Ctrl+C in terminal, or
kill -SIGTERM $(pgrep locust)
# If load test is hammering production from CI:
# Cancel the CI job — k6 process dies with the runner
Useful One-Liners¶
# k6: run a quick sanity check (10 VUs, 30s, single URL)
k6 run --vus 10 --duration 30s -e URL=https://api.example.com/health \
<(echo 'import http from "k6/http"; export default ()=>{ http.get(__ENV.URL) }')
# Parse k6 JSON output to get p95
jq '[.[] | select(.type=="Point" and .metric=="http_req_duration") | .data.value] | sort | .[floor(length * 0.95)]' results.json
# Count errors from k6 JSON output
jq '[.[] | select(.type=="Point" and .metric=="http_req_failed" and .data.value==1)] | length' results.json
# Locust: get stats as JSON
curl -s http://localhost:8089/stats/requests | jq '.stats[] | {name: .name, p95: .response_times."95", rps: .current_rps}'
# Calculate RPS from access log
awk '{print $4}' /var/log/nginx/access.log | cut -c1-17 | sort | uniq -c | sort -rn | head
# Watch live HTTP connection count during a test
watch -n 1 "ss -tn state established '( dport = :443 or dport = :80 )' | wc -l"
# Watch application memory during soak test
watch -n 5 "ps aux --sort=-%mem | grep 'gunicorn\|uvicorn\|node' | head -5"
# k6 with environment variables from .env
set -a && source .env && set +a && k6 run script.js