Skip to content

Portal | Level: L2: Operations | Topics: etcd | Domain: Kubernetes

Runbook: etcd Backup & Restore

Symptoms

  • Cluster data loss or corruption
  • Need to recover from catastrophic failure
  • Accidental deletion of critical resources
  • etcd database space exceeded

Fast Triage

# Check etcd health
ETCDCTL_API=3 etcdctl endpoint health \
  --cacert=/etc/kubernetes/pki/etcd/ca.crt \
  --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
  --key=/etc/kubernetes/pki/etcd/healthcheck-client.key

# Check DB size
ETCDCTL_API=3 etcdctl endpoint status --write-out=table \
  --cacert=/etc/kubernetes/pki/etcd/ca.crt \
  --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
  --key=/etc/kubernetes/pki/etcd/healthcheck-client.key

# Check latest backup age
ls -la /var/backups/etcd/ | tail -5

Backup Procedure

Manual Snapshot

ETCDCTL_API=3 etcdctl snapshot save /var/backups/etcd/snapshot-$(date +%Y%m%d-%H%M%S).db \
  --cacert=/etc/kubernetes/pki/etcd/ca.crt \
  --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
  --key=/etc/kubernetes/pki/etcd/healthcheck-client.key

# Verify
ETCDCTL_API=3 etcdctl snapshot status /var/backups/etcd/snapshot-*.db --write-out=table

Automated Backup (CronJob)

apiVersion: batch/v1
kind: CronJob
metadata:
  name: etcd-backup
  namespace: kube-system
spec:
  schedule: "0 */6 * * *"  # Every 6 hours
  jobTemplate:
    spec:
      template:
        spec:
          hostNetwork: true
          nodeSelector:
            node-role.kubernetes.io/control-plane: ""
          tolerations:
            - key: node-role.kubernetes.io/control-plane
              effect: NoSchedule
          containers:
            - name: backup
              image: bitnami/etcd:3.5
              command:
                - /bin/sh
                - -c
                - |
                  etcdctl snapshot save /backup/snapshot-$(date +%Y%m%d-%H%M%S).db \
                    --endpoints=https://127.0.0.1:2379 \
                    --cacert=/etc/kubernetes/pki/etcd/ca.crt \
                    --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
                    --key=/etc/kubernetes/pki/etcd/healthcheck-client.key
                  find /backup -name "*.db" -mtime +7 -delete
              volumeMounts:
                - name: etcd-certs
                  mountPath: /etc/kubernetes/pki/etcd
                  readOnly: true
                - name: backup
                  mountPath: /backup
          volumes:
            - name: etcd-certs
              hostPath:
                path: /etc/kubernetes/pki/etcd
            - name: backup
              hostPath:
                path: /var/backups/etcd
          restartPolicy: OnFailure

Restore Procedure

WARNING: Restore replaces ALL cluster data with the snapshot. This is destructive.

Step 1: Stop kube-apiserver

# kubeadm clusters: move manifests
mv /etc/kubernetes/manifests/kube-apiserver.yaml /tmp/

# k3s:
systemctl stop k3s

Step 2: Stop etcd

# kubeadm:
mv /etc/kubernetes/manifests/etcd.yaml /tmp/

# Standalone:
systemctl stop etcd

Step 3: Restore snapshot

ETCDCTL_API=3 etcdctl snapshot restore /var/backups/etcd/snapshot-YYYYMMDD.db \
  --data-dir=/var/lib/etcd-restored \
  --name=$(hostname) \
  --initial-cluster=$(hostname)=https://$(hostname):2380 \
  --initial-advertise-peer-urls=https://$(hostname):2380

Step 4: Replace data directory

mv /var/lib/etcd /var/lib/etcd-old-$(date +%Y%m%d)
mv /var/lib/etcd-restored /var/lib/etcd
chown -R etcd:etcd /var/lib/etcd  # if using systemd

Step 5: Restart services

# kubeadm:
mv /tmp/etcd.yaml /etc/kubernetes/manifests/
# Wait for etcd to start
sleep 30
mv /tmp/kube-apiserver.yaml /etc/kubernetes/manifests/

# k3s:
systemctl start k3s

Step 6: Verify

# etcd health
etcdctl endpoint health

# API server responding
kubectl get nodes
kubectl get pods -A

# Check for any resources that were created after the backup
# (these will be lost)

Verification

# etcd healthy
ETCDCTL_API=3 etcdctl endpoint health
# Expected: is healthy

# All nodes ready
kubectl get nodes
# Expected: all Ready

# Workloads recovered
kubectl get pods -A | grep -v Running | grep -v Completed
# Expected: no unexpected states

Monitoring

# Alert: etcd backup too old
- alert: EtcdBackupTooOld
  expr: time() - etcd_last_backup_timestamp_seconds > 86400
  labels:
    severity: warning
  annotations:
    summary: "etcd backup is older than 24 hours"

# Alert: etcd backup failed
- alert: EtcdBackupFailed
  expr: kube_job_status_failed{job_name=~"etcd-backup.*"} > 0
  labels:
    severity: critical

Wiki Navigation