Kubernetes Recovery from Master Failure with Kubeadm
# Backup Kubernetes PKI
#!/bin/sh
KUBE_CERT_DIR="/etc/kubernetes/pki"
BACKUP_CERT_DIR="/backup"
if [ ! -d $BACKUP_CERT_DIR ]
then
mkdir -p $BACKUP_CERT_DIR
fi
cp -a $KUBE_CERT_DIR $BACKUP_CERT_DIR
# Backup Kubernetes Etcd database with a CronJob object
The CronJob must be scheduled to Kubernetes Master node.
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: etcd-snapshot
namespace: kube-system
spec:
schedule: "59 23 * * *"
jobTemplate:
spec:
template:
spec:
containers:
- name: backup
# Same image as in /etc/kubernetes/manifests/etcd.yaml
image: k8s.gcr.io/etcd:3.3.10
env:
- name: ETCDCTL_API
value: "3"
command: ["/bin/sh"]
args: ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd/snapshot-$(date +%Y-%m-%d_%H-%M-%S_%Z).db"]
volumeMounts:
- mountPath: /etc/kubernetes/pki/etcd
name: etcd-certs
readOnly: true
- mountPath: /backup/etcd
name: backup
hostNetwork: true
restartPolicy: OnFailure
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/master
operator: Exists
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- name: etcd-certs
hostPath:
path: /etc/kubernetes/pki/etcd
type: Directory
- name: backup
hostPath:
path: /backup/etcd
type: DirectoryOrCreate
# Recovery from Master Failure with Kubeadm
#/bin/sh
#####################################
# Recovery from Master Failure #
#####################################
if kubectl cluster-info > /dev/null 2>&1; then
kubectl cluster-info
exit 0
fi
# Restore Kubernetes certificates
KUBE_CERT_DIR="/etc/kubernetes"
BACKUP_CERT_DIR="/backup/pki"
mkdir -p $KUBE_CERT_DIR
cp -a $BACKUP_CERT_DIR $KUBE_CERT_DIR
# Restore etcd data
ETCD_IMAGE="k8s.gcr.io/etcd:3.3.10"
# ETCD_SNAPSHOT="snapshot-2019-05-21_06-48-06_UTC.db"
ETCD_SNAPSHOT="snapshot-2019-05-21_09-05-11_UTC.db"
mkdir -p /var/lib/etcd
docker run --rm \
-v '/backup:/backup' \
-v '/var/lib/etcd:/var/lib/etcd' \
--env ETCDCTL_API=3 \
"$ETCD_IMAGE" \
/bin/sh -c "etcdctl snapshot restore '/backup/etcd/$ETCD_SNAPSHOT' ; mv /default.etcd/member/ /var/lib/etcd/"
# Init Kubernetes with Kubeadm
kubeadm init \
--ignore-preflight-errors=DirAvailable--var-lib-etcd