mirror of https://github.com/easzlab/kubeasz.git
调整集群备份/恢复脚本及文档
parent
45587b76a1
commit
2a35d3e7ac
|
@ -3,49 +3,34 @@
|
|||
|
||||
- hosts:
|
||||
- etcd
|
||||
roles:
|
||||
- cluster-backup
|
||||
tasks:
|
||||
- name: 执行etcd 数据备份
|
||||
shell: "mkdir -p /etcd_backup && cd /etcd_backup && \
|
||||
ETCDCTL_API=3 {{ bin_dir }}/etcdctl snapshot save snapshot.db"
|
||||
args:
|
||||
warn: false
|
||||
|
||||
- name: 获取etcd 数据备份
|
||||
fetch:
|
||||
src: /etcd_backup/snapshot.db
|
||||
dest: "{{ base_dir }}/.cluster/backup/"
|
||||
flat: yes
|
||||
run_once: true
|
||||
|
||||
- hosts:
|
||||
- deploy
|
||||
- localhost
|
||||
tasks:
|
||||
- name: Creating backup dirs
|
||||
file: name={{ item }} state=directory
|
||||
with_items:
|
||||
- "{{ base_dir }}/roles/cluster-backup/files/ca"
|
||||
- "{{ base_dir }}/roles/cluster-backup/files/hosts"
|
||||
- "{{ base_dir }}/roles/cluster-backup/files/snapshot"
|
||||
|
||||
- name: Backing up CA sth
|
||||
copy:
|
||||
src: "{{ ca_dir }}/{{ item }}"
|
||||
dest: "{{ base_dir }}/roles/cluster-backup/files/ca/{{ item }}"
|
||||
with_items:
|
||||
- ca.pem
|
||||
- ca-key.pem
|
||||
- ca.csr
|
||||
- ca-csr.json
|
||||
- ca-config.json
|
||||
|
||||
- name: Backing up ansible hosts-1
|
||||
copy:
|
||||
src: "{{ base_dir }}/hosts"
|
||||
dest: "{{ base_dir }}/roles/cluster-backup/files/hosts/hosts"
|
||||
dest: "{{ base_dir }}/.cluster/backup/hosts"
|
||||
register: p
|
||||
|
||||
- name: Backing up ansible hosts-2
|
||||
shell: "cd {{ base_dir }}/roles/cluster-backup/files/hosts && \
|
||||
shell: "cd {{ base_dir }}/.cluster/backup && \
|
||||
cp -fp hosts hosts-$(date +'%Y%m%d%H%M')"
|
||||
when: 'p is changed'
|
||||
|
||||
- name: Backing up etcd snapshot-1
|
||||
copy:
|
||||
src: "{{ base_dir }}/roles/cluster-backup/files/snapshot.db"
|
||||
dest: "{{ base_dir }}/roles/cluster-backup/files/snapshot/snapshot.db"
|
||||
register: q
|
||||
|
||||
- name: Backing up etcd snapshot-2
|
||||
shell: "cd {{ base_dir }}/roles/cluster-backup/files/ && \
|
||||
mv -f snapshot.db snapshot/snapshot-$(date +'%Y%m%d%H%M').db"
|
||||
when: 'q is changed'
|
||||
|
||||
- name: Backing up etcd snapshot with datetime
|
||||
shell: "cd {{ base_dir }}/.cluster/backup && \
|
||||
cp -fp snapshot.db snapshot-$(date +'%Y%m%d%H%M').db"
|
||||
|
|
|
@ -1,77 +1,6 @@
|
|||
# cluster-restore playbook
|
||||
# read the guide: 'op/cluster_restore.md'
|
||||
|
||||
# to restore CA sth on 'deploy' node
|
||||
- hosts: deploy
|
||||
tasks:
|
||||
- name: Restoring dirs of CA sth
|
||||
file: name=/etc/kubernetes/ssl/ state=directory
|
||||
|
||||
- name: Restoring CA sth
|
||||
copy:
|
||||
src: "{{ base_dir }}/roles/cluster-backup/files/ca/{{ item }}"
|
||||
dest: "{{ ca_dir }}/{{ item }}"
|
||||
with_items:
|
||||
- ca.pem
|
||||
- ca-key.pem
|
||||
- ca.csr
|
||||
- ca-csr.json
|
||||
- ca-config.json
|
||||
|
||||
- hosts: deploy
|
||||
roles:
|
||||
- deploy
|
||||
|
||||
# pre-tasks on all nodes
|
||||
- hosts:
|
||||
- kube-master
|
||||
- kube-node
|
||||
- etcd
|
||||
roles:
|
||||
- prepare
|
||||
|
||||
# [optional] only needed by multi-master cluster
|
||||
- hosts: lb
|
||||
roles:
|
||||
- lb
|
||||
|
||||
# to install etcd cluster
|
||||
- hosts: etcd
|
||||
roles:
|
||||
- etcd
|
||||
|
||||
# to install docker
|
||||
- hosts:
|
||||
- kube-master
|
||||
- kube-node
|
||||
roles:
|
||||
- docker
|
||||
|
||||
# to set up 'kube-master' nodes
|
||||
- hosts:
|
||||
- kube-master
|
||||
roles:
|
||||
- kube-master
|
||||
- kube-node
|
||||
#
|
||||
tasks:
|
||||
- name: Making master nodes SchedulingDisabled
|
||||
shell: "{{ bin_dir }}/kubectl cordon {{ inventory_hostname }} "
|
||||
when: DEPLOY_MODE != "allinone"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Setting master role name
|
||||
shell: "{{ bin_dir }}/kubectl label node {{ inventory_hostname }} kubernetes.io/role=master --overwrite"
|
||||
ignore_errors: true
|
||||
|
||||
# to set up 'kube-node' nodes
|
||||
- hosts:
|
||||
- kube-node
|
||||
roles:
|
||||
- kube-node
|
||||
|
||||
# to restore data of etcd cluster
|
||||
- hosts: etcd
|
||||
roles:
|
||||
- cluster-restore
|
||||
|
||||
|
|
|
@ -3,127 +3,44 @@
|
|||
虽然 K8S 集群可以配置成多主多节点的高可用的部署,还是有必要了解下集群的备份和容灾恢复能力;在高可用k8s集群中 etcd集群保存了整个集群的状态,因此这里的备份与恢复重点就是:
|
||||
|
||||
- 从运行的etcd集群备份数据到磁盘文件
|
||||
- 从etcd备份文件恢复数据到运行的etcd集群,然后据此重建整个集群
|
||||
- 从etcd备份文件恢复数据,从而使集群恢复到备份时状态
|
||||
|
||||
## 前提
|
||||
## 备份与恢复操作说明
|
||||
|
||||
k8s 集群可能因为软硬件故障或者误操作出现了不可自愈的问题,这个时候需要考虑集群从备份中恢复重建;使用kubeasz项目创建的集群如需恢复前提如下:
|
||||
|
||||
- 集群正常状态下的etcd 备份文件(etcd V3数据)
|
||||
- 创建集群时使用的 CA证书相关文件
|
||||
- 创建集群时使用的 ansible hosts文件
|
||||
|
||||
## 备份与恢复手动操作说明
|
||||
|
||||
首先用kubeasz 搭建一个测试集群,部署几个测试deployment,验证集群各项正常后,进行一次备份:
|
||||
|
||||
- 1.在一个etcd节点上执行数据备份,把产生的备份文件`snapshot.db`复制到所有etcd集群节点
|
||||
|
||||
``` bash
|
||||
$ mkdir -p /backup/k8s/ && cd /backup/k8s
|
||||
$ ETCDCTL_API=3 etcdctl snapshot save snapshot.db
|
||||
```
|
||||
|
||||
- 2.在deploy节点把 CA证书相关备份出来
|
||||
|
||||
``` bash
|
||||
$ mkdir -p /backup/k8s/ && cp /etc/kubernetes/ssl/ca* /backup/k8s/
|
||||
```
|
||||
|
||||
- 3.在deploy节点清理集群,模拟集群完全崩溃
|
||||
|
||||
``` bash
|
||||
$ ansible-playbook /etc/ansible/99.clean.yml
|
||||
```
|
||||
|
||||
- 4.在deploy节点开始一步步重建集群
|
||||
|
||||
``` bash
|
||||
# 恢复原集群的CA 证书相关
|
||||
$ mkdir -p /etc/kubernetes/ssl/ && cp /backup/k8s/* /etc/kubernetes/ssl/
|
||||
|
||||
# 然后执行集群恢复步骤,安装至 kube-node完成阶段
|
||||
$ cd /etc/ansible
|
||||
$ ansible-playbook 01.prepare.yml
|
||||
$ ansible-playbook 02.etcd.yml
|
||||
$ ansible-playbook 03.docker.yml
|
||||
$ ansible-playbook 04.kube-master.yml
|
||||
$ ansible-playbook 05.kube-node.yml
|
||||
|
||||
# 以上步骤验证正常后,停止etcd集群服务,并清空新etcd集群数据目录
|
||||
$ ansible etcd -m service -a 'name=etcd state=stopped'
|
||||
$ asnible etcd -m file -a 'name=/var/lib/etcd/member/ state=absent'
|
||||
```
|
||||
|
||||
- 5.手动分别登陆每个etcd节点进行数据备份恢复,每个etcd都要如下操作
|
||||
|
||||
``` bash
|
||||
# 参照本etcd节点/etc/systemd/system/etcd.service的服务文件,替换如下{{}}中变量后执行
|
||||
$ cd /backup/k8s/
|
||||
$ ETCDCTL_API=3 etcdctl snapshot restore snapshot.db \
|
||||
--name {{ NODE_NAME }} \
|
||||
--initial-cluster {{ ETCD_NODES }} \
|
||||
--initial-cluster-token etcd-cluster-0 \
|
||||
--initial-advertise-peer-urls https://{{ inventory_hostname }}:2380
|
||||
|
||||
# 以上执行完后,会生成{{ NODE_NAME }}.etcd的文件夹,将它里面的member 拷贝到etcd数据目录中
|
||||
$ cp -r {{ NODE_NAME }}.etcd/member /var/lib/etcd/
|
||||
|
||||
$ systemctl restart etcd
|
||||
```
|
||||
|
||||
- 6.在deploy节点执行网络重建
|
||||
|
||||
``` bash
|
||||
$ ansible-playbook /etc/ansible/tools/change_k8s_network.yml
|
||||
```
|
||||
|
||||
执行完之后,可以验证整个集群是否恢复正常,之前的测试应用部署是否全部恢复。
|
||||
|
||||
- 参考:https://github.com/coreos/etcd/blob/master/Documentation/op-guide/recovery.md
|
||||
|
||||
## 备份恢复自动脚本操作指南
|
||||
|
||||
- 一.集群备份
|
||||
- 1.首先搭建一个测试集群,部署几个测试deployment,验证集群各项正常后,进行一次备份:
|
||||
|
||||
``` bash
|
||||
$ ansible-playbook /etc/ansible/23.backup.yml
|
||||
```
|
||||
|
||||
执行完毕可以在目录 `/etc/ansible/roles/cluster-backup/files`下检查备份情况,示例如下:
|
||||
执行完毕可以在备份目录下检查备份情况,示例如下:
|
||||
|
||||
``` bash
|
||||
roles/cluster-backup/files/
|
||||
├── ca # 集群CA 相关备份
|
||||
│ ├── ca-config.json
|
||||
│ ├── ca.csr
|
||||
│ ├── ca-csr.json
|
||||
│ ├── ca-key.pem
|
||||
│ └── ca.pem
|
||||
├── hosts # ansible hosts备份
|
||||
│ ├── hosts # 最近的备份
|
||||
│ └── hosts-201807231642
|
||||
├── readme.md
|
||||
└── snapshot # etcd 数据备份
|
||||
├── snapshot-201807231642.db
|
||||
└── snapshot.db # 最近的备份
|
||||
```
|
||||
/etc/ansible/.cluster/backup/
|
||||
├── hosts
|
||||
├── hosts-201907030954
|
||||
├── snapshot-201907030954.db
|
||||
├── snapshot-201907031048.db
|
||||
└── snapshot.db
|
||||
```
|
||||
|
||||
- 二.模拟集群故障
|
||||
- 2.模拟误删除操作(略)
|
||||
|
||||
``` bash
|
||||
$ ansible-playbook /etc/ansible/99.clean.yml
|
||||
```
|
||||
- 3.恢复集群及验证
|
||||
|
||||
**注意** 为了模拟集群彻底崩溃,这里清理整个集群;实际操作中,在有备份前提下,也建议彻底清理集群后再尝试去恢复
|
||||
|
||||
- 三.集群恢复
|
||||
|
||||
可以在 `roles/cluster-restore/defaults/main.yml` 文件中配置需要恢复的 etcd备份版本,默认使用最近一次备份
|
||||
可以在 `roles/cluster-restore/defaults/main.yml` 文件中配置需要恢复的 etcd备份版本(从上述备份目录中选取),默认使用最近一次备份;执行恢复后,需要一定时间等待 pod/svc 等资源恢复重建。
|
||||
|
||||
``` bash
|
||||
$ ansible-playbook /etc/ansible/24.restore.yml
|
||||
$ ansible-playbook /etc/ansible/tools/change_k8s_network.yml
|
||||
```
|
||||
如果集群主要组件(master/etcd/node)等出现不可恢复问题,可以尝试使用如下步骤 [清理]() --> [创建]() --> [恢复]()
|
||||
|
||||
``` bash
|
||||
$ ansible-playbook /etc/ansible/99.clean.yml
|
||||
$ ansible-playbook /etc/ansible/90.setup.yml
|
||||
$ ansible-playbook /etc/ansible/24.restore.yml
|
||||
```
|
||||
|
||||
执行完成可以验证整个集群是否恢复如初!
|
||||
## 参考
|
||||
|
||||
- https://github.com/coreos/etcd/blob/master/Documentation/op-guide/recovery.md
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
# 集群备份目录
|
||||
|
||||
此目标保留作为k8s 集群备份使用。
|
|
@ -1,14 +0,0 @@
|
|||
- name: 准备备份目录
|
||||
file: name=/backup/k8s state=directory
|
||||
|
||||
- name: 执行etcd 数据备份
|
||||
shell: "cd /backup/k8s && \
|
||||
ETCDCTL_API=3 {{ bin_dir }}/etcdctl snapshot save snapshot.db"
|
||||
|
||||
- name: 获取etcd 数据备份
|
||||
fetch:
|
||||
src: /backup/k8s/snapshot.db
|
||||
dest: /etc/ansible/roles/cluster-backup/files/
|
||||
flat: yes
|
||||
run_once: true
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
# 指定需要恢复的 etcd 数据备份,默认使用最近的一次备份
|
||||
# 在ansible 控制端查看备份目录:/etc/ansible/.cluster/backup
|
||||
db_to_restore: "snapshot.db"
|
||||
|
||||
# etcd 集群间通信的IP和端口, 根据etcd组成员自动生成
|
||||
|
|
|
@ -5,18 +5,18 @@
|
|||
file: name=/var/lib/etcd/member state=absent
|
||||
|
||||
- name: 生成备份目录
|
||||
file: name=/backup/k8s state=directory
|
||||
file: name=/etcd_backup state=directory
|
||||
|
||||
- name: 准备指定的备份etcd 数据
|
||||
copy:
|
||||
src: "{{ base_dir }}/roles/cluster-backup/files/snapshot/{{ db_to_restore }}"
|
||||
dest: "/backup/k8s/snapshot.db"
|
||||
src: "{{ base_dir }}/.cluster/backup/{{ db_to_restore }}"
|
||||
dest: "/etcd_backup/snapshot.db"
|
||||
|
||||
- name: 清理原备份出来数据
|
||||
file: name=/backup/k8s/{{ NODE_NAME }}.etcd state=absent
|
||||
- name: 清理上次备份恢复数据
|
||||
file: name=/etcd_backup/{{ NODE_NAME }}.etcd state=absent
|
||||
|
||||
- name: etcd 数据恢复
|
||||
shell: "cd /backup/k8s && \
|
||||
shell: "cd /etcd_backup && \
|
||||
ETCDCTL_API=3 {{ bin_dir }}/etcdctl snapshot restore snapshot.db \
|
||||
--name {{ NODE_NAME }} \
|
||||
--initial-cluster {{ ETCD_NODES }} \
|
||||
|
@ -24,7 +24,7 @@
|
|||
--initial-advertise-peer-urls https://{{ inventory_hostname }}:2380"
|
||||
|
||||
- name: 恢复数据至etcd 数据目录
|
||||
shell: "cp -rf /backup/k8s/{{ NODE_NAME }}.etcd/member /var/lib/etcd/"
|
||||
shell: "cp -rf /etcd_backup/{{ NODE_NAME }}.etcd/member /var/lib/etcd/"
|
||||
|
||||
- name: 重启etcd 服务
|
||||
service: name=etcd state=restarted
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
file: name={{ item }} state=directory
|
||||
with_items:
|
||||
- "{{ base_dir }}/.cluster/ssl"
|
||||
- "{{ base_dir }}/.cluster/backup"
|
||||
|
||||
- name: 本地设置 bin 目录权限
|
||||
file: path={{ base_dir }}/bin state=directory mode=0755 recurse=yes
|
||||
|
|
|
@ -307,17 +307,17 @@ function list() {
|
|||
[ -f "$BASEPATH/.cluster/current_cluster" ] || { echo "[ERROR] invalid context, run 'easzctl checkout <cluster_name>' first"; return 1; }
|
||||
CLUSTER=$(cat $BASEPATH/.cluster/current_cluster)
|
||||
echo -e "\nlist of managed contexts (current: \033[33m$CLUSTER\033[0m)"
|
||||
i=1; for Cluster in $(ls $BASEPATH/.cluster/ |grep -Ev "ssl|current_cluster|kubeconfig");
|
||||
i=1; for c in $(ls $BASEPATH/.cluster/ |grep -Ev "backup|ssl|current_cluster|kubeconfig");
|
||||
do
|
||||
echo -e "==> context $i:\t$Cluster"
|
||||
echo -e "==> context $i:\t$c"
|
||||
let "i++"
|
||||
done
|
||||
echo -e "\nlist of installed clusters (current: \033[33m$CLUSTER\033[0m)"
|
||||
i=1; for Cluster in $(ls $BASEPATH/.cluster/ |grep -Ev "ssl|current_cluster|kubeconfig");
|
||||
i=1; for c in $(ls $BASEPATH/.cluster/ |grep -Ev "backup|ssl|current_cluster|kubeconfig");
|
||||
do
|
||||
KUBECONF=$BASEPATH/.cluster/$Cluster/config
|
||||
KUBECONF=$BASEPATH/.cluster/$c/config
|
||||
if [ -f "$KUBECONF" ]; then
|
||||
echo -e "==> cluster $i:\t$Cluster"
|
||||
echo -e "==> cluster $i:\t$c"
|
||||
$BASEPATH/bin/kubectl --kubeconfig=$KUBECONF get node
|
||||
fi
|
||||
let "i++"
|
||||
|
|
Loading…
Reference in New Issue