diff --git a/23.backup.yml b/23.backup.yml index 3abad10..ed15cf2 100644 --- a/23.backup.yml +++ b/23.backup.yml @@ -3,49 +3,34 @@ - hosts: - etcd - roles: - - cluster-backup + tasks: + - name: 执行etcd 数据备份 + shell: "mkdir -p /etcd_backup && cd /etcd_backup && \ + ETCDCTL_API=3 {{ bin_dir }}/etcdctl snapshot save snapshot.db" + args: + warn: false + + - name: 获取etcd 数据备份 + fetch: + src: /etcd_backup/snapshot.db + dest: "{{ base_dir }}/.cluster/backup/" + flat: yes + run_once: true - hosts: - - deploy + - localhost tasks: - - name: Creating backup dirs - file: name={{ item }} state=directory - with_items: - - "{{ base_dir }}/roles/cluster-backup/files/ca" - - "{{ base_dir }}/roles/cluster-backup/files/hosts" - - "{{ base_dir }}/roles/cluster-backup/files/snapshot" - - - name: Backing up CA sth - copy: - src: "{{ ca_dir }}/{{ item }}" - dest: "{{ base_dir }}/roles/cluster-backup/files/ca/{{ item }}" - with_items: - - ca.pem - - ca-key.pem - - ca.csr - - ca-csr.json - - ca-config.json - - name: Backing up ansible hosts-1 copy: src: "{{ base_dir }}/hosts" - dest: "{{ base_dir }}/roles/cluster-backup/files/hosts/hosts" + dest: "{{ base_dir }}/.cluster/backup/hosts" register: p - name: Backing up ansible hosts-2 - shell: "cd {{ base_dir }}/roles/cluster-backup/files/hosts && \ + shell: "cd {{ base_dir }}/.cluster/backup && \ cp -fp hosts hosts-$(date +'%Y%m%d%H%M')" when: 'p is changed' - - name: Backing up etcd snapshot-1 - copy: - src: "{{ base_dir }}/roles/cluster-backup/files/snapshot.db" - dest: "{{ base_dir }}/roles/cluster-backup/files/snapshot/snapshot.db" - register: q - - - name: Backing up etcd snapshot-2 - shell: "cd {{ base_dir }}/roles/cluster-backup/files/ && \ - mv -f snapshot.db snapshot/snapshot-$(date +'%Y%m%d%H%M').db" - when: 'q is changed' - + - name: Backing up etcd snapshot with datetime + shell: "cd {{ base_dir }}/.cluster/backup && \ + cp -fp snapshot.db snapshot-$(date +'%Y%m%d%H%M').db" diff --git a/24.restore.yml b/24.restore.yml index 5cac9e2..8d83ad9 100644 --- a/24.restore.yml +++ b/24.restore.yml @@ -1,77 +1,6 @@ # cluster-restore playbook # read the guide: 'op/cluster_restore.md' -# to restore CA sth on 'deploy' node -- hosts: deploy - tasks: - - name: Restoring dirs of CA sth - file: name=/etc/kubernetes/ssl/ state=directory - - - name: Restoring CA sth - copy: - src: "{{ base_dir }}/roles/cluster-backup/files/ca/{{ item }}" - dest: "{{ ca_dir }}/{{ item }}" - with_items: - - ca.pem - - ca-key.pem - - ca.csr - - ca-csr.json - - ca-config.json - -- hosts: deploy - roles: - - deploy - -# pre-tasks on all nodes -- hosts: - - kube-master - - kube-node - - etcd - roles: - - prepare - -# [optional] only needed by multi-master cluster -- hosts: lb - roles: - - lb - -# to install etcd cluster -- hosts: etcd - roles: - - etcd - -# to install docker -- hosts: - - kube-master - - kube-node - roles: - - docker - -# to set up 'kube-master' nodes -- hosts: - - kube-master - roles: - - kube-master - - kube-node - # - tasks: - - name: Making master nodes SchedulingDisabled - shell: "{{ bin_dir }}/kubectl cordon {{ inventory_hostname }} " - when: DEPLOY_MODE != "allinone" - ignore_errors: true - - - name: Setting master role name - shell: "{{ bin_dir }}/kubectl label node {{ inventory_hostname }} kubernetes.io/role=master --overwrite" - ignore_errors: true - -# to set up 'kube-node' nodes -- hosts: - - kube-node - roles: - - kube-node - -# to restore data of etcd cluster - hosts: etcd roles: - cluster-restore - diff --git a/docs/op/cluster_restore.md b/docs/op/cluster_restore.md index 967e580..0b69a9f 100644 --- a/docs/op/cluster_restore.md +++ b/docs/op/cluster_restore.md @@ -3,127 +3,44 @@ 虽然 K8S 集群可以配置成多主多节点的高可用的部署,还是有必要了解下集群的备份和容灾恢复能力;在高可用k8s集群中 etcd集群保存了整个集群的状态,因此这里的备份与恢复重点就是: - 从运行的etcd集群备份数据到磁盘文件 -- 从etcd备份文件恢复数据到运行的etcd集群,然后据此重建整个集群 +- 从etcd备份文件恢复数据,从而使集群恢复到备份时状态 -## 前提 +## 备份与恢复操作说明 -k8s 集群可能因为软硬件故障或者误操作出现了不可自愈的问题,这个时候需要考虑集群从备份中恢复重建;使用kubeasz项目创建的集群如需恢复前提如下: - -- 集群正常状态下的etcd 备份文件(etcd V3数据) -- 创建集群时使用的 CA证书相关文件 -- 创建集群时使用的 ansible hosts文件 - -## 备份与恢复手动操作说明 - -首先用kubeasz 搭建一个测试集群,部署几个测试deployment,验证集群各项正常后,进行一次备份: - -- 1.在一个etcd节点上执行数据备份,把产生的备份文件`snapshot.db`复制到所有etcd集群节点 - -``` bash -$ mkdir -p /backup/k8s/ && cd /backup/k8s -$ ETCDCTL_API=3 etcdctl snapshot save snapshot.db -``` - -- 2.在deploy节点把 CA证书相关备份出来 - -``` bash -$ mkdir -p /backup/k8s/ && cp /etc/kubernetes/ssl/ca* /backup/k8s/ -``` - -- 3.在deploy节点清理集群,模拟集群完全崩溃 - -``` bash -$ ansible-playbook /etc/ansible/99.clean.yml -``` - -- 4.在deploy节点开始一步步重建集群 - -``` bash -# 恢复原集群的CA 证书相关 -$ mkdir -p /etc/kubernetes/ssl/ && cp /backup/k8s/* /etc/kubernetes/ssl/ - -# 然后执行集群恢复步骤,安装至 kube-node完成阶段 -$ cd /etc/ansible -$ ansible-playbook 01.prepare.yml -$ ansible-playbook 02.etcd.yml -$ ansible-playbook 03.docker.yml -$ ansible-playbook 04.kube-master.yml -$ ansible-playbook 05.kube-node.yml - -# 以上步骤验证正常后,停止etcd集群服务,并清空新etcd集群数据目录 -$ ansible etcd -m service -a 'name=etcd state=stopped' -$ asnible etcd -m file -a 'name=/var/lib/etcd/member/ state=absent' -``` - -- 5.手动分别登陆每个etcd节点进行数据备份恢复,每个etcd都要如下操作 - -``` bash -# 参照本etcd节点/etc/systemd/system/etcd.service的服务文件,替换如下{{}}中变量后执行 -$ cd /backup/k8s/ -$ ETCDCTL_API=3 etcdctl snapshot restore snapshot.db \ - --name {{ NODE_NAME }} \ - --initial-cluster {{ ETCD_NODES }} \ - --initial-cluster-token etcd-cluster-0 \ - --initial-advertise-peer-urls https://{{ inventory_hostname }}:2380 - -# 以上执行完后,会生成{{ NODE_NAME }}.etcd的文件夹,将它里面的member 拷贝到etcd数据目录中 -$ cp -r {{ NODE_NAME }}.etcd/member /var/lib/etcd/ - -$ systemctl restart etcd -``` - -- 6.在deploy节点执行网络重建 - -``` bash -$ ansible-playbook /etc/ansible/tools/change_k8s_network.yml -``` - -执行完之后,可以验证整个集群是否恢复正常,之前的测试应用部署是否全部恢复。 - -- 参考:https://github.com/coreos/etcd/blob/master/Documentation/op-guide/recovery.md - -## 备份恢复自动脚本操作指南 - -- 一.集群备份 +- 1.首先搭建一个测试集群,部署几个测试deployment,验证集群各项正常后,进行一次备份: ``` bash $ ansible-playbook /etc/ansible/23.backup.yml ``` -执行完毕可以在目录 `/etc/ansible/roles/cluster-backup/files`下检查备份情况,示例如下: +执行完毕可以在备份目录下检查备份情况,示例如下: -``` bash -roles/cluster-backup/files/ -├── ca # 集群CA 相关备份 -│   ├── ca-config.json -│   ├── ca.csr -│   ├── ca-csr.json -│   ├── ca-key.pem -│   └── ca.pem -├── hosts # ansible hosts备份 -│   ├── hosts # 最近的备份 -│   └── hosts-201807231642 -├── readme.md -└── snapshot # etcd 数据备份 - ├── snapshot-201807231642.db - └── snapshot.db # 最近的备份 +``` +/etc/ansible/.cluster/backup/ +├── hosts +├── hosts-201907030954 +├── snapshot-201907030954.db +├── snapshot-201907031048.db +└── snapshot.db ``` -- 二.模拟集群故障 +- 2.模拟误删除操作(略) -``` bash -$ ansible-playbook /etc/ansible/99.clean.yml -``` +- 3.恢复集群及验证 -**注意** 为了模拟集群彻底崩溃,这里清理整个集群;实际操作中,在有备份前提下,也建议彻底清理集群后再尝试去恢复 - -- 三.集群恢复 - -可以在 `roles/cluster-restore/defaults/main.yml` 文件中配置需要恢复的 etcd备份版本,默认使用最近一次备份 +可以在 `roles/cluster-restore/defaults/main.yml` 文件中配置需要恢复的 etcd备份版本(从上述备份目录中选取),默认使用最近一次备份;执行恢复后,需要一定时间等待 pod/svc 等资源恢复重建。 ``` bash $ ansible-playbook /etc/ansible/24.restore.yml -$ ansible-playbook /etc/ansible/tools/change_k8s_network.yml +``` +如果集群主要组件(master/etcd/node)等出现不可恢复问题,可以尝试使用如下步骤 [清理]() --> [创建]() --> [恢复]() + +``` bash +$ ansible-playbook /etc/ansible/99.clean.yml +$ ansible-playbook /etc/ansible/90.setup.yml +$ ansible-playbook /etc/ansible/24.restore.yml ``` -执行完成可以验证整个集群是否恢复如初! +## 参考 + +- https://github.com/coreos/etcd/blob/master/Documentation/op-guide/recovery.md diff --git a/roles/cluster-backup/files/readme.md b/roles/cluster-backup/files/readme.md deleted file mode 100644 index a742c3d..0000000 --- a/roles/cluster-backup/files/readme.md +++ /dev/null @@ -1,3 +0,0 @@ -# 集群备份目录 - -此目标保留作为k8s 集群备份使用。 diff --git a/roles/cluster-backup/tasks/main.yml b/roles/cluster-backup/tasks/main.yml deleted file mode 100644 index cbecb2a..0000000 --- a/roles/cluster-backup/tasks/main.yml +++ /dev/null @@ -1,14 +0,0 @@ -- name: 准备备份目录 - file: name=/backup/k8s state=directory - -- name: 执行etcd 数据备份 - shell: "cd /backup/k8s && \ - ETCDCTL_API=3 {{ bin_dir }}/etcdctl snapshot save snapshot.db" - -- name: 获取etcd 数据备份 - fetch: - src: /backup/k8s/snapshot.db - dest: /etc/ansible/roles/cluster-backup/files/ - flat: yes - run_once: true - diff --git a/roles/cluster-restore/defaults/main.yml b/roles/cluster-restore/defaults/main.yml index 75a9922..0398c0c 100644 --- a/roles/cluster-restore/defaults/main.yml +++ b/roles/cluster-restore/defaults/main.yml @@ -1,4 +1,5 @@ # 指定需要恢复的 etcd 数据备份,默认使用最近的一次备份 +# 在ansible 控制端查看备份目录:/etc/ansible/.cluster/backup db_to_restore: "snapshot.db" # etcd 集群间通信的IP和端口, 根据etcd组成员自动生成 diff --git a/roles/cluster-restore/tasks/main.yml b/roles/cluster-restore/tasks/main.yml index 498f11d..1d0616c 100644 --- a/roles/cluster-restore/tasks/main.yml +++ b/roles/cluster-restore/tasks/main.yml @@ -5,18 +5,18 @@ file: name=/var/lib/etcd/member state=absent - name: 生成备份目录 - file: name=/backup/k8s state=directory + file: name=/etcd_backup state=directory - name: 准备指定的备份etcd 数据 copy: - src: "{{ base_dir }}/roles/cluster-backup/files/snapshot/{{ db_to_restore }}" - dest: "/backup/k8s/snapshot.db" + src: "{{ base_dir }}/.cluster/backup/{{ db_to_restore }}" + dest: "/etcd_backup/snapshot.db" -- name: 清理原备份出来数据 - file: name=/backup/k8s/{{ NODE_NAME }}.etcd state=absent +- name: 清理上次备份恢复数据 + file: name=/etcd_backup/{{ NODE_NAME }}.etcd state=absent - name: etcd 数据恢复 - shell: "cd /backup/k8s && \ + shell: "cd /etcd_backup && \ ETCDCTL_API=3 {{ bin_dir }}/etcdctl snapshot restore snapshot.db \ --name {{ NODE_NAME }} \ --initial-cluster {{ ETCD_NODES }} \ @@ -24,7 +24,7 @@ --initial-advertise-peer-urls https://{{ inventory_hostname }}:2380" - name: 恢复数据至etcd 数据目录 - shell: "cp -rf /backup/k8s/{{ NODE_NAME }}.etcd/member /var/lib/etcd/" + shell: "cp -rf /etcd_backup/{{ NODE_NAME }}.etcd/member /var/lib/etcd/" - name: 重启etcd 服务 service: name=etcd state=restarted diff --git a/roles/deploy/tasks/main.yml b/roles/deploy/tasks/main.yml index edeb00a..e8e68d3 100644 --- a/roles/deploy/tasks/main.yml +++ b/roles/deploy/tasks/main.yml @@ -2,6 +2,7 @@ file: name={{ item }} state=directory with_items: - "{{ base_dir }}/.cluster/ssl" + - "{{ base_dir }}/.cluster/backup" - name: 本地设置 bin 目录权限 file: path={{ base_dir }}/bin state=directory mode=0755 recurse=yes diff --git a/tools/easzctl b/tools/easzctl index 152ec55..be9bc2c 100755 --- a/tools/easzctl +++ b/tools/easzctl @@ -307,17 +307,17 @@ function list() { [ -f "$BASEPATH/.cluster/current_cluster" ] || { echo "[ERROR] invalid context, run 'easzctl checkout ' first"; return 1; } CLUSTER=$(cat $BASEPATH/.cluster/current_cluster) echo -e "\nlist of managed contexts (current: \033[33m$CLUSTER\033[0m)" - i=1; for Cluster in $(ls $BASEPATH/.cluster/ |grep -Ev "ssl|current_cluster|kubeconfig"); + i=1; for c in $(ls $BASEPATH/.cluster/ |grep -Ev "backup|ssl|current_cluster|kubeconfig"); do - echo -e "==> context $i:\t$Cluster" + echo -e "==> context $i:\t$c" let "i++" done echo -e "\nlist of installed clusters (current: \033[33m$CLUSTER\033[0m)" - i=1; for Cluster in $(ls $BASEPATH/.cluster/ |grep -Ev "ssl|current_cluster|kubeconfig"); + i=1; for c in $(ls $BASEPATH/.cluster/ |grep -Ev "backup|ssl|current_cluster|kubeconfig"); do - KUBECONF=$BASEPATH/.cluster/$Cluster/config + KUBECONF=$BASEPATH/.cluster/$c/config if [ -f "$KUBECONF" ]; then - echo -e "==> cluster $i:\t$Cluster" + echo -e "==> cluster $i:\t$c" $BASEPATH/bin/kubectl --kubeconfig=$KUBECONF get node fi let "i++"