fix etcd dir bug

Signed-off-by: yangyuliufeng <qlw705706@gmail.com>
pull/1049/head
yangyuliufeng 2021-06-23 14:26:25 +08:00 committed by jmgao
parent a11f714d94
commit 370554cad7
5 changed files with 22 additions and 17 deletions

View File

@ -10,19 +10,18 @@
- 1.首先搭建一个测试集群部署几个测试deployment验证集群各项正常后进行一次备份
``` bash
$ ansible-playbook /etc/ansible/23.backup.yml
$ ansible-playbook -i clusters/k8s-01/hosts -e @clusters/k8s-01/config.yml playbooks/94.backup.yml
```
执行完毕可以在备份目录下检查备份情况,示例如下:
执行完毕可以在部署主机的备份目录下检查备份情况,示例如下:
```
/etc/ansible/.cluster/backup/
├── hosts
├── hosts-201907030954
├── snapshot-201907030954.db
├── snapshot-201907031048.db
/etc/kubeasz/clusters/k8s-01/backup/
├── snapshot_202106201205.db
├── snapshot_202106211406.db
└── snapshot.db
```
其中snapshot.db始终为最近一次备份文件
- 2.模拟误删除操作(略)
@ -31,16 +30,21 @@ $ ansible-playbook /etc/ansible/23.backup.yml
可以在 `roles/cluster-restore/defaults/main.yml` 文件中配置需要恢复的 etcd备份版本从上述备份目录中选取默认使用最近一次备份执行恢复后需要一定时间等待 pod/svc 等资源恢复重建。
``` bash
$ ansible-playbook /etc/ansible/24.restore.yml
$ ansible-playbook -i clusters/k8s-01/hosts -e @clusters/k8s-01/config.yml playbooks/94.backup.yml95.restore.yml
```
如果集群主要组件master/etcd/node等出现不可恢复问题可以尝试使用如下步骤 [清理]() --> [创建]() --> [恢复]()
``` bash
$ ansible-playbook /etc/ansible/99.clean.yml
$ ansible-playbook /etc/ansible/90.setup.yml
$ ansible-playbook /etc/ansible/24.restore.yml
$ ansible-playbook -i clusters/k8s-01/hosts -e @clusters/k8s-01/config.yml playbooks/99.clean.yml
$ ezctl setup k8s-01 01
$ ezctl setup k8s-01 02
$ ezctl setup k8s-01 03
$ ezctl setup k8s-01 04
$ ezctl setup k8s-01 05
...
$ ansible-playbook -i clusters/k8s-01/hosts -e @clusters/k8s-01/config.yml playbooks/95.restore.yml
```
## 参考
- https://github.com/coreos/etcd/blob/master/Documentation/op-guide/recovery.md
- https://etcd.io/docs/v3.4/op-guide/recovery/

View File

@ -2,7 +2,7 @@
service: name=etcd state=stopped
- name: 清除etcd 数据目录
file: name=/var/lib/etcd/member state=absent
file: name={{ ETCD_DATA_DIR }}/member state=absent
- name: 生成备份目录
file: name=/etcd_backup state=directory
@ -24,7 +24,7 @@
--initial-advertise-peer-urls https://{{ inventory_hostname }}:2380"
- name: 恢复数据至etcd 数据目录
shell: "cp -rf /etcd_backup/etcd-{{ inventory_hostname }}.etcd/member /var/lib/etcd/"
shell: "cp -rf /etcd_backup/etcd-{{ inventory_hostname }}.etcd/member {{ ETCD_DATA_DIR }}/"
- name: 重启etcd 服务
service: name=etcd state=restarted

View File

@ -12,6 +12,7 @@
- name: remove files and dirs
file: name={{ item }} state=absent
with_items:
- "/var/lib/etcd"
- {{ ETCD_DATA_DIR }}
- {{ ETCD_WAL_DIR }}
- "/backup/k8s"
- "/etc/systemd/system/etcd.service"

View File

@ -1,5 +1,5 @@
- name: prepare some dirs
file: name=/var/lib/etcd state=directory mode=0700
file: name={{ ETCD_DATA_DIR }} state=directory mode=0700
- name: 下载etcd二进制文件
copy: src={{ base_dir }}/bin/{{ item }} dest={{ bin_dir }}/{{ item }} mode=0755

View File

@ -7,7 +7,7 @@ Documentation=https://github.com/coreos
[Service]
Type=notify
WorkingDirectory=/var/lib/etcd/
WorkingDirectory={{ ETCD_DATA_DIR }}
ExecStart={{ bin_dir }}/etcd \
--name=etcd-{{ inventory_hostname }} \
--cert-file={{ ca_dir }}/etcd.pem \