更新删除节点脚本和文档

pull/475/head
gjmzj 2019-02-19 16:38:10 +08:00
parent 9df8906e98
commit a80351e05e
3 changed files with 209 additions and 192 deletions

View File

@ -147,7 +147,9 @@
- "/etc/systemd/system/etcd.service"
# to clean 'lb' nodes
- hosts: lb
- hosts:
- lb
- ex-lb
tasks:
- name: stop keepalived service
shell: systemctl disable keepalived && systemctl stop keepalived

View File

@ -6,22 +6,29 @@
## 删除流程解释
- 0.获取待删除节点参数`NODE_TO_DEL`
- 1.待删除节点可能是kube-node节点因此先执行`kubectl drain`,如果不是忽略执行报错
- 2.参照`99.clean.yml`脚本方式删除节点可能的服务和配置,忽略执行报错
- 3.待删除节点可能是kube-node节点执行`kubectl delete node`, 如果不是忽略执行报错
- 4.修改ansible hosts移除删除节点
## 删除操作
- 1.替换待删除节点变量假设为192.168.1.1
``` bash
$ sed -i 's/NODE_TO_DEL/192.168.1.1/g' /etc/ansible/tools/clean_one_node.yml
```
- 假设待删除节点为 192.168.1.1
- 2.执行删除
```
``` bash
# 带参数执行如下
$ ansible-playbook /etc/ansible/tools/clean_one_node.yml -e NODE_TO_DEL=192.168.1.1
# 或者不带参数执行,然后根据提示输入/确认
$ ansible-playbook /etc/ansible/tools/clean_one_node.yml
```
## 验证
- 验证删除节点上是否相关服务均已停止
- 验证 ansible hosts 文件中已删除节点
## Debug
如果出现清理失败,类似报错:`... Device or resource busy: '/var/run/docker/netns/xxxxxxxxxx'`需要手动umount该目录后重新清理

View File

@ -1,27 +1,33 @@
# 警告此脚本将清理单个node节点使用请详细参阅 docs/op/del_one_node.md
# 如果该节点为kube-node节点请先执行kubectl drain
# 请三思后运行此脚本特别的如果有pod使用了本地存储类型请自行判断重要性
# 参考 docs/op/del_one_node.md说明
# 使用:
# 1. 假设待删节点为 192.168.1.1,执行 sed -i 's/NODE_TO_DEL/192.168.1.1/g' tools/clean_one_node.yml
# 2. 执行 ansible-playbook /etc/ansible/tools/clean_one_node.yml
# 1. 执行 ansible-playbook /etc/ansible/tools/clean_one_node.yml
# 2. 按照提示输入待删除节点
# 执行kubectl drain(节点可能是kube-node节点)
- hosts: deploy
vars_prompt:
- name: "NODE_TO_DEL"
prompt: "which node is about to be deleted?(e.g 192.168.1.1)"
private: no
confirm: yes
tasks:
- name: 执行kubectl drain(节点可能是kube-node节点)
shell: "{{ bin_dir }}/kubectl drain NODE_TO_DEL --ignore-daemonsets --delete-local-data"
shell: "{{ bin_dir }}/kubectl drain {{ NODE_TO_DEL }} --ignore-daemonsets --delete-local-data"
ignore_errors: true
- block:
# 清理 kube-node 相关服务
- hosts: NODE_TO_DEL
tasks:
- name: stop kube-node service
shell: "systemctl stop kubelet kube-proxy"
- name: stop and disable kube-node service
service: name={{ item }} state=stopped enabled=no
with_items:
- kubelet
- kube-proxy
ignore_errors: true
- name: umount kubelet 挂载的目录
shell: "mount | grep '/var/lib/kubelet'| awk '{print $3}'|xargs umount"
args:
warn: false
ignore_errors: true
- name: 清理目录和文件
@ -34,8 +40,6 @@
- "/opt/kube/kube-system/"
# 清理 kube-master 相关
- hosts: NODE_TO_DEL
tasks:
- name: stop and disable kube-master service
service: name={{ item }} state=stopped enabled=no
with_items:
@ -53,8 +57,6 @@
- "/etc/systemd/system/kube-scheduler.service"
# 清理集群docker服务、网络相关
- hosts: NODE_TO_DEL
tasks:
- name: 清理kube-router相关
shell: "{{ bin_dir }}/docker run --privileged --net=host cloudnativelabs/kube-router --cleanup-config"
ignore_errors: true
@ -95,6 +97,7 @@
- "/etc/systemd/system/calico-node.service"
- "/etc/systemd/system/docker.service"
- "/etc/systemd/system/docker.service.requires/"
- "/etc/systemd/system/docker.service.d/"
- "/opt/kube/kube-system/"
- "/etc/bash_completion.d/docker"
ignore_errors: true
@ -126,8 +129,6 @@
ignore_errors: true
# 清理etcd 集群相关
- hosts: NODE_TO_DEL
tasks:
- name: stop and disable etcd service
service:
name: etcd
@ -144,8 +145,6 @@
- "/etc/systemd/system/etcd.service"
# 清理负载均衡相关
- hosts: NODE_TO_DEL
tasks:
- name: stop keepalived service
shell: systemctl disable keepalived && systemctl stop keepalived
ignore_errors: true
@ -161,15 +160,17 @@
- "/etc/keepalived"
# 清理其他
- hosts: NODE_TO_DEL
tasks:
- name: stop and disable chrony
service: name={{ item }} state=stopped enabled=no
with_items:
- chrony
- chronyd
- name: stop and disable chrony in Ubuntu
service: name=chrony state=stopped enabled=no
ignore_errors: true
tags: rm_ntp
when: ansible_distribution == "Ubuntu" or ansible_distribution == "Debian"
- name: stop and disable chronyd in CentOS/RedHat
service: name=chronyd state=stopped enabled=no
ignore_errors: true
tags: rm_ntp
when: ansible_distribution == "CentOS" or ansible_distribution == "RedHat"
- name: 清理证书目录和文件
file: name={{ item }} state=absent
@ -188,11 +189,18 @@
- 'kubeasz'
- 'helm'
- 'kubectl completion'
delegate_to: "{{ NODE_TO_DEL }}"
run_once: true
# 执行kubectl delete(节点可能是kube-node节点)
- hosts: deploy
tasks:
- name: 执行kubectl delete(节点可能是kube-node节点)
shell: "{{ bin_dir }}/kubectl delete node NODE_TO_DEL"
shell: "{{ bin_dir }}/kubectl delete node {{ NODE_TO_DEL }}"
ignore_errors: true
# 删除 ansible hosts 中节点配置
- name: rm {{ NODE_TO_DEL }} in ansible hosts
lineinfile:
dest: "{{ base_dir }}/hosts"
state: absent
regexp: '{{ NODE_TO_DEL }}'
connection: local