From 07ce9c6422e935c8caf02784f5de903a88e4ea55 Mon Sep 17 00:00:00 2001 From: gjmzj Date: Wed, 20 Jun 2018 11:48:29 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=88=A0=E9=99=A4=E5=8D=95?= =?UTF-8?q?=E4=B8=AA=E8=8A=82=E7=82=B9=E8=84=9A=E6=9C=AC=EF=BC=8C=E4=BF=AE?= =?UTF-8?q?=E6=94=B9kubeStateMetrics=E9=BB=98=E8=AE=A4=E9=95=9C=E5=83=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/op/del_one_node.md | 23 ++++ docs/op/op-index.md | 1 + manifests/prometheus/prom-settings.yaml | 3 + tools/clean_one_node.yml | 171 ++++++++++++++++++++++++ 4 files changed, 198 insertions(+) create mode 100644 docs/op/del_one_node.md create mode 100644 tools/clean_one_node.yml diff --git a/docs/op/del_one_node.md b/docs/op/del_one_node.md new file mode 100644 index 0000000..e0632bc --- /dev/null +++ b/docs/op/del_one_node.md @@ -0,0 +1,23 @@ +# 如何删除单个节点 + +本文档所指删除的节点是指使用kubeasz项目安装的节点角色(可能是kube-master, kube-node, etcd, lb节点) + +- 警告:此操作将清理单个node节点,包含k8s集群可能使用的数据,特别的:如果有pod使用了本地存储类型,请自行判断重要性 + +## 删除流程解释 + +- 1.待删除节点可能是kube-node节点,因此先执行`kubectl drain`,如果不是忽略执行报错 +- 2.参照`99.clean.yml`脚本方式删除节点可能的服务和配置,忽略执行报错 +- 3.待删除节点可能是kube-node节点,执行`kubectl delete node`, 如果不是忽略执行报错 + +## 删除操作 + +- 1.替换待删除节点变量,假设为192.168.1.1 +``` bash +$ sed -i 's/NODE_TO_DEL/192.168.1.1/g' /etc/ansible/tools/clean_one_node.yml +``` + +- 2.执行删除 +``` +$ ansible-playbook /etc/ansible/tools/clean_one_node.yml +``` diff --git a/docs/op/op-index.md b/docs/op/op-index.md index 3708d50..23bb653 100644 --- a/docs/op/op-index.md +++ b/docs/op/op-index.md @@ -5,3 +5,4 @@ - [升级K8S 版本](upgrade.md) - [修改集群VIP地址](ChangeVIP.md) - [修改AIO部署的系统IP](change_ip_allinone.md) +- [删除集群某一节点](del_one_node.md) diff --git a/manifests/prometheus/prom-settings.yaml b/manifests/prometheus/prom-settings.yaml index af1777c..9cb7afe 100644 --- a/manifests/prometheus/prom-settings.yaml +++ b/manifests/prometheus/prom-settings.yaml @@ -15,3 +15,6 @@ server: pushgateway: enabled: false +kubeStateMetrics: + image: + repository: mirrorgooglecontainers/kube-state-metrics diff --git a/tools/clean_one_node.yml b/tools/clean_one_node.yml new file mode 100644 index 0000000..3d12789 --- /dev/null +++ b/tools/clean_one_node.yml @@ -0,0 +1,171 @@ +# 警告:此脚本将清理单个node节点,如果该节点为kube-node节点,请先执行kubectl drain +# 请三思后运行此脚本,特别的:如果有pod使用了本地存储类型,请自行判断重要性 +# 参考 docs/op/del_one_node.md说明 +# 使用: +# 1. 假设待删节点为 192.168.1.1,执行 sed -i 's/NODE_TO_DEL/192.168.1.1/g' tools/clean_one_node.yml +# 2. 执行 ansible-playbook /etc/ansible/tools/clean_one_node.yml + +# 执行kubectl drain(节点可能是kube-node节点) +- hosts: deploy + tasks: + - name: 执行kubectl drain(节点可能是kube-node节点) + shell: "{{ bin_dir }}/kubectl drain NODE_TO_DEL --ignore-daemonsets --delete-local-data" + ignore_errors: true + +# 清理 kube-node 相关服务 +- hosts: NODE_TO_DEL + tasks: + - name: stop kube-node service + shell: "systemctl stop kubelet kube-proxy" + ignore_errors: true + + - name: umount kubelet 挂载的目录 + shell: "mount | grep '/var/lib/kubelet'| awk '{print $3}'|xargs umount" + ignore_errors: true + + - name: 清理目录和文件 + file: name={{ item }} state=absent + with_items: + - "/var/lib/kubelet/" + - "/var/lib/kube-proxy/" + - "/etc/systemd/system/kubelet.service" + - "/etc/systemd/system/kube-proxy.service" + - "/opt/kube/kube-system/" + +# 清理 kube-master 相关 +- hosts: NODE_TO_DEL + tasks: + - name: stop kube-master service + shell: "systemctl stop kube-apiserver kube-controller-manager kube-scheduler" + ignore_errors: true + + - name: 清理目录和文件 + file: name={{ item }} state=absent + with_items: + - "/var/run/kubernetes" + - "/etc/systemd/system/kube-apiserver.service" + - "/etc/systemd/system/kube-controller-manager.service" + - "/etc/systemd/system/kube-scheduler.service" + +# 清理集群docker服务、网络相关 +- hosts: NODE_TO_DEL + tasks: + - name: 清理kube-router相关 + shell: "{{ bin_dir }}/docker run --privileged --net=host cloudnativelabs/kube-router --cleanup-config" + ignore_errors: true + when: "CLUSTER_NETWORK == 'kube-router'" + + - name: stop docker service + shell: "systemctl stop docker" + ignore_errors: true + + # 因为calico-kube-controller使用了host网络,相当于使用了docker -net=host,需要 + # 卸载 /var/run/docker/netns/default + - name: 卸载docker 相关fs1 + mount: path=/var/run/docker/netns/default state=unmounted + + - name: 卸载docker 相关fs2 + mount: path=/var/lib/docker/overlay state=unmounted + + - name: 清理目录和文件 + file: name={{ item }} state=absent + with_items: + - "/etc/cni/" + - "/root/.kube/" + - "/run/flannel/" + - "/etc/calico/" + - "/var/run/calico/" + - "/var/log/calico/" + - "/var/lib/cni/" + - "/var/lib/docker/" + - "/var/lib/kube-router/" + - "/var/run/docker/" + - "/etc/systemd/system/calico-node.service" + - "/etc/systemd/system/docker.service" + - "/etc/systemd/system/docker.service.requires/" + - "/opt/kube/kube-system/" + + - name: 清理 iptables + shell: "iptables -F && iptables -X \ + && iptables -F -t nat && iptables -X -t nat \ + && iptables -F -t raw && iptables -X -t raw \ + && iptables -F -t mangle && iptables -X -t mangle" + + - name: 清理网络 + shell: "ip link del docker0; \ + ip link del tunl0; \ + ip link del flannel.1; \ + ip link del cni0; \ + ip link del mynet0; \ + ip link del kube-bridge; \ + ip link del dummy0; \ + systemctl restart networking; \ + systemctl restart network" + ignore_errors: true + + - name: 清理calico残留路由 + shell: "for rt in `ip route|grep bird|sed 's/blackhole//'|awk '{print $1}'`;do ip route del $rt;done;" + when: "CLUSTER_NETWORK == 'calico'" + ignore_errors: true + +# 清理etcd 集群相关 +- hosts: NODE_TO_DEL + tasks: + - name: stop etcd service + shell: systemctl stop etcd + ignore_errors: true + + - name: 清理目录和文件 + file: name={{ item }} state=absent + with_items: + - "/var/lib/etcd" + - "/etc/etcd/" + - "/etc/systemd/system/etcd.service" + +# 清理负载均衡相关 +- hosts: NODE_TO_DEL + tasks: + - name: stop keepalived service + shell: systemctl disable keepalived && systemctl stop keepalived + ignore_errors: true + + - name: stop haproxy service + shell: systemctl disable haproxy && systemctl stop haproxy + ignore_errors: true + + - name: 清理LB 配置文件目录 + file: name={{ item }} state=absent + with_items: + - "/etc/haproxy" + - "/etc/keepalived" + +# 清理其他 +- hosts: NODE_TO_DEL + tasks: + - name: 清理证书目录和文件 + file: name={{ item }} state=absent + with_items: + - "/etc/kubernetes/" + - "{{ ca_dir }}" + - "/root/.kube/" + - "/etc/docker/" + + - name: 清理自动生成的PATH + lineinfile: + dest: ~/.bashrc + state: absent + regexp: 'kubeasz' + + - name: 清理 kubectl 命令自动补全 + lineinfile: + dest: ~/.bashrc + state: absent + regexp: 'kubectl completion' + +# 执行kubectl delete(节点可能是kube-node节点) +- hosts: deploy + tasks: + - name: 执行kubectl delete(节点可能是kube-node节点) + shell: "{{ bin_dir }}/kubectl delete node NODE_TO_DEL" + ignore_errors: true +