etcd: throttle restart for availability (#11677)
* etcd: throttle restart for availability During upgrade, etcd member are restarted all at once. This can impact the availability of the etcd cluster and subsequently of the Kubernetes cluster. Limit the concurrent restart so that the etcd cluster can keep quorum. * Simplify etcd handlerspull/11689/head
parent
31a206033f
commit
0f0e24be0f
|
@ -2,26 +2,25 @@
|
||||||
- name: Backup etcd
|
- name: Backup etcd
|
||||||
import_tasks: backup.yml
|
import_tasks: backup.yml
|
||||||
|
|
||||||
- name: Etcd | reload systemd
|
- name: Restart etcd
|
||||||
systemd_service:
|
systemd_service:
|
||||||
daemon_reload: true
|
|
||||||
listen:
|
|
||||||
- Restart etcd
|
|
||||||
- Restart etcd-events
|
|
||||||
|
|
||||||
- name: Reload etcd
|
|
||||||
service:
|
|
||||||
name: etcd
|
name: etcd
|
||||||
state: restarted
|
state: restarted
|
||||||
|
daemon_reload: true
|
||||||
when: ('etcd' in group_names)
|
when: ('etcd' in group_names)
|
||||||
listen: Restart etcd
|
throttle: "{{ groups['etcd'] | length // 2 }}"
|
||||||
|
# Etcd cluster MUST have an odd number of members
|
||||||
|
# Truncated integer division by 2 will always return (majority - 1) which
|
||||||
|
# means the cluster will keep quorum and stay available
|
||||||
|
|
||||||
- name: Reload etcd-events
|
- name: Restart etcd-events
|
||||||
service:
|
systemd_service:
|
||||||
name: etcd-events
|
name: etcd-events
|
||||||
state: restarted
|
state: restarted
|
||||||
|
daemon_reload: true
|
||||||
|
# TODO: this seems odd. etcd-events should be a different group possibly ?
|
||||||
when: ('etcd' in group_names)
|
when: ('etcd' in group_names)
|
||||||
listen: Restart etcd-events
|
throttle: "{{ groups['etcd'] | length // 2 }}"
|
||||||
|
|
||||||
- name: Wait for etcd up
|
- name: Wait for etcd up
|
||||||
uri:
|
uri:
|
||||||
|
|
Loading…
Reference in New Issue