From b0ad8ec02326e07551fe3c03ecc903a8dd601b02 Mon Sep 17 00:00:00 2001 From: holmesb <5072156+holmesb@users.noreply.github.com> Date: Fri, 8 Jan 2021 15:20:53 +0000 Subject: [PATCH] =?UTF-8?q?Fixed=20issue=20#7112.=C2=A0=20Created=20new=20?= =?UTF-8?q?API=20Server=20vars=20that=20replace=20defunct=20Controller=20M?= =?UTF-8?q?anager=20one=20(#7114)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Brendan Holmes <5072156+holmesb@users.noreply.github.com> --- docs/kubernetes-reliability.md | 25 +++++++++++-------- docs/large-deployments.md | 3 ++- .../kubernetes/master/defaults/main/main.yml | 3 ++- .../templates/kubeadm-config.v1beta2.yaml.j2 | 7 +++++- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/docs/kubernetes-reliability.md b/docs/kubernetes-reliability.md index 06357bf4e..03b6b1b57 100644 --- a/docs/kubernetes-reliability.md +++ b/docs/kubernetes-reliability.md @@ -43,8 +43,10 @@ attempts to set a status of node. At the same time Kubernetes controller manager will try to check `nodeStatusUpdateRetry` times every `--node-monitor-period` of time. After -`--node-monitor-grace-period` it will consider node unhealthy. It will remove -its pods based on `--pod-eviction-timeout` +`--node-monitor-grace-period` it will consider node unhealthy. Pods will then be rescheduled based on the +[Taint Based Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#taint-based-evictions) +timers that you set on them individually, or the API Server's global timers:`--default-not-ready-toleration-seconds` & +``--default-unreachable-toleration-seconds``. Kube proxy has a watcher over API. Once pods are evicted, Kube proxy will notice and will update iptables of the node. It will remove endpoints from @@ -57,12 +59,14 @@ services so pods from failed node won't be accessible anymore. If `-–node-status-update-frequency` is set to **4s** (10s is default). `--node-monitor-period` to **2s** (5s is default). `--node-monitor-grace-period` to **20s** (40s is default). -`--pod-eviction-timeout` is set to **30s** (5m is default) +`--default-not-ready-toleration-seconds` and ``--default-unreachable-toleration-seconds`` are set to **30** +(300 seconds is default). Note these two values should be integers representing the number of seconds ("s" or "m" for +seconds\minutes are not specified). In such scenario, pods will be evicted in **50s** because the node will be -considered as down after **20s**, and `--pod-eviction-timeout` occurs after -**30s** more. However, this scenario creates an overhead on etcd as every node -will try to update its status every 2 seconds. +considered as down after **20s**, and `--default-not-ready-toleration-seconds` or +``--default-unreachable-toleration-seconds`` occur after **30s** more. However, this scenario creates an overhead on +etcd as every node will try to update its status every 2 seconds. If the environment has 1000 nodes, there will be 15000 node updates per minute which may require large etcd containers or even dedicated nodes for etcd. @@ -75,7 +79,8 @@ minute which may require large etcd containers or even dedicated nodes for etcd. ## Medium Update and Average Reaction Let's set `-–node-status-update-frequency` to **20s** -`--node-monitor-grace-period` to **2m** and `--pod-eviction-timeout` to **1m**. +`--node-monitor-grace-period` to **2m** and `--default-not-ready-toleration-seconds` and +``--default-unreachable-toleration-seconds`` to **60**. In that case, Kubelet will try to update status every 20s. So, it will be 6 * 5 = 30 attempts before Kubernetes controller manager will consider unhealthy status of node. After 1m it will evict all pods. The total time will be 3m @@ -90,9 +95,9 @@ etcd updates per minute. ## Low Update and Slow reaction Let's set `-–node-status-update-frequency` to **1m**. -`--node-monitor-grace-period` will set to **5m** and `--pod-eviction-timeout` -to **1m**. In this scenario, every kubelet will try to update the status every -minute. There will be 5 * 5 = 25 attempts before unhealthy status. After 5m, +`--node-monitor-grace-period` will set to **5m** and `--default-not-ready-toleration-seconds` and +``--default-unreachable-toleration-seconds`` to **60**. In this scenario, every kubelet will try to update the status +every minute. There will be 5 * 5 = 25 attempts before unhealthy status. After 5m, Kubernetes controller manager will set unhealthy status. This means that pods will be evicted after 1m after being marked unhealthy. (6m in total). diff --git a/docs/large-deployments.md b/docs/large-deployments.md index 1aa59e101..8b8ebef4e 100644 --- a/docs/large-deployments.md +++ b/docs/large-deployments.md @@ -30,7 +30,8 @@ For a large scaled deployments, consider the following configuration changes: * Tune ``kubelet_status_update_frequency`` to increase reliability of kubelet. ``kube_controller_node_monitor_grace_period``, ``kube_controller_node_monitor_period``, - ``kube_controller_pod_eviction_timeout`` for better Kubernetes reliability. + ``kube_apiserver_pod_eviction_not_ready_timeout_seconds`` & + ``kube_apiserver_pod_eviction_unreachable_timeout_seconds`` for better Kubernetes reliability. Check out [Kubernetes Reliability](kubernetes-reliability.md) * Tune network prefix sizes. Those are ``kube_network_node_prefix``, diff --git a/roles/kubernetes/master/defaults/main/main.yml b/roles/kubernetes/master/defaults/main/main.yml index ed31da30c..24c1ddff0 100644 --- a/roles/kubernetes/master/defaults/main/main.yml +++ b/roles/kubernetes/master/defaults/main/main.yml @@ -86,9 +86,10 @@ audit_webhook_batch_max_wait: 1s kube_controller_node_monitor_grace_period: 40s kube_controller_node_monitor_period: 5s -kube_controller_pod_eviction_timeout: 5m0s kube_controller_terminated_pod_gc_threshold: 12500 kube_apiserver_request_timeout: "1m0s" +kube_apiserver_pod_eviction_not_ready_timeout_seconds: "300" +kube_apiserver_pod_eviction_unreachable_timeout_seconds: "300" # 1.10+ admission plugins kube_apiserver_enable_admission_plugins: [] diff --git a/roles/kubernetes/master/templates/kubeadm-config.v1beta2.yaml.j2 b/roles/kubernetes/master/templates/kubeadm-config.v1beta2.yaml.j2 index 31e398bb9..47570710c 100644 --- a/roles/kubernetes/master/templates/kubeadm-config.v1beta2.yaml.j2 +++ b/roles/kubernetes/master/templates/kubeadm-config.v1beta2.yaml.j2 @@ -100,6 +100,12 @@ certificatesDir: {{ kube_cert_dir }} imageRepository: {{ kube_image_repo }} apiServer: extraArgs: +{% if kube_apiserver_pod_eviction_not_ready_timeout_seconds is defined %} + default-not-ready-toleration-seconds: "{{ kube_apiserver_pod_eviction_not_ready_timeout_seconds }}" +{% endif %} +{% if kube_apiserver_pod_eviction_unreachable_timeout_seconds is defined %} + default-unreachable-toleration-seconds: "{{ kube_apiserver_pod_eviction_unreachable_timeout_seconds }}" +{% endif %} {% if kube_api_anonymous_auth is defined %} anonymous-auth: "{{ kube_api_anonymous_auth }}" {% endif %} @@ -256,7 +262,6 @@ controllerManager: extraArgs: node-monitor-grace-period: {{ kube_controller_node_monitor_grace_period }} node-monitor-period: {{ kube_controller_node_monitor_period }} - pod-eviction-timeout: {{ kube_controller_pod_eviction_timeout }} node-cidr-mask-size: "{{ kube_network_node_prefix }}" profiling: "{{ kube_profiling }}" terminated-pod-gc-threshold: "{{ kube_controller_terminated_pod_gc_threshold }}"