From c07d60bc906f3db97d79fffa6bc49c4b3a24970c Mon Sep 17 00:00:00 2001 From: Sergii Golovatiuk Date: Tue, 7 Feb 2017 15:01:02 +0100 Subject: [PATCH] Kubernetes Reliability Improvements - Exclude kubelet CPU/RAM (kube-reserved) from cgroup. It decreases a chance of overcommitment - Add a possibility to modify Kubelet node-status-update-frequency - Add a posibility to configure node-monitor-grace-period, node-monitor-period, pod-eviction-timeout for Kubernetes controller manager - Add Kubernetes Relaibility Documentation with recomendations for various scenarios. Signed-off-by: Sergii Golovatiuk --- docs/kubernetes-reliability.md | 103 ++++++++++++++++++ docs/large-deployments.md | 17 ++- roles/kubernetes/master/defaults/main.yml | 3 + .../kube-controller-manager.manifest.j2 | 3 + roles/kubernetes/node/defaults/main.yml | 1 + roles/kubernetes/node/templates/kubelet.j2 | 5 +- 6 files changed, 126 insertions(+), 6 deletions(-) create mode 100644 docs/kubernetes-reliability.md diff --git a/docs/kubernetes-reliability.md b/docs/kubernetes-reliability.md new file mode 100644 index 000000000..82ec65f2e --- /dev/null +++ b/docs/kubernetes-reliability.md @@ -0,0 +1,103 @@ +# Overview + +Distributed system such as Kubernetes are designed to be resilient to the +failures. More details about Kubernetes High-Availability (HA) may be found at +[Building High-Availability Clusters](https://kubernetes.io/docs/admin/high-availability/) + +To have a simple view the most of parts of HA will be skipped to describe +Kubelet<->Controller Manager communication only. + +By default the normal behavior looks like: + +1. Kubelet updates it status to apiserver periodically, as specified by + `--node-status-update-frequency`. The default value is **10s**. + +2. Kubernetes controller manager checks the statuses of Kubelets every + `–-node-monitor-period`. The default value is **5s**. + +3. In case the status is updated within `--node-monitor-grace-period` of time, + Kubernetes controller manager considers healthy status of Kubelet. The + default value is **40s**. + +> Kubernetes controller manager and Kubelets work asynchronously. It means that +> the delay may include any network latency, API Server latency, etcd latency, +> latency caused by load on one's master nodes and so on. So if +> `--node-status-update-frequency` is set to 5s in reality it may appear in +> etcd in 6-7 seconds or even longer when etcd cannot commit data to quorum +> nodes. + +# Failure + +Kubelet will try to make `nodeStatusUpdateRetry` post attempts. Currently +`nodeStatusUpdateRetry` is constantly set to 5 in +[kubelet.go](https://github.com/kubernetes/kubernetes/blob/release-1.5/pkg/kubelet/kubelet.go#L102). + +Kubelet will try to update the status in +[tryUpdateNodeStatus](https://github.com/kubernetes/kubernetes/blob/release-1.5/pkg/kubelet/kubelet_node_status.go#L345) +function. Kubelet uses `http.Client()` Golang method, but has no specified +timeout. Thus there may be some glitches when API Server is overloaded while +TCP connection is established. + +So, there will be `nodeStatusUpdateRetry` * `--node-status-update-frequency` +attempts to set a status of node. + +At the same time Kubernetes controller manager will try to check +`nodeStatusUpdateRetry` times every `--node-monitor-period` of time. After +`--node-monitor-grace-period` it will consider node unhealthy. It will remove +its pods based on `--pod-eviction-timeout` + +Kube proxy has a watcher over API. Once pods are evicted, Kube proxy will +notice and will update iptables of the node. It will remove endpoints from +services so pods from failed node won't be accessible anymore. + +# Recommendations for different cases + +## Fast Update and Fast Reaction + +If `-–node-status-update-frequency` is set to **4s** (10s is default). +`--node-monitor-period` to **2s** (5s is default). +`--node-monitor-grace-period` to **20s** (40s is default). +`--pod-eviction-timeout` is set to **30s** (5m is default) + +In such scenario, pods will be evicted in **50s** because the node will be +considered as down after **20s**, and `--pod-eviction-timeout` occurs after +**30s** more. However, this scenario creates an overhead on etcd as every node +will try to update its status every 2 seconds. + +If the environment has 1000 nodes, there will be 15000 node updates per +minute which may require large etcd containers or even dedicated nodes for etcd. + +> If we calculate the number of tries, the division will give 5, but in reality +> it will be from 3 to 5 with `nodeStatusUpdateRetry` attempts of each try. The +> total number of attemtps will vary from 15 to 25 due to latency of all +> components. + +## Medium Update and Average Reaction + +Let's set `-–node-status-update-frequency` to **20s** +`--node-monitor-grace-period` to **2m** and `--pod-eviction-timeout` to **1m**. +In that case, Kubelet will try to update status every 20s. So, it will be 6 * 5 += 30 attempts before Kubernetes controller manager will consider unhealthy +status of node. After 1m it will evict all pods. The total time will be 3m +before eviction process. + +Such scenario is good for medium environments as 1000 nodes will require 3000 +etcd updates per minute. + +> In reality, there will be from 4 to 6 node update tries. The total number of +> of attempts will vary from 20 to 30. + +## Low Update and Slow reaction + +Let's set `-–node-status-update-frequency` to **1m**. +`--node-monitor-grace-period` will set to **5m** and `--pod-eviction-timeout` +to **1m**. In this scenario, every kubelet will try to update the status every +minute. There will be 5 * 5 = 25 attempts before unhealty status. After 5m, +Kubernetes controller manager will set unhealthy status. This means that pods +will be evicted after 1m after being marked unhealthy. (6m in total). + +> In reality, there will be from 3 to 5 tries. The total number of attempt will +> vary from 15 to 25. + +There can be different combinations such as Fast Update with Slow reaction to +satisfy specific cases. diff --git a/docs/large-deployments.md b/docs/large-deployments.md index 721064942..18e3b7877 100644 --- a/docs/large-deployments.md +++ b/docs/large-deployments.md @@ -3,7 +3,8 @@ Large deployments of K8s For a large scaled deployments, consider the following configuration changes: -* Tune [ansible settings](http://docs.ansible.com/ansible/intro_configuration.html) +* Tune [ansible settings] + (http://docs.ansible.com/ansible/intro_configuration.html) for `forks` and `timeout` vars to fit large numbers of nodes being deployed. * Override containers' `foo_image_repo` vars to point to intranet registry. @@ -23,9 +24,15 @@ For a large scaled deployments, consider the following configuration changes: * Tune CPU/memory limits and requests. Those are located in roles' defaults and named like ``foo_memory_limit``, ``foo_memory_requests`` and ``foo_cpu_limit``, ``foo_cpu_requests``. Note that 'Mi' memory units for K8s - will be submitted as 'M', if applied for ``docker run``, and cpu K8s units will - end up with the 'm' skipped for docker as well. This is required as docker does not - understand k8s units well. + will be submitted as 'M', if applied for ``docker run``, and cpu K8s units + will end up with the 'm' skipped for docker as well. This is required as + docker does not understand k8s units well. + +* Tune ``kubelet_status_update_frequency`` to increase reliability of kubelet. + ``kube_controller_node_monitor_grace_period``, + ``kube_controller_node_monitor_period``, + ``kube_controller_pod_eviction_timeout`` for better Kubernetes reliability. + Check out [Kubernetes Reliability](kubernetes-reliability.md) * Add calico-rr nodes if you are deploying with Calico or Canal. Nodes recover from host/network interruption much quicker with calico-rr. Note that @@ -33,7 +40,7 @@ For a large scaled deployments, consider the following configuration changes: etcd role is okay). * Check out the - [Inventory](https://github.com/kubernetes-incubator/kargo/blob/master/docs/getting-started.md#building-your-own-inventory) + [Inventory](getting-started.md#building-your-own-inventory) section of the Getting started guide for tips on creating a large scale Ansible inventory. diff --git a/roles/kubernetes/master/defaults/main.yml b/roles/kubernetes/master/defaults/main.yml index 874925adf..510968b13 100644 --- a/roles/kubernetes/master/defaults/main.yml +++ b/roles/kubernetes/master/defaults/main.yml @@ -18,6 +18,9 @@ kube_controller_memory_limit: 512M kube_controller_cpu_limit: 250m kube_controller_memory_requests: 170M kube_controller_cpu_requests: 100m +kube_controller_node_monitor_grace_period: 40s +kube_controller_node_monitor_period: 5s +kube_controller_pod_eviction_timeout: 5m0s kube_scheduler_memory_limit: 512M kube_scheduler_cpu_limit: 250m kube_scheduler_memory_requests: 170M diff --git a/roles/kubernetes/master/templates/manifests/kube-controller-manager.manifest.j2 b/roles/kubernetes/master/templates/manifests/kube-controller-manager.manifest.j2 index 49dd05ba8..7bcd51cc4 100644 --- a/roles/kubernetes/master/templates/manifests/kube-controller-manager.manifest.j2 +++ b/roles/kubernetes/master/templates/manifests/kube-controller-manager.manifest.j2 @@ -28,6 +28,9 @@ spec: - --cluster-signing-cert-file={{ kube_cert_dir }}/ca.pem - --cluster-signing-key-file={{ kube_cert_dir }}/ca-key.pem - --enable-hostpath-provisioner={{ kube_hostpath_dynamic_provisioner }} + - --node-monitor-grace-period={{ kube_controller_node_monitor_grace_period }} + - --node-monitor-period={{ kube_controller_node_monitor_period }} + - --pod-eviction-timeout={{ kube_controller_pod_eviction_timeout }} - --v={{ kube_log_level }} {% if cloud_provider is defined and cloud_provider in ["openstack", "azure"] %} - --cloud-provider={{cloud_provider}} diff --git a/roles/kubernetes/node/defaults/main.yml b/roles/kubernetes/node/defaults/main.yml index d60b76208..46cc10c47 100644 --- a/roles/kubernetes/node/defaults/main.yml +++ b/roles/kubernetes/node/defaults/main.yml @@ -12,6 +12,7 @@ kube_proxy_masquerade_all: true # Limits for kube components and nginx load balancer app kubelet_memory_limit: 512M kubelet_cpu_limit: 100m +kubelet_status_update_frequency: 10s kube_proxy_memory_limit: 2000M kube_proxy_cpu_limit: 500m kube_proxy_memory_requests: 256M diff --git a/roles/kubernetes/node/templates/kubelet.j2 b/roles/kubernetes/node/templates/kubelet.j2 index 65f98d432..8ec348a05 100644 --- a/roles/kubernetes/node/templates/kubelet.j2 +++ b/roles/kubernetes/node/templates/kubelet.j2 @@ -9,7 +9,10 @@ KUBELET_ADDRESS="--address={{ ip | default("0.0.0.0") }}" KUBELET_HOSTNAME="--hostname-override={{ ansible_hostname }}" {# Base kubelet args #} -{% set kubelet_args_base %}--pod-manifest-path={{ kube_manifest_dir }} --pod-infra-container-image={{ pod_infra_image_repo }}:{{ pod_infra_image_tag }}{% endset %} +{% set kubelet_args_base %}--pod-manifest-path={{ kube_manifest_dir }} \ +--pod-infra-container-image={{ pod_infra_image_repo }}:{{ pod_infra_image_tag }} \ +--kube-reserved cpu={{ kubelet_cpu_limit }},memory={{ kubelet_memory_limit|regex_replace('Mi', 'M') }} \ +--node-status-update-frequency={{ kubelet_status_update_frequency }}{% endset %} {# DNS settings for kubelet #} {% if dns_mode == 'kubedns' %}