optimize cgroups settings for node reserved (#9209)

* optimize cgroups settings for node reserved

* fix

* set cgroup slice for multi container engine

* set cgroup slice for crio

* add reserved cgroups variables to sample files

* Compatible with cgroup path for different container managers

* add cgroups doc

* fix markdown
pull/9632/head
Shelming.Song 2022-12-31 00:05:30 +08:00 committed by GitHub
parent 744c81d451
commit 1c4db6132d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 147 additions and 4 deletions

72
docs/cgroups.md 100644
View File

@ -0,0 +1,72 @@
# cgroups
To avoid the rivals for resources between containers or the impact on the host in Kubernetes, the kubelet components will rely on cgroups to limit the containers resources usage.
## Enforcing Node Allocatable
You can use `kubelet_enforce_node_allocatable` to set node allocatable enforcement.
```yaml
# A comma separated list of levels of node allocatable enforcement to be enforced by kubelet.
kubelet_enforce_node_allocatable: "pods"
# kubelet_enforce_node_allocatable: "pods,kube-reserved"
# kubelet_enforce_node_allocatable: "pods,kube-reserved,system-reserved"
```
Note that to enforce kube-reserved or system-reserved, `kube_reserved_cgroups` or `system_reserved_cgroups` needs to be specified respectively.
Here is an example:
```yaml
kubelet_enforce_node_allocatable: "pods,kube-reserved,system-reserved"
# Reserve this space for kube resources
# Set to true to reserve resources for kube daemons
kube_reserved: true
kube_reserved_cgroups_for_service_slice: kube.slice
kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}"
kube_memory_reserved: 256Mi
kube_cpu_reserved: 100m
# kube_ephemeral_storage_reserved: 2Gi
# kube_pid_reserved: "1000"
# Reservation for master hosts
kube_master_memory_reserved: 512Mi
kube_master_cpu_reserved: 200m
# kube_master_ephemeral_storage_reserved: 2Gi
# kube_master_pid_reserved: "1000"
# Set to true to reserve resources for system daemons
system_reserved: true
system_reserved_cgroups_for_service_slice: system.slice
system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}"
system_memory_reserved: 512Mi
system_cpu_reserved: 500m
# system_ephemeral_storage_reserved: 2Gi
# system_pid_reserved: "1000"
# Reservation for master hosts
system_master_memory_reserved: 256Mi
system_master_cpu_reserved: 250m
# system_master_ephemeral_storage_reserved: 2Gi
# system_master_pid_reserved: "1000"
```
After the setup, the cgroups hierarchy is as follows:
```bash
/ (Cgroups Root)
├── kubepods.slice
│ ├── ...
│ ├── kubepods-besteffort.slice
│ ├── kubepods-burstable.slice
│ └── ...
├── kube.slice
│ ├── ...
│ ├── {{container_manager}}.service
│ ├── kubelet.service
│ └── ...
├── system.slice
│ └── ...
└── ...
```
You can learn more in the [official kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/).

View File

@ -261,9 +261,36 @@ podsecuritypolicy_enabled: false
# Acceptable options are 'pods', 'system-reserved', 'kube-reserved' and ''. Default is "". # Acceptable options are 'pods', 'system-reserved', 'kube-reserved' and ''. Default is "".
# kubelet_enforce_node_allocatable: pods # kubelet_enforce_node_allocatable: pods
## Set runtime and kubelet cgroups when using systemd as cgroup driver (default)
# kubelet_runtime_cgroups: "{{ kube_reserved_cgroups }}/{{ container_manager }}.service"
# kubelet_kubelet_cgroups: "{{ kube_reserved_cgroups }}/kubelet.service"
## Set runtime and kubelet cgroups when using cgroupfs as cgroup driver
# kubelet_runtime_cgroups_cgroupfs: "/system.slice/{{ container_manager }}.service"
# kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service"
# Optionally reserve this space for kube daemons.
# kube_reserved: true
## Uncomment to override default values
## The following two items need to be set when kube_reserved is true
# kube_reserved_cgroups_for_service_slice: kube.slice
# kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}"
# kube_memory_reserved: 256Mi
# kube_cpu_reserved: 100m
# kube_ephemeral_storage_reserved: 2Gi
# kube_pid_reserved: "1000"
# Reservation for master hosts
# kube_master_memory_reserved: 512Mi
# kube_master_cpu_reserved: 200m
# kube_master_ephemeral_storage_reserved: 2Gi
# kube_master_pid_reserved: "1000"
## Optionally reserve resources for OS system daemons. ## Optionally reserve resources for OS system daemons.
# system_reserved: true # system_reserved: true
## Uncomment to override default values ## Uncomment to override default values
## The following two items need to be set when system_reserved is true
# system_reserved_cgroups_for_service_slice: system.slice
# system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}"
# system_memory_reserved: 512Mi # system_memory_reserved: 512Mi
# system_cpu_reserved: 500m # system_cpu_reserved: 500m
# system_ephemeral_storage_reserved: 2Gi # system_ephemeral_storage_reserved: 2Gi

View File

@ -36,6 +36,10 @@ LimitMEMLOCK={{ containerd_limit_mem_lock }}
# Only systemd 226 and above support this version. # Only systemd 226 and above support this version.
TasksMax=infinity TasksMax=infinity
OOMScoreAdjust=-999 OOMScoreAdjust=-999
# Set the cgroup slice of the service so that kube reserved takes effect
{% if kube_reserved is defined and kube_reserved|bool %}
Slice={{ kube_reserved_cgroups_for_service_slice }}
{% endif %}
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@ -35,6 +35,10 @@ LimitCORE=infinity
TasksMax=infinity TasksMax=infinity
Delegate=yes Delegate=yes
KillMode=process KillMode=process
# Set the cgroup slice of the service so that kube reserved takes effect
{% if kube_reserved is defined and kube_reserved|bool %}
Slice={{ kube_reserved_cgroups_for_service_slice }}
{% endif %}
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@ -113,8 +113,12 @@ conmon = "{{ crio_conmon }}"
{% if crio_cgroup_manager == "cgroupfs" %} {% if crio_cgroup_manager == "cgroupfs" %}
conmon_cgroup = "pod" conmon_cgroup = "pod"
{% else %} {% else %}
{% if kube_reserved is defined and kube_reserved|bool %}
conmon_cgroup = "{{ kube_reserved_cgroups_for_service_slice }}
{% else %}
conmon_cgroup = "system.slice" conmon_cgroup = "system.slice"
{% endif %} {% endif %}
{% endif %}
# Environment variable list for the conmon process, used for passing necessary # Environment variable list for the conmon process, used for passing necessary
# environment variables to conmon or the runtime. # environment variables to conmon or the runtime.

View File

@ -42,6 +42,10 @@ TimeoutStartSec=1min
Restart=on-failure Restart=on-failure
StartLimitBurst=3 StartLimitBurst=3
StartLimitInterval=60s StartLimitInterval=60s
# Set the cgroup slice of the service so that kube reserved takes effect
{% if kube_reserved is defined and kube_reserved|bool %}
Slice={{ kube_reserved_cgroups_for_service_slice }}
{% endif %}
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@ -12,11 +12,11 @@ kube_resolv_conf: "/etc/resolv.conf"
kubelet_enforce_node_allocatable: "\"\"" kubelet_enforce_node_allocatable: "\"\""
# Set runtime and kubelet cgroups when using systemd as cgroup driver (default) # Set runtime and kubelet cgroups when using systemd as cgroup driver (default)
kubelet_runtime_cgroups: "/systemd/system.slice" kubelet_runtime_cgroups: "{{ kube_reserved_cgroups }}/{{ container_manager }}.service"
kubelet_kubelet_cgroups: "/systemd/system.slice" kubelet_kubelet_cgroups: "{{ kube_reserved_cgroups }}/kubelet.service"
# Set runtime and kubelet cgroups when using cgroupfs as cgroup driver # Set runtime and kubelet cgroups when using cgroupfs as cgroup driver
kubelet_runtime_cgroups_cgroupfs: "/system.slice/containerd.service" kubelet_runtime_cgroups_cgroupfs: "/system.slice/{{ container_manager }}.service"
kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service" kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service"
### fail with swap on (default true) ### fail with swap on (default true)
@ -32,6 +32,10 @@ kubelet_secure_addresses: >-
{%- endfor -%} {%- endfor -%}
# Reserve this space for kube resources # Reserve this space for kube resources
# Set to true to reserve resources for kube daemons
kube_reserved: false
kube_reserved_cgroups_for_service_slice: kube.slice
kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}"
kube_memory_reserved: 256Mi kube_memory_reserved: 256Mi
kube_cpu_reserved: 100m kube_cpu_reserved: 100m
# kube_ephemeral_storage_reserved: 2Gi # kube_ephemeral_storage_reserved: 2Gi
@ -44,6 +48,8 @@ kube_master_cpu_reserved: 200m
# Set to true to reserve resources for system daemons # Set to true to reserve resources for system daemons
system_reserved: false system_reserved: false
system_reserved_cgroups_for_service_slice: system.slice
system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}"
system_memory_reserved: 512Mi system_memory_reserved: 512Mi
system_cpu_reserved: 500m system_cpu_reserved: 500m
# system_ephemeral_storage_reserved: 2Gi # system_ephemeral_storage_reserved: 2Gi

View File

@ -60,6 +60,8 @@ clusterDNS:
- {{ dns_address }} - {{ dns_address }}
{% endfor %} {% endfor %}
{# Node reserved CPU/memory #} {# Node reserved CPU/memory #}
{% if kube_reserved|bool %}
kubeReservedCgroup: {{ kube_reserved_cgroups }}
kubeReserved: kubeReserved:
{% if is_kube_master|bool %} {% if is_kube_master|bool %}
cpu: {{ kube_master_cpu_reserved }} cpu: {{ kube_master_cpu_reserved }}
@ -80,7 +82,9 @@ kubeReserved:
pid: "{{ kube_pid_reserved }}" pid: "{{ kube_pid_reserved }}"
{% endif %} {% endif %}
{% endif %} {% endif %}
{% if system_reserved is defined and system_reserved %} {% endif %}
{% if system_reserved|bool %}
systemReservedCgroup: {{ system_reserved_cgroups }}
systemReserved: systemReserved:
{% if is_kube_master|bool %} {% if is_kube_master|bool %}
cpu: {{ system_master_cpu_reserved }} cpu: {{ system_master_cpu_reserved }}

View File

@ -10,6 +10,24 @@ Wants={{ container_manager }}.service
[Service] [Service]
EnvironmentFile=-{{ kube_config_dir }}/kubelet.env EnvironmentFile=-{{ kube_config_dir }}/kubelet.env
{% if system_reserved|bool %}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpu/{{ system_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuacct/{{ system_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuset/{{ system_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/hugetlb/{{ system_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/memory/{{ system_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/pids/{{ system_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/systemd/{{ system_reserved_cgroups_for_service_slice }}
{% endif %}
{% if kube_reserved|bool %}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpu/{{ kube_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuacct/{{ kube_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuset/{{ kube_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/hugetlb/{{ kube_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/memory/{{ kube_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/pids/{{ kube_reserved_cgroups_for_service_slice }}
ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/systemd/{{ kube_reserved_cgroups_for_service_slice }}
{% endif %}
ExecStart={{ bin_dir }}/kubelet \ ExecStart={{ bin_dir }}/kubelet \
$KUBE_LOGTOSTDERR \ $KUBE_LOGTOSTDERR \
$KUBE_LOG_LEVEL \ $KUBE_LOG_LEVEL \