Refactor cgroup hierarchy handling and resource reservation

* We don't need to organize the cgroup hierarchy differently if we don't
  use the resources reservation, so remove the variance, always place
  the kubelet at the same place (default to
  /runtime.slice/kubelet.service)

* Same for the container "runtimes" (which means in fact the container
  **engines**, aka containerd, cri-o, not runc or kata)

* Accordingly, there is no need for a lot of customization on the cgroup
  hierarchy, so reduce it to `kube_slice` and `system_slice`. All the
  rest is derived from that and not user-modifiable.

* Correct the semantics of kube_reserved and system_reserved:
  - kube-reserved and systemd-reserved do not guarantee on their own
    that resources will be available for the respective cgroups, they
    allow to calculate NodeAllocatable.
    See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable
pull/10714/head
Max Gautier 2023-11-24 16:11:40 +01:00
parent 70920af365
commit cac814c06d
No known key found for this signature in database
12 changed files with 25 additions and 43 deletions

View File

@ -36,10 +36,8 @@ LimitMEMLOCK={{ containerd_limit_mem_lock }}
# Only systemd 226 and above support this version.
TasksMax=infinity
OOMScoreAdjust=-999
# Set the cgroup slice of the service so that kube reserved takes effect
{% if kube_reserved is defined and kube_reserved|bool %}
Slice={{ kube_reserved_cgroups_for_service_slice }}
{% endif %}
# Set the cgroup slice of the service to optionally enforce resource limitations
Slice={{ kube_slice }}
[Install]
WantedBy=multi-user.target

View File

@ -35,10 +35,8 @@ LimitCORE=infinity
TasksMax=infinity
Delegate=yes
KillMode=process
# Set the cgroup slice of the service so that kube reserved takes effect
{% if kube_reserved is defined and kube_reserved|bool %}
Slice={{ kube_reserved_cgroups_for_service_slice }}
{% endif %}
# Set the cgroup slice of the service to optionally enforce resource limitations
Slice={{ kube_slice }}
[Install]
WantedBy=multi-user.target

View File

@ -90,7 +90,7 @@
remote_src: true
notify: Restart crio
- name: Cri-o | configure crio to use kube reserved cgroups
- name: Cri-o | configure crio to run in the kube slice
ansible.builtin.copy:
dest: /etc/systemd/system/crio.service.d/00-slice.conf
owner: root
@ -98,11 +98,8 @@
mode: '0644'
content: |
[Service]
Slice={{ kube_reserved_cgroups_for_service_slice }}
Slice={{ kube_slice }}
notify: Restart crio
when:
- kube_reserved is defined and kube_reserved is true
- kube_reserved_cgroups_for_service_slice is defined
- name: Cri-o | update the bin dir for crio.service file
replace:

View File

@ -114,11 +114,7 @@ conmon = "{{ crio_conmon }}"
{% if crio_cgroup_manager == "cgroupfs" %}
conmon_cgroup = "pod"
{% else %}
{% if kube_reserved is defined and kube_reserved|bool %}
conmon_cgroup = "{{ kube_reserved_cgroups_for_service_slice }}"
{% else %}
conmon_cgroup = "system.slice"
{% endif %}
conmon_cgroup = "{{ kube_slice }}"
{% endif %}
# Environment variable list for the conmon process, used for passing necessary

View File

@ -32,10 +32,8 @@ TimeoutStartSec=1min
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
# Set the cgroup slice of the service so that kube reserved takes effect
{% if kube_reserved is defined and kube_reserved|bool %}
Slice={{ kube_reserved_cgroups_for_service_slice }}
{% endif %}
# Set the cgroup slice of the service to optionally enforce resource limitations
Slice={{ kube_slice }}
[Install]
WantedBy=multi-user.target

View File

@ -11,15 +11,6 @@ kube_resolv_conf: "/etc/resolv.conf"
# Set to empty to avoid cgroup creation
kubelet_enforce_node_allocatable: "\"\""
# Set runtime and kubelet cgroups when using systemd as cgroup driver (default)
kube_service_cgroups: "{% if kube_reserved %}{{ kube_reserved_cgroups_for_service_slice }}{% else %}system.slice{% endif %}"
kubelet_runtime_cgroups: "/{{ kube_service_cgroups }}/{{ container_manager }}.service"
kubelet_kubelet_cgroups: "/{{ kube_service_cgroups }}/kubelet.service"
# Set runtime and kubelet cgroups when using cgroupfs as cgroup driver
kubelet_runtime_cgroups_cgroupfs: "/system.slice/{{ container_manager }}.service"
kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service"
# Set systemd service hardening features
kubelet_systemd_hardening: false
@ -42,6 +33,10 @@ kube_cpu_reserved: "100m"
kube_ephemeral_storage_reserved: "500Mi"
kube_pid_reserved: "1000"
# Set slice for host system daemons (sshd, NetworkManager, ...)
# You probably don't want to change this
system_slice: system.slice
# Set to true to reserve resources for system daemons
system_reserved: false
system_reserved_cgroups_for_service_slice: system.slice

View File

@ -39,12 +39,6 @@
kubelet_cgroup_driver: "{{ kubelet_cgroup_driver_detected }}"
when: kubelet_cgroup_driver is undefined
- name: Set kubelet_cgroups options when cgroupfs is used
set_fact:
kubelet_runtime_cgroups: "{{ kubelet_runtime_cgroups_cgroupfs }}"
kubelet_kubelet_cgroups: "{{ kubelet_kubelet_cgroups_cgroupfs }}"
when: kubelet_cgroup_driver == 'cgroupfs'
- name: Set kubelet_config_extra_args options when cgroupfs is used
set_fact:
kubelet_config_extra_args: "{{ kubelet_config_extra_args | combine(kubelet_config_extra_args_cgroupfs) }}"

View File

@ -33,7 +33,7 @@ address: {{ kubelet_bind_address }}
readOnlyPort: {{ kube_read_only_port }}
healthzPort: {{ kubelet_healthz_port }}
healthzBindAddress: {{ kubelet_healthz_bind_address }}
kubeletCgroups: {{ kubelet_kubelet_cgroups }}
kubeletCgroups: {{ kube_slice_cgroup ~ 'kubelet.service' }}
clusterDomain: {{ dns_domain }}
{% if kubelet_protect_kernel_defaults | bool %}
protectKernelDefaults: true
@ -63,7 +63,7 @@ clusterDNS:
{# Node reserved CPU/memory #}
{% for scope in "kube", "system" %}
{% if lookup('ansible.builtin.vars', scope + "_reserved") | bool %}
{{ scope }}ReservedCgroup: {{ lookup('ansible.builtin.vars', scope + '_reserved_cgroups') }}
{{ scope }}ReservedCgroup: {{ lookup('ansible.builtin.vars', scope + '_slice_cgroup') }}
{% endif %}
{{ scope }}Reserved:
{% for resource in "cpu", "memory", "ephemeral-storage", "pid" %}

View File

@ -11,7 +11,7 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
--config={{ kube_config_dir }}/kubelet-config.yaml \
--kubeconfig={{ kube_config_dir }}/kubelet.conf \
{# end kubeadm specific settings #}
--runtime-cgroups={{ kubelet_runtime_cgroups }} \
--runtime-cgroups={{ kube_slice_cgroup ~ container_manager ~ '.service' }} \
{% endset %}
KUBELET_ARGS="{{ kubelet_args_base }} {{ kubelet_custom_flags | join(' ') }}"

View File

@ -14,9 +14,7 @@ Wants={{ kubelet_dependency }}
{% endfor %}
[Service]
{% if kube_reserved|bool %}
Slice={{ kube_reserved_cgroups_for_service_slice }}
{% endif %}
Slice={{ kube_slice }}
EnvironmentFile=-{{ kube_config_dir }}/kubelet.env
ExecStart={{ bin_dir }}/kubelet \
$KUBE_LOGTOSTDERR \

View File

@ -0,0 +1,3 @@
---
kube_slice_cgroup: "/{{ kube_slice.split('-') | join('.slice/') }}/"
system_slice_cgroup: "/{{ system_slice.split('-') | join('.slice/') }}/"

View File

@ -23,6 +23,11 @@ kube_version: v1.30.4
## The minimum version working
kube_version_min_required: v1.28.0
# TODO: put this default to more specific place -> needed by roles container-engine+kubernetes/node
# Set the systemd slice for kubernetes-related daemons: kubelet and container engine
# You probably don't want to change this
kube_slice: runtime.slice
## Kube Proxy mode One of ['iptables', 'ipvs']
kube_proxy_mode: ipvs