Refactor cgroup hierarchy handling and resource reservation
* We don't need to organize the cgroup hierarchy differently if we don't use the resources reservation, so remove the variance, always place the kubelet at the same place (default to /runtime.slice/kubelet.service) * Same for the container "runtimes" (which means in fact the container **engines**, aka containerd, cri-o, not runc or kata) * Accordingly, there is no need for a lot of customization on the cgroup hierarchy, so reduce it to `kube_slice` and `system_slice`. All the rest is derived from that and not user-modifiable. * Correct the semantics of kube_reserved and system_reserved: - kube-reserved and systemd-reserved do not guarantee on their own that resources will be available for the respective cgroups, they allow to calculate NodeAllocatable. See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatablepull/10714/head
parent
70920af365
commit
cac814c06d
|
@ -36,10 +36,8 @@ LimitMEMLOCK={{ containerd_limit_mem_lock }}
|
|||
# Only systemd 226 and above support this version.
|
||||
TasksMax=infinity
|
||||
OOMScoreAdjust=-999
|
||||
# Set the cgroup slice of the service so that kube reserved takes effect
|
||||
{% if kube_reserved is defined and kube_reserved|bool %}
|
||||
Slice={{ kube_reserved_cgroups_for_service_slice }}
|
||||
{% endif %}
|
||||
# Set the cgroup slice of the service to optionally enforce resource limitations
|
||||
Slice={{ kube_slice }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -35,10 +35,8 @@ LimitCORE=infinity
|
|||
TasksMax=infinity
|
||||
Delegate=yes
|
||||
KillMode=process
|
||||
# Set the cgroup slice of the service so that kube reserved takes effect
|
||||
{% if kube_reserved is defined and kube_reserved|bool %}
|
||||
Slice={{ kube_reserved_cgroups_for_service_slice }}
|
||||
{% endif %}
|
||||
# Set the cgroup slice of the service to optionally enforce resource limitations
|
||||
Slice={{ kube_slice }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -90,7 +90,7 @@
|
|||
remote_src: true
|
||||
notify: Restart crio
|
||||
|
||||
- name: Cri-o | configure crio to use kube reserved cgroups
|
||||
- name: Cri-o | configure crio to run in the kube slice
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/systemd/system/crio.service.d/00-slice.conf
|
||||
owner: root
|
||||
|
@ -98,11 +98,8 @@
|
|||
mode: '0644'
|
||||
content: |
|
||||
[Service]
|
||||
Slice={{ kube_reserved_cgroups_for_service_slice }}
|
||||
Slice={{ kube_slice }}
|
||||
notify: Restart crio
|
||||
when:
|
||||
- kube_reserved is defined and kube_reserved is true
|
||||
- kube_reserved_cgroups_for_service_slice is defined
|
||||
|
||||
- name: Cri-o | update the bin dir for crio.service file
|
||||
replace:
|
||||
|
|
|
@ -114,11 +114,7 @@ conmon = "{{ crio_conmon }}"
|
|||
{% if crio_cgroup_manager == "cgroupfs" %}
|
||||
conmon_cgroup = "pod"
|
||||
{% else %}
|
||||
{% if kube_reserved is defined and kube_reserved|bool %}
|
||||
conmon_cgroup = "{{ kube_reserved_cgroups_for_service_slice }}"
|
||||
{% else %}
|
||||
conmon_cgroup = "system.slice"
|
||||
{% endif %}
|
||||
conmon_cgroup = "{{ kube_slice }}"
|
||||
{% endif %}
|
||||
|
||||
# Environment variable list for the conmon process, used for passing necessary
|
||||
|
|
|
@ -32,10 +32,8 @@ TimeoutStartSec=1min
|
|||
Restart=on-failure
|
||||
StartLimitBurst=3
|
||||
StartLimitInterval=60s
|
||||
# Set the cgroup slice of the service so that kube reserved takes effect
|
||||
{% if kube_reserved is defined and kube_reserved|bool %}
|
||||
Slice={{ kube_reserved_cgroups_for_service_slice }}
|
||||
{% endif %}
|
||||
# Set the cgroup slice of the service to optionally enforce resource limitations
|
||||
Slice={{ kube_slice }}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -11,15 +11,6 @@ kube_resolv_conf: "/etc/resolv.conf"
|
|||
# Set to empty to avoid cgroup creation
|
||||
kubelet_enforce_node_allocatable: "\"\""
|
||||
|
||||
# Set runtime and kubelet cgroups when using systemd as cgroup driver (default)
|
||||
kube_service_cgroups: "{% if kube_reserved %}{{ kube_reserved_cgroups_for_service_slice }}{% else %}system.slice{% endif %}"
|
||||
kubelet_runtime_cgroups: "/{{ kube_service_cgroups }}/{{ container_manager }}.service"
|
||||
kubelet_kubelet_cgroups: "/{{ kube_service_cgroups }}/kubelet.service"
|
||||
|
||||
# Set runtime and kubelet cgroups when using cgroupfs as cgroup driver
|
||||
kubelet_runtime_cgroups_cgroupfs: "/system.slice/{{ container_manager }}.service"
|
||||
kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service"
|
||||
|
||||
# Set systemd service hardening features
|
||||
kubelet_systemd_hardening: false
|
||||
|
||||
|
@ -42,6 +33,10 @@ kube_cpu_reserved: "100m"
|
|||
kube_ephemeral_storage_reserved: "500Mi"
|
||||
kube_pid_reserved: "1000"
|
||||
|
||||
# Set slice for host system daemons (sshd, NetworkManager, ...)
|
||||
# You probably don't want to change this
|
||||
system_slice: system.slice
|
||||
|
||||
# Set to true to reserve resources for system daemons
|
||||
system_reserved: false
|
||||
system_reserved_cgroups_for_service_slice: system.slice
|
||||
|
|
|
@ -39,12 +39,6 @@
|
|||
kubelet_cgroup_driver: "{{ kubelet_cgroup_driver_detected }}"
|
||||
when: kubelet_cgroup_driver is undefined
|
||||
|
||||
- name: Set kubelet_cgroups options when cgroupfs is used
|
||||
set_fact:
|
||||
kubelet_runtime_cgroups: "{{ kubelet_runtime_cgroups_cgroupfs }}"
|
||||
kubelet_kubelet_cgroups: "{{ kubelet_kubelet_cgroups_cgroupfs }}"
|
||||
when: kubelet_cgroup_driver == 'cgroupfs'
|
||||
|
||||
- name: Set kubelet_config_extra_args options when cgroupfs is used
|
||||
set_fact:
|
||||
kubelet_config_extra_args: "{{ kubelet_config_extra_args | combine(kubelet_config_extra_args_cgroupfs) }}"
|
||||
|
|
|
@ -33,7 +33,7 @@ address: {{ kubelet_bind_address }}
|
|||
readOnlyPort: {{ kube_read_only_port }}
|
||||
healthzPort: {{ kubelet_healthz_port }}
|
||||
healthzBindAddress: {{ kubelet_healthz_bind_address }}
|
||||
kubeletCgroups: {{ kubelet_kubelet_cgroups }}
|
||||
kubeletCgroups: {{ kube_slice_cgroup ~ 'kubelet.service' }}
|
||||
clusterDomain: {{ dns_domain }}
|
||||
{% if kubelet_protect_kernel_defaults | bool %}
|
||||
protectKernelDefaults: true
|
||||
|
@ -63,7 +63,7 @@ clusterDNS:
|
|||
{# Node reserved CPU/memory #}
|
||||
{% for scope in "kube", "system" %}
|
||||
{% if lookup('ansible.builtin.vars', scope + "_reserved") | bool %}
|
||||
{{ scope }}ReservedCgroup: {{ lookup('ansible.builtin.vars', scope + '_reserved_cgroups') }}
|
||||
{{ scope }}ReservedCgroup: {{ lookup('ansible.builtin.vars', scope + '_slice_cgroup') }}
|
||||
{% endif %}
|
||||
{{ scope }}Reserved:
|
||||
{% for resource in "cpu", "memory", "ephemeral-storage", "pid" %}
|
||||
|
|
|
@ -11,7 +11,7 @@ KUBELET_HOSTNAME="--hostname-override={{ kube_override_hostname }}"
|
|||
--config={{ kube_config_dir }}/kubelet-config.yaml \
|
||||
--kubeconfig={{ kube_config_dir }}/kubelet.conf \
|
||||
{# end kubeadm specific settings #}
|
||||
--runtime-cgroups={{ kubelet_runtime_cgroups }} \
|
||||
--runtime-cgroups={{ kube_slice_cgroup ~ container_manager ~ '.service' }} \
|
||||
{% endset %}
|
||||
|
||||
KUBELET_ARGS="{{ kubelet_args_base }} {{ kubelet_custom_flags | join(' ') }}"
|
||||
|
|
|
@ -14,9 +14,7 @@ Wants={{ kubelet_dependency }}
|
|||
{% endfor %}
|
||||
|
||||
[Service]
|
||||
{% if kube_reserved|bool %}
|
||||
Slice={{ kube_reserved_cgroups_for_service_slice }}
|
||||
{% endif %}
|
||||
Slice={{ kube_slice }}
|
||||
EnvironmentFile=-{{ kube_config_dir }}/kubelet.env
|
||||
ExecStart={{ bin_dir }}/kubelet \
|
||||
$KUBE_LOGTOSTDERR \
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
---
|
||||
kube_slice_cgroup: "/{{ kube_slice.split('-') | join('.slice/') }}/"
|
||||
system_slice_cgroup: "/{{ system_slice.split('-') | join('.slice/') }}/"
|
|
@ -23,6 +23,11 @@ kube_version: v1.30.4
|
|||
## The minimum version working
|
||||
kube_version_min_required: v1.28.0
|
||||
|
||||
# TODO: put this default to more specific place -> needed by roles container-engine+kubernetes/node
|
||||
# Set the systemd slice for kubernetes-related daemons: kubelet and container engine
|
||||
# You probably don't want to change this
|
||||
kube_slice: runtime.slice
|
||||
|
||||
## Kube Proxy mode One of ['iptables', 'ipvs']
|
||||
kube_proxy_mode: ipvs
|
||||
|
||||
|
|
Loading…
Reference in New Issue