From a88bad7947022612a25847b1737c589c35ceb11b Mon Sep 17 00:00:00 2001 From: Louis Tu <92532497+tu1h@users.noreply.github.com> Date: Tue, 23 Jan 2024 14:42:33 +0800 Subject: [PATCH] Add scheduler plugins support (#10747) Signed-off-by: tu1h --- docs/kubernetes-apps/scheduler_plugins.md | 49 +++++ roles/kubernetes-apps/meta/main.yml | 8 + .../scheduler_plugins/defaults/main.yml | 29 +++ .../scheduler_plugins/tasks/main.yml | 68 ++++++ ...appgroup.diktyo.x-k8s.io_appgroups.yaml.j2 | 197 ++++++++++++++++++ .../templates/cm-scheduler-plugins.yaml.j2 | 28 +++ .../deploy-scheduler-plugins.yaml.j2 | 74 +++++++ .../templates/namespace.yaml.j2 | 7 + ....diktyo.x-k8s.io_networktopologies.yaml.j2 | 148 +++++++++++++ .../templates/rbac-scheduler-plugins.yaml.j2 | 140 +++++++++++++ .../templates/sa-scheduler-plugins.yaml.j2 | 11 + .../scheduling.x-k8s.io_elasticquotas.yaml.j2 | 82 ++++++++ .../scheduling.x-k8s.io_podgroups.yaml.j2 | 97 +++++++++ ...node.k8s.io_noderesourcetopologies.yaml.j2 | 153 ++++++++++++++ .../defaults/main/download.yml | 12 ++ .../kubespray-defaults/defaults/main/main.yml | 3 + tests/files/packet_almalinux8-calico.yml | 3 + 17 files changed, 1109 insertions(+) create mode 100644 docs/kubernetes-apps/scheduler_plugins.md create mode 100644 roles/kubernetes-apps/scheduler_plugins/defaults/main.yml create mode 100644 roles/kubernetes-apps/scheduler_plugins/tasks/main.yml create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/appgroup.diktyo.x-k8s.io_appgroups.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/cm-scheduler-plugins.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/deploy-scheduler-plugins.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/namespace.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/networktopology.diktyo.x-k8s.io_networktopologies.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/rbac-scheduler-plugins.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/sa-scheduler-plugins.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/scheduling.x-k8s.io_elasticquotas.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/scheduling.x-k8s.io_podgroups.yaml.j2 create mode 100644 roles/kubernetes-apps/scheduler_plugins/templates/topology.node.k8s.io_noderesourcetopologies.yaml.j2 diff --git a/docs/kubernetes-apps/scheduler_plugins.md b/docs/kubernetes-apps/scheduler_plugins.md new file mode 100644 index 000000000..85e37910a --- /dev/null +++ b/docs/kubernetes-apps/scheduler_plugins.md @@ -0,0 +1,49 @@ +# Scheduler plugins for Kubernetes + +[scheduler-plugins](https://github.com/kubernetes-sigs/scheduler-plugins) is out-of-tree scheduler plugins based on the [scheduler framework](https://kubernetes.io/docs/concepts/scheduling-eviction/scheduling-framework/). + +The kube-scheduler binary includes a list of plugins: + +- [CapacityScheduling](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/pkg/capacityscheduling) [Beta] +- [CoScheduling](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/pkg/coscheduling) [Beta] +- [NodeResources](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master/pkg/noderesources) [Beta] +- [NodeResouceTopology](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/noderesourcetopology/README.md) [Beta] +- [PreemptionToleration](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/preemptiontoleration/README.md) [Alpha] +- [Trimaran](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/trimaran/README.md) [Alpha] +- [NetworkAware](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/networkaware/README.md) [Sample] +- [CrossNodePreemption](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/crossnodepreemption/README.md) [Sample] +- [PodState](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/podstate/README.md) [Sample] +- [QualityOfService](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/qos/README.md) [Sample] + +Currently, we use [helm chart](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/manifests/install/charts/as-a-second-scheduler/README.md#installing-the-chart) to install the scheduler plugins, so that a second scheduler would be created and running. **Note that running multi-scheduler will inevitably encounter resource conflicts when the cluster is short of resources**. + +## Compatibility Matrix + +There are requirements for the version of Kubernetes, please see [Compatibility Matrix +](https://github.com/kubernetes-sigs/scheduler-plugins/tree/master?tab=readme-ov-file#compatibility-matrix). It deserves our attention. + +| Scheduler Plugins | Compiled With K8s Version | +| ----------------- | ------------------------- | +| v0.27.8 | v1.27.8 | +| v0.26.8 | v1.26.7 | + +## Turning it on + + The `scheduler_plugins_enabled` option is used to enable the installation of scheduler plugins. + + You can enable or disable some plugins by setting the `scheduler_plugins_enabled_plugins` or `scheduler_plugins_disabled_plugins` option. They must be in the list we mentioned above. + + In addition, to use custom plugin configuration, set a value for `scheduler_plugins_plugin_config` option. + + For example, for Coscheduling plugin, you want to customize the permit waiting timeout to 10 seconds: + + ```yaml + scheduler_plugins_plugin_config: + - name: Coscheduling + args: + permitWaitingTimeSeconds: 10 # default is 60 + ``` + +## Leverage plugin + + Once the plugin is installed, we can apply CRs into cluster. For example, if using `CoScheduling`, we can apply the CR and test the deployment in the [example](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/doc/install.md#test-coscheduling). diff --git a/roles/kubernetes-apps/meta/main.yml b/roles/kubernetes-apps/meta/main.yml index 9c19fdebd..1b9cd6be8 100644 --- a/roles/kubernetes-apps/meta/main.yml +++ b/roles/kubernetes-apps/meta/main.yml @@ -124,3 +124,11 @@ dependencies: - inventory_hostname == groups['kube_control_plane'][0] tags: - argocd + + - role: kubernetes-apps/scheduler_plugins + when: + - scheduler_plugins_enabled + - kube_major_version is version('v1.28', '<') + - inventory_hostname == groups['kube_control_plane'][0] + tags: + - scheduler_plugins diff --git a/roles/kubernetes-apps/scheduler_plugins/defaults/main.yml b/roles/kubernetes-apps/scheduler_plugins/defaults/main.yml new file mode 100644 index 000000000..43f50f544 --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/defaults/main.yml @@ -0,0 +1,29 @@ +--- +scheduler_plugins_enabled: false + +scheduler_plugins_namespace: scheduler-plugins + +scheduler_plugins_controller_replicas: 1 + +scheduler_plugins_scheduler_replicas: 1 + +# The default is determined by the number of control plane nodes. +scheduler_plugins_scheduler_leader_elect: "{{ ((groups['kube_control_plane'] | length) > 1) }}" + +# Plugins to enable. See https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/manifests/install/charts/as-a-second-scheduler/README.md#configuration for more info. +scheduler_plugins_enabled_plugins: + - Coscheduling + - CapacityScheduling + - NodeResourceTopologyMatch + - NodeResourcesAllocatable + +# Plugins to disable. See https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/manifests/install/charts/as-a-second-scheduler/README.md#configuration for more info. +scheduler_plugins_disabled_plugins: + - PrioritySort + +# Customize the enabled plugins' config. +# Refer to the "pluginConfig" section of https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/manifests//scheduler-config.yaml. +scheduler_plugins_plugin_config: + - name: Coscheduling + args: + permitWaitingTimeSeconds: 10 # default is 60 diff --git a/roles/kubernetes-apps/scheduler_plugins/tasks/main.yml b/roles/kubernetes-apps/scheduler_plugins/tasks/main.yml new file mode 100644 index 000000000..d17b19128 --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/tasks/main.yml @@ -0,0 +1,68 @@ +--- +- name: Scheduler Plugins | Ensure dir exists + file: + path: "{{ kube_config_dir }}/scheduler-plugins" + state: directory + owner: root + group: root + mode: 0755 + when: inventory_hostname == groups['kube_control_plane'][0] + tags: + - scheduler_plugins + +- name: Scheduler Plugins | Create manifests + template: + src: "{{ item.file }}.j2" + dest: "{{ kube_config_dir }}/scheduler-plugins/{{ item.file }}" + mode: 0644 + with_items: + - { name: appgroup, file: appgroup.diktyo.x-k8s.io_appgroups.yaml, type: crd } + - { name: networktopology, file: networktopology.diktyo.x-k8s.io_networktopologies.yaml, type: crd } + - { name: elasticquotas, file: scheduling.x-k8s.io_elasticquotas.yaml, type: crd } + - { name: podgroups, file: scheduling.x-k8s.io_podgroups.yaml, type: crd } + - { name: noderesourcetopologies, file: topology.node.k8s.io_noderesourcetopologies.yaml, type: crd } + - { name: namespace, file: namespace.yaml, type: namespace } + - { name: sa, file: sa-scheduler-plugins.yaml, type: serviceaccount } + - { name: rbac, file: rbac-scheduler-plugins.yaml, type: rbac } + - { name: cm, file: cm-scheduler-plugins.yaml, type: configmap } + - { name: deploy, file: deploy-scheduler-plugins.yaml, type: deployment } + register: scheduler_plugins_manifests + when: inventory_hostname == groups['kube_control_plane'][0] + tags: + - scheduler_plugins + +- name: Scheduler Plugins | Apply manifests + kube: + name: "{{ item.item.name }}" + kubectl: "{{ bin_dir }}/kubectl" + resource: "{{ item.item.type }}" + filename: "{{ kube_config_dir }}/scheduler-plugins/{{ item.item.file }}" + state: "latest" + with_items: "{{ scheduler_plugins_manifests.results }}" + when: inventory_hostname == groups['kube_control_plane'][0] + tags: + - scheduler_plugins + +- name: Scheduler Plugins | Wait for controller pods to be ready + command: "{{ kubectl }} -n {{ scheduler_plugins_namespace }} get pods -l app=scheduler-plugins-controller -o jsonpath='{.items[?(@.status.containerStatuses[0].ready==false)].metadata.name}'" # noqa ignore-errors + register: controller_pods_not_ready + until: controller_pods_not_ready.stdout.find("scheduler-plugins-controller")==-1 + retries: 30 + delay: 10 + ignore_errors: true + changed_when: false + when: inventory_hostname == groups['kube_control_plane'][0] + tags: + - scheduler_plugins + +- name: Scheduler Plugins | Wait for scheduler pods to be ready + command: "{{ kubectl }} -n {{ scheduler_plugins_namespace }} get pods -l component=scheduler -o jsonpath='{.items[?(@.status.containerStatuses[0].ready==false)].metadata.name}'" # noqa ignore-errors + register: scheduler_pods_not_ready + until: scheduler_pods_not_ready.stdout.find("scheduler-plugins-scheduler")==-1 + retries: 30 + delay: 10 + ignore_errors: true + changed_when: false + when: inventory_hostname == groups['kube_control_plane'][0] + tags: + - scheduler_plugins diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/appgroup.diktyo.x-k8s.io_appgroups.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/appgroup.diktyo.x-k8s.io_appgroups.yaml.j2 new file mode 100644 index 000000000..757a3b12d --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/appgroup.diktyo.x-k8s.io_appgroups.yaml.j2 @@ -0,0 +1,197 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes-sigs/scheduler-plugins/pull/432 # edited manually + controller-gen.kubebuilder.io/version: v0.11.1 + creationTimestamp: null + name: appgroups.appgroup.diktyo.x-k8s.io +spec: + group: appgroup.diktyo.x-k8s.io + names: + kind: AppGroup + listKind: AppGroupList + plural: appgroups + shortNames: + - ag + singular: appgroup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: AppGroup is a collection of Pods belonging to the same application. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: AppGroupSpec defines the number of Pods and which Pods belong + to the group. + properties: + numMembers: + description: NumMembers defines the number of Pods belonging to the + App Group + format: int32 + minimum: 1 + type: integer + topologySortingAlgorithm: + description: The preferred Topology Sorting Algorithm + type: string + workloads: + description: Workloads defines the workloads belonging to the group + items: + description: AppGroupWorkload represents the Workloads belonging + to the App Group. + properties: + dependencies: + description: Dependencies of the Workload. + items: + description: DependenciesInfo contains information about one + dependency. + properties: + maxNetworkCost: + description: Max Network Cost between workloads + format: int64 + maximum: 10000 + minimum: 0 + type: integer + minBandwidth: + anyOf: + - type: integer + - type: string + description: MinBandwidth between workloads + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + workload: + description: Workload reference Info. + properties: + apiVersion: + description: ApiVersion defines the versioned schema + of an object. + type: string + kind: + description: 'Kind of the workload, info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name represents the workload, info: + http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: Namespace of the workload + type: string + selector: + description: Selector defines how to find Pods related + to the Workload (key = workload). (e.g., workload=w1) + type: string + required: + - kind + - name + - selector + type: object + required: + - workload + type: object + type: array + workload: + description: Workload reference Info. + properties: + apiVersion: + description: ApiVersion defines the versioned schema of + an object. + type: string + kind: + description: 'Kind of the workload, info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name represents the workload, info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: Namespace of the workload + type: string + selector: + description: Selector defines how to find Pods related to + the Workload (key = workload). (e.g., workload=w1) + type: string + required: + - kind + - name + - selector + type: object + required: + - workload + type: object + type: array + required: + - numMembers + - topologySortingAlgorithm + - workloads + type: object + status: + description: AppGroupStatus defines the observed use. + properties: + runningWorkloads: + description: The number of actively running workloads (e.g., number + of pods). + format: int32 + minimum: 0 + type: integer + scheduleStartTime: + description: ScheduleStartTime of the group + format: date-time + type: string + topologyCalculationTime: + description: TopologyCalculationTime of the group + format: date-time + type: string + topologyOrder: + description: Topology order for TopSort plugin (QueueSort) + items: + description: AppGroupTopologyInfo represents the calculated order + for a given Workload. + properties: + index: + description: Topology index. + format: int32 + type: integer + workload: + description: Workload reference Info. + properties: + apiVersion: + description: ApiVersion defines the versioned schema of + an object. + type: string + kind: + description: 'Kind of the workload, info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name represents the workload, info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: Namespace of the workload + type: string + selector: + description: Selector defines how to find Pods related to + the Workload (key = workload). (e.g., workload=w1) + type: string + required: + - kind + - name + - selector + type: object + type: object + type: array + type: object + type: object + served: true + storage: true \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/cm-scheduler-plugins.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/cm-scheduler-plugins.yaml.j2 new file mode 100644 index 000000000..7e022e889 --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/cm-scheduler-plugins.yaml.j2 @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: scheduler-config + namespace: {{ scheduler_plugins_namespace }} +data: + scheduler-config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1 + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: {{ scheduler_plugins_scheduler_leader_elect | bool | lower }} + profiles: + # Compose all plugins in one profile + - schedulerName: scheduler-plugins-scheduler + plugins: + multiPoint: + enabled: +{% for enabeld_plugin in scheduler_plugins_enabled_plugins %} + - name: {{ enabeld_plugin }} +{% endfor %} + disabled: +{% for disabled_plugin in scheduler_plugins_disabled_plugins %} + - name: {{ disabled_plugin }} +{% endfor %} +{% if scheduler_plugins_plugin_config is defined and scheduler_plugins_plugin_config | length != 0 %} + pluginConfig: +{{ scheduler_plugins_plugin_config | to_nice_yaml(indent=2, width=256) | indent(6, true) }} +{% endif %} \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/deploy-scheduler-plugins.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/deploy-scheduler-plugins.yaml.j2 new file mode 100644 index 000000000..114698a94 --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/deploy-scheduler-plugins.yaml.j2 @@ -0,0 +1,74 @@ +kind: Deployment +apiVersion: apps/v1 +metadata: + name: scheduler-plugins-controller + namespace: {{ scheduler_plugins_namespace }} + labels: + app: scheduler-plugins-controller +spec: + replicas: {{ scheduler_plugins_controller_replicas }} + selector: + matchLabels: + app: scheduler-plugins-controller + template: + metadata: + labels: + app: scheduler-plugins-controller + spec: + serviceAccountName: scheduler-plugins-controller + containers: + - name: scheduler-plugins-controller + image: {{ scheduler_plugins_controller_image_repo }}:{{ scheduler_plugins_controller_image_tag }} + imagePullPolicy: {{ k8s_image_pull_policy }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + component: scheduler + name: scheduler-plugins-scheduler + namespace: {{ scheduler_plugins_namespace }} +spec: + selector: + matchLabels: + component: scheduler + replicas: {{ scheduler_plugins_scheduler_replicas }} + template: + metadata: + labels: + component: scheduler + spec: + serviceAccountName: scheduler-plugins-scheduler + containers: + - command: + - /bin/kube-scheduler + - --config=/etc/kubernetes/scheduler-config.yaml + image: {{ scheduler_plugins_scheduler_image_repo }}:{{ scheduler_plugins_scheduler_image_tag }} + imagePullPolicy: {{ k8s_image_pull_policy }} + livenessProbe: + httpGet: + path: /healthz + port: 10259 + scheme: HTTPS + initialDelaySeconds: 15 + name: scheduler-plugins-scheduler + readinessProbe: + httpGet: + path: /healthz + port: 10259 + scheme: HTTPS + resources: + requests: + cpu: '0.1' + securityContext: + privileged: false + volumeMounts: + - name: scheduler-config + mountPath: /etc/kubernetes + readOnly: true + hostNetwork: false + hostPID: false + volumes: + - name: scheduler-config + configMap: + name: scheduler-config \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/namespace.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/namespace.yaml.j2 new file mode 100644 index 000000000..d54ae66fd --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/namespace.yaml.j2 @@ -0,0 +1,7 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: {{ scheduler_plugins_namespace }} + labels: + name: {{ scheduler_plugins_namespace }} \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/networktopology.diktyo.x-k8s.io_networktopologies.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/networktopology.diktyo.x-k8s.io_networktopologies.yaml.j2 new file mode 100644 index 000000000..e33157c0f --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/networktopology.diktyo.x-k8s.io_networktopologies.yaml.j2 @@ -0,0 +1,148 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes-sigs/scheduler-plugins/pull/432 # edited manually + controller-gen.kubebuilder.io/version: v0.11.1 + creationTimestamp: null + name: networktopologies.networktopology.diktyo.x-k8s.io +spec: + group: networktopology.diktyo.x-k8s.io + names: + kind: NetworkTopology + listKind: NetworkTopologyList + plural: networktopologies + shortNames: + - nt + singular: networktopology + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NetworkTopology defines network costs in the cluster between + regions and zones + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: NetworkTopologySpec defines the zones and regions of the + cluster. + properties: + configmapName: + description: ConfigmapName to be used for cost calculation + type: string + weights: + description: The manual defined weights of the cluster + items: + description: WeightInfo contains information about all network costs + for a given algorithm. + properties: + name: + description: Algorithm Name for network cost calculation (e.g., + userDefined) + type: string + topologyList: + description: TopologyList owns Costs between origins + items: + description: TopologyInfo contains information about network + costs for a particular Topology Key. + properties: + originList: + description: OriginList for a particular origin. + items: + description: OriginInfo contains information about network + costs for a particular Origin. + properties: + costList: + description: Costs for the particular origin. + items: + description: CostInfo contains information about + networkCosts. + properties: + bandwidthAllocated: + anyOf: + - type: integer + - type: string + description: Bandwidth allocated between origin + and destination. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + bandwidthCapacity: + anyOf: + - type: integer + - type: string + description: Bandwidth capacity between origin + and destination. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + destination: + description: Name of the destination (e.g., + Region Name, Zone Name). + type: string + networkCost: + description: Network Cost between origin and + destination (e.g., Dijkstra shortest path, + etc) + format: int64 + minimum: 0 + type: integer + required: + - destination + - networkCost + type: object + type: array + origin: + description: Name of the origin (e.g., Region Name, + Zone Name). + type: string + required: + - origin + type: object + type: array + topologyKey: + description: Topology key (e.g., "topology.kubernetes.io/region", + "topology.kubernetes.io/zone"). + type: string + required: + - originList + - topologyKey + type: object + type: array + required: + - name + - topologyList + type: object + type: array + required: + - configmapName + - weights + type: object + status: + description: NetworkTopologyStatus defines the observed use. + properties: + nodeCount: + description: The total number of nodes in the cluster + format: int64 + minimum: 0 + type: integer + weightCalculationTime: + description: The calculation time for the weights in the network topology + CRD + format: date-time + type: string + type: object + type: object + served: true + storage: true \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/rbac-scheduler-plugins.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/rbac-scheduler-plugins.yaml.j2 new file mode 100644 index 000000000..aa6f211d7 --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/rbac-scheduler-plugins.yaml.j2 @@ -0,0 +1,140 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: scheduler-plugins-scheduler +rules: +- apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "watch"] +- apiGroups: ["", "events.k8s.io"] + resources: ["events"] + verbs: ["create", "patch", "update"] +- apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] +- apiGroups: ["coordination.k8s.io"] + resourceNames: ["kube-scheduler"] + resources: ["leases"] + verbs: ["get", "update"] +- apiGroups: [""] + resources: ["endpoints"] + verbs: ["create"] +- apiGroups: [""] + resourceNames: ["kube-scheduler"] + resources: ["endpoints"] + verbs: ["get", "update"] +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch", "patch"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["delete", "get", "list", "watch", "update"] +- apiGroups: [""] + resources: ["bindings", "pods/binding"] + verbs: ["create"] +- apiGroups: [""] + resources: ["pods/status"] + verbs: ["patch", "update"] +- apiGroups: [""] + resources: ["replicationcontrollers", "services"] + verbs: ["get", "list", "watch"] +- apiGroups: ["apps", "extensions"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] +- apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["get", "list", "watch"] +- apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["persistentvolumeclaims", "persistentvolumes"] + verbs: ["get", "list", "watch", "patch", "update"] +- apiGroups: ["authentication.k8s.io"] + resources: ["tokenreviews"] + verbs: ["create"] +- apiGroups: ["authorization.k8s.io"] + resources: ["subjectaccessreviews"] + verbs: ["create"] +- apiGroups: ["storage.k8s.io"] + resources: ["csinodes", "storageclasses" , "csidrivers" , "csistoragecapacities"] + verbs: ["get", "list", "watch"] +- apiGroups: ["topology.node.k8s.io"] + resources: ["noderesourcetopologies"] + verbs: ["get", "list", "watch"] +# resources need to be updated with the scheduler plugins used +- apiGroups: ["scheduling.x-k8s.io"] + resources: ["podgroups", "elasticquotas", "podgroups/status", "elasticquotas/status"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] +# for network-aware plugins add the following lines (scheduler-plugins v0.27.8) +#- apiGroups: [ "appgroup.diktyo.x-k8s.io" ] +# resources: [ "appgroups" ] +# verbs: [ "get", "list", "watch", "create", "delete", "update", "patch" ] +#- apiGroups: [ "networktopology.diktyo.x-k8s.io" ] +# resources: [ "networktopologies" ] +# verbs: [ "get", "list", "watch", "create", "delete", "update", "patch" ] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: scheduler-plugins-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: scheduler-plugins-scheduler +subjects: +- kind: ServiceAccount + name: scheduler-plugins-scheduler + namespace: {{ scheduler_plugins_namespace }} +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: scheduler-plugins-controller +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch", "update"] +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch", "patch"] +- apiGroups: ["topology.node.k8s.io"] + resources: ["noderesourcetopologies"] + verbs: ["get", "list", "watch"] +# resources need to be updated with the scheduler plugins used +- apiGroups: ["scheduling.x-k8s.io"] + resources: ["podgroups", "elasticquotas", "podgroups/status", "elasticquotas/status"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: scheduler-plugins-controller +subjects: +- kind: ServiceAccount + name: scheduler-plugins-controller + namespace: {{ scheduler_plugins_namespace }} +roleRef: + kind: ClusterRole + name: scheduler-plugins-controller + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: sched-plugins::extension-apiserver-authentication-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: scheduler-plugins-scheduler + namespace: {{ scheduler_plugins_namespace }} +- kind: ServiceAccount + name: scheduler-plugins-controller + namespace: {{ scheduler_plugins_namespace }} \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/sa-scheduler-plugins.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/sa-scheduler-plugins.yaml.j2 new file mode 100644 index 000000000..6c25e1809 --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/sa-scheduler-plugins.yaml.j2 @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: scheduler-plugins-scheduler + namespace: {{ scheduler_plugins_namespace }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: scheduler-plugins-controller + namespace: {{ scheduler_plugins_namespace }} \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/scheduling.x-k8s.io_elasticquotas.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/scheduling.x-k8s.io_elasticquotas.yaml.j2 new file mode 100644 index 000000000..d63f57209 --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/scheduling.x-k8s.io_elasticquotas.yaml.j2 @@ -0,0 +1,82 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes-sigs/scheduler-plugins/pull/52 + controller-gen.kubebuilder.io/version: v0.11.1 + creationTimestamp: null + name: elasticquotas.scheduling.x-k8s.io +spec: + group: scheduling.x-k8s.io + names: + kind: ElasticQuota + listKind: ElasticQuotaList + plural: elasticquotas + shortNames: + - eq + - eqs + singular: elasticquota + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: ElasticQuota sets elastic quota restrictions per namespace + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: ElasticQuotaSpec defines the Min and Max for Quota. + properties: + max: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Max is the set of desired max limits for each named resource. + The usage of max is based on the resource configurations of successfully + scheduled pods. + type: object + min: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Min is the set of desired guaranteed limits for each + named resource. + type: object + type: object + status: + description: ElasticQuotaStatus defines the observed use. + properties: + used: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Used is the current observed total usage of the resource + in the namespace. + type: object + type: object + type: object + served: true + storage: true + subresources: + status: {} \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/scheduling.x-k8s.io_podgroups.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/scheduling.x-k8s.io_podgroups.yaml.j2 new file mode 100644 index 000000000..3767cf962 --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/scheduling.x-k8s.io_podgroups.yaml.j2 @@ -0,0 +1,97 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes-sigs/scheduler-plugins/pull/50 + controller-gen.kubebuilder.io/version: v0.11.1 + creationTimestamp: null + name: podgroups.scheduling.x-k8s.io +spec: + group: scheduling.x-k8s.io + names: + kind: PodGroup + listKind: PodGroupList + plural: podgroups + shortNames: + - pg + - pgs + singular: podgroup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: PodGroup is a collection of Pod; used for batch workload. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: Specification of the desired behavior of the pod group. + properties: + minMember: + description: MinMember defines the minimal number of members/tasks + to run the pod group; if there's not enough resources to start all + tasks, the scheduler will not start anyone. + format: int32 + type: integer + minResources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: MinResources defines the minimal resource of members/tasks + to run the pod group; if there's not enough resources to start all + tasks, the scheduler will not start anyone. + type: object + scheduleTimeoutSeconds: + description: ScheduleTimeoutSeconds defines the maximal time of members/tasks + to wait before run the pod group; + format: int32 + type: integer + type: object + status: + description: Status represents the current information about a pod group. + This data may not be up to date. + properties: + failed: + description: The number of pods which reached phase Failed. + format: int32 + type: integer + occupiedBy: + description: OccupiedBy marks the workload (e.g., deployment, statefulset) + UID that occupy the podgroup. It is empty if not initialized. + type: string + phase: + description: Current phase of PodGroup. + type: string + running: + description: The number of actively running pods. + format: int32 + type: integer + scheduleStartTime: + description: ScheduleStartTime of the group + format: date-time + type: string + succeeded: + description: The number of pods which reached phase Succeeded. + format: int32 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} \ No newline at end of file diff --git a/roles/kubernetes-apps/scheduler_plugins/templates/topology.node.k8s.io_noderesourcetopologies.yaml.j2 b/roles/kubernetes-apps/scheduler_plugins/templates/topology.node.k8s.io_noderesourcetopologies.yaml.j2 new file mode 100644 index 000000000..d83ef0b9b --- /dev/null +++ b/roles/kubernetes-apps/scheduler_plugins/templates/topology.node.k8s.io_noderesourcetopologies.yaml.j2 @@ -0,0 +1,153 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes/enhancements/pull/1870 + controller-gen.kubebuilder.io/version: v0.11.1 + creationTimestamp: null + name: noderesourcetopologies.topology.node.k8s.io +spec: + group: topology.node.k8s.io + names: + kind: NodeResourceTopology + listKind: NodeResourceTopologyList + plural: noderesourcetopologies + shortNames: + - node-res-topo + singular: noderesourcetopology + scope: Cluster + versions: + - name: v1alpha2 + schema: + openAPIV3Schema: + description: NodeResourceTopology describes node resources and their topology. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + attributes: + description: AttributeList contains an array of AttributeInfo objects. + items: + description: AttributeInfo contains one attribute of a Zone. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + topologyPolicies: + description: 'DEPRECATED (to be removed in v1beta1): use top level attributes + if needed' + items: + type: string + type: array + zones: + description: ZoneList contains an array of Zone objects. + items: + description: Zone represents a resource topology zone, e.g. socket, + node, die or core. + properties: + attributes: + description: AttributeList contains an array of AttributeInfo objects. + items: + description: AttributeInfo contains one attribute of a Zone. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + costs: + description: CostList contains an array of CostInfo objects. + items: + description: CostInfo describes the cost (or distance) between + two Zones. + properties: + name: + type: string + value: + format: int64 + type: integer + required: + - name + - value + type: object + type: array + name: + type: string + parent: + type: string + resources: + description: ResourceInfoList contains an array of ResourceInfo + objects. + items: + description: ResourceInfo contains information about one resource + type. + properties: + allocatable: + anyOf: + - type: integer + - type: string + description: Allocatable quantity of the resource, corresponding + to allocatable in node status, i.e. total amount of this + resource available to be used by pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + available: + anyOf: + - type: integer + - type: string + description: Available is the amount of this resource currently + available for new (to be scheduled) pods, i.e. Allocatable + minus the resources reserved by currently running pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + capacity: + anyOf: + - type: integer + - type: string + description: Capacity of the resource, corresponding to capacity + in node status, i.e. total amount of this resource that + the node has. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + name: + description: Name of the resource. + type: string + required: + - allocatable + - available + - capacity + - name + type: object + type: array + type: + type: string + required: + - name + - type + type: object + type: array + required: + - zones + type: object + served: true + storage: true \ No newline at end of file diff --git a/roles/kubespray-defaults/defaults/main/download.yml b/roles/kubespray-defaults/defaults/main/download.yml index 676989ea3..cc8d676f7 100644 --- a/roles/kubespray-defaults/defaults/main/download.yml +++ b/roles/kubespray-defaults/defaults/main/download.yml @@ -156,6 +156,13 @@ crio_supported_versions: v1.27: v1.27.1 crio_version: "{{ crio_supported_versions[kube_major_version] }}" +# Scheduler plugins doesn't build for K8s 1.28 yet +scheduler_plugins_supported_versions: + v1.28: 0 + v1.27: v0.27.8 + v1.26: v0.26.7 +scheduler_plugins_version: "{{ scheduler_plugins_supported_versions[kube_major_version] }}" + yq_version: "v4.35.2" # Download URLs @@ -294,6 +301,11 @@ dnsautoscaler_version: v1.8.8 dnsautoscaler_image_repo: "{{ kube_image_repo }}/cpa/cluster-proportional-autoscaler" dnsautoscaler_image_tag: "{{ dnsautoscaler_version }}" +scheduler_plugins_controller_image_repo: "{{ kube_image_repo }}/scheduler-plugins/controller" +scheduler_plugins_controller_image_tag: "{{ scheduler_plugins_version }}" +scheduler_plugins_scheduler_image_repo: "{{ kube_image_repo }}/scheduler-plugins/kube-scheduler" +scheduler_plugins_scheduler_image_tag: "{{ scheduler_plugins_version }}" + registry_version: "2.8.1" registry_image_repo: "{{ docker_image_repo }}/library/registry" registry_image_tag: "{{ registry_version }}" diff --git a/roles/kubespray-defaults/defaults/main/main.yml b/roles/kubespray-defaults/defaults/main/main.yml index f0290061d..f7eb5b373 100644 --- a/roles/kubespray-defaults/defaults/main/main.yml +++ b/roles/kubespray-defaults/defaults/main/main.yml @@ -681,3 +681,6 @@ sysctl_file_path: "/etc/sysctl.d/99-sysctl.conf" system_upgrade: false system_upgrade_reboot: on-upgrade # never, always + +# Enables or disables the scheduler plugins. +scheduler_plugins_enabled: false diff --git a/tests/files/packet_almalinux8-calico.yml b/tests/files/packet_almalinux8-calico.yml index 63cf8bf64..648a72d13 100644 --- a/tests/files/packet_almalinux8-calico.yml +++ b/tests/files/packet_almalinux8-calico.yml @@ -17,3 +17,6 @@ ntp_timezone: Etc/UTC ntp_manage_config: true ntp_tinker_panic: true ntp_force_sync_immediately: true + +# Scheduler plugins +scheduler_plugins_enabled: true