From f3a9135241c861efbcd4a897bf348d3b9a634c14 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Mon, 14 Jun 2021 18:01:41 +0200 Subject: [PATCH] common: disable/enable pg_autoscaler The PG autoscaler can disrupt the PG checks so the idea here is to disable it and re-enable it back after the restart is done. Signed-off-by: Guillaume Abrioux (cherry picked from commit 13036115e2862fc8ca9c04e1379fd793e0e7036a) --- infrastructure-playbooks/cephadm-adopt.yml | 64 ++++++++ infrastructure-playbooks/rolling_update.yml | 58 +++++++ ...inerized-to-containerized-ceph-daemons.yml | 61 +++++++- roles/ceph-facts/tasks/facts.yml | 4 + roles/ceph-handler/tasks/handler_osds.yml | 148 +++++++++++++----- 5 files changed, 297 insertions(+), 38 deletions(-) diff --git a/infrastructure-playbooks/cephadm-adopt.yml b/infrastructure-playbooks/cephadm-adopt.yml index 89ccb692b..b0eb7b227 100644 --- a/infrastructure-playbooks/cephadm-adopt.yml +++ b/infrastructure-playbooks/cephadm-adopt.yml @@ -414,6 +414,48 @@ - import_role: name: ceph-defaults + - name: get pool list + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd dump -f json" + register: pool_list + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + + - name: get balancer module status + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer status -f json" + register: balancer_status + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + + - name: set_fact pools_pgautoscaler_mode + set_fact: + pools_pgautoscaler_mode: "{{ pools_pgautoscaler_mode | default([]) | union([{'name': item.pool_name, 'mode': item.pg_autoscale_mode}]) }}" + run_once: true + with_items: "{{ (pool_list.stdout | from_json)['pools'] }}" + + - name: disable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer off" + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: disable pg autoscale on pools + ceph_pool: + name: "{{ item.name }}" + cluster: "{{ cluster }}" + pg_autoscale_mode: false + with_items: "{{ pools_pgautoscaler_mode }}" + delegate_to: "{{ groups[mon_group_name][0] }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + - name: set osd flags command: "{{ cephadm_cmd }} shell --fsid {{ fsid }} -- ceph --cluster {{ cluster }} osd set {{ item }}" changed_when: false @@ -515,6 +557,21 @@ - import_role: name: ceph-defaults + - name: re-enable pg autoscale on pools + ceph_pool: + name: "{{ item.name }}" + cluster: "{{ cluster }}" + pg_autoscale_mode: true + with_items: "{{ pools_pgautoscaler_mode }}" + delegate_to: "{{ groups[mon_group_name][0] }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + - name: unset osd flags command: "{{ cephadm_cmd }} shell --fsid {{ fsid }} -- ceph --cluster {{ cluster }} osd unset {{ item }}" changed_when: false @@ -524,6 +581,13 @@ environment: CEPHADM_IMAGE: '{{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}' + - name: re-enable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer on" + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + - name: redeploy mds daemons hosts: "{{ mds_group_name|default('mdss') }}" become: true diff --git a/infrastructure-playbooks/rolling_update.yml b/infrastructure-playbooks/rolling_update.yml index 920e7cc8c..f0c0b774a 100644 --- a/infrastructure-playbooks/rolling_update.yml +++ b/infrastructure-playbooks/rolling_update.yml @@ -385,6 +385,44 @@ name: ceph-facts tasks_from: container_binary.yml + - name: get pool list + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd dump -f json" + register: pool_list + run_once: true + changed_when: false + + - name: get balancer module status + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer status -f json" + register: balancer_status + run_once: true + changed_when: false + + - name: set_fact pools_pgautoscaler_mode + set_fact: + pools_pgautoscaler_mode: "{{ pools_pgautoscaler_mode | default([]) | union([{'name': item.pool_name, 'mode': item.pg_autoscale_mode}]) }}" + run_once: true + with_items: "{{ (pool_list.stdout | from_json)['pools'] }}" + + - name: disable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer off" + run_once: true + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: disable pg autoscale on pools + ceph_pool: + name: "{{ item.name }}" + cluster: "{{ cluster }}" + pg_autoscale_mode: false + with_items: "{{ pools_pgautoscaler_mode }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + - name: set osd flags ceph_osd_flag: name: "{{ item }}" @@ -493,6 +531,20 @@ name: ceph-facts tasks_from: container_binary.yml + - name: re-enable pg autoscale on pools + ceph_pool: + name: "{{ item.name }}" + cluster: "{{ cluster }}" + pg_autoscale_mode: true + run_once: true + with_items: "{{ pools_pgautoscaler_mode }}" + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + - name: unset osd flags ceph_osd_flag: name: "{{ item }}" @@ -505,6 +557,12 @@ - noout - nodeep-scrub + - name: re-enable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer on" + run_once: true + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + - name: upgrade ceph mdss cluster, deactivate all rank > 0 hosts: "{{ mon_group_name | default('mons') }}[0]" become: true diff --git a/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml b/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml index bbd7fe808..d59214736 100644 --- a/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml +++ b/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml @@ -215,6 +215,44 @@ name: ceph-facts tasks_from: container_binary.yml + - name: get pool list + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd dump -f json" + register: pool_list + run_once: true + changed_when: false + + - name: get balancer module status + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer status -f json" + register: balancer_status + run_once: true + changed_when: false + + - name: set_fact pools_pgautoscaler_mode + set_fact: + pools_pgautoscaler_mode: "{{ pools_pgautoscaler_mode | default([]) | union([{'name': item.pool_name, 'mode': item.pg_autoscale_mode}]) }}" + run_once: true + with_items: "{{ (pool_list.stdout | from_json)['pools'] }}" + + - name: disable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer off" + run_once: true + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: disable pg autoscale on pools + ceph_pool: + name: "{{ item.name }}" + cluster: "{{ cluster }}" + pg_autoscale_mode: false + with_items: "{{ pools_pgautoscaler_mode }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + - name: set osd flags ceph_osd_flag: name: "{{ item }}" @@ -366,7 +404,21 @@ name: ceph-facts tasks_from: container_binary.yml - - name: set osd flags + - name: re-enable pg autoscale on pools + ceph_pool: + name: "{{ item.name }}" + cluster: "{{ cluster }}" + pg_autoscale_mode: true + with_items: "{{ pools_pgautoscaler_mode }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + + - name: unset osd flags ceph_osd_flag: name: "{{ item }}" cluster: "{{ cluster }}" @@ -378,6 +430,13 @@ - noout - nodeep-scrub + - name: re-enable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer on" + run_once: true + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: switching from non-containerized to containerized ceph mds hosts: "{{ mds_group_name|default('mdss') }}" diff --git a/roles/ceph-facts/tasks/facts.yml b/roles/ceph-facts/tasks/facts.yml index 071e029a5..7cd78e80d 100644 --- a/roles/ceph-facts/tasks/facts.yml +++ b/roles/ceph-facts/tasks/facts.yml @@ -11,6 +11,10 @@ - name: import_tasks container_binary.yml import_tasks: container_binary.yml +- name: set_fact ceph_cmd + set_fact: + ceph_cmd: "{{ container_binary + ' run --rm --net=host -v /etc/ceph:/etc/ceph:z -v /var/lib/ceph:/var/lib/ceph:z -v /var/run/ceph:/var/run/ceph:z --entrypoint=ceph ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'ceph' }}" + # In case ansible_python_interpreter is set by the user, # ansible will not discover python and discovered_interpreter_python # will not be set diff --git a/roles/ceph-handler/tasks/handler_osds.yml b/roles/ceph-handler/tasks/handler_osds.yml index 4a959997f..48c27c4ea 100644 --- a/roles/ceph-handler/tasks/handler_osds.yml +++ b/roles/ceph-handler/tasks/handler_osds.yml @@ -1,43 +1,117 @@ --- -- name: set _osd_handler_called before restart +- name: set_fact trigger_restart set_fact: - _osd_handler_called: True - -- name: unset noup flag - ceph_osd_flag: - name: noup - cluster: "{{ cluster }}" - state: absent - environment: - CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" - CEPH_CONTAINER_BINARY: "{{ container_binary }}" - delegate_to: "{{ groups[mon_group_name][0] }}" + trigger_restart: true + loop: "{{ groups[osd_group_name] }}" + when: hostvars[item]['handler_osd_status'] | default(False) | bool run_once: true -# This does not just restart OSDs but everything else too. Unfortunately -# at this time the ansible role does not have an OSD id list to use -# for restarting them specifically. -# This does not need to run during a rolling update as the playbook will -# restart all OSDs using the tasks "start ceph osd" or -# "restart containerized ceph osd" -- name: copy osd restart script - template: - src: restart_osd_daemon.sh.j2 - dest: /tmp/restart_osd_daemon.sh - owner: root - group: root - mode: 0750 +- name: osd handler + when: trigger_restart | default(False) | bool + block: + - name: set _osd_handler_called before restart + set_fact: + _osd_handler_called: True -- name: restart ceph osds daemon(s) - command: /usr/bin/env bash /tmp/restart_osd_daemon.sh - when: - - hostvars[item]['handler_osd_status'] | default(False) | bool - - handler_health_osd_check | bool - - hostvars[item]['_osd_handler_called'] | default(False) | bool - with_items: "{{ groups[osd_group_name] | intersect(ansible_play_batch) }}" - delegate_to: "{{ item }}" - run_once: True + - name: unset noup flag + ceph_osd_flag: + name: noup + cluster: "{{ cluster }}" + state: absent + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + delegate_to: "{{ groups[mon_group_name][0] }}" + run_once: true -- name: set _osd_handler_called after restart - set_fact: - _osd_handler_called: False + # This does not just restart OSDs but everything else too. Unfortunately + # at this time the ansible role does not have an OSD id list to use + # for restarting them specifically. + # This does not need to run during a rolling update as the playbook will + # restart all OSDs using the tasks "start ceph osd" or + # "restart containerized ceph osd" + - name: copy osd restart script + template: + src: restart_osd_daemon.sh.j2 + dest: /tmp/restart_osd_daemon.sh + owner: root + group: root + mode: 0750 + + - name: get pool list + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd dump -f json" + register: pool_list + delegate_to: "{{ groups.get(mon_group_name, [])[0] }}" + run_once: true + changed_when: false + + - name: get balancer module status + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer status -f json" + register: balancer_status + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + + - name: set_fact pools_pgautoscaler_mode + set_fact: + pools_pgautoscaler_mode: "{{ pools_pgautoscaler_mode | default([]) | union([{'name': item.pool_name, 'mode': item.pg_autoscale_mode}]) }}" + run_once: true + with_items: "{{ (pool_list.stdout | from_json)['pools'] }}" + + - name: disable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer off" + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: disable pg autoscale on pools + ceph_pool: + name: "{{ item.name }}" + cluster: "{{ cluster }}" + pg_autoscale_mode: false + with_items: "{{ pools_pgautoscaler_mode }}" + delegate_to: "{{ groups.get(mon_group_name, [])[0] }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + + - name: restart ceph osds daemon(s) + command: /usr/bin/env bash /tmp/restart_osd_daemon.sh + when: + - hostvars[item]['handler_osd_status'] | default(False) | bool + - handler_health_osd_check | bool + - hostvars[item]['_osd_handler_called'] | default(False) | bool + with_items: "{{ groups[osd_group_name] | intersect(ansible_play_batch) }}" + delegate_to: "{{ item }}" + run_once: True + + - name: set _osd_handler_called after restart + set_fact: + _osd_handler_called: False + + - name: re-enable pg autoscale on pools + ceph_pool: + name: "{{ item.name }}" + cluster: "{{ cluster }}" + pg_autoscale_mode: true + with_items: "{{ pools_pgautoscaler_mode }}" + run_once: true + delegate_to: "{{ groups.get(mon_group_name, [])[0] }}" + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + environment: + CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" + CEPH_CONTAINER_BINARY: "{{ container_binary }}" + + - name: re-enable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer on" + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool