From f7882bbc022a17c04dda7991c7ee9a2a455e24c4 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Mon, 14 Jun 2021 18:01:41 +0200 Subject: [PATCH] common: disable/enable pg_autoscaler The PG autoscaler can disrupt the PG checks so the idea here is to disable it and re-enable it back after the restart is done. Signed-off-by: Guillaume Abrioux (cherry picked from commit 13036115e2862fc8ca9c04e1379fd793e0e7036a) --- infrastructure-playbooks/rolling_update.yml | 46 +++++++ ...inerized-to-containerized-ceph-daemons.yml | 49 +++++++- roles/ceph-facts/tasks/facts.yml | 4 + roles/ceph-handler/tasks/handler_osds.yml | 119 ++++++++++++------ 4 files changed, 181 insertions(+), 37 deletions(-) diff --git a/infrastructure-playbooks/rolling_update.yml b/infrastructure-playbooks/rolling_update.yml index 50de76829..ce5075723 100644 --- a/infrastructure-playbooks/rolling_update.yml +++ b/infrastructure-playbooks/rolling_update.yml @@ -383,6 +383,38 @@ name: ceph-facts tasks_from: container_binary.yml + - name: get pool list + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd dump -f json" + register: pool_list + run_once: true + changed_when: false + + - name: get balancer module status + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer status -f json" + register: balancer_status + run_once: true + changed_when: false + + - name: set_fact pools_pgautoscaler_mode + set_fact: + pools_pgautoscaler_mode: "{{ pools_pgautoscaler_mode | default([]) | union([{'name': item.pool_name, 'mode': item.pg_autoscale_mode}]) }}" + run_once: true + with_items: "{{ (pool_list.stdout | from_json)['pools'] }}" + + - name: disable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer off" + run_once: true + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: disable pg autoscale on pools + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool set {{ item.name }} pg_autoscale_mode off" + with_items: "{{ pools_pgautoscaler_mode }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + - name: set osd flags ceph_osd_flag: name: "{{ item }}" @@ -490,6 +522,14 @@ name: ceph-facts tasks_from: container_binary.yml + - name: re-enable pg autoscale on pools + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool set {{ item.name }} pg_autoscale_mode on" + with_items: "{{ pools_pgautoscaler_mode }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + - name: unset osd flags ceph_osd_flag: name: "{{ item }}" @@ -502,6 +542,12 @@ - noout - nodeep-scrub + - name: re-enable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer on" + run_once: true + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + - name: set_fact container_exec_cmd_osd set_fact: container_exec_cmd_update_osd: "{{ container_binary }} exec ceph-mon-{{ ansible_facts['hostname'] }}" diff --git a/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml b/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml index 0760fa931..3882526ec 100644 --- a/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml +++ b/infrastructure-playbooks/switch-from-non-containerized-to-containerized-ceph-daemons.yml @@ -214,6 +214,38 @@ - import_role: name: ceph-facts + - name: get pool list + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd dump -f json" + register: pool_list + run_once: true + changed_when: false + + - name: get balancer module status + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer status -f json" + register: balancer_status + run_once: true + changed_when: false + + - name: set_fact pools_pgautoscaler_mode + set_fact: + pools_pgautoscaler_mode: "{{ pools_pgautoscaler_mode | default([]) | union([{'name': item.pool_name, 'mode': item.pg_autoscale_mode}]) }}" + run_once: true + with_items: "{{ (pool_list.stdout | from_json)['pools'] }}" + + - name: disable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer off" + run_once: true + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: disable pg autoscale on pools + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool set {{ item.name }} pg_autoscale_mode off" + with_items: "{{ pools_pgautoscaler_mode }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + - name: set osd flags ceph_osd_flag: name: "{{ item }}" @@ -372,7 +404,15 @@ - import_role: name: ceph-facts - - name: set osd flags + - name: re-enable pg autoscale on pools + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool set {{ item.name }} pg_autoscale_mode on" + with_items: "{{ pools_pgautoscaler_mode }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + + - name: unset osd flags ceph_osd_flag: name: "{{ item }}" cluster: "{{ cluster }}" @@ -384,6 +424,13 @@ - noout - nodeep-scrub + - name: re-enable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer on" + run_once: true + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: switching from non-containerized to containerized ceph mds hosts: "{{ mds_group_name|default('mdss') }}" diff --git a/roles/ceph-facts/tasks/facts.yml b/roles/ceph-facts/tasks/facts.yml index d968c0c99..e50fde239 100644 --- a/roles/ceph-facts/tasks/facts.yml +++ b/roles/ceph-facts/tasks/facts.yml @@ -15,6 +15,10 @@ set_fact: is_podman: "{{ podman_binary.stat.exists }}" +- name: set_fact ceph_cmd + set_fact: + ceph_cmd: "{{ container_binary + ' run --rm --net=host -v /etc/ceph:/etc/ceph:z -v /var/lib/ceph:/var/lib/ceph:z -v /var/run/ceph:/var/run/ceph:z --entrypoint=ceph ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'ceph' }}" + # In case ansible_python_interpreter is set by the user, # ansible will not discover python and discovered_interpreter_python # will not be set diff --git a/roles/ceph-handler/tasks/handler_osds.yml b/roles/ceph-handler/tasks/handler_osds.yml index 4a959997f..8d0717e3d 100644 --- a/roles/ceph-handler/tasks/handler_osds.yml +++ b/roles/ceph-handler/tasks/handler_osds.yml @@ -1,43 +1,90 @@ --- -- name: set _osd_handler_called before restart +- name: set_fact trigger_restart set_fact: - _osd_handler_called: True - -- name: unset noup flag - ceph_osd_flag: - name: noup - cluster: "{{ cluster }}" - state: absent - environment: - CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}" - CEPH_CONTAINER_BINARY: "{{ container_binary }}" - delegate_to: "{{ groups[mon_group_name][0] }}" + trigger_restart: true + loop: "{{ groups[osd_group_name] }}" + when: hostvars[item]['handler_osd_status'] | default(False) | bool run_once: true +- name: osd handler + when: trigger_restart | default(False) | bool + block: + - name: get pool list + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd dump -f json" + register: pool_list + delegate_to: "{{ groups.get(mon_group_name, [])[0] }}" + run_once: true + changed_when: false + + - name: get balancer module status + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer status -f json" + register: balancer_status + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + + - name: set_fact pools_pgautoscaler_mode + set_fact: + pools_pgautoscaler_mode: "{{ pools_pgautoscaler_mode | default([]) | union([{'name': item.pool_name, 'mode': item.pg_autoscale_mode}]) }}" + run_once: true + with_items: "{{ (pool_list.stdout | from_json)['pools'] }}" + + - name: disable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer off" + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool + + - name: disable pg autoscale on pools + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool set {{ item.name }} pg_autoscale_mode off" + with_items: "{{ pools_pgautoscaler_mode }}" + delegate_to: "{{ groups.get(mon_group_name, [])[0] }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + # This does not just restart OSDs but everything else too. Unfortunately -# at this time the ansible role does not have an OSD id list to use -# for restarting them specifically. -# This does not need to run during a rolling update as the playbook will -# restart all OSDs using the tasks "start ceph osd" or -# "restart containerized ceph osd" -- name: copy osd restart script - template: - src: restart_osd_daemon.sh.j2 - dest: /tmp/restart_osd_daemon.sh - owner: root - group: root - mode: 0750 + # at this time the ansible role does not have an OSD id list to use + # for restarting them specifically. + # This does not need to run during a rolling update as the playbook will + # restart all OSDs using the tasks "start ceph osd" or + # "restart containerized ceph osd" + - name: copy osd restart script + template: + src: restart_osd_daemon.sh.j2 + dest: /tmp/restart_osd_daemon.sh + owner: root + group: root + mode: 0750 -- name: restart ceph osds daemon(s) - command: /usr/bin/env bash /tmp/restart_osd_daemon.sh - when: - - hostvars[item]['handler_osd_status'] | default(False) | bool - - handler_health_osd_check | bool - - hostvars[item]['_osd_handler_called'] | default(False) | bool - with_items: "{{ groups[osd_group_name] | intersect(ansible_play_batch) }}" - delegate_to: "{{ item }}" - run_once: True + - name: restart ceph osds daemon(s) + command: /usr/bin/env bash /tmp/restart_osd_daemon.sh + when: + - hostvars[item]['handler_osd_status'] | default(False) | bool + - handler_health_osd_check | bool + - hostvars[item]['_osd_handler_called'] | default(False) | bool + with_items: "{{ groups[osd_group_name] | intersect(ansible_play_batch) }}" + delegate_to: "{{ item }}" + run_once: True -- name: set _osd_handler_called after restart - set_fact: - _osd_handler_called: False + - name: set _osd_handler_called after restart + set_fact: + _osd_handler_called: False + + - name: re-enable pg autoscale on pools + command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool set {{ item.name }} pg_autoscale_mode on" + with_items: "{{ pools_pgautoscaler_mode }}" + delegate_to: "{{ groups.get(mon_group_name, [])[0] }}" + run_once: true + when: + - pools_pgautoscaler_mode is defined + - item.mode == 'on' + + - name: re-enable balancer + command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer on" + run_once: true + delegate_to: "{{ groups[mon_group_name][0] }}" + changed_when: false + when: (balancer_status.stdout | from_json)['active'] | bool