ceph-ansible/infrastructure-playbooks/rolling_update.yml

---
# This playbook does a rolling update for all the Ceph services
#
# The value of 'serial:' adjusts the number of servers to be updated simultaneously.
# We recommend a value of 1, which means hosts of a group (e.g: monitor) will be
# upgraded one by one. It is really crucial for the update process to happen
# in a serialized fashion. DO NOT CHANGE THIS VALUE.
#
#
# If you run a Ceph community version, you have to change the variable: ceph_stable_release to the new release
#
# If you run Red Hat Ceph Storage and are doing a **major** update (e.g: from 2 to 3), you have two options:
#   - if you use a CDN, you have to change the ceph_rhcs_version to a newer one
#   - if you use an ISO, you have to change the ceph_rhcs_iso_path to the directory containing the new Ceph version
#

- name: confirm whether user really meant to upgrade the cluster
  hosts: localhost
  tags: always
  become: false
  gather_facts: false
  vars:
    - mgr_group_name: mgrs

  vars_prompt:
    - name: ireallymeanit
      prompt: Are you sure you want to upgrade the cluster?
      default: 'no'
      private: no

  tasks:
    - name: exit playbook, if user did not mean to upgrade cluster
      fail:
        msg: >
          "Exiting rolling_update.yml playbook, cluster was NOT upgraded.
           To upgrade the cluster, either say 'yes' on the prompt or
           use `-e ireallymeanit=yes` on the command line when
           invoking the playbook"
      when: ireallymeanit != 'yes'


- name: gather facts and check the init system
  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"
    - "{{ mds_group_name|default('mdss') }}"
    - "{{ rgw_group_name|default('rgws') }}"
    - "{{ mgr_group_name|default('mgrs') }}"
    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
    - "{{ nfs_group_name|default('nfss') }}"
    - "{{ client_group_name|default('clients') }}"
    - "{{ iscsi_gw_group_name|default('iscsigws') }}"
    - "{{ grafana_server_group_name|default('grafana-server') }}"
  tags: always
  any_errors_fatal: True
  become: True
  gather_facts: False
  vars:
    delegate_facts_host: True
  tasks:
    - debug: msg="gather facts on all Ceph hosts for following reference"

    - import_role:
        name: ceph-defaults

    - name: gather facts
      setup:
        gather_subset:
          - 'all'
          - '!facter'
          - '!ohai'
      when: not delegate_facts_host | bool or inventory_hostname in groups.get(client_group_name, [])

    - name: gather and delegate facts
      setup:
        gather_subset:
          - 'all'
          - '!facter'
          - '!ohai'
      delegate_to: "{{ item }}"
      delegate_facts: True
      with_items: "{{ groups['all'] | difference(groups.get('clients', [])) }}"
      run_once: true
      when: delegate_facts_host | bool

    - import_role:
        name: ceph-facts

    - import_role:
        name: ceph-infra

    - import_role:
        name: ceph-validate

    - import_role:
        name: ceph-container-engine
      when:
        - (group_names != ['clients']) or (inventory_hostname == groups.get('clients', [''])|first)
        - (containerized_deployment | bool) or (dashboard_enabled | bool)

    - import_role:
        name: ceph-container-common
        tasks_from: registry
      when:
        - (group_names != ['clients']) or (inventory_hostname == groups.get('clients', [''])|first)
        - (containerized_deployment | bool) or (dashboard_enabled | bool)
        - ceph_docker_registry_auth | bool

    - name: check ceph release in container image
      when:
        - groups.get(mon_group_name, []) | length > 0
        - containerized_deployment | bool
      delegate_to: "{{ groups[mon_group_name][0] }}"
      run_once: true
      block:
        - name: get the ceph release being deployed
          command: "{{ ceph_cmd }} --cluster {{ cluster }} --version"
          register: ceph_version
          changed_when: false

        - name: check ceph release being deployed
          fail:
            msg: "This version of ceph-ansible is intended for upgrading to Ceph Nautilus only."
          when: "'nautilus' not in ceph_version.stdout.split()"

    - name: set_fact rolling_update
      set_fact:
        rolling_update: true

- name: upgrade ceph mon cluster
  tags: mons
  vars:
    health_mon_check_retries: 5
    health_mon_check_delay: 15
    upgrade_ceph_packages: True
  hosts: "{{ mon_group_name|default('mons') }}"
  serial: 1
  become: True
  gather_facts: false
  tasks:
    - name: upgrade ceph mon cluster
      block:
        - name: upgrade ceph mon cluster
          block:
            - name: remove ceph aliases
              file:
                path: /etc/profile.d/ceph-aliases.sh
                state: absent
              when: containerized_deployment | bool

            - name: set mon_host_count
              set_fact:
                mon_host_count: "{{ groups[mon_group_name] | length }}"

            - name: fail when less than three monitors
              fail:
                msg: "Upgrade of cluster with less than three monitors is not supported."
              when: mon_host_count | int < 3

            - name: select a running monitor
              set_fact:
                mon_host: "{{ groups[mon_group_name] | difference([inventory_hostname]) | last }}"

            - import_role:
                name: ceph-defaults
            - import_role:
                name: ceph-facts

            - block:
                - name: get ceph cluster status
                  command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} health -f json"
                  register: check_cluster_health
                  delegate_to: "{{ mon_host }}"

                - block:
                    - name: display ceph health detail
                      command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} health detail"
                      delegate_to: "{{ mon_host }}"

                    - name: fail if cluster isn't in an acceptable state
                      fail:
                        msg: "cluster is not in an acceptable state!"
                  when: (check_cluster_health.stdout | from_json).status == 'HEALTH_ERR'

                - name: get the ceph quorum status
                  command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} quorum_status -f json"
                  register: check_quorum_status
                  delegate_to: "{{ mon_host }}"

                - name: fail if the cluster quorum isn't in an acceptable state
                  fail:
                    msg: "cluster quorum is not in an acceptable state!"
                  when: (check_quorum_status.stdout | from_json).quorum | length != groups[mon_group_name] | length
              when: inventory_hostname == groups[mon_group_name] | first

        - name: ensure /var/lib/ceph/bootstrap-rbd-mirror is present
          file:
            path: /var/lib/ceph/bootstrap-rbd-mirror
            owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
            group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
            mode: '755'
            state: directory
          delegate_to: "{{ item }}"
          with_items: "{{ groups[mon_group_name] }}"
          when:
            - cephx | bool
            - inventory_hostname == groups[mon_group_name][0]

        - name: create potentially missing keys (rbd and rbd-mirror)
          ceph_key:
            name: "client.{{ item.0 }}"
            dest: "/var/lib/ceph/{{ item.0 }}/"
            caps:
              mon: "allow profile {{ item.0 }}"
            cluster: "{{ cluster }}"
          delegate_to: "{{ item.1 }}"
          with_nested:
            - ['bootstrap-rbd', 'bootstrap-rbd-mirror']
            - "{{ groups[mon_group_name] }}" # so the key goes on all the nodes
          environment:
            CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}"
            CEPH_CONTAINER_BINARY: "{{ container_binary }}"
          when:
            - cephx | bool
            - inventory_hostname == groups[mon_group_name][0]

        # NOTE: we mask the service so the RPM can't restart it
        # after the package gets upgraded
        - name: stop ceph mon - shortname
          systemd:
            name: ceph-mon@{{ ansible_facts['hostname'] }}
            state: stopped
            enabled: no
            masked: yes
          ignore_errors: True

        # NOTE: we mask the service so the RPM can't restart it
        # after the package gets upgraded
        - name: stop ceph mon - fqdn
          systemd:
            name: ceph-mon@{{ ansible_facts['fqdn'] }}
            state: stopped
            enabled: no
            masked: yes
          ignore_errors: True

        # only mask the service for mgr because it must be upgraded
        # after ALL monitors, even when collocated
        - name: mask the mgr service
          systemd:
            name: ceph-mgr@{{ ansible_facts['hostname'] }}
            masked: yes
          when: inventory_hostname in groups[mgr_group_name] | default([])
                or groups[mgr_group_name] | default([]) | length == 0

        - import_role:
            name: ceph-handler
        - import_role:
            name: ceph-common
          when: not containerized_deployment | bool
        - import_role:
            name: ceph-container-common
          when: containerized_deployment | bool
        - import_role:
            name: ceph-config
        - import_role:
            name: ceph-mon

        - name: start ceph mgr
          systemd:
            name: ceph-mgr@{{ ansible_facts['hostname'] }}
            state: started
            enabled: yes
          ignore_errors: True # if no mgr collocated with mons

        - name: import_role ceph-facts
          import_role:
            name: ceph-facts
            tasks_from: set_monitor_address.yml
          delegate_to: "{{ groups[mon_group_name][0] }}"
          delegate_facts: true

        - name: non container | waiting for the monitor to join the quorum...
          command: ceph --cluster "{{ cluster }}" -m "{{ hostvars[groups[mon_group_name][0]]['_current_monitor_address'] }}" quorum_status --format json
          register: ceph_health_raw
          until:
            - ceph_health_raw.rc == 0
            - (hostvars[inventory_hostname]['ansible_facts']['hostname'] in (ceph_health_raw.stdout | default('{}') |  from_json)["quorum_names"] or
              hostvars[inventory_hostname]['ansible_facts']['fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"])
          retries: "{{ health_mon_check_retries }}"
          delay: "{{ health_mon_check_delay }}"
          when: not containerized_deployment | bool

        - name: container | waiting for the containerized monitor to join the quorum...
          command: >
            {{ container_binary }} exec ceph-mon-{{ ansible_facts['hostname'] }} ceph --cluster "{{ cluster }}" -m "{{ hostvars[groups[mon_group_name][0]]['_current_monitor_address'] }}" quorum_status --format json
          register: ceph_health_raw
          until:
            - ceph_health_raw.rc == 0
            - (hostvars[inventory_hostname]['ansible_facts']['hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
              hostvars[inventory_hostname]['ansible_facts']['fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"])
          retries: "{{ health_mon_check_retries }}"
          delay: "{{ health_mon_check_delay }}"
          when: containerized_deployment | bool

      rescue:
        - name: unmask the mon service
          systemd:
            name: ceph-mon@{{ ansible_facts['hostname'] }}
            enabled: yes
            masked: no

        - name: unmask the mgr service
          systemd:
            name: ceph-mgr@{{ ansible_facts['hostname'] }}
            masked: no
          when: inventory_hostname in groups[mgr_group_name] | default([])
                or groups[mgr_group_name] | default([]) | length == 0

        - name: stop the playbook execution
          fail:
            msg: "There was an error during monitor upgrade. Please, check the previous task results."

- name: reset mon_host
  hosts: "{{ mon_group_name|default('mons') }}"
  tags: always
  become: True
  gather_facts: false
  tasks:
    - import_role:
        name: ceph-defaults

    - name: reset mon_host fact
      set_fact:
        mon_host: "{{ groups[mon_group_name][0] }}"


- name: upgrade ceph mgr nodes when implicitly collocated on monitors
  vars:
    health_mon_check_retries: 5
    health_mon_check_delay: 15
    upgrade_ceph_packages: True
  hosts: "{{ mon_group_name|default('mons') }}"
  tags: mgrs
  serial: 1
  become: True
  gather_facts: false
  tasks:
    - name: upgrade mgrs when no mgr group explicitly defined in inventory
      when: groups.get(mgr_group_name, []) | length == 0
      block:
        - name: stop ceph mgr
          systemd:
            name: ceph-mgr@{{ ansible_facts['hostname'] }}
            state: stopped
            masked: yes

        - import_role:
            name: ceph-defaults
        - import_role:
            name: ceph-facts
        - import_role:
            name: ceph-handler
        - import_role:
            name: ceph-common
          when: not containerized_deployment | bool
        - import_role:
            name: ceph-container-common
          when: containerized_deployment | bool
        - import_role:
            name: ceph-config
        - import_role:
            name: ceph-mgr

- name: upgrade ceph mgr nodes
  vars:
    upgrade_ceph_packages: True
    ceph_release: "{{ ceph_stable_release }}"
  hosts: "{{ mgr_group_name|default('mgrs') }}"
  tags: mgrs
  serial: 1
  become: True
  gather_facts: false
  tasks:
    # The following task has a failed_when: false
    # to handle the scenario where no mgr existed before the upgrade
    # or if we run a Ceph cluster before Luminous
    - name: stop ceph mgr
      systemd:
        name: ceph-mgr@{{ ansible_facts['hostname'] }}
        state: stopped
        enabled: no
        masked: yes
      failed_when: false

    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when: containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-mgr


- name: set osd flags
  hosts: "{{ osd_group_name | default('osds') }}"
  tags: osds
  become: True
  gather_facts: false
  tasks:
    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
        tasks_from: container_binary.yml

    - name: set osd flags, disable autoscaler and balancer
      run_once: true
      delegate_to: "{{ groups[mon_group_name][0] }}"
      block:
        - name: get pool list
          command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool ls detail -f json"
          register: pool_list
          changed_when: false
          check_mode: false

        - name: get balancer module status
          command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer status -f json"
          register: balancer_status_update
          run_once: true
          changed_when: false
          check_mode: false

        - name: set_fact pools_pgautoscaler_mode
          set_fact:
            pools_pgautoscaler_mode: "{{ pools_pgautoscaler_mode | default([]) | union([{'name': item.pool_name, 'mode': item.pg_autoscale_mode}]) }}"
          with_items: "{{ pool_list.stdout | default('{}') | from_json }}"

        - name: disable balancer
          command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer off"
          changed_when: false
          when: (balancer_status_update.stdout | from_json)['active'] | bool

        - name: disable pg autoscale on pools
          command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool set {{ item.name }} pg_autoscale_mode off"
          with_items: "{{ pools_pgautoscaler_mode }}"
          when:
            - pools_pgautoscaler_mode is defined
            - item.mode == 'on'

        - name: set osd flags
          ceph_osd_flag:
            name: "{{ item }}"
            cluster: "{{ cluster }}"
          environment:
            CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
            CEPH_CONTAINER_BINARY: "{{ container_binary }}"
          with_items:
            - noout
            - nodeep-scrub

- name: upgrade ceph osds cluster
  vars:
    health_osd_check_retries: 600
    health_osd_check_delay: 2
    upgrade_ceph_packages: True
  hosts: "{{ osd_group_name|default('osds') }}"
  tags: osds
  serial: 1
  become: True
  gather_facts: false
  tasks:
    - name: get osd numbers - non container
      shell: if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | sed 's/.*-//' ; fi  # noqa 306
      register: osd_ids
      changed_when: false

    - name: set num_osds
      set_fact:
        num_osds: "{{ osd_ids.stdout_lines|default([])|length }}"

    - name: set_fact container_exec_cmd_osd
      set_fact:
        container_exec_cmd_update_osd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }}"
      when: containerized_deployment | bool

    - name: stop ceph osd
      systemd:
        name: ceph-osd@{{ item }}
        state: stopped
        enabled: no
        masked: yes
      with_items: "{{ osd_ids.stdout_lines }}"

    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when: containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-osd

    - name: scan ceph-disk osds with ceph-volume if deploying nautilus
      command: "ceph-volume --cluster={{ cluster }} simple scan --force"
      environment:
        CEPH_VOLUME_DEBUG: "{{ ceph_volume_debug }}"
      when:
        - ceph_release in ["nautilus", "octopus"]
        - not containerized_deployment | bool

    - name: activate scanned ceph-disk osds and migrate to ceph-volume if deploying nautilus
      command: "ceph-volume --cluster={{ cluster }} simple activate --all"
      environment:
        CEPH_VOLUME_DEBUG: "{{ ceph_volume_debug }}"
      when:
        - ceph_release in ["nautilus", "octopus"]
        - not containerized_deployment | bool

    - name: get num_pgs - non container
      command: "{{ container_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} pg stat --format json"
      changed_when: false
      register: ceph_pgs
      delegate_to: "{{ groups[mon_group_name][0] }}"

    - name: waiting for clean pgs...
      command: "{{ container_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} pg stat --format json"
      register: ceph_health_post
      until: >
        (((ceph_health_post.stdout | from_json).pg_summary.num_pg_by_state | length) > 0)
        and
        (((ceph_health_post.stdout | from_json).pg_summary.num_pg_by_state | selectattr('name', 'search', '^active\\+clean') | map(attribute='num') | list | sum) == (ceph_pgs.stdout | from_json).pg_summary.num_pgs)
      delegate_to: "{{ groups[mon_group_name][0] }}"
      retries: "{{ health_osd_check_retries }}"
      delay: "{{ health_osd_check_delay }}"
      when: (ceph_pgs.stdout | from_json).pg_summary.num_pgs != 0


- name: complete osd upgrade
  hosts: "{{ osd_group_name | default('osds') }}"
  tags: osds
  become: True
  gather_facts: false
  tasks:
    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
        tasks_from: container_binary.yml

    - name: unset osd flags, re-enable pg autoscaler and balancer
      run_once: true
      delegate_to: "{{ groups[mon_group_name][0] }}"
      block:
        - name: re-enable pg autoscale on pools
          command: "{{ ceph_cmd }} --cluster {{ cluster }} osd pool set {{ item.name }} pg_autoscale_mode on"
          with_items: "{{ pools_pgautoscaler_mode }}"
          when:
            - pools_pgautoscaler_mode is defined
            - item.mode == 'on'

        - name: unset osd flags
          ceph_osd_flag:
            name: "{{ item }}"
            cluster: "{{ cluster }}"
            state: absent
          environment:
            CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
            CEPH_CONTAINER_BINARY: "{{ container_binary }}"
          with_items:
            - noout
            - nodeep-scrub

        - name: re-enable balancer
          command: "{{ ceph_cmd }} --cluster {{ cluster }} balancer on"
          changed_when: false
          when: (balancer_status_update.stdout | from_json)['active'] | bool

        - name: set_fact container_exec_cmd_osd
          set_fact:
            container_exec_cmd_update_osd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }}"
          when: containerized_deployment | bool

        - name: get osd versions
          command: "{{ container_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} versions"
          register: ceph_versions
          changed_when: false

        - name: set_fact ceph_versions_osd
          set_fact:
            ceph_versions_osd: "{{ (ceph_versions.stdout|from_json).osd }}"

        # length == 1 means there is a single osds versions entry
        # thus all the osds are running the same version
        - name: complete osds upgrade
          command: "{{ container_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} osd require-osd-release nautilus"
          when:
            - (ceph_versions.get('stdout', '{}')|from_json).get('osd', {}) | length == 1
            - ceph_versions_osd | string is search("ceph version 14")


- name: upgrade ceph mdss cluster, deactivate all rank > 0
  hosts: "{{ mon_group_name | default('mons') }}[0]"
  tags: mdss
  become: true
  gather_facts: false
  tasks:
    - name: deactivate all mds rank > 0
      when: groups.get(mds_group_name, []) | length > 0
      block:
        - import_role:
            name: ceph-defaults
        - import_role:
            name: ceph-facts

        - name: deactivate all mds rank > 0 if any
          when: groups.get(mds_group_name, []) | length > 1
          block:
            - name: set max_mds 1 on ceph fs
              command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs set {{ cephfs }} max_mds 1"
              changed_when: false

            - name: wait until only rank 0 is up
              command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs get {{ cephfs }} -f json"
              changed_when: false
              register: wait_rank_zero
              retries: 720
              delay: 5
              until: (wait_rank_zero.stdout | from_json).mdsmap.in | length == 1 and (wait_rank_zero.stdout | from_json).mdsmap.in[0]  == 0

            - name: get name of remaining active mds
              command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs dump -f json"
              changed_when: false
              register: _mds_active_name

            - name: set_fact mds_active_name
              set_fact:
                mds_active_name: "{{ (_mds_active_name.stdout | from_json)['filesystems'][0]['mdsmap']['info'][item.key]['name'] }}"
              with_dict: "{{ (_mds_active_name.stdout | default('{}') | from_json).filesystems[0]['mdsmap']['info'] | default({}) }}"

            - name: set_fact mds_active_host
              set_fact:
                mds_active_host: "{{ [hostvars[item]['inventory_hostname']] }}"
              with_items: "{{ groups[mds_group_name] }}"
              when: hostvars[item]['ansible_facts']['hostname'] == mds_active_name

            - name: create standby_mdss group
              add_host:
                name: "{{ item }}"
                groups: standby_mdss
                ansible_host: "{{ hostvars[item]['ansible_host'] | default(omit) }}"
                ansible_port: "{{ hostvars[item]['ansible_port'] | default(omit) }}"
              with_items: "{{ groups[mds_group_name] | difference(mds_active_host) }}"

            - name: stop standby ceph mds
              systemd:
                name: "ceph-mds@{{ hostvars[item]['ansible_facts']['hostname'] }}"
                state: stopped
                enabled: no
              delegate_to: "{{ item }}"
              with_items: "{{ groups['standby_mdss'] }}"
              when: groups['standby_mdss'] | default([]) | length > 0

            # dedicated task for masking systemd unit
            # somehow, having a single task doesn't work in containerized context
            - name: mask systemd units for standby ceph mds
              systemd:
                name: "ceph-mds@{{ hostvars[item]['ansible_facts']['hostname'] }}"
                masked: yes
              delegate_to: "{{ item }}"
              with_items: "{{ groups['standby_mdss'] }}"
              when: groups['standby_mdss'] | default([]) | length > 0

            - name: wait until all standbys mds are stopped
              command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs dump -f json"
              changed_when: false
              register: wait_standbys_down
              retries: 300
              delay: 5
              until: (wait_standbys_down.stdout | from_json).standbys | length == 0

        - name: create active_mdss group
          add_host:
            name: "{{ mds_active_host[0] if mds_active_host is defined else groups.get(mds_group_name)[0] }}"
            groups: active_mdss
            ansible_host: "{{ hostvars[mds_active_host[0] if mds_active_host is defined else groups.get(mds_group_name)[0]]['ansible_host'] | default(omit) }}"
            ansible_port: "{{ hostvars[mds_active_host[0] if mds_active_host is defined else groups.get(mds_group_name)[0]]['ansible_port'] | default(omit) }}"


- name: upgrade active mds
  vars:
    upgrade_ceph_packages: True
  hosts: active_mdss
  tags: mdss
  become: true
  gather_facts: false
  tasks:
    - import_role:
        name: ceph-defaults

    - import_role:
        name: ceph-facts

    - name: prevent restart from the packaging
      systemd:
        name: ceph-mds@{{ ansible_facts['hostname'] }}
        enabled: no
        masked: yes
      when: not containerized_deployment | bool

    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when: containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-mds

    - name: restart ceph mds
      systemd:
        name: ceph-mds@{{ ansible_facts['hostname'] }}
        state: restarted
        enabled: yes
        masked: no
      when: not containerized_deployment | bool

    - name: restart active mds
      command: "{{ container_binary }} stop ceph-mds-{{ ansible_facts['hostname'] }}"
      changed_when: false
      when: containerized_deployment | bool

- name: upgrade standbys ceph mdss cluster
  vars:
    upgrade_ceph_packages: True
  hosts: standby_mdss
  tags: mdss
  become: True
  gather_facts: false

  tasks:
    - import_role:
        name: ceph-defaults

    - import_role:
        name: ceph-facts

    - name: prevent restarts from the packaging
      systemd:
        name: ceph-mds@{{ ansible_facts['hostname'] }}
        enabled: no
        masked: yes
      when: not containerized_deployment | bool

    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when: containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-mds

    - name: set max_mds
      command: "{{ container_exec_cmd | default('') }} ceph --cluster {{ cluster }} fs set {{ cephfs }} max_mds {{ mds_max_mds }}"
      changed_when: false
      delegate_to: "{{ groups[mon_group_name][0] }}"
      when: inventory_hostname == groups['standby_mdss'] | last


- name: upgrade ceph rgws cluster
  vars:
    upgrade_ceph_packages: True
  hosts: "{{ rgw_group_name|default('rgws') }}"
  tags: rgws
  serial: 1
  become: True
  gather_facts: false
  tasks:

    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts

    - name: stop ceph rgw when upgrading from stable-3.2
      systemd:
        name: ceph-radosgw@rgw.{{ ansible_facts['hostname'] }}
        state: stopped
        enabled: no
        masked: yes
      ignore_errors: True

    - name: stop ceph rgw
      systemd:
        name: ceph-radosgw@rgw.{{ ansible_facts['hostname'] }}.{{ item.instance_name }}
        state: stopped
        enabled: no
        masked: yes
      with_items: "{{ rgw_instances }}"

    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when: containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-rgw


- name: upgrade ceph rbd mirror node
  vars:
    upgrade_ceph_packages: True
  hosts: "{{ rbdmirror_group_name|default('rbdmirrors') }}"
  tags: rbdmirrors
  serial: 1
  become: True
  gather_facts: false
  tasks:
    - name: stop ceph rbd mirror
      systemd:
        name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_facts['hostname'] }}"
        state: stopped
        enabled: no
        masked: yes

    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when: containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-rbd-mirror


- name: upgrade ceph nfs node
  vars:
    upgrade_ceph_packages: True
  hosts: "{{ nfs_group_name|default('nfss') }}"
  tags: nfss
  serial: 1
  become: True
  gather_facts: false
  tasks:
    # failed_when: false is here so that if we upgrade
    # from a version of ceph that does not have nfs-ganesha
    # then this task will not fail
    - name: stop ceph nfs
      systemd:
        name: nfs-ganesha
        state: stopped
        enabled: no
        masked: yes
      failed_when: false
      when: not containerized_deployment | bool

    - name: systemd stop nfs container
      systemd:
        name: ceph-nfs@{{ ceph_nfs_service_suffix | default(ansible_facts['hostname']) }}
        state: stopped
        enabled: no
        masked: yes
      failed_when: false
      when:
        - ceph_nfs_enable_service | bool
        - containerized_deployment | bool

    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when: containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-nfs


- name: upgrade ceph iscsi gateway node
  vars:
    upgrade_ceph_packages: True
  hosts: "{{ iscsi_gw_group_name|default('iscsigws') }}"
  tags: iscsigws
  serial: 1
  become: True
  gather_facts: false
  tasks:
    # failed_when: false is here so that if we upgrade
    # from a version of ceph that does not have iscsi gws
    # then this task will not fail
    - name: stop ceph iscsi services
      systemd:
        name: '{{ item }}'
        state: stopped
        enabled: no
        masked: yes
      failed_when: false
      with_items:
        - rbd-target-api
        - rbd-target-gw
        - tcmu-runner

    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when: containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-iscsi-gw


- name: upgrade ceph client node
  vars:
    upgrade_ceph_packages: True
  hosts: "{{ client_group_name|default('clients') }}"
  tags: clients
  serial: "{{ client_update_batch | default(20) }}"
  become: True
  gather_facts: false
  tasks:
    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
    - import_role:
        name: ceph-handler
    - import_role:
        name: ceph-common
      when: not containerized_deployment | bool
    - import_role:
        name: ceph-container-common
      when:
        - (group_names != ['clients']) or (inventory_hostname == groups.get('clients', [''])|first)
        - containerized_deployment | bool
    - import_role:
        name: ceph-config
    - import_role:
        name: ceph-client

- name: upgrade ceph-crash daemons
  hosts:
    - "{{ mon_group_name | default('mons') }}"
    - "{{ osd_group_name | default('osds') }}"
    - "{{ mds_group_name | default('mdss') }}"
    - "{{ rgw_group_name | default('rgws') }}"
    - "{{ rbdmirror_group_name | default('rbdmirrors') }}"
    - "{{ mgr_group_name | default('mgrs') }}"
  tags:
    - post_upgrade
    - crash
  gather_facts: false
  become: true
  tasks:
    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts
        tasks_from: container_binary.yml
    - import_role:
        name: ceph-handler

    - import_role:
        name: ceph-crash
        tasks_from: systemd.yml

    - name: stop the ceph-crash service
      systemd:
        name: "{{ 'ceph-crash@' + ansible_facts['hostname'] if containerized_deployment | bool else 'ceph-crash.service' }}"
        state: stopped
        daemon_reload: yes

    # it needs to be done in a separate task otherwise the stop just before doesn't work.
    - name: mask and disable the ceph-crash service
      systemd:
        name: "{{ 'ceph-crash@' + ansible_facts['hostname'] if containerized_deployment | bool else 'ceph-crash.service' }}"
        enabled: no
        masked: yes

    - import_role:
        name: ceph-crash

- name: complete upgrade
  hosts:
    - "{{ mon_group_name | default('mons') }}"
    - "{{ mgr_group_name | default('mgrs') }}"
    - "{{ osd_group_name | default('osds') }}"
    - "{{ mds_group_name | default('mdss') }}"
    - "{{ rgw_group_name | default('rgws') }}"
    - "{{ nfs_group_name | default('nfss') }}"
    - "{{ rbdmirror_group_name | default('rbdmirrors') }}"
    - "{{ client_group_name | default('clients') }}"
    - "{{ iscsi_gw_group_name | default('iscsigws') }}"
  tags: post_upgrade
  become: True
  gather_facts: false
  tasks:
    - import_role:
        name: ceph-defaults
    - import_role:
        name: ceph-facts

    - name: container | disallow pre-nautilus OSDs and enable all new nautilus-only functionality
      command: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }} ceph --cluster {{ cluster }} osd require-osd-release nautilus"
      delegate_to: "{{ groups[mon_group_name][0] }}"
      run_once: True
      when:
        - containerized_deployment | bool
        - groups.get(mon_group_name, []) | length > 0

    - name: non container | disallow pre-nautilus OSDs and enable all new nautilus-only functionality
      command: "ceph --cluster {{ cluster }} osd require-osd-release nautilus"
      delegate_to: "{{ groups[mon_group_name][0] }}"
      run_once: True
      when:
        - not containerized_deployment | bool
        - groups.get(mon_group_name, []) | length > 0

    - name: container | enable msgr2 protocol
      command: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }} ceph --cluster {{ cluster }} mon enable-msgr2"
      delegate_to: "{{ groups[mon_group_name][0] }}"
      run_once: True
      when:
        - containerized_deployment | bool
        - groups.get(mon_group_name, []) | length > 0

    - name: non container | enable msgr2 protocol
      command: "ceph --cluster {{ cluster }} mon enable-msgr2"
      delegate_to: "{{ groups[mon_group_name][0] }}"
      run_once: True
      when:
        - not containerized_deployment | bool
        - groups.get(mon_group_name, []) | length > 0

    - import_role:
        name: ceph-handler
    - name: import_role ceph-config
      import_role:
        name: ceph-config
      vars:
        msgr2_migration: True

- name: upgrade node-exporter
  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"
    - "{{ mds_group_name|default('mdss') }}"
    - "{{ rgw_group_name|default('rgws') }}"
    - "{{ mgr_group_name|default('mgrs') }}"
    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
    - "{{ nfs_group_name|default('nfss') }}"
    - "{{ iscsi_gw_group_name|default('iscsigws') }}"
    - "{{ grafana_server_group_name|default('grafana-server') }}"
  tags: monitoring
  gather_facts: false
  become: true
  tasks:
    - import_role:
        name: ceph-defaults

    - name: with dashboard configuration
      when: dashboard_enabled | bool
      block:
        - name: stop node-exporter
          service:
            name: node_exporter
            state: stopped
          failed_when: false

        - import_role:
            name: ceph-facts
        - import_role:
            name: ceph-container-engine
        - import_role:
            name: ceph-container-common
            tasks_from: registry
          when:
            - not containerized_deployment | bool
            - ceph_docker_registry_auth | bool
        - import_role:
            name: ceph-node-exporter

- name: upgrade monitoring node
  hosts: "{{ grafana_server_group_name }}"
  tags: monitoring
  gather_facts: false
  become: true
  tasks:
    - import_role:
        name: ceph-defaults

    - name: with dashboard configuration
      when: dashboard_enabled | bool
      block:
        - name: stop monitoring services
          service:
            name: '{{ item }}'
            state: stopped
          failed_when: false
          with_items:
            - alertmanager
            - prometheus
            - grafana-server

        - import_role:
            name: ceph-facts
        - import_role:
            name: ceph-facts
            tasks_from: grafana
        - import_role:
            name: ceph-prometheus
        - import_role:
            name: ceph-grafana

- name: upgrade ceph dashboard
  hosts: "{{ groups[mgr_group_name] | default(groups[mon_group_name]) | default(omit) }}"
  tags: monitoring
  gather_facts: false
  become: true
  tasks:
    - import_role:
        name: ceph-defaults

    - name: with dashboard configuration
      when: dashboard_enabled | bool
      block:
        - import_role:
            name: ceph-facts
        - import_role:
            name: ceph-facts
            tasks_from: grafana
        - import_role:
            name: ceph-dashboard

- name: switch any existing crush buckets to straw2
  hosts: "{{ mon_group_name | default('mons') }}[0]"
  tags: post_upgrade
  become: true
  any_errors_fatal: true
  gather_facts: false
  tasks:
    - import_role:
        name: ceph-defaults

    - import_role:
        name: ceph-facts
        tasks_from: container_binary.yml

    - name: set_fact ceph_cmd
      set_fact:
        ceph_cmd: "{{ container_binary + ' run --rm --net=host -v /etc/ceph:/etc/ceph:z --entrypoint=ceph ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'ceph' }}"

    - name: backup the crushmap
      command: "{{ ceph_cmd }} --cluster {{ cluster }} osd getcrushmap -o /etc/ceph/{{ cluster }}-crushmap"
      changed_when: false

    - block:
        - name: switch crush buckets to straw2
          command: "{{ ceph_cmd }} --cluster {{ cluster }} osd crush set-all-straw-buckets-to-straw2"
          changed_when: false
      rescue:
        - name: restore the crushmap
          command: "{{ ceph_cmd }} --cluster {{ cluster }} osd setcrushmap -i /etc/ceph/{{ cluster }}-crushmap"
          changed_when: false

        - name: inform that the switch to straw2 buckets failed
          fail:
            msg: >
              "An attempt to switch to straw2 bucket was made but failed.
               Check the cluster status."

    - name: remove crushmap backup
      file:
        path: /etc/ceph/{{ cluster }}-crushmap
        state: absent

- name: show ceph status
  hosts: "{{ mon_group_name|default('mons') }}"
  tags: always
  become: True
  gather_facts: false
  tasks:
    - import_role:
        name: ceph-defaults

    - name: set_fact container_exec_cmd_status
      set_fact:
        container_exec_cmd_status: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_facts']['hostname'] }}"
      when: containerized_deployment | bool

    - name: show ceph status
      command: "{{ container_exec_cmd_status|default('') }} ceph --cluster {{ cluster }} -s"
      changed_when: false
      run_once: True
      delegate_to: "{{ groups[mon_group_name][0] }}"

    - name: show all daemons version
      command: "{{ container_exec_cmd_status|default('') }} ceph --cluster {{ cluster }} versions"
      run_once: True
      delegate_to: "{{ groups[mon_group_name][0] }}"
      changed_when: false