ceph-ansible/infrastructure-playbooks/rolling_update.yml

---
# This playbook does a rolling update for all the Ceph services
#
# The value of 'serial:' adjusts the number of servers to be updated simultaneously.
# We recommend a value of 1, which means hosts of a group (e.g: monitor) will be
# upgraded one by one. It is really crucial for the update process to happen
# in a serialized fashion. DO NOT CHANGE THIS VALUE.
#
#
# If you run a Ceph community version, you have to change the variable: ceph_stable_release to the new release
#
# If you run Red Hat Ceph Storage and are doing a **major** update (e.g: from 2 to 3), you have two options:
#   - if you use a CDN, you have to change the ceph_rhcs_version to a newer one
#   - if you use an ISO, you have to change the ceph_rhcs_iso_path to the directory containing the new Ceph version
#

- name: confirm whether user really meant to upgrade the cluster
  hosts: localhost
  become: false
  vars:
    - mgr_group_name: mgrs

  vars_prompt:
    - name: ireallymeanit
      prompt: Are you sure you want to upgrade the cluster?
      default: 'no'
      private: no

  tasks:
    - name: exit playbook, if user did not mean to upgrade cluster
      fail:
        msg: >
          "Exiting rolling_update.yml playbook, cluster was NOT upgraded.
           To upgrade the cluster, either say 'yes' on the prompt or
           use `-e ireallymeanit=yes` on the command line when
           invoking the playbook"
      when: ireallymeanit != 'yes'

    - name: fail if no mgr host is present in the inventory
      fail:
        msg: "Please add a mgr host to your inventory."
      when:
        - groups.get(mgr_group_name, []) | length == 0


- name: gather facts and check the init system

  hosts:
    - "{{ mon_group_name|default('mons') }}"
    - "{{ osd_group_name|default('osds') }}"
    - "{{ mds_group_name|default('mdss') }}"
    - "{{ rgw_group_name|default('rgws') }}"
    - "{{ mgr_group_name|default('mgrs') }}"
    - "{{ rbd_mirror_group_name|default('rbdmirrors') }}"
    - "{{ nfs_group_name|default('nfss') }}"
    - "{{ client_group_name|default('clients') }}"

  become: True
  gather_facts: False
  vars:
    delegate_facts_host: True
  tasks:
    - debug: msg="gather facts on all Ceph hosts for following reference"

    - name: gather facts
      setup:
      when:
        - not delegate_facts_host | bool

    - name: gather and delegate facts
      setup:
      delegate_to: "{{ item }}"
      delegate_facts: True
      with_items: "{{ groups['all'] }}"
      run_once: true
      when:
        - delegate_facts_host | bool

    - set_fact: rolling_update=true

- name: upgrade ceph mon cluster

  vars:
    health_mon_check_retries: 5
    health_mon_check_delay: 15
    upgrade_ceph_packages: True

  hosts:
    - "{{ mon_group_name|default('mons') }}"

  serial: 1
  become: True

  pre_tasks:
    - name: set mon_host_count
      set_fact: mon_host_count={{ groups[mon_group_name] | length }}

    - name: fail when less than three monitors
      fail:
        msg: "Upgrade of cluster with less than three monitors is not supported."
      when:
        - mon_host_count | int < 3

    - name: stop ceph mon - shortname
      systemd:
        name: ceph-mon@{{ ansible_hostname }}
        state: stopped
        enabled: yes
      ignore_errors: True
      when:
        - not containerized_deployment

    - name: stop ceph mon - fqdn
      systemd:
        name: ceph-mon@{{ ansible_fqdn }}
        state: stopped
        enabled: yes
      ignore_errors: True
      when:
        - not containerized_deployment

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-mon

  post_tasks:
    - name: start ceph mon
      systemd:
        name: ceph-mon@{{ monitor_name }}
        state: started
        enabled: yes
      when:
        - not containerized_deployment

    - name: restart containerized ceph mon
      systemd:
        name: ceph-mon@{{ monitor_name }}
        state: restarted
        enabled: yes
        daemon_reload: yes
      when:
        - containerized_deployment

    - name: set mon_host_count
      set_fact: mon_host_count={{ groups[mon_group_name] | length }}

    - name: select a running monitor
      set_fact:
        mon_host: "{{ groups[mon_group_name] | difference([inventory_hostname]) | last }}"

    - name: non container | waiting for the monitor to join the quorum...
      command: ceph --cluster "{{ cluster }}" -s --format json
      register: ceph_health_raw
      until: >
        hostvars[mon_host]['ansible_hostname'] in (ceph_health_raw.stdout | default('{}') |  from_json)["quorum_names"] or
        hostvars[mon_host]['ansible_fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"]
      retries: "{{ health_mon_check_retries }}"
      delay: "{{ health_mon_check_delay }}"
      delegate_to: "{{ mon_host }}"
      when:
        - not containerized_deployment

    - name: container | waiting for the containerized monitor to join the quorum...
      command: docker exec ceph-mon-{{ hostvars[mon_host]['ansible_hostname'] }} ceph --cluster "{{ cluster }}" -s --format json
      register: ceph_health_raw
      until: >
        hostvars[mon_host]['ansible_hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
        hostvars[mon_host]['ansible_fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"]
      retries: "{{ health_mon_check_retries }}"
      delay: "{{ health_mon_check_delay }}"
      delegate_to: "{{ mon_host }}"
      when:
        - containerized_deployment

    - name: create potentially missing keys (rbd and rbd-mirror)
      ceph_key:
        name: "client.{{ item.0 }}"
        state: present
        dest: "/var/lib/ceph/{{ item.0 }}/"
        caps:
          mon: "allow profile {{ item.0 }}"
        cluster: "{{ cluster }}"
        containerized: "{{ 'docker exec ceph-mon-' + hostvars[mon_host]['ansible_hostname'] if containerized_deployment else None }}"
      when:
        - cephx
      delegate_to: "{{ mon_host }}"
      ignore_errors: True # this might fail for upgrade from J to L on rbd-mirror and also on partially updated clusters
      with_nested:
        - ['bootstrap-rbd', 'bootstrap-rbd-mirror']
        - "{{ groups[mon_group_name] }}" # so the key goes on all the nodes

    - name: set osd flags
      command: ceph --cluster {{ cluster }} osd set {{ item }}
      with_items:
        - noout
        - norebalance
      delegate_to: "{{ mon_host }}"
      when: not containerized_deployment

    - name: set containerized osd flags
      command: |
        docker exec ceph-mon-{{ hostvars[mon_host]['ansible_hostname'] }} ceph --cluster {{ cluster }} osd set {{ item }}
      with_items:
        - noout
        - norebalance
      delegate_to: "{{ mon_host }}"
      when: containerized_deployment


- name: upgrade ceph mgr node

  vars:
    upgrade_ceph_packages: True
    ceph_release: "{{ ceph_stable_release }}"

  hosts:
    - "{{ mgr_group_name|default('mgrs') }}"

  serial: 1
  become: True

  pre_tasks:
    - name: non container - get current fsid
      command: "ceph --cluster {{ cluster }} fsid"
      register: cluster_uuid_non_container
      delegate_to: "{{ groups[mon_group_name][0] }}"
      when:
        - not containerized_deployment

    - name: container - get current fsid
      command: "docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }} ceph --cluster {{ cluster }} fsid"
      register: cluster_uuid_container
      delegate_to: "{{ groups[mon_group_name][0] }}"
      when:
        - containerized_deployment

    - name: set_fact ceph_cluster_fsid
      set_fact:
        ceph_cluster_fsid: "{{ cluster_uuid_container.stdout if containerized_deployment else cluster_uuid_non_container.stdout }}"

    - name: create ceph mgr keyring(s) when mon is not containerized
      ceph_key:
        name: "mgr.{{ hostvars[item]['ansible_hostname'] }}"
        state: present
        caps:
          mon: allow profile mgr
          osd: allow *
          mds: allow *
        cluster: "{{ cluster }}"
      when:
        - not containerized_deployment
        - cephx
        - groups.get(mgr_group_name, []) | length > 0
      delegate_to: "{{ groups[mon_group_name][0] }}"
      with_items: "{{ groups.get(mgr_group_name, []) }}"

    - name: create ceph mgr keyring(s) when mon is containerized
      ceph_key:
        name: "mgr.{{ hostvars[item]['ansible_hostname'] }}"
        state: present
        caps:
          mon: allow profile mgr
          osd: allow *
          mds: allow *
        cluster: "{{ cluster }}"
        containerized: "docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
      when:
        - containerized_deployment
        - cephx
        - groups.get(mgr_group_name, []) | length > 0
      delegate_to: "{{ groups[mon_group_name][0] }}"
      with_items: "{{ groups.get(mgr_group_name, []) }}"

    - name: fetch ceph mgr key(s)
      fetch:
        src: "{{ ceph_conf_key_directory }}/{{ cluster }}.mgr.{{ hostvars[item]['ansible_hostname'] }}.keyring"
        dest: "{{ fetch_directory }}/{{ ceph_cluster_fsid }}/{{ ceph_conf_key_directory }}/"
        flat: yes
        fail_on_missing: no
      delegate_to: "{{ groups[mon_group_name][0] }}"
      with_items:
        - "{{ groups.get(mgr_group_name, []) }}"

    # The following task has a failed_when: false
    # to handle the scenario where no mgr existed before the upgrade
    # or if we run a Ceph cluster before Luminous
    - name: stop ceph mgr
      systemd:
        name: ceph-mgr@{{ ansible_hostname }}
        state: stopped
        enabled: yes
      failed_when: false
      when:
        - not containerized_deployment

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-mgr

  post_tasks:
    - name: start ceph mgr
      systemd:
        name: ceph-mgr@{{ ansible_hostname }}
        state: started
        enabled: yes
      when:
        - not containerized_deployment

    - name: restart containerized ceph mgr
      systemd:
        name: ceph-mgr@{{ ansible_hostname }}
        state: restarted
        enabled: yes
        daemon_reload: yes
      when:
        - containerized_deployment


- name: upgrade ceph osds cluster

  vars:
    health_osd_check_retries: 40
    health_osd_check_delay: 30
    upgrade_ceph_packages: True

  hosts:
    - "{{ osd_group_name|default('osds') }}"

  serial: 1
  become: True

  pre_tasks:
    - name: get osd numbers - non container
      shell: "if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | sed 's/.*-//' ; fi"
      register: osd_ids
      changed_when: false
      when: not containerized_deployment

    - name: get osd unit names - container
      shell: systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]{1,}|[a-z]+).service"
      register: osd_names
      changed_when: false
      when: containerized_deployment

    - name: stop ceph osd
      systemd:
        name: ceph-osd@{{ item }}
        state: stopped
        enabled: yes
      with_items: "{{ osd_ids.stdout_lines }}"
      when:
        - not containerized_deployment

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-osd

  post_tasks:
    - name: get osd numbers
      shell: "if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | sed 's/.*-//' ; fi"
      register: osd_ids
      changed_when: false
      when: not containerized_deployment

    - name: start ceph osd
      systemd:
        name: ceph-osd@{{ item }}
        state: started
        enabled: yes
      with_items: "{{ osd_ids.stdout_lines }}"
      when:
        - not containerized_deployment

    - name: restart containerized ceph osd
      systemd:
        name: "{{ item }}"
        state: restarted
        enabled: yes
        daemon_reload: yes
      with_items: "{{ osd_names.stdout_lines }}"
      when:
        - containerized_deployment

    - name: set_fact docker_exec_cmd_osd
      set_fact:
        docker_exec_cmd_update_osd: "docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
      when:
        - containerized_deployment

    - name: get osd versions
      command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} versions"
      register: ceph_versions
      delegate_to: "{{ groups[mon_group_name][0] }}"

    - name: set_fact ceph_versions_osd
      set_fact:
        ceph_versions_osd: "{{ (ceph_versions.stdout|from_json).osd }}"
      delegate_to: "{{ groups[mon_group_name][0] }}"

    # length == 1 means there is a single osds versions entry
    # thus all the osds are running the same version
    - name: osd set sortbitwise
      command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} osd set sortbitwise"
      delegate_to: "{{ groups[mon_group_name][0] }}"
      when:
        - (ceph_versions.get('stdout', '{}')|from_json).get('osd', {}) | length == 1
        - ceph_versions_osd | string is search("ceph version 10")

    - name: get num_pgs - non container
      command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} -s --format json"
      register: ceph_pgs
      delegate_to: "{{ groups[mon_group_name][0] }}"

    - name: waiting for clean pgs...
      command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} -s --format json"
      register: ceph_health_post
      until: >
        (((ceph_health_post.stdout | from_json).pgmap.pgs_by_state | length) > 0)
        and
        (((ceph_health_post.stdout | from_json).pgmap.pgs_by_state | selectattr('state_name', 'search', '^active\\+clean') | map(attribute='count') | list | sum) == (ceph_pgs.stdout | from_json).pgmap.num_pgs)
      delegate_to: "{{ groups[mon_group_name][0] }}"
      retries: "{{ health_osd_check_retries }}"
      delay: "{{ health_osd_check_delay }}"
      when:
        - (ceph_pgs.stdout | from_json).pgmap.num_pgs != 0


- name: unset osd flags

  hosts:
    - "{{ mon_group_name|default('mons') }}"

  become: True

  roles:
    - ceph-defaults
    - ceph-facts

  tasks:
    - name: set_fact docker_exec_cmd_osd
      set_fact:
        docker_exec_cmd_update_osd: "docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
      when:
        - containerized_deployment

    - name: unset osd flags
      command: "{{ docker_exec_cmd_update_osd|default('') }} ceph osd unset {{ item }} --cluster {{ cluster }}"
      with_items:
        - noout
        - norebalance
      delegate_to: "{{ groups[mon_group_name][0] }}"

    - name: get osd versions
      command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} versions"
      register: ceph_versions
      delegate_to: "{{ groups[mon_group_name][0] }}"

    - name: set_fact ceph_versions_osd
      set_fact:
        ceph_versions_osd: "{{ (ceph_versions.stdout|from_json).osd }}"
      delegate_to: "{{ groups[mon_group_name][0] }}"

    # length == 1 means there is a single osds versions entry
    # thus all the osds are running the same version
    - name: complete osds upgrade
      command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} osd require-osd-release luminous"
      delegate_to: "{{ groups[mon_group_name][0] }}"
      when:
        - (ceph_versions.get('stdout', '{}')|from_json).get('osd', {}) | length == 1
        - ceph_versions_osd | string is search("ceph version 12")

- name: upgrade ceph mdss cluster

  vars:
    upgrade_ceph_packages: True

  hosts:
    - "{{ mds_group_name|default('mdss') }}"

  serial: 1
  become: True

  pre_tasks:
    - name: stop ceph mds
      systemd:
        name: ceph-mds@{{ ansible_hostname }}
        state: stopped
        enabled: yes
      when:
        - not containerized_deployment

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-mds

  post_tasks:
    - name: start ceph mds
      systemd:
        name: ceph-mds@{{ ansible_hostname }}
        state: started
        enabled: yes
      when:
        - not containerized_deployment

    - name: restart ceph mds
      systemd:
        name: ceph-mds@{{ ansible_hostname }}
        state: restarted
        enabled: yes
        daemon_reload: yes
      when:
        - containerized_deployment


- name: upgrade ceph rgws cluster

  vars:
    upgrade_ceph_packages: True

  hosts:
    - "{{ rgw_group_name|default('rgws') }}"

  serial: 1
  become: True

  pre_tasks:
    - name: stop ceph rgw
      systemd:
        name: ceph-radosgw@rgw.{{ ansible_hostname }}
        state: stopped
        enabled: yes
      when:
        - not containerized_deployment

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-rgw

  post_tasks:
    - name: start ceph rgw
      systemd:
        name: ceph-radosgw@rgw.{{ ansible_hostname }}
        state: started
        enabled: yes
      when:
        - not containerized_deployment

    - name: restart containerized ceph rgw
      systemd:
        name: ceph-radosgw@rgw.{{ ansible_hostname }}
        state: restarted
        enabled: yes
        daemon_reload: yes
      when:
        - containerized_deployment


- name: upgrade ceph rbd mirror node

  vars:
    upgrade_ceph_packages: True

  hosts:
    - "{{ rbd_mirror_group_name|default('rbdmirrors') }}"

  serial: 1
  become: True

  pre_tasks:
    # NOTE(leseb): these tasks have a 'failed_when: false'
    # in case we run before luminous or after
    - name: stop ceph rbd mirror before luminous
      systemd:
        name: "ceph-rbd-mirror@{{ ceph_rbd_mirror_local_user }}"
        state: stopped
        enabled: no
      failed_when: false

    - name: stop ceph rbd mirror for and after luminous
      systemd:
        name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}"
        state: stopped
        enabled: yes
      failed_when: false

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-rbd-mirror

  post_tasks:
    - name: start ceph rbd mirror
      systemd:
        name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}"
        state: started
        enabled: yes
      when:
        - not containerized_deployment

    - name: restart containerized ceph rbd mirror
      systemd:
        name: ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}
        state: restarted
        enabled: yes
        daemon_reload: yes
      when:
        - containerized_deployment


- name: upgrade ceph nfs node

  vars:
    upgrade_ceph_packages: True

  hosts:
    - "{{ nfs_group_name|default('nfss') }}"

  serial: 1
  become: True

  pre_tasks:
    # failed_when: false is here so that if we upgrade
    # from a version of ceph that does not have nfs-ganesha
    # then this task will not fail
    - name: stop ceph nfs
      systemd:
        name: nfs-ganesha
        state: stopped
        enabled: yes
      failed_when: false
      when:
        - not containerized_deployment

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-nfs

  post_tasks:
    - name: start nfs gateway
      systemd:
        name: nfs-ganesha
        state: started
        enabled: yes
      when:
        - not containerized_deployment
        - ceph_nfs_enable_service

    - name: systemd restart nfs container
      systemd:
        name: ceph-nfs@{{ ceph_nfs_service_suffix | default(ansible_hostname) }}
        state: restarted
        enabled: yes
        daemon_reload: yes
      when:
        - ceph_nfs_enable_service
        - containerized_deployment


- name: upgrade ceph iscsi gateway node

  vars:
    upgrade_ceph_packages: True

  hosts:
    - "{{ iscsi_gw_group_name|default('iscsigws') }}"
    - iscsi-gws # for backward compatibility only!

  serial: 1
  become: True

  pre_tasks:
    # failed_when: false is here so that if we upgrade
    # from a version of ceph that does not have iscsi gws
    # then this task will not fail
    - name: stop rbd-target-gw
      systemd:
        name: rbd-target-gw
        state: stopped
        enabled: yes
      failed_when: false
      when:
        - not containerized_deployment

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-iscsi-gw

  post_tasks:
    - name: start rbd-target-gw
      systemd:
        name: rbd-target-gw
        state: started
        enabled: yes
      when:
        - not containerized_deployment


- name: upgrade ceph client node

  vars:
    upgrade_ceph_packages: True

  hosts:
    - "{{ client_group_name|default('clients') }}"
  serial: "{{ client_update_batch | default(ansible_forks) }}"
  become: True

  roles:
    - ceph-defaults
    - ceph-facts
    - ceph-handler
    - { role: ceph-common, when: not containerized_deployment }
    - { role: ceph-docker-common, when: containerized_deployment }
    - ceph-config
    - ceph-client


- name: show ceph status

  hosts:
    - "{{ mon_group_name|default('mons') }}"

  become: True

  roles:
    - ceph-defaults

  tasks:
    - name: set_fact docker_exec_cmd_status
      set_fact:
        docker_exec_cmd_status: "docker exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
      when:
        - containerized_deployment

    - name: show ceph status
      command: "{{ docker_exec_cmd_status|default('') }} ceph --cluster {{ cluster }} -s"
      delegate_to: "{{ groups[mon_group_name][0] }}"