ceph-ansible/infrastructure-playbooks/rolling_update.yml

842 lines
25 KiB
YAML
Raw Normal View History

---
# This playbook does a rolling update for all the Ceph services
#
# The value of 'serial:' adjusts the number of servers to be updated simultaneously.
# We recommend a value of 1, which means hosts of a group (e.g: monitor) will be
# upgraded one by one. It is really crucial for the update process to happen
# in a serialized fashion. DO NOT CHANGE THIS VALUE.
#
#
# If you run a Ceph community version, you have to change the variable: ceph_stable_release to the new release
#
# If you run Red Hat Ceph Storage and are doing a **major** update (e.g: from 2 to 3), you have two options:
# - if you use a CDN, you have to change the ceph_rhcs_version to a newer one
# - if you use an ISO, you have to change the ceph_rhcs_iso_path to the directory containing the new Ceph version
#
- name: confirm whether user really meant to upgrade the cluster
hosts: localhost
become: false
vars:
- mgr_group_name: mgrs
vars_prompt:
- name: ireallymeanit
prompt: Are you sure you want to upgrade the cluster?
default: 'no'
private: no
tasks:
- name: exit playbook, if user did not mean to upgrade cluster
fail:
msg: >
"Exiting rolling_update.yml playbook, cluster was NOT upgraded.
To upgrade the cluster, either say 'yes' on the prompt or
use `-e ireallymeanit=yes` on the command line when
invoking the playbook"
when: ireallymeanit != 'yes'
- name: gather facts and check the init system
hosts:
- "{{ mon_group_name|default('mons') }}"
- "{{ osd_group_name|default('osds') }}"
- "{{ mds_group_name|default('mdss') }}"
- "{{ rgw_group_name|default('rgws') }}"
- "{{ mgr_group_name|default('mgrs') }}"
- "{{ rbd_mirror_group_name|default('rbdmirrors') }}"
- "{{ nfs_group_name|default('nfss') }}"
- "{{ client_group_name|default('clients') }}"
become: True
gather_facts: False
vars:
delegate_facts_host: True
tasks:
- debug: msg="gather facts on all Ceph hosts for following reference"
- name: gather facts
setup:
when:
- not delegate_facts_host | bool
- name: gather and delegate facts
setup:
delegate_to: "{{ item }}"
delegate_facts: True
with_items: "{{ groups['all'] }}"
run_once: true
when:
- delegate_facts_host | bool
- set_fact: rolling_update=true
- name: upgrade ceph mon cluster
vars:
health_mon_check_retries: 5
health_mon_check_delay: 15
upgrade_ceph_packages: True
hosts:
- "{{ mon_group_name|default('mons') }}"
serial: 1
become: True
tasks:
- name: set mon_host_count
set_fact: mon_host_count={{ groups[mon_group_name] | length }}
- name: fail when less than three monitors
fail:
msg: "Upgrade of cluster with less than three monitors is not supported."
when:
- mon_host_count | int < 3
- name: stop ceph mon - shortname
systemd:
name: ceph-mon@{{ ansible_hostname }}
state: stopped
enabled: yes
ignore_errors: True
when:
- not containerized_deployment
- name: stop ceph mon - fqdn
systemd:
name: ceph-mon@{{ ansible_fqdn }}
state: stopped
enabled: yes
ignore_errors: True
when:
- not containerized_deployment
- name: stop ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: stopped
enabled: yes
ignore_errors: True # if no mgr collocated with mons
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-mon
- import_role:
name: ceph-mgr
when: groups.get(mgr_group_name, []) | length == 0
- name: start ceph mon
systemd:
name: ceph-mon@{{ monitor_name }}
state: started
enabled: yes
when:
- not containerized_deployment
- name: start ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: started
enabled: yes
ignore_errors: True # if no mgr collocated with mons
when:
- not containerized_deployment
- name: restart containerized ceph mon
systemd:
name: ceph-mon@{{ monitor_name }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- containerized_deployment
- name: restart containerized ceph mgr
systemd:
name: ceph-mgr@{{ monitor_name }}
state: restarted
enabled: yes
daemon_reload: yes
ignore_errors: True # if no mgr collocated with mons
when:
- containerized_deployment
- name: set mon_host_count
set_fact: mon_host_count={{ groups[mon_group_name] | length }}
- name: select a running monitor
set_fact:
mon_host: "{{ groups[mon_group_name] | difference([inventory_hostname]) | last }}"
- name: non container | waiting for the monitor to join the quorum...
command: ceph --cluster "{{ cluster }}" -s --format json
register: ceph_health_raw
until: >
hostvars[mon_host]['ansible_hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
hostvars[mon_host]['ansible_fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"]
retries: "{{ health_mon_check_retries }}"
delay: "{{ health_mon_check_delay }}"
delegate_to: "{{ mon_host }}"
when:
- not containerized_deployment
- name: container | waiting for the containerized monitor to join the quorum...
command: >
{{ container_binary }} exec ceph-mon-{{ hostvars[mon_host]['ansible_hostname'] }} ceph --cluster "{{ cluster }}" -s --format json
register: ceph_health_raw
until: >
hostvars[mon_host]['ansible_hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
hostvars[mon_host]['ansible_fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"]
retries: "{{ health_mon_check_retries }}"
delay: "{{ health_mon_check_delay }}"
delegate_to: "{{ mon_host }}"
when:
- containerized_deployment
- name: create potentially missing keys (rbd and rbd-mirror)
ceph_key:
name: "client.{{ item.0 }}"
state: present
dest: "/var/lib/ceph/{{ item.0 }}/"
caps:
mon: "allow profile {{ item.0 }}"
cluster: "{{ cluster }}"
containerized: "{{ 'docker exec ceph-mon-' + hostvars[item.1]['ansible_hostname'] if containerized_deployment else None }}"
when:
- cephx
delegate_to: "{{ item.1 }}"
ignore_errors: True # this might fail for upgrade from J to L on rbd-mirror and also on partially updated clusters
with_nested:
- ['bootstrap-rbd', 'bootstrap-rbd-mirror']
- "{{ groups[mon_group_name] | difference([mon_host]) }}" # so the key goes on all the nodes
- name: set osd flags
command: ceph --cluster {{ cluster }} osd set {{ item }}
with_items:
- noout
- norebalance
delegate_to: "{{ mon_host }}"
when: not containerized_deployment
- name: set containerized osd flags
command: >
{{ container_binary }} exec ceph-mon-{{ hostvars[mon_host]['ansible_hostname'] }} ceph --cluster {{ cluster }} osd set {{ item }}
with_items:
- noout
- norebalance
delegate_to: "{{ mon_host }}"
when: containerized_deployment
- name: upgrade ceph mgr node
vars:
upgrade_ceph_packages: True
ceph_release: "{{ ceph_stable_release }}"
hosts:
- "{{ mgr_group_name|default('mgrs') }}"
serial: 1
become: True
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- name: non container - get current fsid
command: "ceph --cluster {{ cluster }} fsid"
register: cluster_uuid_non_container
delegate_to: "{{ groups[mon_group_name][0] }}"
when:
- not containerized_deployment
- name: container - get current fsid
command: >
{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }} ceph --cluster {{ cluster }} fsid
register: cluster_uuid_container
delegate_to: "{{ groups[mon_group_name][0] }}"
when:
- containerized_deployment
- name: set_fact ceph_cluster_fsid
set_fact:
ceph_cluster_fsid: "{{ cluster_uuid_container.stdout if containerized_deployment else cluster_uuid_non_container.stdout }}"
- name: create ceph mgr keyring(s) when mon is not containerized
ceph_key:
name: "mgr.{{ hostvars[item]['ansible_hostname'] }}"
state: present
caps:
mon: allow profile mgr
osd: allow *
mds: allow *
cluster: "{{ cluster }}"
when:
- not containerized_deployment
- cephx
- groups.get(mgr_group_name, []) | length > 0
delegate_to: "{{ groups[mon_group_name][0] }}"
with_items: "{{ groups.get(mgr_group_name, []) }}"
- name: create ceph mgr keyring(s) when mon is containerized
ceph_key:
name: "mgr.{{ hostvars[item]['ansible_hostname'] }}"
state: present
caps:
mon: allow profile mgr
osd: allow *
mds: allow *
cluster: "{{ cluster }}"
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}"
CEPH_UID: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
when:
- containerized_deployment
- cephx
- groups.get(mgr_group_name, []) | length > 0
delegate_to: "{{ groups[mon_group_name][0] }}"
with_items: "{{ groups.get(mgr_group_name, []) }}"
- name: fetch ceph mgr key(s)
fetch:
src: "{{ ceph_conf_key_directory }}/{{ cluster }}.mgr.{{ hostvars[item]['ansible_hostname'] }}.keyring"
dest: "{{ fetch_directory }}/{{ ceph_cluster_fsid }}/{{ ceph_conf_key_directory }}/"
flat: yes
fail_on_missing: no
delegate_to: "{{ groups[mon_group_name][0] }}"
with_items:
- "{{ groups.get(mgr_group_name, []) }}"
# The following task has a failed_when: false
# to handle the scenario where no mgr existed before the upgrade
# or if we run a Ceph cluster before Luminous
- name: stop ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: stopped
enabled: yes
failed_when: false
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-mgr
- name: start ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: started
enabled: yes
when:
- not containerized_deployment
- name: restart containerized ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- containerized_deployment
- name: upgrade ceph osds cluster
vars:
health_osd_check_retries: 40
health_osd_check_delay: 30
upgrade_ceph_packages: True
hosts:
- "{{ osd_group_name|default('osds') }}"
serial: 1
become: True
tasks:
- name: get osd numbers - non container
shell: "if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | sed 's/.*-//' ; fi"
register: osd_ids
changed_when: false
when: not containerized_deployment
- name: get osd unit names - container
shell: systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]{1,}|[a-z]+).service"
register: osd_names
changed_when: false
when: containerized_deployment
- name: stop ceph osd
systemd:
name: ceph-osd@{{ item }}
state: stopped
enabled: yes
with_items: "{{ osd_ids.stdout_lines }}"
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-osd
- name: get osd numbers
shell: "if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | sed 's/.*-//' ; fi"
register: osd_ids
changed_when: false
when: not containerized_deployment
- name: start ceph osd
systemd:
name: ceph-osd@{{ item }}
state: started
enabled: yes
with_items: "{{ osd_ids.stdout_lines }}"
when:
- not containerized_deployment
- name: restart containerized ceph osd
systemd:
name: "{{ item }}"
state: restarted
enabled: yes
daemon_reload: yes
with_items: "{{ osd_names.stdout_lines }}"
when:
- containerized_deployment
- name: set_fact docker_exec_cmd_osd
set_fact:
docker_exec_cmd_update_osd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
when:
- containerized_deployment
- name: get osd versions
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} versions"
register: ceph_versions
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: set_fact ceph_versions_osd
set_fact:
ceph_versions_osd: "{{ (ceph_versions.stdout|from_json).osd }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
# length == 1 means there is a single osds versions entry
# thus all the osds are running the same version
- name: osd set sortbitwise
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} osd set sortbitwise"
delegate_to: "{{ groups[mon_group_name][0] }}"
when:
- (ceph_versions.get('stdout', '{}')|from_json).get('osd', {}) | length == 1
- ceph_versions_osd | string is search("ceph version 10")
- name: get num_pgs - non container
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} -s --format json"
register: ceph_pgs
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: waiting for clean pgs...
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} -s --format json"
register: ceph_health_post
until: >
(((ceph_health_post.stdout | from_json).pgmap.pgs_by_state | length) > 0)
and
(((ceph_health_post.stdout | from_json).pgmap.pgs_by_state | selectattr('state_name', 'search', '^active\\+clean') | map(attribute='count') | list | sum) == (ceph_pgs.stdout | from_json).pgmap.num_pgs)
delegate_to: "{{ groups[mon_group_name][0] }}"
retries: "{{ health_osd_check_retries }}"
delay: "{{ health_osd_check_delay }}"
when:
- (ceph_pgs.stdout | from_json).pgmap.num_pgs != 0
- name: unset osd flags
hosts:
- "{{ mon_group_name|default('mons') }}"
become: True
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- name: set_fact docker_exec_cmd_osd
set_fact:
docker_exec_cmd_update_osd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
when:
- containerized_deployment
- name: unset osd flags
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph osd unset {{ item }} --cluster {{ cluster }}"
with_items:
- noout
- norebalance
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: get osd versions
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} versions"
register: ceph_versions
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: set_fact ceph_versions_osd
set_fact:
ceph_versions_osd: "{{ (ceph_versions.stdout|from_json).osd }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
# length == 1 means there is a single osds versions entry
# thus all the osds are running the same version
- name: complete osds upgrade
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} osd require-osd-release luminous"
delegate_to: "{{ groups[mon_group_name][0] }}"
when:
- (ceph_versions.get('stdout', '{}')|from_json).get('osd', {}) | length == 1
- ceph_versions_osd | string is search("ceph version 12")
- name: upgrade ceph mdss cluster
vars:
upgrade_ceph_packages: True
hosts:
- "{{ mds_group_name|default('mdss') }}"
serial: 1
become: True
tasks:
- name: stop ceph mds
systemd:
name: ceph-mds@{{ ansible_hostname }}
state: stopped
enabled: yes
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-mds
- name: start ceph mds
systemd:
name: ceph-mds@{{ ansible_hostname }}
state: started
enabled: yes
when:
- not containerized_deployment
- name: restart ceph mds
systemd:
name: ceph-mds@{{ ansible_hostname }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- containerized_deployment
- name: upgrade ceph rgws cluster
vars:
upgrade_ceph_packages: True
hosts:
- "{{ rgw_group_name|default('rgws') }}"
serial: 1
become: True
tasks:
- name: stop ceph rgw
systemd:
name: ceph-radosgw@rgw.{{ ansible_hostname }}
state: stopped
enabled: yes
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-rgw
- name: start ceph rgw
systemd:
name: ceph-radosgw@rgw.{{ ansible_hostname }}
state: started
enabled: yes
when:
- not containerized_deployment
- name: restart containerized ceph rgw
systemd:
name: ceph-radosgw@rgw.{{ ansible_hostname }}.{{ item.instance_name }}
state: restarted
enabled: yes
daemon_reload: yes
with_items: "{{ rgw_instances }}"
when:
- containerized_deployment
- name: upgrade ceph rbd mirror node
vars:
upgrade_ceph_packages: True
hosts:
- "{{ rbd_mirror_group_name|default('rbdmirrors') }}"
serial: 1
become: True
tasks:
# NOTE(leseb): these tasks have a 'failed_when: false'
# in case we run before luminous or after
- name: stop ceph rbd mirror before luminous
systemd:
name: "ceph-rbd-mirror@{{ ceph_rbd_mirror_local_user }}"
state: stopped
enabled: no
failed_when: false
- name: stop ceph rbd mirror for and after luminous
systemd:
name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}"
state: stopped
enabled: yes
failed_when: false
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-rbd-mirror
- name: start ceph rbd mirror
systemd:
name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}"
state: started
enabled: yes
when:
- not containerized_deployment
- name: restart containerized ceph rbd mirror
systemd:
name: ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- containerized_deployment
- name: upgrade ceph nfs node
vars:
upgrade_ceph_packages: True
hosts:
- "{{ nfs_group_name|default('nfss') }}"
serial: 1
become: True
tasks:
# failed_when: false is here so that if we upgrade
# from a version of ceph that does not have nfs-ganesha
# then this task will not fail
- name: stop ceph nfs
systemd:
name: nfs-ganesha
state: stopped
enabled: yes
failed_when: false
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-nfs
- name: start nfs gateway
systemd:
name: nfs-ganesha
state: started
enabled: yes
when:
- not containerized_deployment
- ceph_nfs_enable_service
- name: systemd restart nfs container
systemd:
name: ceph-nfs@{{ ceph_nfs_service_suffix | default(ansible_hostname) }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- ceph_nfs_enable_service
- containerized_deployment
- name: upgrade ceph iscsi gateway node
vars:
upgrade_ceph_packages: True
hosts:
- "{{ iscsi_gw_group_name|default('iscsigws') }}"
- iscsi-gws # for backward compatibility only!
serial: 1
become: True
tasks:
# failed_when: false is here so that if we upgrade
# from a version of ceph that does not have iscsi gws
# then this task will not fail
- name: stop rbd-target-gw
systemd:
name: rbd-target-gw
state: stopped
enabled: yes
failed_when: false
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-iscsi-gw
- name: start rbd-target-gw
systemd:
name: rbd-target-gw
state: started
enabled: yes
when:
- not containerized_deployment
- name: upgrade ceph client node
vars:
upgrade_ceph_packages: True
hosts:
- "{{ client_group_name|default('clients') }}"
serial: "{{ client_update_batch | default(20) }}"
become: True
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-client
- name: show ceph status
hosts:
- "{{ mon_group_name|default('mons') }}"
become: True
tasks:
- import_role:
name: ceph-defaults
- name: set_fact docker_exec_cmd_status
set_fact:
docker_exec_cmd_status: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
when:
- containerized_deployment
- name: show ceph status
command: "{{ docker_exec_cmd_status|default('') }} ceph --cluster {{ cluster }} -s"
delegate_to: "{{ groups[mon_group_name][0] }}"