ceph-ansible/infrastructure-playbooks/rolling_update.yml

844 lines
25 KiB
YAML
Raw Blame History

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

---
# This playbook does a rolling update for all the Ceph services
#
# The value of 'serial:' adjusts the number of servers to be updated simultaneously.
# We recommend a value of 1, which means hosts of a group (e.g: monitor) will be
# upgraded one by one. It is really crucial for the update process to happen
# in a serialized fashion. DO NOT CHANGE THIS VALUE.
#
#
# If you run a Ceph community version, you have to change the variable: ceph_stable_release to the new release
#
# If you run Red Hat Ceph Storage and are doing a **major** update (e.g: from 2 to 3), you have two options:
# - if you use a CDN, you have to change the ceph_rhcs_version to a newer one
# - if you use an ISO, you have to change the ceph_rhcs_iso_path to the directory containing the new Ceph version
#
- name: confirm whether user really meant to upgrade the cluster
hosts: localhost
become: false
vars:
- mgr_group_name: mgrs
vars_prompt:
- name: ireallymeanit
prompt: Are you sure you want to upgrade the cluster?
default: 'no'
private: no
tasks:
- name: exit playbook, if user did not mean to upgrade cluster
fail:
msg: >
"Exiting rolling_update.yml playbook, cluster was NOT upgraded.
To upgrade the cluster, either say 'yes' on the prompt or
use `-e ireallymeanit=yes` on the command line when
invoking the playbook"
when: ireallymeanit != 'yes'
- name: gather facts and check the init system
hosts:
- "{{ mon_group_name|default('mons') }}"
- "{{ osd_group_name|default('osds') }}"
- "{{ mds_group_name|default('mdss') }}"
- "{{ rgw_group_name|default('rgws') }}"
- "{{ mgr_group_name|default('mgrs') }}"
- "{{ rbd_mirror_group_name|default('rbdmirrors') }}"
- "{{ nfs_group_name|default('nfss') }}"
- "{{ client_group_name|default('clients') }}"
become: True
gather_facts: False
vars:
delegate_facts_host: True
tasks:
- debug: msg="gather facts on all Ceph hosts for following reference"
- name: gather facts
setup:
when:
- not delegate_facts_host | bool
- name: gather and delegate facts
setup:
delegate_to: "{{ item }}"
delegate_facts: True
with_items: "{{ groups['all'] }}"
run_once: true
when:
- delegate_facts_host | bool
- set_fact: rolling_update=true
- name: upgrade ceph mon cluster
vars:
health_mon_check_retries: 5
health_mon_check_delay: 15
upgrade_ceph_packages: True
hosts:
- "{{ mon_group_name|default('mons') }}"
serial: 1
become: True
tasks:
- name: set mon_host_count
set_fact:
mon_host_count: "{{ groups[mon_group_name] | length }}"
- name: fail when less than three monitors
fail:
msg: "Upgrade of cluster with less than three monitors is not supported."
when:
- mon_host_count | int < 3
- name: select a running monitor
set_fact:
mon_host: "{{ groups[mon_group_name] | difference([inventory_hostname]) | last }}"
- name: stop ceph mon - shortname
systemd:
name: ceph-mon@{{ ansible_hostname }}
state: stopped
enabled: yes
ignore_errors: True
when:
- not containerized_deployment
- name: stop ceph mon - fqdn
systemd:
name: ceph-mon@{{ ansible_fqdn }}
state: stopped
enabled: yes
ignore_errors: True
when:
- not containerized_deployment
- name: stop ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: stopped
enabled: yes
ignore_errors: True # if no mgr collocated with mons
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-mon
- import_role:
name: ceph-mgr
when: groups.get(mgr_group_name, []) | length == 0
- name: set osd flags
command: ceph --cluster {{ cluster }} osd set {{ item }}
with_items:
- noout
- norebalance
delegate_to: "{{ mon_host }}"
when:
- inventory_hostname == groups[mon_group_name][0]
- not containerized_deployment
- name: set containerized osd flags
command: >
{{ container_binary }} exec ceph-mon-{{ hostvars[mon_host]['ansible_hostname'] }} ceph --cluster {{ cluster }} osd set {{ item }}
with_items:
- noout
- norebalance
delegate_to: "{{ mon_host }}"
when:
- inventory_hostname == groups[mon_group_name][0]
- containerized_deployment
- name: start ceph mon
systemd:
name: ceph-mon@{{ monitor_name }}
state: started
enabled: yes
when:
- not containerized_deployment
- name: start ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: started
enabled: yes
ignore_errors: True # if no mgr collocated with mons
when:
- not containerized_deployment
- name: restart containerized ceph mon
systemd:
name: ceph-mon@{{ monitor_name }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- containerized_deployment
- name: restart containerized ceph mgr
systemd:
name: ceph-mgr@{{ monitor_name }}
state: restarted
enabled: yes
daemon_reload: yes
ignore_errors: True # if no mgr collocated with mons
when:
- containerized_deployment
- name: non container | waiting for the monitor to join the quorum...
command: ceph --cluster "{{ cluster }}" -s --format json
register: ceph_health_raw
until: >
hostvars[inventory_hostname]['ansible_hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
hostvars[inventory_hostname]['ansible_fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"]
retries: "{{ health_mon_check_retries }}"
delay: "{{ health_mon_check_delay }}"
delegate_to: "{{ mon_host }}"
when:
- not containerized_deployment
- name: container | waiting for the containerized monitor to join the quorum...
command: >
{{ container_binary }} exec ceph-mon-{{ hostvars[inventory_hostname]['ansible_hostname'] }} ceph --cluster "{{ cluster }}" -s --format json
register: ceph_health_raw
until: >
hostvars[inventory_hostname]['ansible_hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
hostvars[inventory_hostname]['ansible_fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"]
retries: "{{ health_mon_check_retries }}"
delay: "{{ health_mon_check_delay }}"
delegate_to: "{{ mon_host }}"
when:
- containerized_deployment
- name: create potentially missing keys (rbd and rbd-mirror)
ceph_key:
name: "client.{{ item.0 }}"
state: present
dest: "/var/lib/ceph/{{ item.0 }}/"
caps:
mon: "allow profile {{ item.0 }}"
cluster: "{{ cluster }}"
when:
- cephx
delegate_to: "{{ item.1 }}"
ignore_errors: True # this might fail for upgrade from J to L on rbd-mirror and also on partially updated clusters
with_nested:
- ['bootstrap-rbd', 'bootstrap-rbd-mirror']
- "{{ groups[mon_group_name] | difference([mon_host]) }}" # so the key goes on all the nodes
- name: upgrade ceph mgr node
vars:
upgrade_ceph_packages: True
ceph_release: "{{ ceph_stable_release }}"
hosts:
- "{{ mgr_group_name|default('mgrs') }}"
serial: 1
become: True
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- name: non container - get current fsid
command: "ceph --cluster {{ cluster }} fsid"
register: cluster_uuid_non_container
delegate_to: "{{ groups[mon_group_name][0] }}"
when:
- not containerized_deployment
- name: container - get current fsid
command: >
{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }} ceph --cluster {{ cluster }} fsid
register: cluster_uuid_container
delegate_to: "{{ groups[mon_group_name][0] }}"
when:
- containerized_deployment
- name: set_fact ceph_cluster_fsid
set_fact:
ceph_cluster_fsid: "{{ cluster_uuid_container.stdout if containerized_deployment else cluster_uuid_non_container.stdout }}"
- name: create ceph mgr keyring(s) when mon is not containerized
ceph_key:
name: "mgr.{{ hostvars[item]['ansible_hostname'] }}"
state: present
caps:
mon: allow profile mgr
osd: allow *
mds: allow *
cluster: "{{ cluster }}"
when:
- not containerized_deployment
- cephx
- groups.get(mgr_group_name, []) | length > 0
delegate_to: "{{ groups[mon_group_name][0] }}"
with_items: "{{ groups.get(mgr_group_name, []) }}"
- name: create ceph mgr keyring(s) when mon is containerized
ceph_key:
name: "mgr.{{ hostvars[item]['ansible_hostname'] }}"
state: present
caps:
mon: allow profile mgr
osd: allow *
mds: allow *
cluster: "{{ cluster }}"
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}"
CEPH_UID: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
when:
- containerized_deployment
- cephx
- groups.get(mgr_group_name, []) | length > 0
delegate_to: "{{ groups[mon_group_name][0] }}"
with_items: "{{ groups.get(mgr_group_name, []) }}"
- name: fetch ceph mgr key(s)
fetch:
src: "{{ ceph_conf_key_directory }}/{{ cluster }}.mgr.{{ hostvars[item]['ansible_hostname'] }}.keyring"
dest: "{{ fetch_directory }}/{{ ceph_cluster_fsid }}/{{ ceph_conf_key_directory }}/"
flat: yes
fail_on_missing: no
delegate_to: "{{ groups[mon_group_name][0] }}"
with_items:
- "{{ groups.get(mgr_group_name, []) }}"
# The following task has a failed_when: false
# to handle the scenario where no mgr existed before the upgrade
# or if we run a Ceph cluster before Luminous
- name: stop ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: stopped
enabled: yes
failed_when: false
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-mgr
- name: start ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: started
enabled: yes
when:
- not containerized_deployment
- name: restart containerized ceph mgr
systemd:
name: ceph-mgr@{{ ansible_hostname }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- containerized_deployment
- name: upgrade ceph osds cluster
vars:
health_osd_check_retries: 40
health_osd_check_delay: 30
upgrade_ceph_packages: True
hosts:
- "{{ osd_group_name|default('osds') }}"
serial: 1
become: True
tasks:
- name: get osd numbers - non container
shell: "if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | sed 's/.*-//' ; fi"
register: osd_ids
changed_when: false
when: not containerized_deployment
- name: get osd unit names - container
shell: systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]{1,}|[a-z]+).service"
register: osd_names
changed_when: false
when: containerized_deployment
- name: stop ceph osd
systemd:
name: ceph-osd@{{ item }}
state: stopped
enabled: yes
with_items: "{{ osd_ids.stdout_lines }}"
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-osd
- name: get osd numbers
shell: "if [ -d /var/lib/ceph/osd ] ; then ls /var/lib/ceph/osd | sed 's/.*-//' ; fi"
register: osd_ids
changed_when: false
when: not containerized_deployment
- name: start ceph osd
systemd:
name: ceph-osd@{{ item }}
state: started
enabled: yes
with_items: "{{ osd_ids.stdout_lines }}"
when:
- not containerized_deployment
- name: restart containerized ceph osd
systemd:
name: "{{ item }}"
state: restarted
enabled: yes
daemon_reload: yes
with_items: "{{ osd_names.stdout_lines }}"
when:
- containerized_deployment
- name: set_fact docker_exec_cmd_osd
set_fact:
docker_exec_cmd_update_osd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
when:
- containerized_deployment
- name: get osd versions
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} versions"
register: ceph_versions
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: set_fact ceph_versions_osd
set_fact:
ceph_versions_osd: "{{ (ceph_versions.stdout|from_json).osd }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
# length == 1 means there is a single osds versions entry
# thus all the osds are running the same version
- name: osd set sortbitwise
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} osd set sortbitwise"
delegate_to: "{{ groups[mon_group_name][0] }}"
when:
- (ceph_versions.get('stdout', '{}')|from_json).get('osd', {}) | length == 1
- ceph_versions_osd | string is search("ceph version 10")
- name: get num_pgs - non container
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} -s --format json"
register: ceph_pgs
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: waiting for clean pgs...
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} -s --format json"
register: ceph_health_post
until: >
(((ceph_health_post.stdout | from_json).pgmap.pgs_by_state | length) > 0)
and
(((ceph_health_post.stdout | from_json).pgmap.pgs_by_state | selectattr('state_name', 'search', '^active\\+clean') | map(attribute='count') | list | sum) == (ceph_pgs.stdout | from_json).pgmap.num_pgs)
delegate_to: "{{ groups[mon_group_name][0] }}"
retries: "{{ health_osd_check_retries }}"
delay: "{{ health_osd_check_delay }}"
when:
- (ceph_pgs.stdout | from_json).pgmap.num_pgs != 0
- name: unset osd flags
hosts:
- "{{ mon_group_name|default('mons') }}"
become: True
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- name: set_fact docker_exec_cmd_osd
set_fact:
docker_exec_cmd_update_osd: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
when:
- containerized_deployment
- name: unset osd flags
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph osd unset {{ item }} --cluster {{ cluster }}"
with_items:
- noout
- norebalance
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: get osd versions
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} versions"
register: ceph_versions
delegate_to: "{{ groups[mon_group_name][0] }}"
- name: set_fact ceph_versions_osd
set_fact:
ceph_versions_osd: "{{ (ceph_versions.stdout|from_json).osd }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
# length == 1 means there is a single osds versions entry
# thus all the osds are running the same version
- name: complete osds upgrade
command: "{{ docker_exec_cmd_update_osd|default('') }} ceph --cluster {{ cluster }} osd require-osd-release luminous"
delegate_to: "{{ groups[mon_group_name][0] }}"
when:
- (ceph_versions.get('stdout', '{}')|from_json).get('osd', {}) | length == 1
- ceph_versions_osd | string is search("ceph version 12")
- name: upgrade ceph mdss cluster
vars:
upgrade_ceph_packages: True
hosts:
- "{{ mds_group_name|default('mdss') }}"
serial: 1
become: True
tasks:
- name: stop ceph mds
systemd:
name: ceph-mds@{{ ansible_hostname }}
state: stopped
enabled: yes
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-mds
- name: start ceph mds
systemd:
name: ceph-mds@{{ ansible_hostname }}
state: started
enabled: yes
when:
- not containerized_deployment
- name: restart ceph mds
systemd:
name: ceph-mds@{{ ansible_hostname }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- containerized_deployment
- name: upgrade ceph rgws cluster
vars:
upgrade_ceph_packages: True
hosts:
- "{{ rgw_group_name|default('rgws') }}"
serial: 1
become: True
tasks:
- name: stop ceph rgw
systemd:
name: ceph-radosgw@rgw.{{ ansible_hostname }}
state: stopped
enabled: yes
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-rgw
- name: start ceph rgw
systemd:
name: ceph-radosgw@rgw.{{ ansible_hostname }}
state: started
enabled: yes
when:
- not containerized_deployment
- name: restart containerized ceph rgw
systemd:
name: ceph-radosgw@rgw.{{ ansible_hostname }}.{{ item.instance_name }}
state: restarted
enabled: yes
daemon_reload: yes
with_items: "{{ rgw_instances }}"
when:
- containerized_deployment
- name: upgrade ceph rbd mirror node
vars:
upgrade_ceph_packages: True
hosts:
- "{{ rbd_mirror_group_name|default('rbdmirrors') }}"
serial: 1
become: True
tasks:
# NOTE(leseb): these tasks have a 'failed_when: false'
# in case we run before luminous or after
- name: stop ceph rbd mirror before luminous
systemd:
name: "ceph-rbd-mirror@{{ ceph_rbd_mirror_local_user }}"
state: stopped
enabled: no
failed_when: false
- name: stop ceph rbd mirror for and after luminous
systemd:
name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}"
state: stopped
enabled: yes
failed_when: false
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-rbd-mirror
- name: start ceph rbd mirror
systemd:
name: "ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}"
state: started
enabled: yes
when:
- not containerized_deployment
- name: restart containerized ceph rbd mirror
systemd:
name: ceph-rbd-mirror@rbd-mirror.{{ ansible_hostname }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- containerized_deployment
- name: upgrade ceph nfs node
vars:
upgrade_ceph_packages: True
hosts:
- "{{ nfs_group_name|default('nfss') }}"
serial: 1
become: True
tasks:
# failed_when: false is here so that if we upgrade
# from a version of ceph that does not have nfs-ganesha
# then this task will not fail
- name: stop ceph nfs
systemd:
name: nfs-ganesha
state: stopped
enabled: yes
failed_when: false
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-nfs
- name: start nfs gateway
systemd:
name: nfs-ganesha
state: started
enabled: yes
when:
- not containerized_deployment
- ceph_nfs_enable_service
- name: systemd restart nfs container
systemd:
name: ceph-nfs@{{ ceph_nfs_service_suffix | default(ansible_hostname) }}
state: restarted
enabled: yes
daemon_reload: yes
when:
- ceph_nfs_enable_service
- containerized_deployment
- name: upgrade ceph iscsi gateway node
vars:
upgrade_ceph_packages: True
hosts:
- "{{ iscsi_gw_group_name|default('iscsigws') }}"
- iscsi-gws # for backward compatibility only!
serial: 1
become: True
tasks:
# failed_when: false is here so that if we upgrade
# from a version of ceph that does not have iscsi gws
# then this task will not fail
- name: stop rbd-target-gw
systemd:
name: rbd-target-gw
state: stopped
enabled: yes
failed_when: false
when:
- not containerized_deployment
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-iscsi-gw
- name: start rbd-target-gw
systemd:
name: rbd-target-gw
state: started
enabled: yes
when:
- not containerized_deployment
- name: upgrade ceph client node
vars:
upgrade_ceph_packages: True
hosts:
- "{{ client_group_name|default('clients') }}"
serial: "{{ client_update_batch | default(20) }}"
become: True
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment
- import_role:
name: ceph-container-common
when: containerized_deployment
- import_role:
name: ceph-config
- import_role:
name: ceph-client
- name: show ceph status
hosts:
- "{{ mon_group_name|default('mons') }}"
become: True
tasks:
- import_role:
name: ceph-defaults
- name: set_fact docker_exec_cmd_status
set_fact:
docker_exec_cmd_status: "{{ container_binary }} exec ceph-mon-{{ hostvars[groups[mon_group_name][0]]['ansible_hostname'] }}"
when:
- containerized_deployment
- name: show ceph status
command: "{{ docker_exec_cmd_status|default('') }} ceph --cluster {{ cluster }} -s"
delegate_to: "{{ groups[mon_group_name][0] }}"