ceph-ansible/infrastructure-playbooks/recover-osds-after-ssd-jour...

127 lines
3.8 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

---
# This playbook use to recover Ceph OSDs after ssd journal failure.
# You will also realise that its really simple to bring your
# OSDs back to life after replacing your faulty SSD with a new one.
#
# You should define `dev_ssds` variable for host which change ssds after
# failture.
#
# For example in host_vars/hostname1.yml
#
# dev_ssds:
# - device_name: sdd
# partitions:
# - index: 1
# size: 10G
# osd_id: 0
# - index: 2
# size: 10G
# osd_id: 1
# - device_name: sdf
# partitions:
# - index: 1
# size: 10G
# osd_id: 2
#
# @param device_name: The device name of new ssd
# @param partitions: The custom partition layout of new ssd
# @param index: The index of this partition
# @param size: The size of this partition
# @param osd_id: Which osds's journal this pattition for.
#
# ansible-playbook recover-osds-after-ssd-journal-failure.yml
# Prompts for select which host to recover, defaults to null,
# doesn't select host the recover ssd. Input the hostname
# which to recover osds after ssd journal failure
#
# ansible-playbook -e target_host=hostname \
# recover-osds-after-ssd-journal-failure.yml
# Overrides the prompt using -e option. Can be used in
# automation scripts to avoid interactive prompt.
- hosts: localhost
gather_facts: no
vars_prompt:
- name: target_host
prompt: please enter the target hostname which to recover osds after ssd journal failure
private: no
tasks:
- add_host:
name: "{{ target_host }}"
groups: dynamically_created_hosts
- hosts: dynamically_created_hosts
vars:
journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
dev_ssds: []
tasks:
- name: get the name of the existing ceph cluster
shell: |
basename $(grep -R fsid /etc/ceph/ | egrep -o '^[^.]*')
changed_when: false
register: cluster_name
- name: get osd(s) if directory stat
stat:
path: "/var/lib/ceph/osd/{{ cluster_name.stdout }}-{{ item.1.osd_id }}/journal_uuid"
register: osds_dir_stat
with_subelements:
- "{{ dev_ssds }}"
- partitions
when: dev_ssds is defined
- name: exit playbook osd(s) is not on this host
fail:
msg: exit playbook osds is not no this host
with_items:
osds_dir_stat.results
when:
- osds_dir_stat is defined and item.stat.exists == false
- name: install sgdisk(gdisk)
package:
name: gdisk
state: present
when: dev_ssds is defined
- name: get osd(s) journal uuid
shell: cat "/var/lib/ceph/osd/{{ cluster_name.stdout }}-{{ item.1.osd_id }}/journal_uuid"
register: osds_uuid
with_subelements:
- "{{ dev_ssds }}"
- partitions
when: dev_ssds is defined
- name: make partitions on new ssd
shell: >
sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal" --typecode={{ item.item[1].index }}:{{ journal_typecode }} --partition-guid={{ item.item[1].index }}:{{ item.stdout }} --mbrtogpt -- /dev/{{ item.item[0].device_name }}
with_items:
- "{{ osds_uuid.results }}"
when: dev_ssds is defined
- name: stop osd(s) service
service:
name: "ceph-osd@{{ item.item[1].osd_id }}"
state: stopped
enabled: no
with_items:
- "{{ osds_uuid.results }}"
when: dev_ssds is defined
- name: reinitialize osd(s) journal in new ssd
shell: >
ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster_name.stdout }}
with_items:
- "{{ osds_uuid.results }}"
when: dev_ssds is defined and cluster_name is defined
- name: start osd(s) service
service:
name: "ceph-osd@{{ item.item[1].osd_id }}"
state: started
enabled: yes
with_items:
- "{{ osds_uuid.results }}"
when: dev_ssds is defined