ceph-ansible/infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-jour...

116 lines
3.4 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

---
# This playbook use to recover Ceph OSDs after ssd journal failure.
# You will also realise that its really simple to bring your
# OSDs back to life after replacing your faulty SSD with a new one.
#
# You should define `dev_ssds` variable for host which changes ssds after
# failure.
#
# For example in host_vars/hostname1.yml
#
# dev_ssds:
# - device_name: /dev/sdd
# partitions:
# - index: 1
# size: 10G
# osd_id: 0
# - index: 2
# size: 10G
# osd_id: 1
# - device_name: /dev/sdf
# partitions:
# - index: 1
# size: 10G
# osd_id: 2
#
# @param device_name: The full device path of new ssd
# @param partitions: The custom partition layout of new ssd
# @param index: The index of this partition
# @param size: The size of this partition
# @param osd_id: Which osds's journal this partition for.
#
# ansible-playbook recover-osds-after-ssd-journal-failure.yml
# Prompts for select which host to recover, defaults to null,
# doesn't select host the recover ssd. Input the hostname
# which to recover osds after ssd journal failure
#
# ansible-playbook -e target_host=hostname \
# recover-osds-after-ssd-journal-failure.yml
# Overrides the prompt using -e option. Can be used in
# automation scripts to avoid interactive prompt.
- hosts: localhost
gather_facts: no
vars_prompt:
- name: target_host
prompt: please enter the target hostname which to recover osds after ssd journal failure
private: no
tasks:
- add_host:
name: "{{ target_host }}"
groups: dynamically_created_hosts
- hosts: dynamically_created_hosts
vars:
journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
dev_ssds: []
tasks:
- fail: msg="please define dev_ssds variable"
when: dev_ssds|length <= 0
- name: get osd(s) if directory stat
stat:
path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
register: osds_dir_stat
with_subelements:
- "{{ dev_ssds }}"
- partitions
- name: exit playbook osd(s) is not on this host
fail:
msg: exit playbook osds is not no this host
with_items:
osds_dir_stat.results
when:
- osds_dir_stat is defined | bool
- item.stat.exists == false
- name: install sgdisk(gdisk)
package:
name: gdisk
state: present
register: result
until: result is succeeded
- name: get osd(s) journal uuid
command: cat "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
register: osds_uuid
with_subelements:
- "{{ dev_ssds }}"
- partitions
- name: make partitions on new ssd
shell: >
sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal"
--typecode={{ item.item[1].index }}:{{ journal_typecode }}
--partition-guid={{ item.item[1].index }}:{{ item.stdout }}
--mbrtogpt -- {{ item.item[0].device_name }}
with_items: "{{ osds_uuid.results }}"
- name: stop osd(s) service
service:
name: "ceph-osd@{{ item.item[1].osd_id }}"
state: stopped
with_items: "{{ osds_uuid.results }}"
- name: reinitialize osd(s) journal in new ssd
command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }}
with_items: "{{ osds_uuid.results }}"
- name: start osd(s) service
service:
name: "ceph-osd@{{ item.item[1].osd_id }}"
state: started
with_items: "{{ osds_uuid.results }}"