ceph-ansible/infrastructure-playbooks/untested-by-ci/recover-osds-after-ssd-jour...

116 lines
3.6 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

---
# This playbook use to recover Ceph OSDs after ssd journal failure.
# You will also realise that its really simple to bring your
# OSDs back to life after replacing your faulty SSD with a new one.
#
# You should define `dev_ssds` variable for host which changes ssds after
# failure.
#
# For example in host_vars/hostname1.yml
#
# dev_ssds:
# - device_name: /dev/sdd
# partitions:
# - index: 1
# size: 10G
# osd_id: 0
# - index: 2
# size: 10G
# osd_id: 1
# - device_name: /dev/sdf
# partitions:
# - index: 1
# size: 10G
# osd_id: 2
#
# @param device_name: The full device path of new ssd
# @param partitions: The custom partition layout of new ssd
# @param index: The index of this partition
# @param size: The size of this partition
# @param osd_id: Which osds's journal this partition for.
#
# ansible-playbook recover-osds-after-ssd-journal-failure.yml
# Prompts for select which host to recover, defaults to null,
# doesn't select host the recover ssd. Input the hostname
# which to recover osds after ssd journal failure
#
# ansible-playbook -e target_host=hostname \
# recover-osds-after-ssd-journal-failure.yml
# Overrides the prompt using -e option. Can be used in
# automation scripts to avoid interactive prompt.
- hosts: localhost
gather_facts: false
vars_prompt:
- name: target_host # noqa: name[casing]
prompt: please enter the target hostname which to recover osds after ssd journal failure
private: false
tasks:
- add_host:
name: "{{ target_host }}"
groups: dynamically_created_hosts
- hosts: dynamically_created_hosts
vars:
journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
dev_ssds: []
tasks:
- fail: msg="please define dev_ssds variable"
when: dev_ssds|length <= 0
- name: Get osd(s) if directory stat
ansible.builtin.stat:
path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
register: osds_dir_stat
with_subelements:
- "{{ dev_ssds }}"
- partitions
- name: Exit playbook osd(s) is not on this host
ansible.builtin.fail:
msg: exit playbook osds is not no this host
with_items:
osds_dir_stat.results
when:
- osds_dir_stat is defined | bool
- item.stat.exists == false
- name: Install sgdisk(gdisk)
ansible.builtin.package:
name: gdisk
state: present
register: result
until: result is succeeded
- name: Get osd(s) journal uuid
ansible.builtin.command: cat "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
register: osds_uuid
with_subelements:
- "{{ dev_ssds }}"
- partitions
- name: Make partitions on new ssd
ansible.builtin.shell: >
sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal"
--typecode={{ item.item[1].index }}:{{ journal_typecode }}
--partition-guid={{ item.item[1].index }}:{{ item.stdout }}
--mbrtogpt -- {{ item.item[0].device_name }}
with_items: "{{ osds_uuid.results }}"
- name: Stop osd(s) service
ansible.builtin.service:
name: "ceph-osd@{{ item.item[1].osd_id }}"
state: stopped
with_items: "{{ osds_uuid.results }}"
- name: Reinitialize osd(s) journal in new ssd
ansible.builtin.command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }}
with_items: "{{ osds_uuid.results }}"
- name: Start osd(s) service
ansible.builtin.service:
name: "ceph-osd@{{ item.item[1].osd_id }}"
state: started
with_items: "{{ osds_uuid.results }}"