2017-03-21 11:08:25 +08:00
|
|
|
|
---
|
|
|
|
|
# This playbook use to recover Ceph OSDs after ssd journal failure.
|
|
|
|
|
# You will also realise that it’s really simple to bring your
|
|
|
|
|
# OSDs back to life after replacing your faulty SSD with a new one.
|
|
|
|
|
#
|
2017-03-22 17:48:34 +08:00
|
|
|
|
# You should define `dev_ssds` variable for host which changes ssds after
|
|
|
|
|
# failure.
|
2017-03-21 11:08:25 +08:00
|
|
|
|
#
|
|
|
|
|
# For example in host_vars/hostname1.yml
|
|
|
|
|
#
|
|
|
|
|
# dev_ssds:
|
2017-03-22 17:48:34 +08:00
|
|
|
|
# - device_name: /dev/sdd
|
2017-03-21 11:08:25 +08:00
|
|
|
|
# partitions:
|
|
|
|
|
# - index: 1
|
|
|
|
|
# size: 10G
|
|
|
|
|
# osd_id: 0
|
|
|
|
|
# - index: 2
|
|
|
|
|
# size: 10G
|
|
|
|
|
# osd_id: 1
|
2017-03-22 17:48:34 +08:00
|
|
|
|
# - device_name: /dev/sdf
|
2017-03-21 11:08:25 +08:00
|
|
|
|
# partitions:
|
|
|
|
|
# - index: 1
|
|
|
|
|
# size: 10G
|
|
|
|
|
# osd_id: 2
|
|
|
|
|
#
|
2017-03-22 17:48:34 +08:00
|
|
|
|
# @param device_name: The full device path of new ssd
|
2017-03-21 11:08:25 +08:00
|
|
|
|
# @param partitions: The custom partition layout of new ssd
|
|
|
|
|
# @param index: The index of this partition
|
|
|
|
|
# @param size: The size of this partition
|
2017-03-23 09:22:06 +08:00
|
|
|
|
# @param osd_id: Which osds's journal this partition for.
|
2017-03-21 11:08:25 +08:00
|
|
|
|
#
|
|
|
|
|
# ansible-playbook recover-osds-after-ssd-journal-failure.yml
|
2024-02-14 18:14:02 +08:00
|
|
|
|
# Prompts for select which host to recover, defaults to null,
|
2017-03-21 11:08:25 +08:00
|
|
|
|
# doesn't select host the recover ssd. Input the hostname
|
|
|
|
|
# which to recover osds after ssd journal failure
|
|
|
|
|
#
|
|
|
|
|
# ansible-playbook -e target_host=hostname \
|
|
|
|
|
# recover-osds-after-ssd-journal-failure.yml
|
|
|
|
|
# Overrides the prompt using -e option. Can be used in
|
|
|
|
|
# automation scripts to avoid interactive prompt.
|
|
|
|
|
|
|
|
|
|
- hosts: localhost
|
2024-02-14 18:14:02 +08:00
|
|
|
|
gather_facts: false
|
2017-03-21 11:08:25 +08:00
|
|
|
|
vars_prompt:
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: target_host # noqa: name[casing]
|
2017-03-21 11:08:25 +08:00
|
|
|
|
prompt: please enter the target hostname which to recover osds after ssd journal failure
|
2024-02-14 18:14:02 +08:00
|
|
|
|
private: false
|
2017-03-21 11:08:25 +08:00
|
|
|
|
tasks:
|
|
|
|
|
- add_host:
|
|
|
|
|
name: "{{ target_host }}"
|
|
|
|
|
groups: dynamically_created_hosts
|
|
|
|
|
|
|
|
|
|
- hosts: dynamically_created_hosts
|
|
|
|
|
vars:
|
|
|
|
|
journal_typecode: 45b0969e-9b03-4f30-b4c6-b4b80ceff106
|
|
|
|
|
dev_ssds: []
|
|
|
|
|
|
2017-03-22 17:48:34 +08:00
|
|
|
|
tasks:
|
|
|
|
|
- fail: msg="please define dev_ssds variable"
|
|
|
|
|
when: dev_ssds|length <= 0
|
2017-03-23 09:22:06 +08:00
|
|
|
|
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: Get osd(s) if directory stat
|
|
|
|
|
ansible.builtin.stat:
|
2017-03-23 09:22:06 +08:00
|
|
|
|
path: "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
|
2017-03-21 11:08:25 +08:00
|
|
|
|
register: osds_dir_stat
|
|
|
|
|
with_subelements:
|
|
|
|
|
- "{{ dev_ssds }}"
|
|
|
|
|
- partitions
|
|
|
|
|
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: Exit playbook osd(s) is not on this host
|
|
|
|
|
ansible.builtin.fail:
|
2017-03-21 11:08:25 +08:00
|
|
|
|
msg: exit playbook osds is not no this host
|
|
|
|
|
with_items:
|
|
|
|
|
osds_dir_stat.results
|
|
|
|
|
when:
|
2019-05-22 16:02:42 +08:00
|
|
|
|
- osds_dir_stat is defined | bool
|
2017-03-22 17:48:34 +08:00
|
|
|
|
- item.stat.exists == false
|
2017-03-21 11:08:25 +08:00
|
|
|
|
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: Install sgdisk(gdisk)
|
|
|
|
|
ansible.builtin.package:
|
2017-03-21 11:08:25 +08:00
|
|
|
|
name: gdisk
|
|
|
|
|
state: present
|
2018-12-19 21:55:01 +08:00
|
|
|
|
register: result
|
|
|
|
|
until: result is succeeded
|
2017-03-21 11:08:25 +08:00
|
|
|
|
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: Get osd(s) journal uuid
|
|
|
|
|
ansible.builtin.command: cat "/var/lib/ceph/osd/{{ cluster }}-{{ item.1.osd_id }}/journal_uuid"
|
2017-03-21 11:08:25 +08:00
|
|
|
|
register: osds_uuid
|
|
|
|
|
with_subelements:
|
|
|
|
|
- "{{ dev_ssds }}"
|
|
|
|
|
- partitions
|
|
|
|
|
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: Make partitions on new ssd
|
|
|
|
|
ansible.builtin.shell: >
|
2017-03-23 14:49:10 +08:00
|
|
|
|
sgdisk --new={{item.item[1].index}}:0:+{{item.item[1].size}} "--change-name={{ item.item[1].index }}:ceph journal"
|
|
|
|
|
--typecode={{ item.item[1].index }}:{{ journal_typecode }}
|
|
|
|
|
--partition-guid={{ item.item[1].index }}:{{ item.stdout }}
|
|
|
|
|
--mbrtogpt -- {{ item.item[0].device_name }}
|
2019-04-01 23:46:15 +08:00
|
|
|
|
with_items: "{{ osds_uuid.results }}"
|
2017-03-21 11:08:25 +08:00
|
|
|
|
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: Stop osd(s) service
|
|
|
|
|
ansible.builtin.service:
|
2017-03-21 11:08:25 +08:00
|
|
|
|
name: "ceph-osd@{{ item.item[1].osd_id }}"
|
|
|
|
|
state: stopped
|
2019-04-01 23:46:15 +08:00
|
|
|
|
with_items: "{{ osds_uuid.results }}"
|
2017-03-21 11:08:25 +08:00
|
|
|
|
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: Reinitialize osd(s) journal in new ssd
|
|
|
|
|
ansible.builtin.command: ceph-osd -i {{ item.item[1].osd_id }} --mkjournal --cluster {{ cluster }}
|
2019-04-01 23:46:15 +08:00
|
|
|
|
with_items: "{{ osds_uuid.results }}"
|
2017-03-21 11:08:25 +08:00
|
|
|
|
|
2024-02-14 18:14:02 +08:00
|
|
|
|
- name: Start osd(s) service
|
|
|
|
|
ansible.builtin.service:
|
2017-03-21 11:19:25 +08:00
|
|
|
|
name: "ceph-osd@{{ item.item[1].osd_id }}"
|
|
|
|
|
state: started
|
2019-04-01 23:46:15 +08:00
|
|
|
|
with_items: "{{ osds_uuid.results }}"
|