ceph-crash: introduce new role ceph-crash

This commit introduces a new role `ceph-crash` in order to deploy
everything needed for the ceph-crash daemon.

Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
pull/5571/head
Guillaume Abrioux 2020-07-03 10:21:49 +02:00
parent d490968fc8
commit 9d2f2108e1
19 changed files with 395 additions and 1 deletions

View File

@ -173,6 +173,16 @@
tasks_from: systemd.yml tasks_from: systemd.yml
when: inventory_hostname in groups.get(rgw_group_name, []) when: inventory_hostname in groups.get(rgw_group_name, [])
- import_role:
name: ceph-crash
tasks_from: systemd.yml
when: inventory_hostname in groups.get(mon_group_name, []) or
inventory_hostname in groups.get(osd_group_name, []) or
inventory_hostname in groups.get(mds_group_name, []) or
inventory_hostname in groups.get(rgw_group_name, []) or
inventory_hostname in groups.get(mgr_group_name, []) or
inventory_hostname in groups.get(rbdmirror_group_name, [])
- name: dashboard configuration - name: dashboard configuration
when: dashboard_enabled | bool when: dashboard_enabled | bool
block: block:

View File

@ -610,6 +610,29 @@
- /var/lib/ceph/bootstrap-mgr - /var/lib/ceph/bootstrap-mgr
- /var/lib/ceph/tmp - /var/lib/ceph/tmp
- name: purge ceph-crash daemons
hosts:
- "{{ mon_group_name | default('mons') }}"
- "{{ osd_group_name | default('osds') }}"
- "{{ mds_group_name | default('mdss') }}"
- "{{ rgw_group_name | default('rgws') }}"
- "{{ rbdmirror_group_name | default('rbdmirrors') }}"
- "{{ mgr_group_name | default('mgrs') }}"
gather_facts: false
become: true
tasks:
- name: stop ceph-crash service
service:
name: ceph-crash.service
state: stopped
enabled: no
failed_when: false
- name: remove /var/lib/ceph/crash
file:
path: /var/lib/ceph/crash
state: absent
- name: final cleanup - check any running ceph, purge ceph packages, purge config and remove data - name: final cleanup - check any running ceph, purge ceph packages, purge config and remove data

View File

@ -468,6 +468,35 @@
failed_when: false failed_when: false
when: dashboard_enabled | bool when: dashboard_enabled | bool
- name: purge ceph-crash containers
hosts:
- "{{ mon_group_name | default('mons') }}"
- "{{ osd_group_name | default('osds') }}"
- "{{ mds_group_name | default('mdss') }}"
- "{{ rgw_group_name | default('rgws') }}"
- "{{ rbdmirror_group_name | default('rbdmirrors') }}"
- "{{ mgr_group_name | default('mgrs') }}"
gather_facts: false
become: true
tasks:
- name: stop ceph-crash container
service:
name: "ceph-crash@{{ ansible_hostname }}"
state: stopped
enabled: no
failed_when: false
- name: remove service file
file:
name: "/etc/systemd/system/ceph-crash.service"
state: absent
failed_when: false
- name: remove /var/lib/ceph/crash
file:
path: /var/lib/ceph/crash
state: absent
- name: check container hosts - name: check container hosts
hosts: hosts:

View File

@ -912,6 +912,27 @@
- import_role: - import_role:
name: ceph-client name: ceph-client
- name: upgrade ceph-crash daemons
hosts:
- "{{ mon_group_name | default('mons') }}"
- "{{ osd_group_name | default('osds') }}"
- "{{ mds_group_name | default('mdss') }}"
- "{{ rgw_group_name | default('rgws') }}"
- "{{ rbdmirror_group_name | default('rbdmirrors') }}"
- "{{ mgr_group_name | default('mgrs') }}"
gather_facts: false
become: true
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
tasks_from: container_binary.yml
- import_role:
name: ceph-handler
- import_role:
name: ceph-crash
- name: complete upgrade - name: complete upgrade
hosts: hosts:
- "{{ mon_group_name | default('mons') }}" - "{{ mon_group_name | default('mons') }}"

View File

@ -546,3 +546,37 @@
- import_role: - import_role:
name: ceph-nfs name: ceph-nfs
- name: switching from non-containerized to containerized ceph-crash
hosts:
- "{{ mon_group_name | default('mons') }}"
- "{{ osd_group_name | default('osds') }}"
- "{{ mds_group_name | default('mdss') }}"
- "{{ rgw_group_name | default('rgws') }}"
- "{{ rbdmirror_group_name | default('rbdmirrors') }}"
- "{{ mgr_group_name | default('mgrs') }}"
vars:
containerized_deployment: true
serial: 1
become: true
tasks:
- name: stop non-containerized ceph-crash
service:
name: ceph-crash
state: stopped
enabled: no
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
tasks_from: container_binary.yml
- import_role:
name: ceph-handler
- import_role:
name: ceph-crash

View File

@ -63,6 +63,14 @@
- ceph_nfs_container_stat.get('rc') == 0 - ceph_nfs_container_stat.get('rc') == 0
- ceph_nfs_container_stat.get('stdout_lines', [])|length != 0 - ceph_nfs_container_stat.get('stdout_lines', [])|length != 0
- name: inspect ceph crash container
command: "{{ container_binary }} inspect {{ ceph_crash_container_stat.stdout }}"
changed_when: false
register: ceph_crash_inspect
when:
- ceph_crash_container_stat.get('rc') == 0
- ceph_crash_container_stat.get('stdout_lines', [])|length != 0
# NOTE(leseb): using failed_when to handle the case when the image is not present yet # NOTE(leseb): using failed_when to handle the case when the image is not present yet
- name: "inspecting ceph mon container image before pulling" - name: "inspecting ceph mon container image before pulling"
command: "{{ container_binary }} inspect {{ (ceph_mon_inspect.stdout | from_json)[0].Image }}" command: "{{ container_binary }} inspect {{ (ceph_mon_inspect.stdout | from_json)[0].Image }}"
@ -127,6 +135,13 @@
- nfs_group_name in group_names - nfs_group_name in group_names
- ceph_nfs_inspect.get('rc') == 0 - ceph_nfs_inspect.get('rc') == 0
- name: "inspecting ceph crash container image before pulling"
command: "{{ container_binary }} inspect {{ (ceph_crash_inspect.stdout | from_json)[0].Image }}"
changed_when: false
failed_when: false
register: ceph_crash_container_inspect_before_pull
when: ceph_crash_inspect.get('rc') == 0
- name: set_fact ceph_mon_image_repodigest_before_pulling - name: set_fact ceph_mon_image_repodigest_before_pulling
set_fact: set_fact:
ceph_mon_image_repodigest_before_pulling: "{{ (ceph_mon_container_inspect_before_pull.stdout | from_json)[0].Id }}" ceph_mon_image_repodigest_before_pulling: "{{ (ceph_mon_container_inspect_before_pull.stdout | from_json)[0].Id }}"
@ -162,6 +177,11 @@
- mgr_group_name in group_names - mgr_group_name in group_names
- ceph_mgr_container_inspect_before_pull.get('rc') == 0 - ceph_mgr_container_inspect_before_pull.get('rc') == 0
- name: set_fact ceph_crash_image_repodigest_before_pulling
set_fact:
ceph_crash_image_repodigest_before_pulling: "{{ (ceph_crash_container_inspect_before_pull.stdout | from_json)[0].Id }}"
when: ceph_crash_container_inspect_before_pull.get('rc') == 0
- name: set_fact ceph_rbd_mirror_image_repodigest_before_pulling - name: set_fact ceph_rbd_mirror_image_repodigest_before_pulling
set_fact: set_fact:
ceph_rbd_mirror_image_repodigest_before_pulling: "{{ (ceph_rbd_mirror_container_inspect_before_pull.stdout | from_json)[0].Id }}" ceph_rbd_mirror_image_repodigest_before_pulling: "{{ (ceph_rbd_mirror_container_inspect_before_pull.stdout | from_json)[0].Id }}"
@ -266,6 +286,15 @@
- ceph_nfs_container_inspect_before_pull.get('rc') == 0 - ceph_nfs_container_inspect_before_pull.get('rc') == 0
- ceph_nfs_image_repodigest_before_pulling != image_repodigest_after_pulling - ceph_nfs_image_repodigest_before_pulling != image_repodigest_after_pulling
- name: set_fact ceph_crash_image_updated
set_fact:
ceph_crash_image_updated: "{{ ceph_crash_image_repodigest_before_pulling != image_repodigest_after_pulling }}"
changed_when: true
notify: restart ceph crash
when:
- ceph_crash_container_inspect_before_pull.get('rc') == 0
- ceph_crash_image_repodigest_before_pulling != image_repodigest_after_pulling
- name: export local ceph dev image - name: export local ceph dev image
command: > command: >
{{ container_binary }} save -o "/tmp/{{ ceph_docker_username }}-{{ ceph_docker_imagename }}-{{ ceph_docker_image_tag }}.tar" {{ container_binary }} save -o "/tmp/{{ ceph_docker_username }}-{{ ceph_docker_imagename }}-{{ ceph_docker_image_tag }}.tar"

View File

@ -0,0 +1,15 @@
---
galaxy_info:
company: Red Hat
author: Guillaume Abrioux
description: Deploy ceph-crash
license: Apache
min_ansible_version: 2.7
platforms:
- name: EL
versions:
- 7
- 8
galaxy_tags:
- system
dependencies: []

View File

@ -0,0 +1,71 @@
---
- name: create and copy client.crash keyring
when: cephx | bool
block:
- name: create client.crash keyring
ceph_key:
state: present
name: "client.crash"
caps: "{{ {'mon': 'allow profile crash', 'mgr': 'allow profile crash'} }}"
cluster: "{{ cluster }}"
dest: "{{ ceph_conf_key_directory }}"
import_key: True
mode: "{{ ceph_keyring_permissions }}"
owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
delegate_to: "{{ groups.get(mon_group_name, [])[0] }}"
run_once: True
- name: get keys from monitors
command: "{{ hostvars[groups[mon_group_name][0]]['container_exec_cmd'] | default('') }} ceph --cluster {{ cluster }} auth get client.crash"
register: _crash_keys
delegate_to: "{{ groups.get(mon_group_name)[0] }}"
run_once: true
- name: get a list of node where the keyring should be copied
set_fact:
list_target_node: "{{ list_target_node | default([]) | union(((groups.get('all') | difference(groups.get(grafana_server_group_name, []) + groups.get(client_group_name, []) + groups.get(nfs_group_name, []) + groups.get(iscsi_gw_group_name, []))) + groups.get(item, [])) | unique) }}"
run_once: True
with_items:
- "{{ mon_group_name if groups.get(mon_group_name, []) | length > 0 else [] }}"
- "{{ osd_group_name if groups.get(osd_group_name, []) | length > 0 else [] }}"
- "{{ mds_group_name if groups.get(mds_group_name, []) | length > 0 else [] }}"
- "{{ rgw_group_name if groups.get(rgw_group_name, []) | length > 0 else [] }}"
- "{{ rbdmirror_group_name if groups.get(rbdmirror_group_name, []) | length > 0 else [] }}"
- "{{ mgr_group_name if groups.get(mgr_group_name, []) | length > 0 else [] }}"
- name: copy ceph key(s) if needed
copy:
dest: "{{ ceph_conf_key_directory }}/{{ cluster }}.client.crash.keyring"
content: "{{ _crash_keys.stdout + '\n' }}"
owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
mode: "{{ ceph_keyring_permissions }}"
with_items: "{{ list_target_node }}"
delegate_to: "{{ item }}"
run_once: True
- name: start ceph-crash daemon
when: containerized_deployment | bool
block:
- name: create /var/lib/ceph/crash/posted
file:
path: /var/lib/ceph/crash/posted
state: directory
mode: '0755'
owner: "{{ ceph_uid }}"
group: "{{ ceph_uid }}"
- name: include_tasks systemd.yml
include_tasks: systemd.yml
- name: start the ceph-crash service
systemd:
name: "{{ 'ceph-crash@' + ansible_hostname if containerized_deployment | bool else 'ceph-crash.service' }}"
state: started
enabled: yes
masked: no
daemon_reload: yes

View File

@ -0,0 +1,9 @@
---
- name: generate systemd unit file for ceph-crash container
template:
src: "{{ role_path }}/templates/ceph-crash.service.j2"
dest: /etc/systemd/system/ceph-crash@.service
owner: "root"
group: "root"
mode: "0644"
notify: restart ceph crash

View File

@ -0,0 +1,41 @@
[Unit]
Description=Ceph crash dump collector
{% if container_binary == 'docker' %}
After=docker.service
Requires=docker.service
{% else %}
After=network.target
{% endif %}
[Service]
{% if container_binary == 'podman' %}
ExecStartPre=-/usr/bin/rm -f /%t/%n-pid /%t/%n-cid
ExecStartPre=-/usr/bin/{{ container_binary }} rm -f ceph-crash-%i
{% endif %}
ExecStart=/usr/bin/{{ container_binary }} run --rm --name ceph-crash-%i \
{% if container_binary == 'podman' %}
-d --conmon-pidfile /%t/%n-pid --cidfile /%t/%n-cid \
{% endif %}
--net=host \
-v /var/lib/ceph:/var/lib/ceph:z \
-v /etc/localtime:/etc/localtime:ro \
--entrypoint=/usr/bin/ceph-crash {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}
{% if container_binary == 'podman' %}
ExecStop=-/usr/bin/sh -c "/usr/bin/{{ container_binary }} rm -f `cat /%t/%n-cid`"
{% else %}
ExecStop=-/usr/bin/{{ container_binary }} stop ceph-crash-%i
{% endif %}
StartLimitInterval=10min
StartLimitBurst=30
{% if container_binary == 'podman' %}
Type=forking
PIDFile=/%t/%n-pid
{% endif %}
KillMode=none
Restart=always
RestartSec=10s
TimeoutStartSec=120
TimeoutStopSec=10
[Install]
WantedBy=multi-user.target

View File

@ -66,6 +66,10 @@
when: iscsi_gw_group_name in group_names when: iscsi_gw_group_name in group_names
listen: "restart ceph rbd-target-api-gw" listen: "restart ceph rbd-target-api-gw"
- name: ceph crash handler
include_tasks: handler_crash.yml
listen: "restart ceph crash"
- name: remove tempdir for scripts - name: remove tempdir for scripts
file: file:
path: "{{ tmpdirpath.path }}" path: "{{ tmpdirpath.path }}"

View File

@ -78,3 +78,10 @@
failed_when: false failed_when: false
check_mode: no check_mode: no
when: inventory_hostname in groups.get(iscsi_gw_group_name, []) when: inventory_hostname in groups.get(iscsi_gw_group_name, [])
- name: check for a ceph-crash container
command: "{{ container_binary }} ps -q --filter='name=ceph-crash-{{ ansible_hostname }}'"
register: ceph_crash_container_stat
changed_when: false
failed_when: false
check_mode: no

View File

@ -216,3 +216,10 @@
failed_when: false failed_when: false
check_mode: no check_mode: no
when: inventory_hostname in groups.get(iscsi_gw_group_name, []) when: inventory_hostname in groups.get(iscsi_gw_group_name, [])
- name: check for a ceph-crash process
command: pgrep ceph-crash
changed_when: false
failed_when: false
check_mode: no
register: crash_process

View File

@ -0,0 +1,18 @@
---
- name: set _crash_handler_called before restart
set_fact:
_crash_handler_called: True
- name: restart the ceph-crash service
systemd:
name: ceph-crash@{{ ansible_hostname }}
state: restarted
enabled: yes
masked: no
daemon_reload: yes
ignore_errors: true
when: hostvars[inventory_hostname]['_crash_handler_called'] | default(False) | bool
- name: set _crash_handler_called after restart
set_fact:
_crash_handler_called: False

View File

@ -37,3 +37,14 @@
set_fact: set_fact:
handler_mgr_status: "{{ (mgr_socket_stat.get('rc') == 0) if not containerized_deployment | bool else (ceph_mgr_container_stat.get('rc') == 0 and ceph_mgr_container_stat.get('stdout_lines', []) | length != 0) }}" handler_mgr_status: "{{ (mgr_socket_stat.get('rc') == 0) if not containerized_deployment | bool else (ceph_mgr_container_stat.get('rc') == 0 and ceph_mgr_container_stat.get('stdout_lines', []) | length != 0) }}"
when: inventory_hostname in groups.get(mgr_group_name, []) when: inventory_hostname in groups.get(mgr_group_name, [])
- name: set_fact handler_crash_status
set_fact:
handler_crash_status: "{{ crash_process.get('rc') == 0 if not containerized_deployment | bool else (ceph_crash_container_stat.get('rc') == 0 and ceph_crash_container_stat.get('stdout_lines', []) | length != 0) }}"
when:
- inventory_hostname in groups.get(mon_group_name, [])
or inventory_hostname in groups.get(mgr_group_name, [])
or inventory_hostname in groups.get(osd_group_name, [])
or inventory_hostname in groups.get(mds_group_name, [])
or inventory_hostname in groups.get(rgw_group_name, [])
or inventory_hostname in groups.get(rbdmirror_group_name, [])

View File

@ -424,6 +424,30 @@
- dashboard_enabled | bool - dashboard_enabled | bool
- groups.get(grafana_server_group_name, []) | length > 0 - groups.get(grafana_server_group_name, []) | length > 0
- hosts:
- mons
- osds
- mdss
- rgws
- rbdmirrors
- mgrs
gather_facts: false
become: True
any_errors_fatal: true
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
tasks_from: container_binary.yml
- import_role:
name: ceph-handler
- import_role:
name: ceph-crash
- hosts: mons - hosts: mons
gather_facts: false gather_facts: false
become: True become: True

View File

@ -446,6 +446,29 @@
- dashboard_enabled | bool - dashboard_enabled | bool
- groups.get(grafana_server_group_name, []) | length > 0 - groups.get(grafana_server_group_name, []) | length > 0
- hosts:
- mons
- osds
- mdss
- rgws
- rbdmirrors
- mgrs
gather_facts: false
become: True
any_errors_fatal: true
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
tasks_from: container_binary.yml
- import_role:
name: ceph-handler
- import_role:
name: ceph-crash
- hosts: mons - hosts: mons
gather_facts: false gather_facts: false
become: True become: True

View File

@ -127,6 +127,9 @@ def node(host, request):
request.function, group_names) request.function, group_names)
pytest.skip(reason) pytest.skip(reason)
if request.node.get_closest_marker('ceph_crash') and group_names in [['nfss'], ['iscsigws'], ['clients'], ['grafana-server']]:
pytest.skip('Not a valid test for nfs, client or iscsigw nodes')
if request.node.get_closest_marker("no_docker") and docker: if request.node.get_closest_marker("no_docker") and docker:
pytest.skip( pytest.skip(
"Not a valid test for containerized deployments or atomic hosts") "Not a valid test for containerized deployments or atomic hosts")

View File

@ -29,3 +29,18 @@ class TestCephConf(object):
if pattern.search(mon_host_line) is None: if pattern.search(mon_host_line) is None:
result = False result = False
assert result assert result
class TestCephCrash(object):
@pytest.mark.no_docker
@pytest.mark.ceph_crash
def test_ceph_crash_service_enabled_and_running(self, node, host):
s = host.service("ceph-crash")
assert s.is_enabled
assert s.is_running
@pytest.mark.docker
@pytest.mark.ceph_crash
def test_ceph_crash_service_enabled_and_running_container(self, node, host):
s = host.service("ceph-crash@{hostname}".format(hostname=node["vars"]["inventory_hostname"]))
assert s.is_enabled
assert s.is_running