mirror of https://github.com/ceph/ceph-ansible.git
ceph-handler: Fix OSD restart script
There's two big issues with the current OSD restart script. 1/ We try to test if the ceph osd daemon socket exists but we use a wildcard for the socket name : /var/run/ceph/*.asok. This fails because we usually have multiple ceph osd sockets (or other ceph daemon collocated) present in /var/run/ceph directory. Currently the test fails with: bash: line xxx: [: too many arguments But it doesn't stop the script execution. Instead we can specify the full ceph osd socket name because we already know the OSD id. 2/ The container filter pattern is wrong and could matches multiple containers resulting the script to fail. We use the filter with two different patterns. One is with the device name (sda, sdb, ..) and the other one is with the OSD id (ceph-osd-0, ceph-osd-15, ..). In both case we could match more than needed. $ docker container ls CONTAINER ID IMAGE NAMES 958121a7cc7d ceph-daemon:latest ceph-osd-strg0-sda 589a982d43b5 ceph-daemon:latest ceph-osd-strg0-sdb 46c7240d71f3 ceph-daemon:latest ceph-osd-strg0-sdaa 877985ec3aca ceph-daemon:latest ceph-osd-strg0-sdab $ docker container ls -q -f "name=sda" 958121a7cc7d 46c7240d71f3 877985ec3aca $ docker container ls CONTAINER ID IMAGE NAMES 2db399b3ee85 ceph-daemon:latest ceph-osd-5 099dc13f08f1 ceph-daemon:latest ceph-osd-13 5d0c2fe8f121 ceph-daemon:latest ceph-osd-17 d6c7b89db1d1 ceph-daemon:latest ceph-osd-1 $ docker container ls -q -f "name=ceph-osd-1" 099dc13f08f1 5d0c2fe8f121 d6c7b89db1d1 Adding an extra '$' character at the end of the pattern solves the problem. Finally removing the get_container_osd_id function because it's not used in the script at all. Signed-off-by: Dimitri Savineau <dsavinea@redhat.com>pull/4141/head
parent
dc187ea6fa
commit
45d46541cb
|
@ -29,7 +29,7 @@ check_pgs() {
|
|||
wait_for_socket_in_container() {
|
||||
osd_mount_point=$({{ container_binary }} exec "$1" df --output=target | grep '/var/lib/ceph/osd/')
|
||||
whoami=$({{ container_binary }} exec "$1" cat $osd_mount_point/whoami)
|
||||
if ! {{ container_binary }} exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/*.asok ]; do sleep 1 ; done"; then
|
||||
if ! {{ container_binary }} exec "$1" timeout 10 bash -c "while [ ! -e /var/run/ceph/ceph-osd.${whoami}.asok ]; do sleep 1 ; done"; then
|
||||
echo "Timed out while trying to look for a Ceph OSD socket."
|
||||
echo "Abort mission!"
|
||||
exit 1
|
||||
|
@ -45,7 +45,7 @@ get_container_id_from_dev_name() {
|
|||
local count
|
||||
count=10
|
||||
while [ $count -ne 0 ]; do
|
||||
id=$({{ container_binary }} ps -q -f "name=$1")
|
||||
id=$({{ container_binary }} ps -q -f "name=${1}$")
|
||||
test "$id" != "" && break
|
||||
sleep $DELAY
|
||||
let count=count-1
|
||||
|
@ -53,11 +53,6 @@ get_container_id_from_dev_name() {
|
|||
echo "$id"
|
||||
}
|
||||
|
||||
get_container_osd_id() {
|
||||
wait_for_socket_in_container $1
|
||||
{{ container_binary }} exec "$1" ls /var/run/ceph | cut -d'.' -f2
|
||||
}
|
||||
|
||||
# For containerized deployments, the unit file looks like: ceph-osd@sda.service
|
||||
# For non-containerized deployments, the unit file looks like: ceph-osd@NNN.service where NNN is OSD ID
|
||||
for unit in $(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]+|[a-z]+).service"); do
|
||||
|
|
Loading…
Reference in New Issue