Fix: tests: handling of subprocesses on bail out
authorJonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Wed, 17 Feb 2021 14:19:50 +0000 (09:19 -0500)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Tue, 13 Apr 2021 20:27:40 +0000 (16:27 -0400)
Observed issue
==============

Background processes of the test_notification_ust tests are present on CI
worker after the jobs is finished.

Cause
=====

The usage of BAIL_OUT during the notap version of stop lttng-sessiond
would end up preventing the normal cleanup of background processes since
we exit directly when issuing the bail out.

Solution
========

Introduce LTTNG_BAIL_OUT.

LTTNG_BAIL_OUT ensure that we call the cleanup path that is normally
used when testing is interrupted.

Add the flag `is_cleanup` to all relevant `*stop_opts` functions.
And introduce the `*_cleanup` functions for relayd, sessiond, consumerd.

While at it, a small rework on how we kill the subprocesses of the
non-iteractive shell was done. This is useful because when using the
test runner the group id of the running process performing the kill is
not valid and result in simply not propagating the SIGTERM signal.

We now use "set -m" to enable monitor mode ensuring that all background
jobs is its own process group id, facilitating the usage of kill with
negative value to ensure that each background jobs subprocesses receives
the SIGTERM signal.

Known drawbacks
=========

We introduce a new lttng specific bail out directive instead of using
the BAIL_OUT from tap.sh

We could override the BAIL_OUT function based on [1].

[1] https://mharrison.org/post/bashfunctionoverride/

References
==========

https://linux.die.net/man/1/bash#:~:text=Monitor%20mode

Signed-off-by: Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
Change-Id: I42ce8659df3e1d9078cc1a46a11a33a2df9a145e

tests/utils/utils.sh

index 4ef4c72645b39294fada49b4eaf60c71efc91da4..b17ba26ebe724326703a11cecac561f7d64eb989 100644 (file)
@@ -40,25 +40,60 @@ if [ -z ${LTTNG_TEST_TEARDOWN_TIMEOUT+x} ]; then
        LTTNG_TEST_TEARDOWN_TIMEOUT=60
 fi
 
-function full_cleanup ()
+# Enable job monitor mode.
+# Here we are mostly interested in the following from the monitor mode:
+#    All processes run in a separate process group.
+# This allows us to ensure that all subprocesses from all background tasks are
+# cleaned up correctly using signal to process group id.
+set -m
+
+kill_background_jobs ()
+{
+       local pids
+       pids=$(jobs -p)
+
+       if [ -z "$pids" ]; then
+               # Empty
+               return 0
+       fi
+
+       while read -r pid;
+       do
+               # Use negative number to send the signal to the process group.
+               # This ensure that any subprocesses receive the signal.
+               # /dev/null is used since there is an acceptable race between
+               # the moments the pids are listed and the moment we send a
+               # signal.
+               kill -SIGTERM -- "-$pid" 2>/dev/null
+       done <<< "$pids"
+}
+
+function cleanup ()
 {
        # Try to kill daemons gracefully
-       stop_lttng_relayd_notap SIGTERM $LTTNG_TEST_TEARDOWN_TIMEOUT
-       stop_lttng_sessiond_notap SIGTERM $LTTNG_TEST_TEARDOWN_TIMEOUT
+       stop_lttng_relayd_cleanup SIGTERM $LTTNG_TEST_TEARDOWN_TIMEOUT
+       stop_lttng_sessiond_cleanup SIGTERM $LTTNG_TEST_TEARDOWN_TIMEOUT
 
        # If daemons are still present, forcibly kill them
-       stop_lttng_relayd_notap SIGKILL $LTTNG_TEST_TEARDOWN_TIMEOUT
-       stop_lttng_sessiond_notap SIGKILL $LTTNG_TEST_TEARDOWN_TIMEOUT
-       stop_lttng_consumerd_notap SIGKILL $LTTNG_TEST_TEARDOWN_TIMEOUT
-
-       # Disable trap for SIGTERM since the following kill to the
-       # pidgroup will be SIGTERM. Otherwise it loops.
-       # The '-' before the pid number ($$) indicates 'kill' to signal the
-       # whole process group.
-       trap - SIGTERM && kill -- -$$
+       stop_lttng_relayd_cleanup SIGKILL $LTTNG_TEST_TEARDOWN_TIMEOUT
+       stop_lttng_sessiond_cleanup SIGKILL $LTTNG_TEST_TEARDOWN_TIMEOUT
+       stop_lttng_consumerd_cleanup SIGKILL $LTTNG_TEST_TEARDOWN_TIMEOUT
+
+       kill_background_jobs
+}
+
+function full_cleanup ()
+{
+       cleanup
        exit 1
 }
 
+function LTTNG_BAIL_OUT ()
+{
+       cleanup
+       BAIL_OUT "$@"
+}
+
 function null_pipes ()
 {
        exec 0>/dev/null
@@ -187,7 +222,7 @@ function validate_lttng_modules_present ()
                return 0
        fi
 
-       BAIL_OUT "LTTng modules not detected."
+       LTTNG_BAIL_OUT "LTTng modules not detected."
 }
 
 function enable_kernel_lttng_event
@@ -491,27 +526,32 @@ function start_lttng_relayd_notap()
 function stop_lttng_relayd_opt()
 {
        local withtap=$1
-       local signal=$2
+       local is_cleanup=$2
+       local signal=$3
+       local timeout_s=$4
+       local dtimeleft_s=
+       local retval=0
+       local pids
 
        if [ -z "$signal" ]; then
                signal="SIGTERM"
        fi
 
-       local timeout_s=$3
-       local dtimeleft_s=
 
        # Multiply time by 2 to simplify integer arithmetic
        if [ -n "$timeout_s" ]; then
                dtimeleft_s=$((timeout_s * 2))
        fi
 
-       local retval=0
-       local pids=
 
        pids=$(lttng_pgrep "$RELAYD_MATCH")
        if [ -z "$pids" ]; then
-               if [ "$withtap" -eq "1" ]; then
-                       pass "No relay daemon to kill"
+               if [ "$is_cleanup" -eq 1 ]; then
+                       :
+               elif [ "$withtap" -eq "1" ]; then
+                       fail "No relay daemon to kill"
+               else
+                       LTTNG_BAIL_OUT "No relay daemon to kill"
                fi
                return 0
        fi
@@ -550,12 +590,17 @@ function stop_lttng_relayd_opt()
 
 function stop_lttng_relayd()
 {
-       stop_lttng_relayd_opt 1 "$@"
+       stop_lttng_relayd_opt 1 "$@"
 }
 
 function stop_lttng_relayd_notap()
 {
-       stop_lttng_relayd_opt 0 "$@"
+       stop_lttng_relayd_opt 0 0 "$@"
+}
+
+function stop_lttng_relayd_cleanup()
+{
+       stop_lttng_relayd_opt 0 1 "$@"
 }
 
 #First arg: show tap output
@@ -602,7 +647,7 @@ function start_lttng_sessiond_opt()
 
        if ! validate_kernel_version; then
            fail "Start session daemon"
-           BAIL_OUT "*** Kernel too old for session daemon tests ***"
+           LTTNG_BAIL_OUT "*** Kernel too old for session daemon tests ***"
        fi
 
        : "${LTTNG_SESSION_CONFIG_XSD_PATH="${DIR}/../src/common/config/"}"
@@ -638,15 +683,18 @@ function start_lttng_sessiond_notap()
 function stop_lttng_sessiond_opt()
 {
        local withtap=$1
-       local signal=$2
+       local is_cleanup=$2
+       local signal=$3
+       local timeout_s=$4
+       local dtimeleft_s=
+       local retval=0
+       local runas_pids
+       local pids
 
        if [ -z "$signal" ]; then
                signal=SIGTERM
        fi
 
-       local timeout_s=$3
-       local dtimeleft_s=
-
        # Multiply time by 2 to simplify integer arithmetic
        if [ -n "$timeout_s" ]; then
                dtimeleft_s=$((timeout_s * 2))
@@ -657,12 +705,7 @@ function stop_lttng_sessiond_opt()
                return 0
        fi
 
-       local retval=0
-
-       local runas_pids=
        runas_pids=$(lttng_pgrep "$RUNAS_MATCH")
-
-       local pids=
        pids=$(lttng_pgrep "$SESSIOND_MATCH")
 
        if [ -n "$runas_pids" ]; then
@@ -670,10 +713,12 @@ function stop_lttng_sessiond_opt()
        fi
 
        if [ -z "$pids" ]; then
-               if [ "$withtap" -eq "1" ]; then
+               if [ "$is_cleanup" -eq 1 ]; then 
+                       :
+               elif [ "$withtap" -eq "1" ]; then
                        fail "No session daemon to kill"
                else
-                       BAIL_OUT "No session daemon to kill"
+                       LTTNG_BAIL_OUT "No session daemon to kill"
                fi
                return 0
        fi
@@ -737,44 +782,50 @@ function stop_lttng_sessiond_opt()
 
 function stop_lttng_sessiond()
 {
-       stop_lttng_sessiond_opt 1 "$@"
+       stop_lttng_sessiond_opt 1 "$@"
 }
 
 function stop_lttng_sessiond_notap()
 {
-       stop_lttng_sessiond_opt 0 "$@"
+       stop_lttng_sessiond_opt 0 0 "$@"
+}
+
+function stop_lttng_sessiond_cleanup()
+{
+       stop_lttng_sessiond_opt 0 1 "$@"
 }
 
 function sigstop_lttng_sessiond_opt()
 {
        local withtap=$1
        local signal=SIGSTOP
+       local pids
 
        if [ -n "$TEST_NO_SESSIOND" ] && [ "$TEST_NO_SESSIOND" == "1" ]; then
                # Env variable requested no session daemon
                return
        fi
 
-       PID_SESSIOND="$(lttng_pgrep "${SESSIOND_MATCH}") $(lttng_pgrep "$RUNAS_MATCH")"
+       pids="$(lttng_pgrep "${SESSIOND_MATCH}") $(lttng_pgrep "$RUNAS_MATCH")"
 
        if [ "$withtap" -eq "1" ]; then
-               diag "Sending SIGSTOP to lt-$SESSIOND_BIN and $SESSIOND_BIN pids: $(echo "$PID_SESSIOND" | tr '\n' ' ')"
+               diag "Sending SIGSTOP to lt-$SESSIOND_BIN and $SESSIOND_BIN pids: $(echo "$pids" | tr '\n' ' ')"
        fi
 
        # shellcheck disable=SC2086
-       if ! kill -s $signal $PID_SESSIOND 1> $OUTPUT_DEST 2> $ERROR_OUTPUT_DEST; then
+       if ! kill -s $signal $pids 1> $OUTPUT_DEST 2> $ERROR_OUTPUT_DEST; then
                if [ "$withtap" -eq "1" ]; then
                        fail "Sending SIGSTOP to session daemon"
                fi
        else
                out=1
                while [ $out -ne 0 ]; do
-                       pid="$(lttng_pgrep "$SESSIOND_MATCH")"
+                       pids="$(lttng_pgrep "$SESSIOND_MATCH")"
 
                        # Wait until state becomes stopped for session
                        # daemon(s).
                        out=0
-                       for sessiond_pid in $pid; do
+                       for sessiond_pid in $pids; do
                                state="$(ps -p "$sessiond_pid" -o state= )"
                                if [[ -n "$state" && "$state" != "T" ]]; then
                                        out=1
@@ -801,35 +852,39 @@ function sigstop_lttng_sessiond_notap()
 function stop_lttng_consumerd_opt()
 {
        local withtap=$1
-       local signal=$2
+       local is_cleanup=$2
+       local signal=$3
+       local timeout_s=$4
+       local dtimeleft_s=
+       local retval=0
+       local pids
 
        if [ -z "$signal" ]; then
                signal=SIGTERM
        fi
 
-       local timeout_s=$3
-       local dtimeleft_s=
-
        # Multiply time by 2 to simplify integer arithmetic
        if [ -n "$timeout_s" ]; then
                dtimeleft_s=$((timeout_s * 2))
        fi
 
-       local retval=0
-
-       PID_CONSUMERD="$(lttng_pgrep "$CONSUMERD_MATCH")"
+       pids="$(lttng_pgrep "$CONSUMERD_MATCH")"
 
-       if [ -z "$PID_CONSUMERD" ]; then
-               if [ "$withtap" -eq "1" ]; then
-                       pass "No consumer daemon to kill"
+       if [ -z "$pids" ]; then
+               if [ "$is_cleanup" -eq 1 ]; then
+                       :
+               elif [ "$withtap" -eq "1" ]; then
+                       fail "No consumerd daemon to kill"
+               else
+                       LTTNG_BAIL_OUT "No consumerd daemon to kill"
                fi
                return 0
        fi
 
-       diag "Killing (signal $signal) $CONSUMERD_BIN pids: $(echo "$PID_CONSUMERD" | tr '\n' ' ')"
+       diag "Killing (signal $signal) $CONSUMERD_BIN pids: $(echo "$pids" | tr '\n' ' ')"
 
        # shellcheck disable=SC2086
-       if ! kill -s $signal $PID_CONSUMERD 1> $OUTPUT_DEST 2> $ERROR_OUTPUT_DEST; then
+       if ! kill -s $signal $pids 1> $OUTPUT_DEST 2> $ERROR_OUTPUT_DEST; then
                retval=1
                if [ "$withtap" -eq "1" ]; then
                        fail "Kill consumer daemon"
@@ -837,12 +892,12 @@ function stop_lttng_consumerd_opt()
        else
                out=1
                while [ $out -ne 0 ]; do
-                       pid="$(lttng_pgrep "$CONSUMERD_MATCH")"
+                       pids="$(lttng_pgrep "$CONSUMERD_MATCH")"
 
                        # If consumerds are still present check their status.
                        # A zombie status qualifies the consumerd as *killed*
                        out=0
-                       for consumer_pid in $pid; do
+                       for consumer_pid in $pids; do
                                state="$(ps -p "$consumer_pid" -o state= )"
                                if [[ -n "$state" && "$state" != "Z" ]]; then
                                        out=1
@@ -871,25 +926,31 @@ function stop_lttng_consumerd_opt()
 
 function stop_lttng_consumerd()
 {
-       stop_lttng_consumerd_opt 1 "$@"
+       stop_lttng_consumerd_opt 1 "$@"
 }
 
 function stop_lttng_consumerd_notap()
 {
-       stop_lttng_consumerd_opt 0 "$@"
+       stop_lttng_consumerd_opt 0 0 "$@"
+}
+
+function stop_lttng_consumerd_cleanup()
+{
+       stop_lttng_consumerd_opt 0 1 "$@"
 }
 
 function sigstop_lttng_consumerd_opt()
 {
        local withtap=$1
        local signal=SIGSTOP
+       local pids
 
-       PID_CONSUMERD="$(lttng_pgrep "$CONSUMERD_MATCH")"
+       pids="$(lttng_pgrep "$CONSUMERD_MATCH")"
 
-       diag "Sending SIGSTOP to $CONSUMERD_BIN pids: $(echo "$PID_CONSUMERD" | tr '\n' ' ')"
+       diag "Sending SIGSTOP to $CONSUMERD_BIN pids: $(echo "$pids" | tr '\n' ' ')"
 
        # shellcheck disable=SC2086
-       kill -s $signal $PID_CONSUMERD 1> $OUTPUT_DEST 2> $ERROR_OUTPUT_DEST
+       kill -s $signal $pids 1> $OUTPUT_DEST 2> $ERROR_OUTPUT_DEST
        retval=$?
 
        if [ $retval -eq 1 ]; then
@@ -900,12 +961,12 @@ function sigstop_lttng_consumerd_opt()
        else
                out=1
                while [ $out -ne 0 ]; do
-                       pid="$(lttng_pgrep "$CONSUMERD_MATCH")"
+                       pids="$(lttng_pgrep "$CONSUMERD_MATCH")"
 
                        # Wait until state becomes stopped for all
                        # consumers.
                        out=0
-                       for consumer_pid in $pid; do
+                       for consumer_pid in $pids; do
                                state="$(ps -p "$consumer_pid" -o state= )"
                                if [[ -n "$state" && "$state" != "T" ]]; then
                                        out=1
This page took 0.031543 seconds and 4 git commands to generate.