Fix: action executor: deadlock on session_list_lock
authorJonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Thu, 22 Apr 2021 02:01:02 +0000 (22:01 -0400)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Fri, 23 Apr 2021 00:18:37 +0000 (20:18 -0400)
Observed issue
==============

A dead lock is observed during the start-stop test suite for triggers.

Cause
=====

A start session action is executed by the action executor, the
`cmd_start_trace` function is called and effectively holds the
`session_list_lock.`. During `cmd_start_trace` a call to
`notification_thread_command_add_channel` is performed to inform the
notification thread of the new channel presence.

At the same time, a tracer event notification is received by the
notification thread. The actions are queued up and the sample of the
session id take place and a call to `session_lock_list` is performed and
blocks on the lock operation.

The notification thread wait on the `session_list_lock` and the
`session_list_lock` holder, the action executor, waits on the completion
of a command the be run by the notification thread: deadlock.

The backtrace:

 Thread 6 (Thread 0x7f831c8a6700 (LWP 3046458)):
 #0  syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
 #1  0x000000000053b852 in futex (uaddr=0x7f831c8a45e0, op=0, val=0, timeout=0x0, uaddr2=0x0, val3=0) at /home/joraj/lttng/master/install/include/urcu/futex.h:72
 #2  0x000000000053b4f9 in futex_noasync (uaddr=0x7f831c8a45e0, op=0, val=0, timeout=0x0, uaddr2=0x0, val3=0) at /home/joraj/lttng/master/install/include/urcu/futex.h:81
 #3  0x000000000053af10 in lttng_waiter_wait (waiter=0x7f831c8a45d8) at waiter.c:55
 #4  0x000000000046b0f2 in run_command_wait (handle=0xe60520, cmd=0x7f831c8a4588) at notification-thread-commands.c:49
 #5  0x000000000046b270 in notification_thread_command_add_channel (handle=0xe60520, session_name=0x7f8300006c30 "my_triggered_session", uid=1000, gid=1000, channel_name=0x7f82dc00be04 "channel0", key=1, domain=LTTNG_DOMAIN_UST, capacity=2097152) at notification-thread-commands.c:184
 #6  0x00000000004c7f65 in create_channel_per_uid (app=0x7f82d8000bf0, usess=0x7f8300000bb0, ua_sess=0x7f82dc002600, ua_chan=0x7f82dc00bde0) at ust-app.c:3360
 #7  0x00000000004c6f98 in ust_app_channel_send (app=0x7f82d8000bf0, usess=0x7f8300000bb0, ua_sess=0x7f82dc002600, ua_chan=0x7f82dc00bde0) at ust-app.c:3514
 #8  0x00000000004c6bde in ust_app_channel_create (usess=0x7f8300000bb0, ua_sess=0x7f82dc002600, uchan=0x7f8300005a90, app=0x7f82d8000bf0, _ua_chan=0x7f831c8a48b0) at ust-app.c:4771
 #9  0x00000000004c6968 in find_or_create_ust_app_channel (usess=0x7f8300000bb0, ua_sess=0x7f82dc002600, app=0x7f82d8000bf0, uchan=0x7f8300005a90, ua_chan=0x7f831c8a48b0) at ust-app.c:5610
 #10 0x00000000004c4f09 in ust_app_synchronize_all_channels (usess=0x7f8300000bb0, ua_sess=0x7f82dc002600, app=0x7f82d8000bf0) at ust-app.c:5820
 #11 0x00000000004b958c in ust_app_synchronize (usess=0x7f8300000bb0, app=0x7f82d8000bf0) at ust-app.c:5886
 #12 0x00000000004b8500 in ust_app_global_update (usess=0x7f8300000bb0, app=0x7f82d8000bf0) at ust-app.c:5960
 #13 0x00000000004b7ec2 in ust_app_start_trace_all (usess=0x7f8300000bb0) at ust-app.c:5520
 #14 0x0000000000444e86 in cmd_start_trace (session=0x7f8300006c30) at cmd.c:2707
 #15 0x00000000004a5af9 in action_executor_start_session_handler (executor=0x7f8314004410, work_item=0x7f8314005100, item=0x7f83140050b0) at action-executor.c:342
 #16 0x00000000004a537f in action_executor_generic_handler (executor=0x7f8314004410, work_item=0x7f8314005100, item=0x7f83140050b0) at action-executor.c:696
 #17 0x00000000004a4dbc in action_work_item_execute (executor=0x7f8314004410, work_item=0x7f8314005100) at action-executor.c:715
 #18 0x00000000004a37e6 in action_executor_thread (_data=0x7f8314004410) at action-executor.c:797
 #19 0x0000000000486193 in launch_thread (data=0x7f83140044b0) at thread.c:66
 #20 0x00007f8320b60609 in start_thread (arg=<optimized out>) at pthread_create.c:477
 #21 0x00007f8320a87293 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

 Thread 5 (Thread 0x7f831d0a7700 (LWP 3046457)):
 #0  __lll_lock_wait (futex=futex@entry=0x5e1c10 <ltt_session_list>, private=0) at lowlevellock.c:52
 #1  0x00007f8320b630a3 in __GI___pthread_mutex_lock (mutex=0x5e1c10 <ltt_session_list>) at ../nptl/pthread_mutex_lock.c:80
 #2  0x00000000004378c3 in session_lock_list () at session.c:156
 #3  0x00000000004a871c in add_action_to_subitem_array (action=0x7f830001a730, subitems=0x7f83140051d0) at action-executor.c:1081
 #4  0x00000000004a8578 in add_action_to_subitem_array (action=0x7f830001a620, subitems=0x7f83140051d0) at action-executor.c:1025
 #5  0x00000000004a4922 in populate_subitem_array_from_trigger (trigger=0x7f830001a950, subitems=0x7f83140051d0) at action-executor.c:1116
 #6  0x00000000004a416e in action_executor_enqueue_trigger (executor=0x7f8314004410, trigger=0x7f830001a950, evaluation=0x7f8314005190, object_creds=0x0, client_list=0x7f8314004980) at action-executor.c:924
 #7  0x0000000000479481 in dispatch_one_event_notifier_notification (state=0x7f831d0a63e8, notification=0x7f8314005160) at notification-thread-events.c:4613
 #8  0x0000000000472324 in handle_one_event_notifier_notification (state=0x7f831d0a63e8, pipe=65, domain=LTTNG_DOMAIN_UST) at notification-thread-events.c:4702
 #9  0x0000000000472271 in handle_notification_thread_event_notification (state=0x7f831d0a63e8, pipe=65, domain=LTTNG_DOMAIN_UST) at notification-thread-events.c:4717
 #10 0x00000000004695a3 in handle_event_notification_pipe (event_source_fd=65, domain=LTTNG_DOMAIN_UST, revents=1, state=0x7f831d0a63e8) at notification-thread.c:591
 #11 0x000000000046849b in thread_notification (data=0xe60520) at notification-thread.c:727
 #12 0x0000000000486193 in launch_thread (data=0xe60610) at thread.c:66
 #13 0x00007f8320b60609 in start_thread (arg=<optimized out>) at pthread_create.c:477
 #14 0x00007f8320a87293 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

Solution
========

Instead of using session_find_by_name() which requires the
`session_list_lock`, we introduce `sample_session_id_by_name` that uses
a urcu backed data structure. This allows the sampling of the session
id without holding the session list lock. We accept the small window
where a session object is still accessible but concretely not valid
since the actual execution context will be validated at the moment of
execution. The execution side already handles the possibility that the
session is removed at that point or is not the same session. The
execution side acquires the session_list_lock for validation.

Known drawbacks
=========

None

Signed-off-by: Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
Change-Id: I5ad2c57acc0d03d2814dda59f8ecf2d831fd961e

src/bin/lttng-sessiond/action-executor.c
src/bin/lttng-sessiond/session.c
src/bin/lttng-sessiond/session.h

index 79c8677143ea3a829cb0a0581c3f2e513de0db33..a259933521517c358bd2ab2aa9be32b53c1f8756 100644 (file)
@@ -1076,17 +1076,26 @@ static int add_action_to_subitem_array(struct lttng_action *action,
         * simplicity and consistency.
         */
        if (session_name != NULL) {
-               struct ltt_session *session = NULL;
+               uint64_t session_id;
 
-               session_lock_list();
-               session = session_find_by_name(session_name);
-               if (session) {
+               /*
+                * Instantaneous sampling of the session id if present.
+                *
+                * This method is preferred over `sessiond_find_by_name` then
+                * fetching the session'd id since `sessiond_find_by_name`
+                * requires the session list lock to be taken.
+                *
+                * Taking the session list lock can lead to a deadlock
+                * between the action executor and the notification thread
+                * (caller of add_action_to_subitem_array). It is okay if the
+                * session state changes between the enqueuing time and the
+                * execution time. The execution context is validated at
+                * execution time.
+                */
+               if (sample_session_id_by_name(session_name, &session_id)) {
                        LTTNG_OPTIONAL_SET(&subitem.context.session_id,
-                                       session->id);
-                       session_put(session);
+                                       session_id);
                }
-
-               session_unlock_list();
        }
 
        /* Get a reference to the action. */
index bb3b63b3a31dc407d7dde1ba3c9680e74e221925..2d95aa2687cf2be475faf953a93172c54c7a81ed 100644 (file)
@@ -45,7 +45,7 @@ struct ltt_session_clear_notifier_element {
  * NOTES:
  *
  * No ltt_session.lock is taken here because those data structure are widely
- * spread across the lttng-tools code base so before caling functions below
+ * spread across the lttng-tools code base so before calling functions below
  * that can read/write a session, the caller MUST acquire the session lock
  * using session_lock() and session_unlock().
  */
@@ -67,6 +67,8 @@ static const char *forbidden_name_chars = "/";
 
 /* Global hash table to keep the sessions, indexed by id. */
 static struct lttng_ht *ltt_sessions_ht_by_id = NULL;
+/* Global hash table to keep the sessions, indexed by name. */
+static struct lttng_ht *ltt_sessions_ht_by_name = NULL;
 
 /*
  * Validate the session name for forbidden characters.
@@ -293,7 +295,7 @@ end:
 }
 
 /*
- * Allocate the ltt_sessions_ht_by_id HT.
+ * Allocate the ltt_sessions_ht_by_id and ltt_sessions_ht_by_name HT.
  *
  * The session list lock must be held.
  */
@@ -308,6 +310,15 @@ static int ltt_sessions_ht_alloc(void)
                ERR("Failed to allocate ltt_sessions_ht_by_id");
                goto end;
        }
+
+       DBG("Allocating ltt_sessions_ht_by_name");
+       ltt_sessions_ht_by_name = lttng_ht_new(0, LTTNG_HT_TYPE_STRING);
+       if (!ltt_sessions_ht_by_name) {
+               ret = -1;
+               ERR("Failed to allocate ltt_sessions_ht_by_name");
+               goto end;
+       }
+
 end:
        return ret;
 }
@@ -319,17 +330,23 @@ end:
  */
 static void ltt_sessions_ht_destroy(void)
 {
-       if (!ltt_sessions_ht_by_id) {
-               return;
+       if (ltt_sessions_ht_by_id) {
+               ht_cleanup_push(ltt_sessions_ht_by_id);
+               ltt_sessions_ht_by_id = NULL;
+       }
+
+       if (ltt_sessions_ht_by_name) {
+               ht_cleanup_push(ltt_sessions_ht_by_name);
+               ltt_sessions_ht_by_name = NULL;
        }
-       ht_cleanup_push(ltt_sessions_ht_by_id);
-       ltt_sessions_ht_by_id = NULL;
+
+       return;
 }
 
 /*
- * Add a ltt_session to the ltt_sessions_ht_by_id.
- * If unallocated, the ltt_sessions_ht_by_id HT is allocated.
- * The session list lock must be held.
+ * Add a ltt_session to the ltt_sessions_ht_by_id and ltt_sessions_ht_by_name.
+ * If unallocated, the ltt_sessions_ht_by_id and ltt_sessions_ht_by_name. HTs
+ * are allocated. The session list lock must be held.
  */
 static void add_session_ht(struct ltt_session *ls)
 {
@@ -344,35 +361,45 @@ static void add_session_ht(struct ltt_session *ls)
                        goto end;
                }
        }
+
+       /* Should always be present with ltt_sessions_ht_by_id. */
+       assert(ltt_sessions_ht_by_name);
+
        lttng_ht_node_init_u64(&ls->node, ls->id);
        lttng_ht_add_unique_u64(ltt_sessions_ht_by_id, &ls->node);
 
+       lttng_ht_node_init_str(&ls->node_by_name, ls->name);
+       lttng_ht_add_unique_str(ltt_sessions_ht_by_name, &ls->node_by_name);
+
 end:
        return;
 }
 
 /*
- * Test if ltt_sessions_ht_by_id is empty.
+ * Test if ltt_sessions_ht_by_id/name are empty.
  * Return 1 if empty, 0 if not empty.
  * The session list lock must be held.
  */
 static int ltt_sessions_ht_empty(void)
 {
-       int ret;
+       unsigned long count;
 
        if (!ltt_sessions_ht_by_id) {
-               ret = 1;
+               count = 0;
                goto end;
        }
 
-       ret = lttng_ht_get_count(ltt_sessions_ht_by_id) ? 0 : 1;
+       assert(ltt_sessions_ht_by_name);
+
+       count = lttng_ht_get_count(ltt_sessions_ht_by_id);
+       assert(count == lttng_ht_get_count(ltt_sessions_ht_by_name));
 end:
-       return ret;
+       return count ? 0 : 1;
 }
 
 /*
- * Remove a ltt_session from the ltt_sessions_ht_by_id.
- * If empty, the ltt_sessions_ht_by_id HT is freed.
+ * Remove a ltt_session from the ltt_sessions_ht_by_id/name.
+ * If empty, the ltt_sessions_ht_by_id/name HTs are freed.
  * The session list lock must be held.
  */
 static void del_session_ht(struct ltt_session *ls)
@@ -382,13 +409,18 @@ static void del_session_ht(struct ltt_session *ls)
 
        assert(ls);
        assert(ltt_sessions_ht_by_id);
+       assert(ltt_sessions_ht_by_name);
 
        iter.iter.node = &ls->node.node;
        ret = lttng_ht_del(ltt_sessions_ht_by_id, &iter);
        assert(!ret);
 
+       iter.iter.node = &ls->node_by_name.node;
+       ret = lttng_ht_del(ltt_sessions_ht_by_name, &iter);
+       assert(!ret);
+
        if (ltt_sessions_ht_empty()) {
-               DBG("Empty ltt_sessions_ht_by_id, destroying it");
+               DBG("Empty ltt_sessions_ht_by_id/name, destroying hast tables");
                ltt_sessions_ht_destroy();
        }
 }
@@ -1353,3 +1385,43 @@ int session_reset_rotation_state(struct ltt_session *session,
        }
        return ret;
 }
+
+/*
+ * Sample the id of a session looked up via its name.
+ * Here the term "sampling" hint the caller that this return the id at a given
+ * point in time with no guarantee that the session for which the id was
+ * sampled still exist at that point.
+ *
+ * Return 0 when the session is not found,
+ * Return 1 when the session is found and set `id`.
+ */
+bool sample_session_id_by_name(const char *name, uint64_t *id)
+{
+       bool found = false;
+       struct lttng_ht_node_str *node;
+       struct lttng_ht_iter iter;
+       struct ltt_session *ls;
+
+       rcu_read_lock();
+
+       if (!ltt_sessions_ht_by_name) {
+               found = false;
+               goto end;
+       }
+
+       lttng_ht_lookup(ltt_sessions_ht_by_name, name, &iter);
+       node = lttng_ht_iter_get_node_str(&iter);
+       if (node == NULL) {
+               found = false;
+               goto end;
+       }
+
+       ls = caa_container_of(node, struct ltt_session, node_by_name);
+       *id = ls->id;
+       found = true;
+
+       DBG3("Session id `%" PRIu64 "` sampled for session `%s", *id, name);
+end:
+       rcu_read_unlock();
+       return found;
+}
index 5ff20ad412eea13c1cdde2e0aa4c6e9202d37399..1a31d3f0bdfd5fe9240a8de968428a7ed6293924 100644 (file)
@@ -145,6 +145,10 @@ struct ltt_session {
         * Node in ltt_sessions_ht_by_id.
         */
        struct lttng_ht_node_u64 node;
+       /*
+        * Node in ltt_sessions_ht_by_name.
+        */
+       struct lttng_ht_node_str node_by_name;
        /*
         * Timer to check periodically if a relay and/or consumer has completed
         * the last rotation.
@@ -278,4 +282,15 @@ enum lttng_error_code session_open_packets(struct ltt_session *session);
 
 bool session_output_supports_trace_chunks(const struct ltt_session *session);
 
+/*
+ * Sample the id of a session looked up via its name.
+ * Here the term "sampling" hint the caller that this return the id at a given
+ * point in time with no guarantee that the session for which the id was
+ * sampled still exist at that point.
+ *
+ * Return 0 when the session is not found,
+ * Return 1 when the session is found and set `id`.
+ */
+bool sample_session_id_by_name(const char *name, uint64_t *id);
+
 #endif /* _LTT_SESSION_H */
This page took 0.030706 seconds and 4 git commands to generate.