From: Jérémie Galarneau Date: Fri, 8 Apr 2022 23:34:04 +0000 (-0400) Subject: Fix: sessiond: assert on empty payload when handling client out event X-Git-Url: https://git.lttng.org./?a=commitdiff_plain;h=9016dbfc649a21e911ed0d5cc19bb68db35af531;p=lttng-tools.git Fix: sessiond: assert on empty payload when handling client out event Observed issue ============== When servicing a large number of tracer notifications and sending notifications to clients, the session daemon occasionally hits an assertion: #4 0x00007fb224d7d116 in __assert_fail () from /usr/lib/libc.so.6 #5 0x000056038b2fe4d7 in client_flush_outgoing_queue (client=0x7fb21400c3b0) at notification-thread-events.cpp:3586 #6 0x000056038b2ff819 in handle_notification_thread_client_out (state=0x7fb221974090, socket=77) at notification-thread-events.cpp:4104 #7 0x000056038b2f3d77 in thread_notification (data=0x56038cc7fe90) at notification-thread.cpp:763 #8 0x000056038b30ca7d in launch_thread (data=0x56038cc7e220) at thread.cpp:66 #9 0x00007fb224dcf5c2 in start_thread () from /usr/lib/libc.so.6 #10 0x00007fb224e54584 in clone () from /usr/lib/libc.so.6 Cause ===== A client "out" event can be received when no payload is left to send under some circumstances. Many threads can flush a client's outgoing queue and, if they had to queue their message (socket was full), will use the "communication update" command to signal the (e)poll thread to monitor for space being made available in the socket. Commands are sent over an internal pipe serviced by the same thread as the client sockets. When space is made available in the socket, there is a race between the (e)poll thread and the other threads that may wish to use the client's socket to flush its outgoing queue. A non-(e)poll thread may attempt (and succeed) in flushing the queue before the (e)poll thread gets a chance to service the client's "out" event. In this situation, the (e)poll thread processing the client out event will see an empty payload: there is nothing to do. Solution ======== The (e)poll thread can simply ignore the "client out" event when an empty payload is seen. There is also no need to update the transmission status as the other thread has already enqueued a "communication update" command to do so. Known drawbacks =============== None. Signed-off-by: Jérémie Galarneau Change-Id: I8a181bea1e37e8e14cc67b624b76d139b488eded --- diff --git a/src/bin/lttng-sessiond/action-executor.cpp b/src/bin/lttng-sessiond/action-executor.cpp index cc481e5b7..0b910d474 100644 --- a/src/bin/lttng-sessiond/action-executor.cpp +++ b/src/bin/lttng-sessiond/action-executor.cpp @@ -233,6 +233,15 @@ static int client_handle_transmission_status( case CLIENT_TRANSMISSION_STATUS_COMPLETE: DBG("Successfully sent full notification to client, client_id = %" PRIu64, client->id); + /* + * There is no need to wake the (e)poll thread. If it was waiting for + * "out" events on the client's socket, it will see that no payload + * in queued and will unsubscribe from that event. + * + * In the other cases, we have to wake the the (e)poll thread to either + * handle the error on the client or to get it to monitor the client "out" + * events. + */ update_communication = false; break; case CLIENT_TRANSMISSION_STATUS_QUEUED: diff --git a/src/bin/lttng-sessiond/notification-thread-events.cpp b/src/bin/lttng-sessiond/notification-thread-events.cpp index 909ccbff2..95932b972 100644 --- a/src/bin/lttng-sessiond/notification-thread-events.cpp +++ b/src/bin/lttng-sessiond/notification-thread-events.cpp @@ -47,8 +47,8 @@ #include "lttng-sessiond.hpp" #include "kernel.hpp" -#define CLIENT_POLL_MASK_IN (LPOLLIN | LPOLLERR | LPOLLHUP | LPOLLRDHUP) -#define CLIENT_POLL_MASK_IN_OUT (CLIENT_POLL_MASK_IN | LPOLLOUT) +#define CLIENT_POLL_EVENTS_IN (LPOLLIN | LPOLLERR | LPOLLHUP | LPOLLRDHUP) +#define CLIENT_POLL_EVENTS_IN_OUT (CLIENT_POLL_EVENTS_IN | LPOLLOUT) /* The tracers currently limit the capture size to PIPE_BUF (4kb on linux). */ #define MAX_CAPTURE_SIZE (PIPE_BUF) @@ -3394,9 +3394,9 @@ int handle_notification_thread_client_connect( goto error; } + client->communication.current_poll_events = CLIENT_POLL_EVENTS_IN; ret = lttng_poll_add(&state->events, client->socket, - LPOLLIN | LPOLLERR | - LPOLLHUP | LPOLLRDHUP); + client->communication.current_poll_events); if (ret < 0) { ERR("Failed to add notification channel client socket to poll set"); ret = 0; @@ -3530,6 +3530,18 @@ int handle_notification_thread_trigger_unregister_all( return error_occurred ? -1 : 0; } +static +bool client_has_outbound_data_left( + const struct notification_client *client) +{ + const struct lttng_payload_view pv = lttng_payload_view_from_payload( + &client->communication.outbound.payload, 0, -1); + const bool has_data = pv.buffer.size != 0; + const bool has_fds = lttng_payload_view_get_fd_handle_count(&pv); + + return has_data || has_fds; +} + static int client_handle_transmission_status( struct notification_client *client, @@ -3540,24 +3552,51 @@ int client_handle_transmission_status( switch (transmission_status) { case CLIENT_TRANSMISSION_STATUS_COMPLETE: - ret = lttng_poll_mod(&state->events, client->socket, - CLIENT_POLL_MASK_IN); - if (ret) { - goto end; - } - - break; case CLIENT_TRANSMISSION_STATUS_QUEUED: + { + int current_poll_events; + int new_poll_events; /* * We want to be notified whenever there is buffer space - * available to send the rest of the payload. + * available to send the rest of the payload if we are + * waiting to send data to the client. + * + * The state of the outbound queue being sampled here is + * fine since: + * - it is okay to wake-up "for nothing" in case we see + * that data is left, but another thread succeeds in + * flushing it before us when handling the client "out" + * event. We will simply stop monitoring that event the next + * time it wakes us up and we see no data left to be sent, + * - if another thread fails to flush the entire client + * outgoing queue, it will issue a "communication update" + * command and cause the client's (e)poll mask to be + * re-evaluated. + * + * The situation we seek to avoid would be to disable the + * monitoring of "out" client events indefinitely when there is + * data to be sent, which can't happen because of the + * aforementioned "communication update" mechanism. */ - ret = lttng_poll_mod(&state->events, client->socket, - CLIENT_POLL_MASK_IN_OUT); - if (ret) { - goto end; + pthread_mutex_lock(&client->lock); + current_poll_events = client->communication.current_poll_events; + new_poll_events = client_has_outbound_data_left(client) ? + CLIENT_POLL_EVENTS_IN_OUT : + CLIENT_POLL_EVENTS_IN; + client->communication.current_poll_events = new_poll_events; + pthread_mutex_unlock(&client->lock); + + /* Update the monitored event set only if it changed. */ + if (current_poll_events != new_poll_events) { + ret = lttng_poll_mod(&state->events, client->socket, + new_poll_events); + if (ret) { + goto end; + } } + break; + } case CLIENT_TRANSMISSION_STATUS_FAIL: ret = notification_thread_client_disconnect(client, state); if (ret) { @@ -3697,18 +3736,6 @@ error: return CLIENT_TRANSMISSION_STATUS_ERROR; } -static -bool client_has_outbound_data_left( - const struct notification_client *client) -{ - const struct lttng_payload_view pv = lttng_payload_view_from_payload( - &client->communication.outbound.payload, 0, -1); - const bool has_data = pv.buffer.size != 0; - const bool has_fds = lttng_payload_view_get_fd_handle_count(&pv); - - return has_data || has_fds; -} - /* Client lock must _not_ be held by the caller. */ static int client_send_command_reply(struct notification_client *client, @@ -4117,7 +4144,47 @@ int handle_notification_thread_client_out( } pthread_mutex_lock(&client->lock); - transmission_status = client_flush_outgoing_queue(client); + if (!client_has_outbound_data_left(client)) { + /* + * A client "out" event can be received when no payload is left + * to send under some circumstances. + * + * Many threads can flush a client's outgoing queue and, if they + * had to queue their message (socket was full), will use the + * "communication update" command to signal the (e)poll thread + * to monitor for space being made available in the socket. + * + * Commands are sent over an internal pipe serviced by the same + * thread as the client sockets. + * + * When space is made available in the socket, there is a race + * between the (e)poll thread and the other threads that may + * wish to use the client's socket to flush its outgoing queue. + * + * A non-(e)poll thread may attempt (and succeed) in flushing + * the queue before the (e)poll thread gets a chance to service + * the client's "out" event. + * + * In this situation, the (e)poll thread processing the client + * out event will see an empty payload: there is nothing to do + * except unsubscribing (e)poll "out" events. + * + * Note that this thread is the (e)poll thread so it can modify + * the (e)poll mask directly without using a communication + * update command. Other threads that flush the outgoing queue + * will use the "communication update" command to wake up this + * thread and force it to monitor "out" events. + * + * When other threads succeed in emptying the outgoing queue, + * they don't need to update the (e)poll mask: if the "out" + * event is monitored, it will fire once and the (e)poll + * thread will reach this condition, causing the event to + * stop being monitored. + */ + transmission_status = CLIENT_TRANSMISSION_STATUS_COMPLETE; + } else { + transmission_status = client_flush_outgoing_queue(client); + } pthread_mutex_unlock(&client->lock); ret = client_handle_transmission_status( diff --git a/src/bin/lttng-sessiond/notification-thread-internal.hpp b/src/bin/lttng-sessiond/notification-thread-internal.hpp index 497484ba0..0c992073e 100644 --- a/src/bin/lttng-sessiond/notification-thread-internal.hpp +++ b/src/bin/lttng-sessiond/notification-thread-internal.hpp @@ -165,6 +165,7 @@ struct notification_client { * clean-up. */ bool active; + int current_poll_events; struct { /* * During the reception of a message, the reception