Fix: session destruction blocks indefinitely if rotation is ongoing
authorJérémie Galarneau <jeremie.galarneau@efficios.com>
Tue, 30 Oct 2018 12:47:52 +0000 (13:47 +0100)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Fri, 16 Nov 2018 22:23:34 +0000 (17:23 -0500)
Issue
---

The destruction of an active session can hang indefinitely if it
occurs while a rotation is ongoing. This was observed when automatic
session rotations were scheduled on a time basis.

The destruction of the session causes it to be stopped. The 'stop'
command causes the session's timers to be stopped. These timers
include the rotation pending check timer.

Meanwhile, 'data pending' queries are performed against the session
until one of them returns that no data is pending.

The 'data pending' check returns that data is pending if a session
rotation is ongoing at the moment of the check.

Hence, stopping the rotation completion check timer causes the
session to remain in the 'session ongoing' state forever and
prevents the session destruction from completing.

Solution
---

The session's rotation schedule timer is correctly stopped when
a 'stop' is performed; we don't want new rotations to be issued
from this point. However, it is incorrect to stop the
'rotation pending check' timer at this stage if a rotation is
ongoing.

This commit leaves the 'rotation pending check' timer running,
allowing the rotation thread to update the session's rotation
state on completion of the rotation. The operations that were
performed as part of the stop command, namely renaming the
'current' chunk, are then performed from the context of the
rotation thread.

Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
src/bin/lttng-sessiond/cmd.c
src/bin/lttng-sessiond/rotate.c
src/bin/lttng-sessiond/rotate.h
src/bin/lttng-sessiond/rotation-thread.c

index b1bcc10a5874f6be49e1f3a8c06f03f257ff47cc..545269525ccb73c909d73cba1fcda1ff306496b1 100644 (file)
@@ -2716,48 +2716,6 @@ error:
        return ret;
 }
 
-static
-int rename_active_chunk(struct ltt_session *session)
-{
-       int ret;
-
-       session->current_archive_id++;
-
-       /*
-        * The currently active tracing path is now the folder we
-        * want to rename.
-        */
-       ret = lttng_strncpy(session->rotation_chunk.current_rotate_path,
-                       session->rotation_chunk.active_tracing_path,
-                       sizeof(session->rotation_chunk.current_rotate_path));
-       if (ret) {
-               ERR("Failed to copy active tracing path");
-               goto end;
-       }
-
-       ret = rename_completed_chunk(session, time(NULL));
-       if (ret < 0) {
-               ERR("Failed to rename current rotation's path");
-               goto end;
-       }
-
-       /*
-        * We just renamed, the folder, we didn't do an actual rotation, so
-        * the active tracing path is now the renamed folder and we have to
-        * restore the rotate count.
-        */
-       ret = lttng_strncpy(session->rotation_chunk.active_tracing_path,
-                       session->rotation_chunk.current_rotate_path,
-                       sizeof(session->rotation_chunk.active_tracing_path));
-       if (ret) {
-               ERR("Failed to rename active session chunk tracing path");
-               goto end;
-       }
-end:
-       session->current_archive_id--;
-       return ret;
-}
-
 /*
  * Command LTTNG_STOP_TRACE processed by the client thread.
  */
@@ -2782,13 +2740,6 @@ int cmd_stop_trace(struct ltt_session *session)
                goto error;
        }
 
-       if (session->rotation_pending_check_timer_enabled) {
-               if (timer_session_rotation_pending_check_stop(session)) {
-                       ERR("Failed to stop the \"rotation pending check\" timer of session %s",
-                                       session->name);
-               }
-       }
-
        if (session->rotation_schedule_timer_enabled) {
                if (timer_session_rotation_schedule_timer_stop(
                                session)) {
@@ -2797,6 +2748,12 @@ int cmd_stop_trace(struct ltt_session *session)
                }
        }
 
+       /*
+        * A rotation is still ongoing. The check timer will continue to wait
+        * for the rotation to complete. When the rotation finally completes,
+        * a check will be performed to rename the "active" chunk to the
+        * expected "timestamp_begin-timestamp_end" format.
+        */
        if (session->current_archive_id > 0 &&
                        session->rotation_state != LTTNG_ROTATION_STATE_ONGOING) {
                ret = rename_active_chunk(session);
index 17d3c51fdbf0823adb32d4bd3aab8606058df2f7..7abfaed644b1e372a95ed623de3ade1eb7ba9fa3 100644 (file)
@@ -320,6 +320,47 @@ end:
        return ret;
 }
 
+int rename_active_chunk(struct ltt_session *session)
+{
+       int ret;
+
+       session->current_archive_id++;
+
+       /*
+        * The currently active tracing path is now the folder we
+        * want to rename.
+        */
+       ret = lttng_strncpy(session->rotation_chunk.current_rotate_path,
+                       session->rotation_chunk.active_tracing_path,
+                       sizeof(session->rotation_chunk.current_rotate_path));
+       if (ret) {
+               ERR("Failed to copy active tracing path");
+               goto end;
+       }
+
+       ret = rename_completed_chunk(session, time(NULL));
+       if (ret < 0) {
+               ERR("Failed to rename current rotation's path");
+               goto end;
+       }
+
+       /*
+        * We just renamed, the folder, we didn't do an actual rotation, so
+        * the active tracing path is now the renamed folder and we have to
+        * restore the rotate count.
+        */
+       ret = lttng_strncpy(session->rotation_chunk.active_tracing_path,
+                       session->rotation_chunk.current_rotate_path,
+                       sizeof(session->rotation_chunk.active_tracing_path));
+       if (ret) {
+               ERR("Failed to rename active session chunk tracing path");
+               goto end;
+       }
+end:
+       session->current_archive_id--;
+       return ret;
+}
+
 int subscribe_session_consumed_size_rotation(struct ltt_session *session, uint64_t size,
                struct notification_thread_handle *notification_thread_handle)
 {
index 9fa58a70977004bc3299786a4591dd1c7adf124a..2812130806cfdbbe2c55a5935a0d90d56915cfbe 100644 (file)
@@ -22,6 +22,7 @@
 #include "rotation-thread.h"
 #include <stdint.h>
 
+int rename_active_chunk(struct ltt_session *session);
 int rename_completed_chunk(struct ltt_session *session, time_t ts);
 
 /*
index c0d1cf146d3b69bbe5fcef59c8cf6d076df8752d..59b7123575eca788add644885a3282acd2719ed3 100644 (file)
@@ -598,6 +598,44 @@ int check_session_rotation_pending(struct ltt_session *session,
                                session->name);
        }
 
+       if (!session->active) {
+               /*
+                * A stop command was issued during the rotation, it is
+                * up to the rotation completion check to perform the
+                * renaming of the last chunk that was produced.
+                */
+               ret = notification_thread_command_session_rotation_ongoing(
+                               notification_thread_handle,
+                               session->name,
+                               session->uid,
+                               session->gid,
+                               session->current_archive_id);
+               if (ret != LTTNG_OK) {
+                       ERR("[rotation-thread] Failed to notify notification thread of completed rotation for session %s",
+                                       session->name);
+               }
+
+               ret = rename_active_chunk(session);
+               if (ret < 0) {
+                       ERR("[rotation-thread] Failed to rename active rotation chunk");
+                       goto end;
+               }
+
+               /* Ownership of location is transferred. */
+               location = session_get_trace_archive_location(session);
+               ret = notification_thread_command_session_rotation_completed(
+                               notification_thread_handle,
+                               session->name,
+                               session->uid,
+                               session->gid,
+                               session->current_archive_id,
+                               location);
+               if (ret != LTTNG_OK) {
+                       ERR("[rotation-thread] Failed to notify notification thread of completed rotation for session %s",
+                                       session->name);
+               }
+       }
+
        ret = 0;
 end:
        if (session->rotation_state == LTTNG_ROTATION_STATE_ONGOING) {
This page took 0.031508 seconds and 4 git commands to generate.