Fix: rotation error may leave session in "ONGOING" state
authorJérémie Galarneau <jeremie.galarneau@efficios.com>
Wed, 14 Nov 2018 21:52:12 +0000 (16:52 -0500)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Fri, 16 Nov 2018 22:23:34 +0000 (17:23 -0500)
The errors that can occur during the launch of a rotation may
leave a session's rotation state in the "ONGOING" state.

This means that clients polling for the rotation's state (or using the
notification channel) will never see the rotation enter the ERROR
or COMPLETED states, resulting in a hang.

This change introduces session_reset_rotation_state() which
implements the logic needed to reset a session's rotation state
and populate the result of the last rotation.

Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
src/bin/lttng-sessiond/cmd.c
src/bin/lttng-sessiond/rotation-thread.c
src/bin/lttng-sessiond/session.c
src/bin/lttng-sessiond/session.h

index 545269525ccb73c909d73cba1fcda1ff306496b1..4d4d0088cf8299ed7859c5b1558d4db0aac644f9 100644 (file)
@@ -4531,6 +4531,11 @@ int cmd_rotate_session(struct ltt_session *session,
        struct tm *timeinfo;
        char datetime[21];
        time_t now;
+       /*
+        * Used to roll-back timestamps in case of failure to launch the
+        * rotation.
+        */
+       time_t original_last_chunk_start_ts, original_current_chunk_start_ts;
 
        assert(session);
 
@@ -4611,29 +4616,17 @@ int cmd_rotate_session(struct ltt_session *session,
         * archive id.
         */
        session->current_archive_id++;
-       /*
-        * A rotation has a local step even if the destination is a relay
-        * daemon; the buffers must be consumed by the consumer daemon.
-        */
-       session->rotation_pending_local = true;
-       session->rotation_pending_relay =
-               session_get_consumer_destination_type(session) == CONSUMER_DST_NET;
-       session->rotation_state = LTTNG_ROTATION_STATE_ONGOING;
-       ret = notification_thread_command_session_rotation_ongoing(
-                       notification_thread_handle,
-                       session->name, session->uid, session->gid,
-                       session->current_archive_id - 1);
-       if (ret != LTTNG_OK) {
-               ERR("Failed to notify notification thread that a session rotation is ongoing for session %s",
-                               session->name);
-       }
 
-       /* Create the path name for the next chunk. */
        now = time(NULL);
        if (now == (time_t) -1) {
                ret = -LTTNG_ERR_ROTATION_NOT_AVAILABLE;
                goto end;
        }
+
+       /* Sample chunk bounds for roll-back in case of error. */
+       original_last_chunk_start_ts = session->last_chunk_start_ts;
+       original_current_chunk_start_ts = session->current_chunk_start_ts;
+
        session->last_chunk_start_ts = session->current_chunk_start_ts;
        session->current_chunk_start_ts = now;
 
@@ -4650,6 +4643,16 @@ int cmd_rotate_session(struct ltt_session *session,
                ret = -LTTNG_ERR_UNK;
                goto end;
        }
+
+       /*
+        * A rotation has a local step even if the destination is a relay
+        * daemon; the buffers must be consumed by the consumer daemon.
+        */
+       session->rotation_pending_local = true;
+       session->rotation_pending_relay =
+               session_get_consumer_destination_type(session) == CONSUMER_DST_NET;
+       session->rotation_state = LTTNG_ROTATION_STATE_ONGOING;
+
        if (session->kernel_session) {
                /*
                 * The active path for the next rotation/destroy.
@@ -4663,7 +4666,7 @@ int cmd_rotate_session(struct ltt_session *session,
                if (ret < 0 || ret == sizeof(session->rotation_chunk.active_tracing_path)) {
                        ERR("Failed to format active kernel tracing path in rotate session command");
                        ret = -LTTNG_ERR_UNK;
-                       goto end;
+                       goto error;
                }
                /*
                 * The sub-directory for the consumer
@@ -4676,7 +4679,7 @@ int cmd_rotate_session(struct ltt_session *session,
                if (ret < 0 || ret == sizeof(session->kernel_session->consumer->chunk_path)) {
                        ERR("Failed to format the kernel consumer's sub-directory in rotate session command");
                        ret = -LTTNG_ERR_UNK;
-                       goto end;
+                       goto error;
                }
                /*
                 * Create the new chunk folder, before the rotation begins so we don't
@@ -4689,12 +4692,12 @@ int cmd_rotate_session(struct ltt_session *session,
                        ERR("Failed to create kernel session tracing path at %s",
                                        session->kernel_session->consumer->chunk_path);
                        ret = -LTTNG_ERR_CREATE_DIR_FAIL;
-                       goto end;
+                       goto error;
                }
                ret = kernel_rotate_session(session);
                if (ret != LTTNG_OK) {
                        ret = -ret;
-                       goto end;
+                       goto error;
                }
        }
        if (session->ust_session) {
@@ -4705,7 +4708,7 @@ int cmd_rotate_session(struct ltt_session *session,
                if (ret < 0) {
                        ERR("Failed to format active UST tracing path in rotate session command");
                        ret = -LTTNG_ERR_UNK;
-                       goto end;
+                       goto error;
                }
                ret = snprintf(session->ust_session->consumer->chunk_path,
                                PATH_MAX, "/%s-%" PRIu64, datetime,
@@ -4713,7 +4716,7 @@ int cmd_rotate_session(struct ltt_session *session,
                if (ret < 0) {
                        ERR("Failed to format the UST consumer's sub-directory in rotate session command");
                        ret = -LTTNG_ERR_UNK;
-                       goto end;
+                       goto error;
                }
                /*
                 * Create the new chunk folder, before the rotation begins so we don't
@@ -4724,18 +4727,18 @@ int cmd_rotate_session(struct ltt_session *session,
                                session->ust_session->gid);
                if (ret) {
                        ret = -LTTNG_ERR_CREATE_DIR_FAIL;
-                       goto end;
+                       goto error;
                }
                ret = ust_app_rotate_session(session);
                if (ret != LTTNG_OK) {
-                       goto end;
+                       goto error;
                }
        }
 
        ret = timer_session_rotation_pending_check_start(session,
                        DEFAULT_ROTATE_PENDING_TIMER);
        if (ret) {
-               goto end;
+               goto error;
        }
 
        if (!session->active) {
@@ -4746,12 +4749,30 @@ int cmd_rotate_session(struct ltt_session *session,
                rotate_return->rotation_id = session->current_archive_id;
        }
 
+       ret = notification_thread_command_session_rotation_ongoing(
+                       notification_thread_handle,
+                       session->name, session->uid, session->gid,
+                       session->current_archive_id - 1);
+       if (ret != LTTNG_OK) {
+               ERR("Failed to notify notification thread that a session rotation is ongoing for session %s",
+                               session->name);
+       }
+
        DBG("Cmd rotate session %s, archive_id %" PRIu64 " sent",
                        session->name, session->current_archive_id - 1);
        ret = LTTNG_OK;
 
 end:
        return ret;
+error:
+       session->last_chunk_start_ts = original_last_chunk_start_ts;
+       session->current_archive_id = original_current_chunk_start_ts;
+       if (session_reset_rotation_state(session,
+                       LTTNG_ROTATION_STATE_NO_ROTATION)) {
+               ERR("Failed to reset rotation state of session \"%s\"",
+                               session->name);
+       }
+       goto end;
 }
 
 /*
index 3256a1e9b6581899380cb47f16cb35b10a90b32c..68a5d402f9ae3a8f80d97f142851ef95288b8046 100644 (file)
@@ -431,7 +431,12 @@ end:
                session->rotation_pending_local = false;
        }
        if (ret) {
-               session->rotation_state = LTTNG_ROTATION_STATE_ERROR;
+               ret = session_reset_rotation_state(session,
+                               LTTNG_ROTATION_STATE_ERROR);
+               if (ret) {
+                       ERR("Failed to reset rotation state of session \"%s\"",
+                                       session->name);
+               }
        }
        return 0;
 }
@@ -491,7 +496,12 @@ int check_session_rotation_pending_relay(struct ltt_session *session)
                ERR("[rotation-thread] Encountered an error when checking if rotation of trace archive %" PRIu64 " of session \"%s\" is pending on the relay",
                                session->current_archive_id - 1,
                                session->name);
-               session->rotation_state = LTTNG_ROTATION_STATE_ERROR;
+               ret = session_reset_rotation_state(session,
+                               LTTNG_ROTATION_STATE_ERROR);
+               if (ret) {
+                       ERR("Failed to reset rotation state of session \"%s\"",
+                                       session->name);
+               }
                rotation_completed = false;
        }
 
@@ -555,7 +565,12 @@ int check_session_rotation_pending(struct ltt_session *session,
        /* Rename the completed trace archive's location. */
        now = time(NULL);
        if (now == (time_t) -1) {
-               session->rotation_state = LTTNG_ROTATION_STATE_ERROR;
+               ret = session_reset_rotation_state(session,
+                               LTTNG_ROTATION_STATE_ERROR);
+               if (ret) {
+                       ERR("Failed to reset rotation state of session \"%s\"",
+                                       session->name);
+               }
                ret = LTTNG_ERR_UNK;
                goto end;
        }
index 56c38714fc6c6ccad7ef56b3a39820f34062d2ca..abf61e898ceb8e6ef037a53e6729020397e5194b 100644 (file)
@@ -571,3 +571,36 @@ int session_access_ok(struct ltt_session *session, uid_t uid, gid_t gid)
                return 1;
        }
 }
+
+/*
+ * Set a session's rotation state and reset all associated state.
+ *
+ * This function resets the rotation state (check timers, pending
+ * flags, etc.) and sets the result of the last rotation. The result
+ * can be queries by a liblttng-ctl client.
+ *
+ * Be careful of the result passed to this function. For instance,
+ * on failure to launch a rotation, a client will expect the rotation
+ * state to be set to "NO_ROTATION". If an error occured while the
+ * rotation was "ONGOING", result should be set to "ERROR", which will
+ * allow a client to report it.
+ *
+ * Must be called with the session and session_list locks held.
+ */
+int session_reset_rotation_state(struct ltt_session *session,
+               enum lttng_rotation_state result)
+{
+       int ret = 0;
+
+       ASSERT_LOCKED(ltt_session_list.lock);
+       ASSERT_LOCKED(session->lock);
+
+       session->rotation_pending_local = false;
+       session->rotation_pending_relay = false;
+       session->rotated_after_last_stop = false;
+       session->rotation_state = result;
+       if (session->rotation_pending_check_timer_enabled) {
+               ret = timer_session_rotation_pending_check_stop(session);
+       }
+       return ret;
+}
index 99b8c47112bd3f6d1032d7bdcf8e4439b07801ba..1f0aeb94e90198ecbbcd4592088690dab245eea7 100644 (file)
@@ -229,4 +229,7 @@ struct ltt_session_list *session_get_list(void);
 
 int session_access_ok(struct ltt_session *session, uid_t uid, gid_t gid);
 
+int session_reset_rotation_state(struct ltt_session *session,
+               enum lttng_rotation_state result);
+
 #endif /* _LTT_SESSION_H */
This page took 0.031427 seconds and 4 git commands to generate.