Add time validation to health check
authorMathieu Desnoyers <mathieu.desnoyers@efficios.com>
Tue, 24 Jul 2012 15:52:36 +0000 (11:52 -0400)
committerDavid Goulet <dgoulet@efficios.com>
Tue, 24 Jul 2012 16:01:39 +0000 (12:01 -0400)
The health check code does not have a notion of "time flow": therefore,
two consecutive calls to lttng_health_check() might end up returning a
bad state (0) just because there was too little time between the
invocations.

Add some time information to the "last" snapshot, so we can do a time
delta between the current and last snapshot to figure out if we need to
report the thread as stalled or not.

At this point, a thread is considered stalled with a wait time of over
20 seconds.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: David Goulet <dgoulet@efficios.com>
src/bin/lttng-sessiond/health.c
src/bin/lttng-sessiond/health.h
src/common/defaults.h

index b9a3ba56aa49f25b4605bcdab70c256db208394f..6c4de9430793fa1d8508c06dab17e88af0b6bcae 100644 (file)
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <time.h>
 
+#include <common/defaults.h>
 #include <common/error.h>
 
 #include "health.h"
 
+static const struct timespec time_delta = {
+       .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S,
+       .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
+};
+
+/*
+ * Set time difference in res from time_a and time_b.
+ */
+static void time_diff(const struct timespec *time_a,
+               const struct timespec *time_b, struct timespec *res)
+{
+       if (time_a->tv_nsec - time_b->tv_nsec < 0) {
+               res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
+               res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
+       } else {
+               res->tv_sec = time_a->tv_sec - time_b->tv_sec;
+               res->tv_nsec = time_a->tv_sec - time_b->tv_sec;
+       }
+}
+
+/*
+ * Return true if time_a - time_b > diff, else false.
+ */
+static int time_diff_gt(const struct timespec *time_a,
+               const struct timespec *time_b, const struct timespec *diff)
+{
+       struct timespec res;
+
+       time_diff(time_a, time_b, &res);
+       time_diff(&res, diff, &res);
+
+       if (res.tv_sec > 0) {
+               return 1;
+       } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
+               return 1;
+       }
+
+       return 0;
+}
+
 /*
  * Check health of a specific health state counter.
  *
  */
 int health_check_state(struct health_state *state)
 {
+       int retval = 1, ret;
        unsigned long current, last;
-       int ret = 1;
+       struct timespec current_time;
 
        assert(state);
 
        last = state->last;
        current = uatomic_read(&state->current);
 
-       /*
-        * Here are the conditions for a bad health. Either flag HEALTH_ERROR is
-        * set, or the progress counter is the same as the last one and we are NOT
-        * waiting for a poll() call.
-        */
-       if ((uatomic_read(&state->flags) & HEALTH_ERROR) ||
-                       (current == last && !HEALTH_IS_IN_POLL(current))) {
+       ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
+       if (ret) {
+               PERROR("Error reading time\n");
                /* error */
-               ret = 0;
+               retval = 0;
+               goto end;
        }
 
-       DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d",
-                       current, last, ret);
+       /*
+        * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
+        * health if, after the delta delay has passed, its the progress counter
+        * has not moved and it has NOT been waiting for a poll() call.
+        */
+       if (uatomic_read(&state->flags) & HEALTH_ERROR) {
+               retval = 0;
+               goto end;
+       }
 
        /*
-        * Update last counter. This value is and MUST be access only in this
-        * function.
+        * Initial condition need to update the last counter and sample time, but
+        * should not check health in this initial case, because we don't know how
+        * much time has passed.
         */
-       state->last = current;
+       if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
+               /* update last counter and last sample time */
+               state->last = current;
+               memcpy(&state->last_time, &current_time, sizeof(current_time));
+       } else {
+               if (time_diff_gt(&current_time, &state->last_time, &time_delta)) {
+                       if (current == last && !HEALTH_IS_IN_POLL(current)) {
+                               /* error */
+                               retval = 0;
+                       }
+                       /* update last counter and last sample time */
+                       state->last = current;
+                       memcpy(&state->last_time, &current_time, sizeof(current_time));
+               }
+       }
+
+end:
+       DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d",
+                       current, last, ret);
 
-       return ret;
+       return retval;
 }
index b268860156aa4e1c2e24db3fd2acb9600e5ed640..b348be5f0144e85b4309624c747026c0090d5e3c 100644 (file)
@@ -19,6 +19,7 @@
 #define _HEALTH_H
 
 #include <stdint.h>
+#include <time.h>
 #include <urcu/uatomic.h>
 
 /*
 #define HEALTH_IS_IN_POLL(x)   ((x) & HEALTH_POLL_VALUE)
 
 enum health_flags {
-       HEALTH_EXIT = (1U << 0),
+       HEALTH_EXIT =  (1U << 0),
        HEALTH_ERROR = (1U << 1),
 };
 
 struct health_state {
        /*
-        * last counter is only read and updated by the health_check
+        * last counter and last_time are only read and updated by the health_check
         * thread (single updater).
         */
        unsigned long last;
+       struct timespec last_time;
+
        /*
         * current and flags are updated by multiple threads concurrently.
         */
@@ -104,7 +107,9 @@ static inline void health_error(struct health_state *state)
 static inline void health_init(struct health_state *state)
 {
        assert(state);
-       uatomic_set(&state->last, 0);
+       state->last = 0;
+       state->last_time.tv_sec = 0;
+       state->last_time.tv_nsec = 0;
        uatomic_set(&state->current, 0);
        uatomic_set(&state->flags, 0);
 }
index 3fd74753f1b3c48495da65082e4e2532b81af66c..5ab63d050712e5c5f8641dd4b90f37a74dac1c82 100644 (file)
 #define DEFAULT_NETWORK_CONTROL_PORT        5342
 #define DEFAULT_NETWORK_DATA_PORT           5343
 
+/*
+ * If a thread stalls for this amount of time, it will be considered bogus (bad
+ * health).
+ */
+#define DEFAULT_HEALTH_CHECK_DELTA_S        20
+#define DEFAULT_HEALTH_CHECK_DELTA_NS       0
+
 #endif /* _DEFAULTS_H */
This page took 0.028598 seconds and 4 git commands to generate.