From: Mathieu Desnoyers Date: Tue, 24 Jul 2012 15:52:36 +0000 (-0400) Subject: Add time validation to health check X-Git-Tag: v2.1.0-rc1~56 X-Git-Url: https://git.lttng.org./?a=commitdiff_plain;h=8809eec0bb55b03862cb1eb128eb39d50104c258;p=lttng-tools.git Add time validation to health check The health check code does not have a notion of "time flow": therefore, two consecutive calls to lttng_health_check() might end up returning a bad state (0) just because there was too little time between the invocations. Add some time information to the "last" snapshot, so we can do a time delta between the current and last snapshot to figure out if we need to report the thread as stalled or not. At this point, a thread is considered stalled with a wait time of over 20 seconds. Signed-off-by: Mathieu Desnoyers Signed-off-by: David Goulet --- diff --git a/src/bin/lttng-sessiond/health.c b/src/bin/lttng-sessiond/health.c index b9a3ba56a..6c4de9430 100644 --- a/src/bin/lttng-sessiond/health.c +++ b/src/bin/lttng-sessiond/health.c @@ -20,11 +20,53 @@ #include #include #include +#include +#include #include #include "health.h" +static const struct timespec time_delta = { + .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S, + .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS, +}; + +/* + * Set time difference in res from time_a and time_b. + */ +static void time_diff(const struct timespec *time_a, + const struct timespec *time_b, struct timespec *res) +{ + if (time_a->tv_nsec - time_b->tv_nsec < 0) { + res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1; + res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec; + } else { + res->tv_sec = time_a->tv_sec - time_b->tv_sec; + res->tv_nsec = time_a->tv_sec - time_b->tv_sec; + } +} + +/* + * Return true if time_a - time_b > diff, else false. + */ +static int time_diff_gt(const struct timespec *time_a, + const struct timespec *time_b, const struct timespec *diff) +{ + struct timespec res; + + time_diff(time_a, time_b, &res); + time_diff(&res, diff, &res); + + if (res.tv_sec > 0) { + return 1; + } else if (res.tv_sec == 0 && res.tv_nsec > 0) { + return 1; + } + + return 0; +} + /* * Check health of a specific health state counter. * @@ -32,33 +74,57 @@ */ int health_check_state(struct health_state *state) { + int retval = 1, ret; unsigned long current, last; - int ret = 1; + struct timespec current_time; assert(state); last = state->last; current = uatomic_read(&state->current); - /* - * Here are the conditions for a bad health. Either flag HEALTH_ERROR is - * set, or the progress counter is the same as the last one and we are NOT - * waiting for a poll() call. - */ - if ((uatomic_read(&state->flags) & HEALTH_ERROR) || - (current == last && !HEALTH_IS_IN_POLL(current))) { + ret = clock_gettime(CLOCK_MONOTONIC, ¤t_time); + if (ret) { + PERROR("Error reading time\n"); /* error */ - ret = 0; + retval = 0; + goto end; } - DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d", - current, last, ret); + /* + * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad + * health if, after the delta delay has passed, its the progress counter + * has not moved and it has NOT been waiting for a poll() call. + */ + if (uatomic_read(&state->flags) & HEALTH_ERROR) { + retval = 0; + goto end; + } /* - * Update last counter. This value is and MUST be access only in this - * function. + * Initial condition need to update the last counter and sample time, but + * should not check health in this initial case, because we don't know how + * much time has passed. */ - state->last = current; + if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) { + /* update last counter and last sample time */ + state->last = current; + memcpy(&state->last_time, ¤t_time, sizeof(current_time)); + } else { + if (time_diff_gt(¤t_time, &state->last_time, &time_delta)) { + if (current == last && !HEALTH_IS_IN_POLL(current)) { + /* error */ + retval = 0; + } + /* update last counter and last sample time */ + state->last = current; + memcpy(&state->last_time, ¤t_time, sizeof(current_time)); + } + } + +end: + DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d", + current, last, ret); - return ret; + return retval; } diff --git a/src/bin/lttng-sessiond/health.h b/src/bin/lttng-sessiond/health.h index b26886015..b348be5f0 100644 --- a/src/bin/lttng-sessiond/health.h +++ b/src/bin/lttng-sessiond/health.h @@ -19,6 +19,7 @@ #define _HEALTH_H #include +#include #include /* @@ -31,16 +32,18 @@ #define HEALTH_IS_IN_POLL(x) ((x) & HEALTH_POLL_VALUE) enum health_flags { - HEALTH_EXIT = (1U << 0), + HEALTH_EXIT = (1U << 0), HEALTH_ERROR = (1U << 1), }; struct health_state { /* - * last counter is only read and updated by the health_check + * last counter and last_time are only read and updated by the health_check * thread (single updater). */ unsigned long last; + struct timespec last_time; + /* * current and flags are updated by multiple threads concurrently. */ @@ -104,7 +107,9 @@ static inline void health_error(struct health_state *state) static inline void health_init(struct health_state *state) { assert(state); - uatomic_set(&state->last, 0); + state->last = 0; + state->last_time.tv_sec = 0; + state->last_time.tv_nsec = 0; uatomic_set(&state->current, 0); uatomic_set(&state->flags, 0); } diff --git a/src/common/defaults.h b/src/common/defaults.h index 3fd74753f..5ab63d050 100644 --- a/src/common/defaults.h +++ b/src/common/defaults.h @@ -127,4 +127,11 @@ #define DEFAULT_NETWORK_CONTROL_PORT 5342 #define DEFAULT_NETWORK_DATA_PORT 5343 +/* + * If a thread stalls for this amount of time, it will be considered bogus (bad + * health). + */ +#define DEFAULT_HEALTH_CHECK_DELTA_S 20 +#define DEFAULT_HEALTH_CHECK_DELTA_NS 0 + #endif /* _DEFAULTS_H */