From: David Goulet Date: Thu, 24 Jan 2013 19:20:23 +0000 (-0500) Subject: Fix: health subsystem issues with shared code X-Git-Tag: v2.2.0-rc1~84 X-Git-Url: https://git.lttng.org./?a=commitdiff_plain;h=927ca06aed61ff6dd3f64ae71854f2d7f9acebe5;p=lttng-tools.git Fix: health subsystem issues with shared code TLS memory is not used for health state of each thread. This commit probably fixes bug428 as well. The health_init/exit are renamed to health_register/unregister. Fixes #411 Signed-off-by: David Goulet --- diff --git a/src/bin/lttng-sessiond/health.c b/src/bin/lttng-sessiond/health.c index d382c04b5..b36ddc4a3 100644 --- a/src/bin/lttng-sessiond/health.c +++ b/src/bin/lttng-sessiond/health.c @@ -32,6 +32,45 @@ static const struct timespec time_delta = { .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS, }; +/* Define TLS health state. */ +DEFINE_URCU_TLS(struct health_state, health_state); + +/* + * It ensures that TLS memory used for the node and its container structure + * don't get reclaimed after the TLS owner thread exits until we have finished + * using it. + */ +static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER; + +static struct health_tls_state_list health_state_list = { + .head = CDS_LIST_HEAD_INIT(health_state_list.head), +}; + +/* + * This keeps track of the error state for unregistered thread. A thread + * reporting a health error, normally unregisters and quits. This makes the TLS + * health state not available to the health_check_state() call so on unregister + * we update this global error array so we can keep track of which thread was + * on error if the TLS health state has been removed. + */ +static enum health_flags global_error_state[HEALTH_NUM_TYPE]; + +/* + * Lock health state global list mutex. + */ +static void state_lock(void) +{ + pthread_mutex_lock(&health_mutex); +} + +/* + * Unlock health state global list mutex. + */ +static void state_unlock(void) +{ + pthread_mutex_unlock(&health_mutex); +} + /* * Set time difference in res from time_a and time_b. */ @@ -68,17 +107,51 @@ static int time_diff_gt(const struct timespec *time_a, } /* - * Check health of a specific health state counter. + * Health mutex MUST be held across use of the returned struct health_state to + * provide existence guarantee. + * + * Return the health_state object or NULL if not found. + */ +static struct health_state *find_health_state(enum health_type type) +{ + struct health_state *state; + + /* Find the right health state in the global TLS list. */ + cds_list_for_each_entry(state, &health_state_list.head, node) { + if (state->type == type) { + return state; + } + } + + return NULL; +} + +/* + * Check health of a specific health type. Note that if a thread has not yet + * initialize its health subsystem or has quit, it's considered in a good + * state. * * Return 0 if health is bad or else 1. */ -int health_check_state(struct health_state *state) +int health_check_state(enum health_type type) { int retval = 1, ret; unsigned long current, last; struct timespec current_time; + struct health_state *state; + + assert(type < HEALTH_NUM_TYPE); - assert(state); + state_lock(); + + state = find_health_state(type); + if (!state) { + /* Check the global state since the state is not visiable anymore. */ + if (global_error_state[type] & HEALTH_ERROR) { + retval = 0; + } + goto not_found; + } last = state->last; current = uatomic_read(&state->current); @@ -125,6 +198,56 @@ int health_check_state(struct health_state *state) end: DBG("Health state current %lu, last %lu, ret %d", current, last, ret); +not_found: + state_unlock(); return retval; } + +/* + * Init health state. + */ +void health_register(enum health_type type) +{ + struct health_state *state; + + assert(type < HEALTH_NUM_TYPE); + + /* Init TLS state. */ + uatomic_set(&URCU_TLS(health_state).last, 0); + uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0); + uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0); + uatomic_set(&URCU_TLS(health_state).current, 0); + uatomic_set(&URCU_TLS(health_state).flags, 0); + uatomic_set(&URCU_TLS(health_state).type, type); + + /* Add it to the global TLS state list. */ + state_lock(); + state = find_health_state(type); + /* + * Duplicates are not accepted, since lookups don't handle them at the + * moment. + */ + assert(!state); + + cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head); + state_unlock(); +} + +/* + * Remove node from global list. + */ +void health_unregister(void) +{ + state_lock(); + /* + * On error, set the global_error_state since we are about to remove + * the node from the global list. + */ + if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) { + uatomic_set(&global_error_state[URCU_TLS(health_state).type], + HEALTH_ERROR); + } + cds_list_del(&URCU_TLS(health_state).node); + state_unlock(); +} diff --git a/src/bin/lttng-sessiond/health.h b/src/bin/lttng-sessiond/health.h index 19880fa37..91a90706a 100644 --- a/src/bin/lttng-sessiond/health.h +++ b/src/bin/lttng-sessiond/health.h @@ -20,7 +20,10 @@ #include #include +#include +#include #include +#include /* * These are the value added to the current state depending of the position in @@ -32,8 +35,21 @@ #define HEALTH_IS_IN_POLL(x) ((x) & HEALTH_POLL_VALUE) enum health_flags { - HEALTH_EXIT = (1U << 0), - HEALTH_ERROR = (1U << 1), + HEALTH_ERROR = (1U << 0), +}; + +enum health_type { + HEALTH_TYPE_CMD = 0, + HEALTH_TYPE_APP_MANAGE = 1, + HEALTH_TYPE_APP_REG = 2, + HEALTH_TYPE_KERNEL = 3, + HEALTH_TYPE_CONSUMER = 4, + + HEALTH_NUM_TYPE, +}; + +struct health_tls_state_list { + struct cds_list_head head; }; struct health_state { @@ -49,8 +65,14 @@ struct health_state { */ unsigned long current; /* progress counter, updated atomically */ enum health_flags flags; /* other flags, updated atomically */ + enum health_type type; /* Indicates the nature of the thread. */ + /* Node of the global TLS state list. */ + struct cds_list_head node; }; +/* Declare TLS health state. */ +extern DECLARE_URCU_TLS(struct health_state, health_state); + /* Health state counters for the client command thread */ extern struct health_state health_thread_cmd; @@ -69,8 +91,7 @@ extern struct health_state health_thread_kernel; */ static inline void health_poll_update(struct health_state *state) { - assert(state); - uatomic_add(&state->current, HEALTH_POLL_VALUE); + uatomic_add(&URCU_TLS(health_state).current, HEALTH_POLL_VALUE); } /* @@ -79,17 +100,7 @@ static inline void health_poll_update(struct health_state *state) */ static inline void health_code_update(struct health_state *state) { - assert(state); - uatomic_add(&state->current, HEALTH_CODE_VALUE); -} - -/* - * Set health "exit" flag. - */ -static inline void health_exit(struct health_state *state) -{ - assert(state); - uatomic_or(&state->flags, HEALTH_EXIT); + uatomic_add(&URCU_TLS(health_state).current, HEALTH_CODE_VALUE); } /* @@ -97,23 +108,11 @@ static inline void health_exit(struct health_state *state) */ static inline void health_error(struct health_state *state) { - assert(state); - uatomic_or(&state->flags, HEALTH_ERROR); -} - -/* - * Init health state. - */ -static inline void health_init(struct health_state *state) -{ - assert(state); - state->last = 0; - state->last_time.tv_sec = 0; - state->last_time.tv_nsec = 0; - uatomic_set(&state->current, 0); - uatomic_set(&state->flags, 0); + uatomic_or(&URCU_TLS(health_state).flags, HEALTH_ERROR); } -int health_check_state(struct health_state *state); +int health_check_state(enum health_type type); +void health_register(enum health_type type); +void health_unregister(void); #endif /* _HEALTH_H */ diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c index a63593d18..d999928fe 100644 --- a/src/bin/lttng-sessiond/main.c +++ b/src/bin/lttng-sessiond/main.c @@ -705,6 +705,8 @@ static void *thread_manage_kernel(void *data) DBG("[thread] Thread manage kernel started"); + health_register(HEALTH_TYPE_KERNEL); + /* * This first step of the while is to clean this structure which could free * non NULL pointers so zero it before the loop. @@ -828,7 +830,7 @@ error_testpoint: WARN("Kernel thread died unexpectedly. " "Kernel tracing can continue but CPU hotplug is disabled."); } - health_exit(&health_thread_kernel); + health_unregister(); DBG("Kernel thread dying"); return NULL; } @@ -869,6 +871,8 @@ static void *thread_manage_consumer(void *data) DBG("[thread] Manage consumer started"); + health_register(HEALTH_TYPE_CONSUMER); + /* * Since the consumer thread can be spawned at any moment in time, we init * the health to a poll status (1, which is a valid health over time). @@ -1102,7 +1106,7 @@ error_poll: health_error(&consumer_data->health); ERR("Health error occurred in %s", __func__); } - health_exit(&consumer_data->health); + health_unregister(); DBG("consumer thread cleanup completed"); return NULL; @@ -1123,6 +1127,8 @@ static void *thread_manage_apps(void *data) rcu_register_thread(); rcu_thread_online(); + health_register(HEALTH_TYPE_APP_MANAGE); + if (testpoint(thread_manage_apps)) { goto error_testpoint; } @@ -1294,7 +1300,7 @@ error_testpoint: health_error(&health_thread_app_manage); ERR("Health error occurred in %s", __func__); } - health_exit(&health_thread_app_manage); + health_unregister(); DBG("Application communication apps thread cleanup complete"); rcu_thread_offline(); rcu_unregister_thread(); @@ -1391,6 +1397,8 @@ static void *thread_registration_apps(void *data) DBG("[thread] Manage application registration started"); + health_register(HEALTH_TYPE_APP_REG); + if (testpoint(thread_registration_apps)) { goto error_testpoint; } @@ -1573,7 +1581,7 @@ error_listen: error_create_poll: error_testpoint: DBG("UST Registration thread cleanup complete"); - health_exit(&health_thread_app_reg); + health_unregister(); return NULL; } @@ -1948,9 +1956,7 @@ static int check_consumer_health(void) { int ret; - ret = health_check_state(&kconsumer_data.health) && - health_check_state(&ustconsumer32_data.health) && - health_check_state(&ustconsumer64_data.health); + ret = health_check_state(HEALTH_TYPE_CONSUMER); DBG3("Health consumer check %d", ret); @@ -3068,26 +3074,26 @@ restart: switch (msg.component) { case LTTNG_HEALTH_CMD: - reply.ret_code = health_check_state(&health_thread_cmd); + reply.ret_code = health_check_state(HEALTH_TYPE_CMD); break; case LTTNG_HEALTH_APP_MANAGE: - reply.ret_code = health_check_state(&health_thread_app_manage); + reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE); break; case LTTNG_HEALTH_APP_REG: - reply.ret_code = health_check_state(&health_thread_app_reg); + reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG); break; case LTTNG_HEALTH_KERNEL: - reply.ret_code = health_check_state(&health_thread_kernel); + reply.ret_code = health_check_state(HEALTH_TYPE_KERNEL); break; case LTTNG_HEALTH_CONSUMER: reply.ret_code = check_consumer_health(); break; case LTTNG_HEALTH_ALL: reply.ret_code = - health_check_state(&health_thread_app_manage) && - health_check_state(&health_thread_app_reg) && - health_check_state(&health_thread_cmd) && - health_check_state(&health_thread_kernel) && + health_check_state(HEALTH_TYPE_APP_MANAGE) && + health_check_state(HEALTH_TYPE_APP_REG) && + health_check_state(HEALTH_TYPE_CMD) && + health_check_state(HEALTH_TYPE_KERNEL) && check_consumer_health(); break; default: @@ -3162,6 +3168,8 @@ static void *thread_manage_clients(void *data) rcu_register_thread(); + health_register(HEALTH_TYPE_CMD); + if (testpoint(thread_manage_clients)) { goto error_testpoint; } @@ -3386,7 +3394,7 @@ error_testpoint: ERR("Health error occurred in %s", __func__); } - health_exit(&health_thread_cmd); + health_unregister(); DBG("Client thread dying"); @@ -4167,26 +4175,6 @@ int main(int argc, char **argv) cmd_init(); - /* Init all health thread counters. */ - health_init(&health_thread_cmd); - health_init(&health_thread_kernel); - health_init(&health_thread_app_manage); - health_init(&health_thread_app_reg); - - /* - * Init health counters of the consumer thread. We do a quick hack here to - * the state of the consumer health is fine even if the thread is not - * started. Once the thread starts, the health state is updated with a poll - * value to set a health code path. This is simply to ease our life and has - * no cost what so ever. - */ - health_init(&kconsumer_data.health); - health_poll_update(&kconsumer_data.health); - health_init(&ustconsumer32_data.health); - health_poll_update(&ustconsumer32_data.health); - health_init(&ustconsumer64_data.health); - health_poll_update(&ustconsumer64_data.health); - /* Check for the application socket timeout env variable. */ env_app_timeout = getenv(DEFAULT_APP_SOCKET_TIMEOUT_ENV); if (env_app_timeout) {