.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
};
+/* Define TLS health state. */
+DEFINE_URCU_TLS(struct health_state, health_state);
+
+/*
+ * It ensures that TLS memory used for the node and its container structure
+ * don't get reclaimed after the TLS owner thread exits until we have finished
+ * using it.
+ */
+static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static struct health_tls_state_list health_state_list = {
+ .head = CDS_LIST_HEAD_INIT(health_state_list.head),
+};
+
+/*
+ * This keeps track of the error state for unregistered thread. A thread
+ * reporting a health error, normally unregisters and quits. This makes the TLS
+ * health state not available to the health_check_state() call so on unregister
+ * we update this global error array so we can keep track of which thread was
+ * on error if the TLS health state has been removed.
+ */
+static enum health_flags global_error_state[HEALTH_NUM_TYPE];
+
+/*
+ * Lock health state global list mutex.
+ */
+static void state_lock(void)
+{
+ pthread_mutex_lock(&health_mutex);
+}
+
+/*
+ * Unlock health state global list mutex.
+ */
+static void state_unlock(void)
+{
+ pthread_mutex_unlock(&health_mutex);
+}
+
/*
* Set time difference in res from time_a and time_b.
*/
}
/*
- * Check health of a specific health state counter.
+ * Health mutex MUST be held across use of the returned struct health_state to
+ * provide existence guarantee.
+ *
+ * Return the health_state object or NULL if not found.
+ */
+static struct health_state *find_health_state(enum health_type type)
+{
+ struct health_state *state;
+
+ /* Find the right health state in the global TLS list. */
+ cds_list_for_each_entry(state, &health_state_list.head, node) {
+ if (state->type == type) {
+ return state;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Check health of a specific health type. Note that if a thread has not yet
+ * initialize its health subsystem or has quit, it's considered in a good
+ * state.
*
* Return 0 if health is bad or else 1.
*/
-int health_check_state(struct health_state *state)
+int health_check_state(enum health_type type)
{
int retval = 1, ret;
unsigned long current, last;
struct timespec current_time;
+ struct health_state *state;
+
+ assert(type < HEALTH_NUM_TYPE);
- assert(state);
+ state_lock();
+
+ state = find_health_state(type);
+ if (!state) {
+ /* Check the global state since the state is not visiable anymore. */
+ if (global_error_state[type] & HEALTH_ERROR) {
+ retval = 0;
+ }
+ goto not_found;
+ }
last = state->last;
current = uatomic_read(&state->current);
end:
DBG("Health state current %lu, last %lu, ret %d",
current, last, ret);
+not_found:
+ state_unlock();
return retval;
}
+
+/*
+ * Init health state.
+ */
+void health_register(enum health_type type)
+{
+ struct health_state *state;
+
+ assert(type < HEALTH_NUM_TYPE);
+
+ /* Init TLS state. */
+ uatomic_set(&URCU_TLS(health_state).last, 0);
+ uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
+ uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
+ uatomic_set(&URCU_TLS(health_state).current, 0);
+ uatomic_set(&URCU_TLS(health_state).flags, 0);
+ uatomic_set(&URCU_TLS(health_state).type, type);
+
+ /* Add it to the global TLS state list. */
+ state_lock();
+ state = find_health_state(type);
+ /*
+ * Duplicates are not accepted, since lookups don't handle them at the
+ * moment.
+ */
+ assert(!state);
+
+ cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
+ state_unlock();
+}
+
+/*
+ * Remove node from global list.
+ */
+void health_unregister(void)
+{
+ state_lock();
+ /*
+ * On error, set the global_error_state since we are about to remove
+ * the node from the global list.
+ */
+ if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
+ uatomic_set(&global_error_state[URCU_TLS(health_state).type],
+ HEALTH_ERROR);
+ }
+ cds_list_del(&URCU_TLS(health_state).node);
+ state_unlock();
+}
#include <assert.h>
#include <time.h>
+#include <pthread.h>
+#include <urcu/tls-compat.h>
#include <urcu/uatomic.h>
+#include <urcu/list.h>
/*
* These are the value added to the current state depending of the position in
#define HEALTH_IS_IN_POLL(x) ((x) & HEALTH_POLL_VALUE)
enum health_flags {
- HEALTH_EXIT = (1U << 0),
- HEALTH_ERROR = (1U << 1),
+ HEALTH_ERROR = (1U << 0),
+};
+
+enum health_type {
+ HEALTH_TYPE_CMD = 0,
+ HEALTH_TYPE_APP_MANAGE = 1,
+ HEALTH_TYPE_APP_REG = 2,
+ HEALTH_TYPE_KERNEL = 3,
+ HEALTH_TYPE_CONSUMER = 4,
+
+ HEALTH_NUM_TYPE,
+};
+
+struct health_tls_state_list {
+ struct cds_list_head head;
};
struct health_state {
*/
unsigned long current; /* progress counter, updated atomically */
enum health_flags flags; /* other flags, updated atomically */
+ enum health_type type; /* Indicates the nature of the thread. */
+ /* Node of the global TLS state list. */
+ struct cds_list_head node;
};
+/* Declare TLS health state. */
+extern DECLARE_URCU_TLS(struct health_state, health_state);
+
/* Health state counters for the client command thread */
extern struct health_state health_thread_cmd;
*/
static inline void health_poll_update(struct health_state *state)
{
- assert(state);
- uatomic_add(&state->current, HEALTH_POLL_VALUE);
+ uatomic_add(&URCU_TLS(health_state).current, HEALTH_POLL_VALUE);
}
/*
*/
static inline void health_code_update(struct health_state *state)
{
- assert(state);
- uatomic_add(&state->current, HEALTH_CODE_VALUE);
-}
-
-/*
- * Set health "exit" flag.
- */
-static inline void health_exit(struct health_state *state)
-{
- assert(state);
- uatomic_or(&state->flags, HEALTH_EXIT);
+ uatomic_add(&URCU_TLS(health_state).current, HEALTH_CODE_VALUE);
}
/*
*/
static inline void health_error(struct health_state *state)
{
- assert(state);
- uatomic_or(&state->flags, HEALTH_ERROR);
-}
-
-/*
- * Init health state.
- */
-static inline void health_init(struct health_state *state)
-{
- assert(state);
- state->last = 0;
- state->last_time.tv_sec = 0;
- state->last_time.tv_nsec = 0;
- uatomic_set(&state->current, 0);
- uatomic_set(&state->flags, 0);
+ uatomic_or(&URCU_TLS(health_state).flags, HEALTH_ERROR);
}
-int health_check_state(struct health_state *state);
+int health_check_state(enum health_type type);
+void health_register(enum health_type type);
+void health_unregister(void);
#endif /* _HEALTH_H */
DBG("[thread] Thread manage kernel started");
+ health_register(HEALTH_TYPE_KERNEL);
+
/*
* This first step of the while is to clean this structure which could free
* non NULL pointers so zero it before the loop.
WARN("Kernel thread died unexpectedly. "
"Kernel tracing can continue but CPU hotplug is disabled.");
}
- health_exit(&health_thread_kernel);
+ health_unregister();
DBG("Kernel thread dying");
return NULL;
}
DBG("[thread] Manage consumer started");
+ health_register(HEALTH_TYPE_CONSUMER);
+
/*
* Since the consumer thread can be spawned at any moment in time, we init
* the health to a poll status (1, which is a valid health over time).
health_error(&consumer_data->health);
ERR("Health error occurred in %s", __func__);
}
- health_exit(&consumer_data->health);
+ health_unregister();
DBG("consumer thread cleanup completed");
return NULL;
rcu_register_thread();
rcu_thread_online();
+ health_register(HEALTH_TYPE_APP_MANAGE);
+
if (testpoint(thread_manage_apps)) {
goto error_testpoint;
}
health_error(&health_thread_app_manage);
ERR("Health error occurred in %s", __func__);
}
- health_exit(&health_thread_app_manage);
+ health_unregister();
DBG("Application communication apps thread cleanup complete");
rcu_thread_offline();
rcu_unregister_thread();
DBG("[thread] Manage application registration started");
+ health_register(HEALTH_TYPE_APP_REG);
+
if (testpoint(thread_registration_apps)) {
goto error_testpoint;
}
error_create_poll:
error_testpoint:
DBG("UST Registration thread cleanup complete");
- health_exit(&health_thread_app_reg);
+ health_unregister();
return NULL;
}
{
int ret;
- ret = health_check_state(&kconsumer_data.health) &&
- health_check_state(&ustconsumer32_data.health) &&
- health_check_state(&ustconsumer64_data.health);
+ ret = health_check_state(HEALTH_TYPE_CONSUMER);
DBG3("Health consumer check %d", ret);
switch (msg.component) {
case LTTNG_HEALTH_CMD:
- reply.ret_code = health_check_state(&health_thread_cmd);
+ reply.ret_code = health_check_state(HEALTH_TYPE_CMD);
break;
case LTTNG_HEALTH_APP_MANAGE:
- reply.ret_code = health_check_state(&health_thread_app_manage);
+ reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE);
break;
case LTTNG_HEALTH_APP_REG:
- reply.ret_code = health_check_state(&health_thread_app_reg);
+ reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG);
break;
case LTTNG_HEALTH_KERNEL:
- reply.ret_code = health_check_state(&health_thread_kernel);
+ reply.ret_code = health_check_state(HEALTH_TYPE_KERNEL);
break;
case LTTNG_HEALTH_CONSUMER:
reply.ret_code = check_consumer_health();
break;
case LTTNG_HEALTH_ALL:
reply.ret_code =
- health_check_state(&health_thread_app_manage) &&
- health_check_state(&health_thread_app_reg) &&
- health_check_state(&health_thread_cmd) &&
- health_check_state(&health_thread_kernel) &&
+ health_check_state(HEALTH_TYPE_APP_MANAGE) &&
+ health_check_state(HEALTH_TYPE_APP_REG) &&
+ health_check_state(HEALTH_TYPE_CMD) &&
+ health_check_state(HEALTH_TYPE_KERNEL) &&
check_consumer_health();
break;
default:
rcu_register_thread();
+ health_register(HEALTH_TYPE_CMD);
+
if (testpoint(thread_manage_clients)) {
goto error_testpoint;
}
ERR("Health error occurred in %s", __func__);
}
- health_exit(&health_thread_cmd);
+ health_unregister();
DBG("Client thread dying");
cmd_init();
- /* Init all health thread counters. */
- health_init(&health_thread_cmd);
- health_init(&health_thread_kernel);
- health_init(&health_thread_app_manage);
- health_init(&health_thread_app_reg);
-
- /*
- * Init health counters of the consumer thread. We do a quick hack here to
- * the state of the consumer health is fine even if the thread is not
- * started. Once the thread starts, the health state is updated with a poll
- * value to set a health code path. This is simply to ease our life and has
- * no cost what so ever.
- */
- health_init(&kconsumer_data.health);
- health_poll_update(&kconsumer_data.health);
- health_init(&ustconsumer32_data.health);
- health_poll_update(&ustconsumer32_data.health);
- health_init(&ustconsumer64_data.health);
- health_poll_update(&ustconsumer64_data.health);
-
/* Check for the application socket timeout env variable. */
env_app_timeout = getenv(DEFAULT_APP_SOCKET_TIMEOUT_ENV);
if (env_app_timeout) {