From: David Goulet Date: Wed, 18 Jul 2012 16:00:03 +0000 (-0400) Subject: Session daemon health check support X-Git-Tag: v2.1.0-rc1~74 X-Git-Url: https://git.lttng.org./?a=commitdiff_plain;h=44a5e5eb99f1d8b528f83fda5585677a3882f5f5;p=lttng-tools.git Session daemon health check support This is the first commit for the health check feature of the session daemon. Add a lttng_health_check(...) call to the public API and return 0 if everything is fine or 1 if some health problem for a component was detected. Using this API call, you can either choose to test a specific component such as the client command thread, the consumer thread(s), kernel thread, application registration thread or all of them at the same time. This feature is NOT implemented with the lttng command line UI and it is intended to be like so until a stable version is accepted by the community. NOTE: The API could change so be aware of possible changes up to the 2.1-stable release. Signed-off-by: David Goulet --- diff --git a/include/lttng/lttng.h b/include/lttng/lttng.h index 0c32d941a..6823579a9 100644 --- a/include/lttng/lttng.h +++ b/include/lttng/lttng.h @@ -131,6 +131,15 @@ enum lttng_calibrate_type { LTTNG_CALIBRATE_FUNCTION = 0, }; +/* Health component for the health check function. */ +enum lttng_health_component { + LTTNG_HEALTH_CMD, + LTTNG_HEALTH_APP_REG, + LTTNG_HEALTH_KERNEL, + LTTNG_HEALTH_CONSUMER, + LTTNG_HEALTH_ALL, +}; + /* Destination type of lttng URI */ enum lttng_dst_type { LTTNG_DST_IPV4 = 1, @@ -619,4 +628,16 @@ extern int lttng_enable_consumer(struct lttng_handle *handle); */ extern int lttng_disable_consumer(struct lttng_handle *handle); +/* + * Check session daemon health for a specific component. + * + * Return 0 if health is OK or 1 if BAD. A returned value of -1 indicate that + * the control library was not able to connect to the session daemon health + * socket. + * + * Any other positive value is an lttcomm error which can be translate with + * lttng_strerror(). + */ +extern int lttng_health_check(enum lttng_health_component c); + #endif /* _LTTNG_H */ diff --git a/src/bin/lttng-sessiond/Makefile.am b/src/bin/lttng-sessiond/Makefile.am index 0cbf0c11a..9b6c4b057 100644 --- a/src/bin/lttng-sessiond/Makefile.am +++ b/src/bin/lttng-sessiond/Makefile.am @@ -19,7 +19,8 @@ lttng_sessiond_SOURCES = utils.c utils.h \ fd-limit.c fd-limit.h \ consumer.c consumer.h \ kernel-consumer.c kernel-consumer.h \ - consumer.h filter.c filter.h + consumer.h filter.c filter.h \ + health.c health.h if HAVE_LIBLTTNG_UST_CTL lttng_sessiond_SOURCES += trace-ust.c ust-app.c ust-consumer.c ust-consumer.h diff --git a/src/bin/lttng-sessiond/consumer.h b/src/bin/lttng-sessiond/consumer.h index 2eb9d7433..fff32de0f 100644 --- a/src/bin/lttng-sessiond/consumer.h +++ b/src/bin/lttng-sessiond/consumer.h @@ -23,6 +23,8 @@ #include #include +#include "health.h" + enum consumer_dst_type { CONSUMER_DST_LOCAL, CONSUMER_DST_NET, @@ -44,6 +46,9 @@ struct consumer_data { /* consumer error and command Unix socket path */ char err_unix_sock_path[PATH_MAX]; char cmd_unix_sock_path[PATH_MAX]; + + /* Health check of the thread */ + struct health_state health; }; /* diff --git a/src/bin/lttng-sessiond/health.c b/src/bin/lttng-sessiond/health.c new file mode 100644 index 000000000..58f804eb9 --- /dev/null +++ b/src/bin/lttng-sessiond/health.c @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2012 - David Goulet + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License, version 2 only, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include + +#include "health.h" + +/* + * Check health of a specific health state counter. + * + * Return 0 if health is bad or else 1. + */ +int health_check_state(struct health_state *state) +{ + int ret; + uint64_t current; + uint64_t last; + + assert(state); + + current = uatomic_read(&state->current); + last = uatomic_read(&state->last); + + /* + * Here are the conditions for a bad health. Current state set to 0 or the + * current state is the same as the last one and we are NOT waiting for a + * poll() call. + */ + if (current == 0 || (current == last && HEALTH_IS_IN_CODE(current))) { + ret = 0; + goto error; + } + + /* All good */ + ret = 1; + +error: + DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d", + current, last, ret); + + /* Exchange current state counter into last one */ + uatomic_xchg(&state->last, state->current); + return ret; +} diff --git a/src/bin/lttng-sessiond/health.h b/src/bin/lttng-sessiond/health.h new file mode 100644 index 000000000..9a1ef391a --- /dev/null +++ b/src/bin/lttng-sessiond/health.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2012 - David Goulet + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License, version 2 only, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _HEALTH_H +#define _HEALTH_H + +#include +#include + +/* + * These are the value added to the current state depending of the position in + * the thread where is either waiting on a poll() or running in the code. + */ +#define HEALTH_POLL_VALUE 1 +#define HEALTH_CODE_VALUE 2 + +#define HEALTH_IS_IN_POLL(x) (x % HEALTH_CODE_VALUE) +#define HEALTH_IS_IN_CODE(x) (x % HEALTH_POLL_VALUE) + +struct health_state { + uint64_t last; + uint64_t current; +}; + +/* Health state counters for the client command thread */ +extern struct health_state health_thread_cmd; + +/* Health state counters for the application registration thread */ +extern struct health_state health_thread_app_reg; + +/* Health state counters for the kernel thread */ +extern struct health_state health_thread_kernel; + +/* + * Update current counter by 1 to indicate that the thread is in a blocking + * state cause by a poll(). + */ +static inline void health_poll_update(struct health_state *state) +{ + assert(state); + + uatomic_add(&state->current, HEALTH_POLL_VALUE); +} + +/* + * Update current counter by 2 which indicates that we are currently running in + * a thread and NOT blocked at a poll(). + */ +static inline void health_code_update(struct health_state *state) +{ + assert(state); + + uatomic_add(&state->current, HEALTH_CODE_VALUE); +} + +/* + * Reset health state. A value of zero indicate a bad health state. + */ +static inline void health_reset(struct health_state *state) +{ + assert(state); + + uatomic_set(&state->current, 0); + uatomic_set(&state->last, 0); +} + +/* + * Init health state. + */ +static inline void health_init(struct health_state *state) +{ + assert(state); + + uatomic_set(&state->last, 0); + uatomic_set(&state->current, HEALTH_CODE_VALUE); +} + +int health_check_state(struct health_state *state); + +#endif /* _HEALTH_H */ diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c index 2abf9d0e7..bf5adc501 100644 --- a/src/bin/lttng-sessiond/main.c +++ b/src/bin/lttng-sessiond/main.c @@ -59,6 +59,7 @@ #include "utils.h" #include "fd-limit.h" #include "filter.h" +#include "health.h" #define CONSUMERD_FILE "lttng-consumerd" @@ -109,6 +110,8 @@ static char apps_unix_sock_path[PATH_MAX]; static char client_unix_sock_path[PATH_MAX]; /* global wait shm path for UST */ static char wait_shm_path[PATH_MAX]; +/* Global health check unix path */ +static char health_unix_sock_path[PATH_MAX]; /* Sockets and FDs */ static int client_sock = -1; @@ -134,6 +137,7 @@ static pthread_t reg_apps_thread; static pthread_t client_thread; static pthread_t kernel_thread; static pthread_t dispatch_thread; +static pthread_t health_thread; /* * UST registration command queue. This queue is tied with a futex and uses a N @@ -208,6 +212,11 @@ static enum consumerd_state kernel_consumerd_state; */ static unsigned int relayd_net_seq_idx; +/* Used for the health monitoring of the session daemon. See health.h */ +struct health_state health_thread_cmd; +struct health_state health_thread_app_reg; +struct health_state health_thread_kernel; + static void setup_consumerd_path(void) { @@ -712,6 +721,8 @@ static void *thread_manage_kernel(void *data) DBG("Thread manage kernel started"); + health_code_update(&health_thread_kernel); + ret = create_thread_poll_set(&events, 2); if (ret < 0) { goto error_poll_create; @@ -723,6 +734,8 @@ static void *thread_manage_kernel(void *data) } while (1) { + health_code_update(&health_thread_kernel); + if (update_poll_flag == 1) { /* * Reset number of fd in the poll set. Always 2 since there is the thread @@ -746,7 +759,9 @@ static void *thread_manage_kernel(void *data) /* Poll infinite value of time */ restart: + health_poll_update(&health_thread_kernel); ret = lttng_poll_wait(&events, -1); + health_poll_update(&health_thread_kernel); if (ret < 0) { /* * Restart interrupted system call. @@ -767,6 +782,8 @@ static void *thread_manage_kernel(void *data) revents = LTTNG_POLL_GETEV(&events, i); pollfd = LTTNG_POLL_GETFD(&events, i); + health_code_update(&health_thread_kernel); + /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { @@ -801,6 +818,7 @@ static void *thread_manage_kernel(void *data) error: lttng_poll_clean(&events); error_poll_create: + health_reset(&health_thread_kernel); DBG("Kernel thread dying"); return NULL; } @@ -818,6 +836,8 @@ static void *thread_manage_consumer(void *data) DBG("[thread] Manage consumer started"); + health_code_update(&consumer_data->health); + ret = lttcomm_listen_unix_sock(consumer_data->err_sock); if (ret < 0) { goto error_listen; @@ -839,9 +859,13 @@ static void *thread_manage_consumer(void *data) nb_fd = LTTNG_POLL_GETNB(&events); + health_code_update(&consumer_data->health); + /* Inifinite blocking call, waiting for transmission */ restart: + health_poll_update(&consumer_data->health); ret = lttng_poll_wait(&events, -1); + health_poll_update(&consumer_data->health); if (ret < 0) { /* * Restart interrupted system call. @@ -857,6 +881,8 @@ restart: revents = LTTNG_POLL_GETEV(&events, i); pollfd = LTTNG_POLL_GETFD(&events, i); + health_code_update(&consumer_data->health); + /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { @@ -877,6 +903,8 @@ restart: goto error; } + health_code_update(&consumer_data->health); + DBG2("Receiving code from consumer err_sock"); /* Getting status code from kconsumerd */ @@ -886,6 +914,8 @@ restart: goto error; } + health_code_update(&consumer_data->health); + if (code == CONSUMERD_COMMAND_SOCK_READY) { consumer_data->cmd_sock = lttcomm_connect_unix_sock(consumer_data->cmd_unix_sock_path); @@ -914,12 +944,16 @@ restart: goto error; } + health_code_update(&consumer_data->health); + /* Update number of fd */ nb_fd = LTTNG_POLL_GETNB(&events); /* Inifinite blocking call, waiting for transmission */ restart_poll: + health_poll_update(&consumer_data->health); ret = lttng_poll_wait(&events, -1); + health_poll_update(&consumer_data->health); if (ret < 0) { /* * Restart interrupted system call. @@ -935,6 +969,8 @@ restart_poll: revents = LTTNG_POLL_GETEV(&events, i); pollfd = LTTNG_POLL_GETFD(&events, i); + health_code_update(&consumer_data->health); + /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { @@ -950,6 +986,8 @@ restart_poll: } } + health_code_update(&consumer_data->health); + /* Wait for any kconsumerd error */ ret = lttcomm_recv_unix_sock(sock, &code, sizeof(enum lttcomm_return_code)); @@ -998,6 +1036,7 @@ error: lttng_poll_clean(&events); error_poll: error_listen: + health_reset(&consumer_data->health); DBG("consumer thread cleanup completed"); return NULL; @@ -1018,6 +1057,8 @@ static void *thread_manage_apps(void *data) rcu_register_thread(); rcu_thread_online(); + health_code_update(&health_thread_app_reg); + ret = create_thread_poll_set(&events, 2); if (ret < 0) { goto error_poll_create; @@ -1028,6 +1069,8 @@ static void *thread_manage_apps(void *data) goto error; } + health_code_update(&health_thread_app_reg); + while (1) { /* Zeroed the events structure */ lttng_poll_reset(&events); @@ -1038,7 +1081,9 @@ static void *thread_manage_apps(void *data) /* Inifinite blocking call, waiting for transmission */ restart: + health_poll_update(&health_thread_app_reg); ret = lttng_poll_wait(&events, -1); + health_poll_update(&health_thread_app_reg); if (ret < 0) { /* * Restart interrupted system call. @@ -1054,6 +1099,8 @@ static void *thread_manage_apps(void *data) revents = LTTNG_POLL_GETEV(&events, i); pollfd = LTTNG_POLL_GETFD(&events, i); + health_code_update(&health_thread_app_reg); + /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { @@ -1073,6 +1120,8 @@ static void *thread_manage_apps(void *data) goto error; } + health_code_update(&health_thread_app_reg); + /* Register applicaton to the session daemon */ ret = ust_app_register(&ust_cmd.reg_msg, ust_cmd.sock); @@ -1082,6 +1131,8 @@ static void *thread_manage_apps(void *data) break; } + health_code_update(&health_thread_app_reg); + /* * Validate UST version compatibility. */ @@ -1094,6 +1145,8 @@ static void *thread_manage_apps(void *data) update_ust_app(ust_cmd.sock); } + health_code_update(&health_thread_app_reg); + ret = ust_app_register_done(ust_cmd.sock); if (ret < 0) { /* @@ -1117,6 +1170,8 @@ static void *thread_manage_apps(void *data) ust_cmd.sock); } + health_code_update(&health_thread_app_reg); + break; } } else { @@ -1136,12 +1191,15 @@ static void *thread_manage_apps(void *data) break; } } + + health_code_update(&health_thread_app_reg); } } error: lttng_poll_clean(&events); error_poll_create: + health_reset(&health_thread_app_reg); DBG("Application communication apps thread cleanup complete"); rcu_thread_offline(); rcu_unregister_thread(); @@ -1681,6 +1739,23 @@ error: return ret; } +/* + * Compute health status of each consumer. + */ +static int check_consumer_health(void) +{ + int ret; + + ret = + health_check_state(&kconsumer_data.health) & + health_check_state(&ustconsumer32_data.health) & + health_check_state(&ustconsumer64_data.health); + + DBG3("Health consumer check %d", ret); + + return ret; +} + /* * Check version of the lttng-modules. */ @@ -4558,6 +4633,180 @@ init_setup_error: return ret; } +/* + * Thread managing health check socket. + */ +static void *thread_manage_health(void *data) +{ + int sock = -1, new_sock, ret, i, pollfd; + uint32_t revents, nb_fd; + struct lttng_poll_event events; + struct lttcomm_health_msg msg; + struct lttcomm_health_data reply; + + DBG("[thread] Manage health check started"); + + rcu_register_thread(); + + /* Create unix socket */ + sock = lttcomm_create_unix_sock(health_unix_sock_path); + if (sock < 0) { + ERR("Unable to create health check Unix socket"); + ret = -1; + goto error; + } + + ret = lttcomm_listen_unix_sock(sock); + if (ret < 0) { + goto error; + } + + /* + * Pass 2 as size here for the thread quit pipe and client_sock. Nothing + * more will be added to this poll set. + */ + ret = create_thread_poll_set(&events, 2); + if (ret < 0) { + goto error; + } + + /* Add the application registration socket */ + ret = lttng_poll_add(&events, sock, LPOLLIN | LPOLLPRI); + if (ret < 0) { + goto error; + } + + while (1) { + DBG("Health check ready"); + + nb_fd = LTTNG_POLL_GETNB(&events); + + /* Inifinite blocking call, waiting for transmission */ +restart: + ret = lttng_poll_wait(&events, -1); + if (ret < 0) { + /* + * Restart interrupted system call. + */ + if (errno == EINTR) { + goto restart; + } + goto error; + } + + for (i = 0; i < nb_fd; i++) { + /* Fetch once the poll data */ + revents = LTTNG_POLL_GETEV(&events, i); + pollfd = LTTNG_POLL_GETFD(&events, i); + + /* Thread quit pipe has been closed. Killing thread. */ + ret = check_thread_quit_pipe(pollfd, revents); + if (ret) { + goto error; + } + + /* Event on the registration socket */ + if (pollfd == sock) { + if (revents & (LPOLLERR | LPOLLHUP | LPOLLRDHUP)) { + ERR("Health socket poll error"); + goto error; + } + } + } + + new_sock = lttcomm_accept_unix_sock(sock); + if (new_sock < 0) { + goto error; + } + + DBG("Receiving data from client for health..."); + ret = lttcomm_recv_unix_sock(new_sock, (void *)&msg, sizeof(msg)); + if (ret <= 0) { + DBG("Nothing recv() from client... continuing"); + ret = close(new_sock); + if (ret) { + PERROR("close"); + } + new_sock = -1; + continue; + } + + rcu_thread_online(); + + switch (msg.component) { + case LTTNG_HEALTH_CMD: + reply.ret_code = health_check_state(&health_thread_cmd); + break; + case LTTNG_HEALTH_APP_REG: + reply.ret_code = health_check_state(&health_thread_app_reg); + break; + case LTTNG_HEALTH_KERNEL: + reply.ret_code = health_check_state(&health_thread_kernel); + break; + case LTTNG_HEALTH_CONSUMER: + reply.ret_code = check_consumer_health(); + break; + case LTTNG_HEALTH_ALL: + ret = check_consumer_health(); + + reply.ret_code = + health_check_state(&health_thread_app_reg) & + health_check_state(&health_thread_cmd) & + health_check_state(&health_thread_kernel) & + ret; + break; + default: + reply.ret_code = LTTCOMM_UND; + break; + } + + /* + * Flip ret value since 0 is a success and 1 indicates a bad health for + * the client where in the sessiond it is the opposite. Again, this is + * just to make things easier for us poor developer which enjoy a lot + * lazyness. + */ + if (reply.ret_code == 0 || reply.ret_code == 1) { + reply.ret_code = !reply.ret_code; + } + + DBG2("Health check return value %d", reply.ret_code); + + ret = send_unix_sock(new_sock, (void *) &reply, sizeof(reply)); + if (ret < 0) { + ERR("Failed to send health data back to client"); + } + + /* End of transmission */ + ret = close(new_sock); + if (ret) { + PERROR("close"); + } + new_sock = -1; + } + +error: + DBG("Health check thread dying"); + unlink(health_unix_sock_path); + if (sock >= 0) { + ret = close(sock); + if (ret) { + PERROR("close"); + } + } + if (new_sock >= 0) { + ret = close(new_sock); + if (ret) { + PERROR("close"); + } + } + + lttng_poll_clean(&events); + + rcu_unregister_thread(); + return NULL; +} + /* * This thread manage all clients request using the unix client socket for * communication. @@ -4574,6 +4823,8 @@ static void *thread_manage_clients(void *data) rcu_register_thread(); + health_code_update(&health_thread_cmd); + ret = lttcomm_listen_unix_sock(client_sock); if (ret < 0) { goto error; @@ -4601,6 +4852,8 @@ static void *thread_manage_clients(void *data) kill(ppid, SIGUSR1); } + health_code_update(&health_thread_cmd); + while (1) { DBG("Accepting client command ..."); @@ -4608,7 +4861,9 @@ static void *thread_manage_clients(void *data) /* Inifinite blocking call, waiting for transmission */ restart: + health_poll_update(&health_thread_cmd); ret = lttng_poll_wait(&events, -1); + health_poll_update(&health_thread_cmd); if (ret < 0) { /* * Restart interrupted system call. @@ -4624,6 +4879,8 @@ static void *thread_manage_clients(void *data) revents = LTTNG_POLL_GETEV(&events, i); pollfd = LTTNG_POLL_GETFD(&events, i); + health_code_update(&health_thread_cmd); + /* Thread quit pipe has been closed. Killing thread. */ ret = check_thread_quit_pipe(pollfd, revents); if (ret) { @@ -4641,6 +4898,8 @@ static void *thread_manage_clients(void *data) DBG("Wait for client response"); + health_code_update(&health_thread_cmd); + sock = lttcomm_accept_unix_sock(client_sock); if (sock < 0) { goto error; @@ -4669,6 +4928,8 @@ static void *thread_manage_clients(void *data) cmd_ctx->llm = NULL; cmd_ctx->session = NULL; + health_code_update(&health_thread_cmd); + /* * Data is received from the lttng client. The struct * lttcomm_session_msg (lsm) contains the command and data request of @@ -4688,6 +4949,8 @@ static void *thread_manage_clients(void *data) continue; } + health_code_update(&health_thread_cmd); + // TODO: Validate cmd_ctx including sanity check for // security purpose. @@ -4719,6 +4982,8 @@ static void *thread_manage_clients(void *data) continue; } + health_code_update(&health_thread_cmd); + DBG("Sending response (size: %d, retcode: %s)", cmd_ctx->lttng_msg_size, lttng_strerror(-cmd_ctx->llm->ret_code)); @@ -4735,9 +5000,13 @@ static void *thread_manage_clients(void *data) sock = -1; clean_command_ctx(&cmd_ctx); + + health_code_update(&health_thread_cmd); } error: + health_reset(&health_thread_cmd); + DBG("Client thread dying"); unlink(client_unix_sock_path); if (client_sock >= 0) { @@ -5286,6 +5555,11 @@ int main(int argc, char **argv) DEFAULT_GLOBAL_APPS_WAIT_SHM_PATH); } + if (strlen(health_unix_sock_path) == 0) { + snprintf(health_unix_sock_path, sizeof(health_unix_sock_path), + DEFAULT_GLOBAL_HEALTH_UNIX_SOCK); + } + /* Setup kernel consumerd path */ snprintf(kconsumer_data.err_unix_sock_path, PATH_MAX, DEFAULT_KCONSUMERD_ERR_SOCK_PATH, rundir); @@ -5336,6 +5610,12 @@ int main(int argc, char **argv) snprintf(wait_shm_path, PATH_MAX, DEFAULT_HOME_APPS_WAIT_SHM_PATH, geteuid()); } + + /* Set health check Unix path */ + if (strlen(health_unix_sock_path) == 0) { + snprintf(health_unix_sock_path, sizeof(health_unix_sock_path), + DEFAULT_HOME_HEALTH_UNIX_SOCK, home_path); + } } /* Set consumer initial state */ @@ -5468,6 +5748,31 @@ int main(int argc, char **argv) */ uatomic_set(&relayd_net_seq_idx, 1); + /* Init all health thread counters. */ + health_init(&health_thread_cmd); + health_init(&health_thread_kernel); + health_init(&health_thread_app_reg); + + /* + * Init health counters of the consumer thread. We do a quick hack here to + * the state of the consumer health is fine even if the thread is not + * started. This is simply to ease our life and has no cost what so ever. + */ + health_init(&kconsumer_data.health); + health_poll_update(&kconsumer_data.health); + health_init(&ustconsumer32_data.health); + health_poll_update(&ustconsumer32_data.health); + health_init(&ustconsumer64_data.health); + health_poll_update(&ustconsumer64_data.health); + + /* Create thread to manage the client socket */ + ret = pthread_create(&health_thread, NULL, + thread_manage_health, (void *) NULL); + if (ret != 0) { + PERROR("pthread_create health"); + goto exit_health; + } + /* Create thread to manage the client socket */ ret = pthread_create(&client_thread, NULL, thread_manage_clients, (void *) NULL); @@ -5549,6 +5854,7 @@ exit_dispatch: } exit_client: +exit_health: exit: /* * cleanup() is called when no other thread is running. diff --git a/src/common/defaults.h b/src/common/defaults.h index 5c5998dde..3fd74753f 100644 --- a/src/common/defaults.h +++ b/src/common/defaults.h @@ -72,6 +72,8 @@ #define DEFAULT_GLOBAL_APPS_UNIX_SOCK DEFAULT_LTTNG_RUNDIR "/apps-lttng-sessiond" #define DEFAULT_HOME_APPS_UNIX_SOCK DEFAULT_LTTNG_HOME_RUNDIR "/apps-lttng-sessiond" #define DEFAULT_HOME_CLIENT_UNIX_SOCK DEFAULT_LTTNG_HOME_RUNDIR "/client-lttng-sessiond" +#define DEFAULT_GLOBAL_HEALTH_UNIX_SOCK DEFAULT_LTTNG_RUNDIR "/health.sock" +#define DEFAULT_HOME_HEALTH_UNIX_SOCK DEFAULT_LTTNG_HOME_RUNDIR "/health.sock" /* * Value taken from the hard limit allowed by the kernel when using setrlimit diff --git a/src/common/sessiond-comm/sessiond-comm.h b/src/common/sessiond-comm/sessiond-comm.h index f48ba2218..eb13734a6 100644 --- a/src/common/sessiond-comm/sessiond-comm.h +++ b/src/common/sessiond-comm/sessiond-comm.h @@ -84,9 +84,8 @@ enum lttcomm_sessiond_command { RELAYD_UPDATE_SYNC_INFO, RELAYD_VERSION, RELAYD_SEND_METADATA, - - /* Other tracer commands */ LTTNG_SET_FILTER, + LTTNG_HEALTH_CHECK, }; /* @@ -321,6 +320,15 @@ struct lttcomm_lttng_msg { char payload[]; }; +struct lttcomm_health_msg { + uint32_t component; + uint32_t cmd; +}; + +struct lttcomm_health_data { + uint32_t ret_code; +}; + /* * lttcomm_consumer_msg is the message sent from sessiond to consumerd * to either add a channel, add a stream, update a stream, or stop diff --git a/src/lib/lttng-ctl/lttng-ctl.c b/src/lib/lttng-ctl/lttng-ctl.c index 780b44dd7..b6487b7c2 100644 --- a/src/lib/lttng-ctl/lttng-ctl.c +++ b/src/lib/lttng-ctl/lttng-ctl.c @@ -20,6 +20,7 @@ */ #define _GNU_SOURCE +#include #include #include #include @@ -55,6 +56,7 @@ do { \ /* Socket to session daemon for communication */ static int sessiond_socket; static char sessiond_sock_path[PATH_MAX]; +static char health_sock_path[PATH_MAX]; /* Variables */ static char *tracing_group; @@ -1278,6 +1280,98 @@ int lttng_disable_consumer(struct lttng_handle *handle) return ask_sessiond(&lsm, NULL); } +/* + * Set health socket path by putting it in the global health_sock_path + * variable. + * + * Returns 0 on success or assert(0) on ENOMEM. + */ +static int set_health_socket_path(void) +{ + int ret; + int in_tgroup = 0; /* In tracing group */ + uid_t uid; + const char *home; + + uid = getuid(); + + if (uid != 0) { + /* Are we in the tracing group ? */ + in_tgroup = check_tracing_group(tracing_group); + } + + if ((uid == 0) || in_tgroup) { + copy_string(health_sock_path, DEFAULT_GLOBAL_HEALTH_UNIX_SOCK, + sizeof(health_sock_path)); + } + + if (uid != 0) { + /* + * With GNU C < 2.1, snprintf returns -1 if the target buffer is too small; + * With GNU C >= 2.1, snprintf returns the required size (excluding closing null) + */ + home = getenv("HOME"); + if (home == NULL) { + /* Fallback in /tmp .. */ + home = "/tmp"; + } + + ret = snprintf(health_sock_path, sizeof(health_sock_path), + DEFAULT_HOME_HEALTH_UNIX_SOCK, home); + if ((ret < 0) || (ret >= sizeof(health_sock_path))) { + /* ENOMEM at this point... just kill the control lib. */ + assert(0); + } + } + + return 0; +} + +/* + * Check session daemon health for a specific health component. + * + * Return 0 if health is OK or else 1 if BAD. A return value of -1 indicate + * that the control library was not able to connect to the session daemon + * health socket. + * + * Any other positive value is an lttcomm error which can be translate with + * lttng_strerror(). + */ +int lttng_health_check(enum lttng_health_component c) +{ + int sock, ret; + struct lttcomm_health_msg msg; + struct lttcomm_health_data reply; + + /* Connect to the sesssion daemon */ + sock = lttcomm_connect_unix_sock(health_sock_path); + if (sock < 0) { + ret = -1; + goto error; + } + + msg.cmd = LTTNG_HEALTH_CHECK; + msg.component = c; + + ret = lttcomm_send_unix_sock(sock, (void *)&msg, sizeof(msg)); + if (ret < 0) { + goto close_error; + } + + ret = lttcomm_recv_unix_sock(sock, (void *)&reply, sizeof(reply)); + if (ret < 0) { + goto close_error; + } + + ret = reply.ret_code; + +close_error: + close(sock); + +error: + return ret; +} + /* * lib constructor */ @@ -1285,4 +1379,6 @@ static void __attribute__((constructor)) init() { /* Set default session group */ lttng_set_tracing_group(DEFAULT_TRACING_GROUP); + /* Set socket for health check */ + (void) set_health_socket_path(); }