Fix: file-descriptor: missing include guards
[lttng-tools.git] / src / common / health / health.cpp
CommitLineData
44a5e5eb 1/*
ab5be9fa
MJ
2 * Copyright (C) 2012 David Goulet <dgoulet@efficios.com>
3 * Copyright (C) 2013 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
44a5e5eb 4 *
ab5be9fa 5 * SPDX-License-Identifier: GPL-2.0-only
44a5e5eb 6 *
44a5e5eb
DG
7 */
8
6c1c0768 9#define _LGPL_SOURCE
c9e313bc
SM
10#include <common/defaults.hpp>
11#include <common/error.hpp>
12#include <common/macros.hpp>
13#include <common/sessiond-comm/inet.hpp>
44a5e5eb 14
c9e313bc 15#include <lttng/health-internal.hpp>
44a5e5eb 16
28ab034a
JG
17#include <algorithm>
18#include <inttypes.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <time.h>
22
8782cc74
MD
23/*
24 * An application-specific error state for unregistered thread keeps
25 * track of thread errors. A thread reporting a health error, normally
26 * unregisters and quits. This makes the TLS health state not available
27 * to the health_check_state() call so on unregister we update this
28 * global error array so we can keep track of which thread was on error
29 * if the TLS health state has been removed.
30 */
31struct health_app {
32 /* List of health state, for each application thread */
33 struct cds_list_head list;
34 /*
35 * This lock ensures that TLS memory used for the node and its
36 * container structure don't get reclaimed after the TLS owner
37 * thread exits until we have finished using it.
38 */
39 pthread_mutex_t lock;
40 int nr_types;
41 struct timespec time_delta;
42 /* Health flags containing thread type error state */
43 enum health_flags *flags;
8809eec0
MD
44};
45
927ca06a
DG
46/* Define TLS health state. */
47DEFINE_URCU_TLS(struct health_state, health_state);
48
55d09795
MD
49/*
50 * Initialize health check subsytem.
51 */
28ab034a 52static void health_init(struct health_app *ha)
55d09795
MD
53{
54 /*
55 * Get the maximum value between the default delta value and the TCP
56 * timeout with a safety net of the default health check delta.
57 */
8784a4d0 58 ha->time_delta.tv_sec = std::max<unsigned long>(
28ab034a
JG
59 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S, ha->time_delta.tv_sec);
60 DBG("Health check time delta in seconds set to %lu", ha->time_delta.tv_sec);
55d09795
MD
61}
62
8782cc74
MD
63struct health_app *health_app_create(int nr_types)
64{
65 struct health_app *ha;
927ca06a 66
64803277 67 ha = zmalloc<health_app>();
8782cc74 68 if (!ha) {
cd9adb8b 69 return nullptr;
8782cc74 70 }
64803277 71 ha->flags = calloc<health_flags>(nr_types);
8782cc74
MD
72 if (!ha->flags) {
73 goto error_flags;
74 }
75 CDS_INIT_LIST_HEAD(&ha->list);
cd9adb8b 76 pthread_mutex_init(&ha->lock, nullptr);
8782cc74
MD
77 ha->nr_types = nr_types;
78 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
79 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
55d09795 80 health_init(ha);
8782cc74
MD
81 return ha;
82
83error_flags:
84 free(ha);
cd9adb8b 85 return nullptr;
8782cc74 86}
927ca06a 87
8782cc74
MD
88void health_app_destroy(struct health_app *ha)
89{
90 free(ha->flags);
91 free(ha);
92}
927ca06a
DG
93
94/*
95 * Lock health state global list mutex.
96 */
8782cc74 97static void state_lock(struct health_app *ha)
927ca06a 98{
8782cc74 99 pthread_mutex_lock(&ha->lock);
927ca06a
DG
100}
101
102/*
103 * Unlock health state global list mutex.
104 */
8782cc74 105static void state_unlock(struct health_app *ha)
927ca06a 106{
8782cc74 107 pthread_mutex_unlock(&ha->lock);
927ca06a
DG
108}
109
8809eec0
MD
110/*
111 * Set time difference in res from time_a and time_b.
112 */
28ab034a
JG
113static void
114time_diff(const struct timespec *time_a, const struct timespec *time_b, struct timespec *res)
8809eec0
MD
115{
116 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
117 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
118 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
119 } else {
120 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
931a97e5 121 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
8809eec0
MD
122 }
123}
124
125/*
126 * Return true if time_a - time_b > diff, else false.
127 */
128static int time_diff_gt(const struct timespec *time_a,
28ab034a
JG
129 const struct timespec *time_b,
130 const struct timespec *diff)
8809eec0
MD
131{
132 struct timespec res;
133
134 time_diff(time_a, time_b, &res);
135 time_diff(&res, diff, &res);
136
137 if (res.tv_sec > 0) {
138 return 1;
139 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
140 return 1;
141 }
142
143 return 0;
144}
145
44a5e5eb 146/*
c89add41 147 * Validate health state. Checks for the error flag or health conditions.
44a5e5eb
DG
148 *
149 * Return 0 if health is bad or else 1.
150 */
8782cc74 151static int validate_state(struct health_app *ha, struct health_state *state)
44a5e5eb 152{
8809eec0 153 int retval = 1, ret;
139ac872 154 unsigned long current, last;
8809eec0 155 struct timespec current_time;
927ca06a 156
a0377dfe 157 LTTNG_ASSERT(state);
44a5e5eb 158
139ac872 159 last = state->last;
44a5e5eb 160 current = uatomic_read(&state->current);
44a5e5eb 161
389fbf04 162 ret = lttng_clock_gettime(CLOCK_MONOTONIC, &current_time);
931a97e5 163 if (ret < 0) {
8809eec0 164 PERROR("Error reading time\n");
139ac872 165 /* error */
8809eec0
MD
166 retval = 0;
167 goto end;
44a5e5eb
DG
168 }
169
8809eec0
MD
170 /*
171 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
172 * health if, after the delta delay has passed, its the progress counter
173 * has not moved and it has NOT been waiting for a poll() call.
174 */
175 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
176 retval = 0;
177 goto end;
178 }
44a5e5eb 179
139ac872 180 /*
8809eec0
MD
181 * Initial condition need to update the last counter and sample time, but
182 * should not check health in this initial case, because we don't know how
183 * much time has passed.
139ac872 184 */
8809eec0
MD
185 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
186 /* update last counter and last sample time */
187 state->last = current;
188 memcpy(&state->last_time, &current_time, sizeof(current_time));
189 } else {
28ab034a 190 if (time_diff_gt(&current_time, &state->last_time, &ha->time_delta)) {
8809eec0
MD
191 if (current == last && !HEALTH_IS_IN_POLL(current)) {
192 /* error */
193 retval = 0;
194 }
195 /* update last counter and last sample time */
196 state->last = current;
197 memcpy(&state->last_time, &current_time, sizeof(current_time));
c89add41
DG
198
199 /* On error, stop right now and notify caller. */
200 if (retval == 0) {
201 goto end;
202 }
8809eec0
MD
203 }
204 }
205
206end:
28ab034a 207 DBG("Health state current %lu, last %lu, ret %d", current, last, ret);
c89add41
DG
208 return retval;
209}
210
211/*
212 * Check health of a specific health type. Note that if a thread has not yet
213 * initialize its health subsystem or has quit, it's considered in a good
214 * state.
215 *
216 * Return 0 if health is bad or else 1.
217 */
8782cc74 218int health_check_state(struct health_app *ha, int type)
c89add41
DG
219{
220 int retval = 1;
221 struct health_state *state;
222
a0377dfe 223 LTTNG_ASSERT(type < ha->nr_types);
c89add41 224
8782cc74 225 state_lock(ha);
c89add41 226
28ab034a 227 cds_list_for_each_entry (state, &ha->list, node) {
c89add41
DG
228 int ret;
229
230 if (state->type != type) {
231 continue;
232 }
233
8782cc74 234 ret = validate_state(ha, state);
c89add41
DG
235 if (!ret) {
236 retval = 0;
237 goto end;
238 }
239 }
240
241 /* Check the global state since some state might not be visible anymore. */
8782cc74 242 if (ha->flags[type] & HEALTH_ERROR) {
c89add41
DG
243 retval = 0;
244 }
245
246end:
8782cc74 247 state_unlock(ha);
139ac872 248
28ab034a 249 DBG("Health check for type %d is %s", (int) type, (retval == 0) ? "BAD" : "GOOD");
8809eec0 250 return retval;
44a5e5eb 251}
927ca06a
DG
252
253/*
254 * Init health state.
255 */
8782cc74 256void health_register(struct health_app *ha, int type)
927ca06a 257{
a0377dfe 258 LTTNG_ASSERT(type < ha->nr_types);
927ca06a
DG
259
260 /* Init TLS state. */
261 uatomic_set(&URCU_TLS(health_state).last, 0);
262 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
263 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
264 uatomic_set(&URCU_TLS(health_state).current, 0);
8784a4d0 265 uatomic_set(&URCU_TLS(health_state).flags, (health_flags) 0);
927ca06a
DG
266 uatomic_set(&URCU_TLS(health_state).type, type);
267
268 /* Add it to the global TLS state list. */
8782cc74
MD
269 state_lock(ha);
270 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
271 state_unlock(ha);
927ca06a
DG
272}
273
274/*
275 * Remove node from global list.
276 */
8782cc74 277void health_unregister(struct health_app *ha)
927ca06a 278{
8782cc74 279 state_lock(ha);
927ca06a
DG
280 /*
281 * On error, set the global_error_state since we are about to remove
282 * the node from the global list.
283 */
284 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
28ab034a 285 uatomic_set(&ha->flags[URCU_TLS(health_state).type], HEALTH_ERROR);
927ca06a
DG
286 }
287 cds_list_del(&URCU_TLS(health_state).node);
8782cc74 288 state_unlock(ha);
927ca06a 289}
This page took 0.084626 seconds and 4 git commands to generate.