Fix: file-descriptor: missing include guards
[lttng-tools.git] / src / common / health / health.cpp
1 /*
2 * Copyright (C) 2012 David Goulet <dgoulet@efficios.com>
3 * Copyright (C) 2013 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * SPDX-License-Identifier: GPL-2.0-only
6 *
7 */
8
9 #define _LGPL_SOURCE
10 #include <common/defaults.hpp>
11 #include <common/error.hpp>
12 #include <common/macros.hpp>
13 #include <common/sessiond-comm/inet.hpp>
14
15 #include <lttng/health-internal.hpp>
16
17 #include <algorithm>
18 #include <inttypes.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <time.h>
22
23 /*
24 * An application-specific error state for unregistered thread keeps
25 * track of thread errors. A thread reporting a health error, normally
26 * unregisters and quits. This makes the TLS health state not available
27 * to the health_check_state() call so on unregister we update this
28 * global error array so we can keep track of which thread was on error
29 * if the TLS health state has been removed.
30 */
31 struct health_app {
32 /* List of health state, for each application thread */
33 struct cds_list_head list;
34 /*
35 * This lock ensures that TLS memory used for the node and its
36 * container structure don't get reclaimed after the TLS owner
37 * thread exits until we have finished using it.
38 */
39 pthread_mutex_t lock;
40 int nr_types;
41 struct timespec time_delta;
42 /* Health flags containing thread type error state */
43 enum health_flags *flags;
44 };
45
46 /* Define TLS health state. */
47 DEFINE_URCU_TLS(struct health_state, health_state);
48
49 /*
50 * Initialize health check subsytem.
51 */
52 static void health_init(struct health_app *ha)
53 {
54 /*
55 * Get the maximum value between the default delta value and the TCP
56 * timeout with a safety net of the default health check delta.
57 */
58 ha->time_delta.tv_sec = std::max<unsigned long>(
59 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S, ha->time_delta.tv_sec);
60 DBG("Health check time delta in seconds set to %lu", ha->time_delta.tv_sec);
61 }
62
63 struct health_app *health_app_create(int nr_types)
64 {
65 struct health_app *ha;
66
67 ha = zmalloc<health_app>();
68 if (!ha) {
69 return nullptr;
70 }
71 ha->flags = calloc<health_flags>(nr_types);
72 if (!ha->flags) {
73 goto error_flags;
74 }
75 CDS_INIT_LIST_HEAD(&ha->list);
76 pthread_mutex_init(&ha->lock, nullptr);
77 ha->nr_types = nr_types;
78 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
79 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
80 health_init(ha);
81 return ha;
82
83 error_flags:
84 free(ha);
85 return nullptr;
86 }
87
88 void health_app_destroy(struct health_app *ha)
89 {
90 free(ha->flags);
91 free(ha);
92 }
93
94 /*
95 * Lock health state global list mutex.
96 */
97 static void state_lock(struct health_app *ha)
98 {
99 pthread_mutex_lock(&ha->lock);
100 }
101
102 /*
103 * Unlock health state global list mutex.
104 */
105 static void state_unlock(struct health_app *ha)
106 {
107 pthread_mutex_unlock(&ha->lock);
108 }
109
110 /*
111 * Set time difference in res from time_a and time_b.
112 */
113 static void
114 time_diff(const struct timespec *time_a, const struct timespec *time_b, struct timespec *res)
115 {
116 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
117 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
118 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
119 } else {
120 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
121 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
122 }
123 }
124
125 /*
126 * Return true if time_a - time_b > diff, else false.
127 */
128 static int time_diff_gt(const struct timespec *time_a,
129 const struct timespec *time_b,
130 const struct timespec *diff)
131 {
132 struct timespec res;
133
134 time_diff(time_a, time_b, &res);
135 time_diff(&res, diff, &res);
136
137 if (res.tv_sec > 0) {
138 return 1;
139 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
140 return 1;
141 }
142
143 return 0;
144 }
145
146 /*
147 * Validate health state. Checks for the error flag or health conditions.
148 *
149 * Return 0 if health is bad or else 1.
150 */
151 static int validate_state(struct health_app *ha, struct health_state *state)
152 {
153 int retval = 1, ret;
154 unsigned long current, last;
155 struct timespec current_time;
156
157 LTTNG_ASSERT(state);
158
159 last = state->last;
160 current = uatomic_read(&state->current);
161
162 ret = lttng_clock_gettime(CLOCK_MONOTONIC, &current_time);
163 if (ret < 0) {
164 PERROR("Error reading time\n");
165 /* error */
166 retval = 0;
167 goto end;
168 }
169
170 /*
171 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
172 * health if, after the delta delay has passed, its the progress counter
173 * has not moved and it has NOT been waiting for a poll() call.
174 */
175 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
176 retval = 0;
177 goto end;
178 }
179
180 /*
181 * Initial condition need to update the last counter and sample time, but
182 * should not check health in this initial case, because we don't know how
183 * much time has passed.
184 */
185 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
186 /* update last counter and last sample time */
187 state->last = current;
188 memcpy(&state->last_time, &current_time, sizeof(current_time));
189 } else {
190 if (time_diff_gt(&current_time, &state->last_time, &ha->time_delta)) {
191 if (current == last && !HEALTH_IS_IN_POLL(current)) {
192 /* error */
193 retval = 0;
194 }
195 /* update last counter and last sample time */
196 state->last = current;
197 memcpy(&state->last_time, &current_time, sizeof(current_time));
198
199 /* On error, stop right now and notify caller. */
200 if (retval == 0) {
201 goto end;
202 }
203 }
204 }
205
206 end:
207 DBG("Health state current %lu, last %lu, ret %d", current, last, ret);
208 return retval;
209 }
210
211 /*
212 * Check health of a specific health type. Note that if a thread has not yet
213 * initialize its health subsystem or has quit, it's considered in a good
214 * state.
215 *
216 * Return 0 if health is bad or else 1.
217 */
218 int health_check_state(struct health_app *ha, int type)
219 {
220 int retval = 1;
221 struct health_state *state;
222
223 LTTNG_ASSERT(type < ha->nr_types);
224
225 state_lock(ha);
226
227 cds_list_for_each_entry (state, &ha->list, node) {
228 int ret;
229
230 if (state->type != type) {
231 continue;
232 }
233
234 ret = validate_state(ha, state);
235 if (!ret) {
236 retval = 0;
237 goto end;
238 }
239 }
240
241 /* Check the global state since some state might not be visible anymore. */
242 if (ha->flags[type] & HEALTH_ERROR) {
243 retval = 0;
244 }
245
246 end:
247 state_unlock(ha);
248
249 DBG("Health check for type %d is %s", (int) type, (retval == 0) ? "BAD" : "GOOD");
250 return retval;
251 }
252
253 /*
254 * Init health state.
255 */
256 void health_register(struct health_app *ha, int type)
257 {
258 LTTNG_ASSERT(type < ha->nr_types);
259
260 /* Init TLS state. */
261 uatomic_set(&URCU_TLS(health_state).last, 0);
262 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
263 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
264 uatomic_set(&URCU_TLS(health_state).current, 0);
265 uatomic_set(&URCU_TLS(health_state).flags, (health_flags) 0);
266 uatomic_set(&URCU_TLS(health_state).type, type);
267
268 /* Add it to the global TLS state list. */
269 state_lock(ha);
270 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
271 state_unlock(ha);
272 }
273
274 /*
275 * Remove node from global list.
276 */
277 void health_unregister(struct health_app *ha)
278 {
279 state_lock(ha);
280 /*
281 * On error, set the global_error_state since we are about to remove
282 * the node from the global list.
283 */
284 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
285 uatomic_set(&ha->flags[URCU_TLS(health_state).type], HEALTH_ERROR);
286 }
287 cds_list_del(&URCU_TLS(health_state).node);
288 state_unlock(ha);
289 }
This page took 0.035685 seconds and 4 git commands to generate.