2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License, version 2 only, as
6 * published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 51
15 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 #include <common/defaults.h>
26 #include <common/error.h>
27 #include <common/macros.h>
28 #include <common/sessiond-comm/inet.h>
32 static struct timespec time_delta
= {
33 .tv_sec
= DEFAULT_HEALTH_CHECK_DELTA_S
,
34 .tv_nsec
= DEFAULT_HEALTH_CHECK_DELTA_NS
,
37 /* Define TLS health state. */
38 DEFINE_URCU_TLS(struct health_state
, health_state
);
41 * It ensures that TLS memory used for the node and its container structure
42 * don't get reclaimed after the TLS owner thread exits until we have finished
45 static pthread_mutex_t health_mutex
= PTHREAD_MUTEX_INITIALIZER
;
47 static struct health_tls_state_list health_state_list
= {
48 .head
= CDS_LIST_HEAD_INIT(health_state_list
.head
),
52 * This keeps track of the error state for unregistered thread. A thread
53 * reporting a health error, normally unregisters and quits. This makes the TLS
54 * health state not available to the health_check_state() call so on unregister
55 * we update this global error array so we can keep track of which thread was
56 * on error if the TLS health state has been removed.
58 static enum health_flags global_error_state
[HEALTH_NUM_TYPE
];
61 * Lock health state global list mutex.
63 static void state_lock(void)
65 pthread_mutex_lock(&health_mutex
);
69 * Unlock health state global list mutex.
71 static void state_unlock(void)
73 pthread_mutex_unlock(&health_mutex
);
77 * Set time difference in res from time_a and time_b.
79 static void time_diff(const struct timespec
*time_a
,
80 const struct timespec
*time_b
, struct timespec
*res
)
82 if (time_a
->tv_nsec
- time_b
->tv_nsec
< 0) {
83 res
->tv_sec
= time_a
->tv_sec
- time_b
->tv_sec
- 1;
84 res
->tv_nsec
= 1000000000L + time_a
->tv_sec
- time_b
->tv_sec
;
86 res
->tv_sec
= time_a
->tv_sec
- time_b
->tv_sec
;
87 res
->tv_nsec
= time_a
->tv_nsec
- time_b
->tv_nsec
;
92 * Return true if time_a - time_b > diff, else false.
94 static int time_diff_gt(const struct timespec
*time_a
,
95 const struct timespec
*time_b
, const struct timespec
*diff
)
99 time_diff(time_a
, time_b
, &res
);
100 time_diff(&res
, diff
, &res
);
102 if (res
.tv_sec
> 0) {
104 } else if (res
.tv_sec
== 0 && res
.tv_nsec
> 0) {
112 * Validate health state. Checks for the error flag or health conditions.
114 * Return 0 if health is bad or else 1.
116 static int validate_state(struct health_state
*state
)
119 unsigned long current
, last
;
120 struct timespec current_time
;
125 current
= uatomic_read(&state
->current
);
127 ret
= clock_gettime(CLOCK_MONOTONIC
, ¤t_time
);
129 PERROR("Error reading time\n");
136 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
137 * health if, after the delta delay has passed, its the progress counter
138 * has not moved and it has NOT been waiting for a poll() call.
140 if (uatomic_read(&state
->flags
) & HEALTH_ERROR
) {
146 * Initial condition need to update the last counter and sample time, but
147 * should not check health in this initial case, because we don't know how
148 * much time has passed.
150 if (state
->last_time
.tv_sec
== 0 && state
->last_time
.tv_nsec
== 0) {
151 /* update last counter and last sample time */
152 state
->last
= current
;
153 memcpy(&state
->last_time
, ¤t_time
, sizeof(current_time
));
155 if (time_diff_gt(¤t_time
, &state
->last_time
, &time_delta
)) {
156 if (current
== last
&& !HEALTH_IS_IN_POLL(current
)) {
160 /* update last counter and last sample time */
161 state
->last
= current
;
162 memcpy(&state
->last_time
, ¤t_time
, sizeof(current_time
));
164 /* On error, stop right now and notify caller. */
172 DBG("Health state current %lu, last %lu, ret %d",
178 * Check health of a specific health type. Note that if a thread has not yet
179 * initialize its health subsystem or has quit, it's considered in a good
182 * Return 0 if health is bad or else 1.
184 int health_check_state(enum health_type type
)
187 struct health_state
*state
;
189 assert(type
< HEALTH_NUM_TYPE
);
193 cds_list_for_each_entry(state
, &health_state_list
.head
, node
) {
196 if (state
->type
!= type
) {
200 ret
= validate_state(state
);
207 /* Check the global state since some state might not be visible anymore. */
208 if (global_error_state
[type
] & HEALTH_ERROR
) {
215 DBG("Health check for type %d is %s", (int) type
,
216 (retval
== 0) ? "BAD" : "GOOD");
223 void health_register(enum health_type type
)
225 assert(type
< HEALTH_NUM_TYPE
);
227 /* Init TLS state. */
228 uatomic_set(&URCU_TLS(health_state
).last
, 0);
229 uatomic_set(&URCU_TLS(health_state
).last_time
.tv_sec
, 0);
230 uatomic_set(&URCU_TLS(health_state
).last_time
.tv_nsec
, 0);
231 uatomic_set(&URCU_TLS(health_state
).current
, 0);
232 uatomic_set(&URCU_TLS(health_state
).flags
, 0);
233 uatomic_set(&URCU_TLS(health_state
).type
, type
);
235 /* Add it to the global TLS state list. */
237 cds_list_add(&URCU_TLS(health_state
).node
, &health_state_list
.head
);
242 * Remove node from global list.
244 void health_unregister(void)
248 * On error, set the global_error_state since we are about to remove
249 * the node from the global list.
251 if (uatomic_read(&URCU_TLS(health_state
).flags
) & HEALTH_ERROR
) {
252 uatomic_set(&global_error_state
[URCU_TLS(health_state
).type
],
255 cds_list_del(&URCU_TLS(health_state
).node
);
260 * Initiliazie health check subsytem. This should be called before any health
263 void health_init(void)
266 * Get the maximum value between the default delta value and the TCP
267 * timeout with a safety net of the default health check delta.
269 time_delta
.tv_sec
= max_t(unsigned long,
270 lttcomm_inet_tcp_timeout
+ DEFAULT_HEALTH_CHECK_DELTA_S
,
272 DBG("Health check time delta in seconds set to %lu", time_delta
.tv_sec
);