2 * Copyright (C) 2017 Julien Desfossez <jdesfossez@efficios.com>
3 * Copyright (C) 2018 Jérémie Galarneau <jeremie.galarneau@efficios.com>
5 * SPDX-License-Identifier: GPL-2.0-only
11 #include "health-sessiond.hpp"
12 #include "lttng-sessiond.hpp"
13 #include "notification-thread-commands.hpp"
15 #include "rotation-thread.hpp"
16 #include "session.hpp"
21 #include <common/align.hpp>
22 #include <common/config/session-config.hpp>
23 #include <common/defaults.hpp>
24 #include <common/error.hpp>
25 #include <common/futex.hpp>
26 #include <common/hashtable/utils.hpp>
27 #include <common/kernel-ctl/kernel-ctl.hpp>
28 #include <common/time.hpp>
29 #include <common/utils.hpp>
31 #include <lttng/condition/condition-internal.hpp>
32 #include <lttng/location-internal.hpp>
33 #include <lttng/notification/channel-internal.hpp>
34 #include <lttng/notification/notification-internal.hpp>
35 #include <lttng/rotate-internal.hpp>
36 #include <lttng/trigger/trigger.h>
43 #include <urcu/list.h>
45 struct lttng_notification_channel
*rotate_notification_channel
= nullptr;
47 struct rotation_thread
{
48 struct lttng_poll_event events
;
52 * The timer thread enqueues jobs and wakes up the rotation thread.
53 * When the rotation thread wakes up, it empties the queue.
55 struct rotation_thread_timer_queue
{
56 struct lttng_pipe
*event_pipe
;
57 struct cds_list_head list
;
61 struct rotation_thread_handle
{
62 struct rotation_thread_timer_queue
*rotation_timer_queue
;
63 /* Access to the notification thread cmd_queue */
64 struct notification_thread_handle
*notification_thread_handle
;
65 /* Thread-specific quit pipe. */
66 struct lttng_pipe
*quit_pipe
;
70 struct rotation_thread_job
{
71 enum rotation_thread_job_type type
;
72 struct ltt_session
*session
;
73 /* List member in struct rotation_thread_timer_queue. */
74 struct cds_list_head head
;
78 static const char *get_job_type_str(enum rotation_thread_job_type job_type
)
81 case ROTATION_THREAD_JOB_TYPE_CHECK_PENDING_ROTATION
:
82 return "CHECK_PENDING_ROTATION";
83 case ROTATION_THREAD_JOB_TYPE_SCHEDULED_ROTATION
:
84 return "SCHEDULED_ROTATION";
90 struct rotation_thread_timer_queue
*rotation_thread_timer_queue_create()
92 struct rotation_thread_timer_queue
*queue
= nullptr;
94 queue
= zmalloc
<rotation_thread_timer_queue
>();
96 PERROR("Failed to allocate timer rotate queue");
100 queue
->event_pipe
= lttng_pipe_open(FD_CLOEXEC
| O_NONBLOCK
);
101 CDS_INIT_LIST_HEAD(&queue
->list
);
102 pthread_mutex_init(&queue
->lock
, nullptr);
107 void rotation_thread_timer_queue_destroy(struct rotation_thread_timer_queue
*queue
)
113 lttng_pipe_destroy(queue
->event_pipe
);
115 pthread_mutex_lock(&queue
->lock
);
116 LTTNG_ASSERT(cds_list_empty(&queue
->list
));
117 pthread_mutex_unlock(&queue
->lock
);
118 pthread_mutex_destroy(&queue
->lock
);
123 * Destroy the thread data previously created by the init function.
125 void rotation_thread_handle_destroy(struct rotation_thread_handle
*handle
)
127 lttng_pipe_destroy(handle
->quit_pipe
);
131 struct rotation_thread_handle
*
132 rotation_thread_handle_create(struct rotation_thread_timer_queue
*rotation_timer_queue
,
133 struct notification_thread_handle
*notification_thread_handle
)
135 struct rotation_thread_handle
*handle
;
137 handle
= zmalloc
<rotation_thread_handle
>();
142 handle
->rotation_timer_queue
= rotation_timer_queue
;
143 handle
->notification_thread_handle
= notification_thread_handle
;
144 handle
->quit_pipe
= lttng_pipe_open(FD_CLOEXEC
);
145 if (!handle
->quit_pipe
) {
152 rotation_thread_handle_destroy(handle
);
157 * Called with the rotation_thread_timer_queue lock held.
158 * Return true if the same timer job already exists in the queue, false if not.
160 static bool timer_job_exists(const struct rotation_thread_timer_queue
*queue
,
161 enum rotation_thread_job_type job_type
,
162 struct ltt_session
*session
)
165 struct rotation_thread_job
*job
;
167 cds_list_for_each_entry (job
, &queue
->list
, head
) {
168 if (job
->session
== session
&& job
->type
== job_type
) {
177 void rotation_thread_enqueue_job(struct rotation_thread_timer_queue
*queue
,
178 enum rotation_thread_job_type job_type
,
179 struct ltt_session
*session
)
182 const char dummy
= '!';
183 struct rotation_thread_job
*job
= nullptr;
184 const char *job_type_str
= get_job_type_str(job_type
);
186 pthread_mutex_lock(&queue
->lock
);
187 if (timer_job_exists(queue
, job_type
, session
)) {
189 * This timer job is already pending, we don't need to add
195 job
= zmalloc
<rotation_thread_job
>();
197 PERROR("Failed to allocate rotation thread job of type \"%s\" for session \"%s\"",
202 /* No reason for this to fail as the caller must hold a reference. */
203 (void) session_get(session
);
205 job
->session
= session
;
206 job
->type
= job_type
;
207 cds_list_add_tail(&job
->head
, &queue
->list
);
209 ret
= lttng_write(lttng_pipe_get_writefd(queue
->event_pipe
), &dummy
, sizeof(dummy
));
212 * We do not want to block in the timer handler, the job has
213 * been enqueued in the list, the wakeup pipe is probably full,
214 * the job will be processed when the rotation_thread catches
218 DIAGNOSTIC_IGNORE_LOGICAL_OP
219 if (errno
== EAGAIN
|| errno
== EWOULDBLOCK
) {
222 * Not an error, but would be surprising and indicate
223 * that the rotation thread can't keep up with the
226 DBG("Wake-up pipe of rotation thread job queue is full");
229 PERROR("Failed to wake-up the rotation thread after pushing a job of type \"%s\" for session \"%s\"",
236 pthread_mutex_unlock(&queue
->lock
);
239 static int init_poll_set(struct lttng_poll_event
*poll_set
, struct rotation_thread_handle
*handle
)
244 * Create pollset with size 3:
245 * - rotation thread quit pipe,
246 * - rotation thread timer queue pipe,
247 * - notification channel sock,
249 ret
= lttng_poll_create(poll_set
, 5, LTTNG_CLOEXEC
);
254 ret
= lttng_poll_add(poll_set
, lttng_pipe_get_readfd(handle
->quit_pipe
), LPOLLIN
);
256 ERR("Failed to add quit pipe read fd to poll set");
260 ret
= lttng_poll_add(
261 poll_set
, lttng_pipe_get_readfd(handle
->rotation_timer_queue
->event_pipe
), LPOLLIN
);
263 ERR("Failed to add rotate_pending fd to poll set");
269 lttng_poll_clean(poll_set
);
273 static void fini_thread_state(struct rotation_thread
*state
)
275 lttng_poll_clean(&state
->events
);
276 if (rotate_notification_channel
) {
277 lttng_notification_channel_destroy(rotate_notification_channel
);
281 static int init_thread_state(struct rotation_thread_handle
*handle
, struct rotation_thread
*state
)
285 memset(state
, 0, sizeof(*state
));
286 lttng_poll_init(&state
->events
);
288 ret
= init_poll_set(&state
->events
, handle
);
290 ERR("Failed to initialize rotation thread poll set");
294 rotate_notification_channel
=
295 lttng_notification_channel_create(lttng_session_daemon_notification_endpoint
);
296 if (!rotate_notification_channel
) {
297 ERR("Could not create notification channel");
301 ret
= lttng_poll_add(&state
->events
, rotate_notification_channel
->socket
, LPOLLIN
);
303 ERR("Failed to add notification fd to pollset");
311 static void check_session_rotation_pending_on_consumers(struct ltt_session
*session
,
312 bool *_rotation_completed
)
315 struct consumer_socket
*socket
;
316 struct cds_lfht_iter iter
;
317 enum consumer_trace_chunk_exists_status exists_status
;
319 bool chunk_exists_on_peer
= false;
320 enum lttng_trace_chunk_status chunk_status
;
322 LTTNG_ASSERT(session
->chunk_being_archived
);
325 * Check for a local pending rotation on all consumers (32-bit
326 * user space, 64-bit user space, and kernel).
329 if (!session
->ust_session
) {
332 cds_lfht_for_each_entry (
333 session
->ust_session
->consumer
->socks
->ht
, &iter
, socket
, node
.node
) {
334 relayd_id
= session
->ust_session
->consumer
->type
== CONSUMER_DST_LOCAL
?
336 session
->ust_session
->consumer
->net_seq_index
;
338 pthread_mutex_lock(socket
->lock
);
339 ret
= consumer_trace_chunk_exists(socket
,
342 session
->chunk_being_archived
,
345 pthread_mutex_unlock(socket
->lock
);
346 ERR("Error occurred while checking rotation status on consumer daemon");
350 if (exists_status
!= CONSUMER_TRACE_CHUNK_EXISTS_STATUS_UNKNOWN_CHUNK
) {
351 pthread_mutex_unlock(socket
->lock
);
352 chunk_exists_on_peer
= true;
355 pthread_mutex_unlock(socket
->lock
);
359 if (!session
->kernel_session
) {
362 cds_lfht_for_each_entry (
363 session
->kernel_session
->consumer
->socks
->ht
, &iter
, socket
, node
.node
) {
364 pthread_mutex_lock(socket
->lock
);
365 relayd_id
= session
->kernel_session
->consumer
->type
== CONSUMER_DST_LOCAL
?
367 session
->kernel_session
->consumer
->net_seq_index
;
369 ret
= consumer_trace_chunk_exists(socket
,
372 session
->chunk_being_archived
,
375 pthread_mutex_unlock(socket
->lock
);
376 ERR("Error occurred while checking rotation status on consumer daemon");
380 if (exists_status
!= CONSUMER_TRACE_CHUNK_EXISTS_STATUS_UNKNOWN_CHUNK
) {
381 pthread_mutex_unlock(socket
->lock
);
382 chunk_exists_on_peer
= true;
385 pthread_mutex_unlock(socket
->lock
);
391 if (!chunk_exists_on_peer
) {
392 uint64_t chunk_being_archived_id
;
394 chunk_status
= lttng_trace_chunk_get_id(session
->chunk_being_archived
,
395 &chunk_being_archived_id
);
396 LTTNG_ASSERT(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
397 DBG("Rotation of trace archive %" PRIu64
398 " of session \"%s\" is complete on all consumers",
399 chunk_being_archived_id
,
402 *_rotation_completed
= !chunk_exists_on_peer
;
404 ret
= session_reset_rotation_state(session
, LTTNG_ROTATION_STATE_ERROR
);
406 ERR("Failed to reset rotation state of session \"%s\"", session
->name
);
412 * Check if the last rotation was completed, called with session lock held.
413 * Should only return non-zero in the event of a fatal error. Doing so will
414 * shutdown the thread.
417 check_session_rotation_pending(struct ltt_session
*session
,
418 struct notification_thread_handle
*notification_thread_handle
)
421 struct lttng_trace_archive_location
*location
;
422 enum lttng_trace_chunk_status chunk_status
;
423 bool rotation_completed
= false;
424 const char *archived_chunk_name
;
425 uint64_t chunk_being_archived_id
;
427 if (!session
->chunk_being_archived
) {
433 lttng_trace_chunk_get_id(session
->chunk_being_archived
, &chunk_being_archived_id
);
434 LTTNG_ASSERT(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
436 DBG("Checking for pending rotation on session \"%s\", trace archive %" PRIu64
,
438 chunk_being_archived_id
);
441 * The rotation-pending check timer of a session is launched in
442 * one-shot mode. If the rotation is incomplete, the rotation
443 * thread will re-enable the pending-check timer.
445 * The timer thread can't stop the timer itself since it is involved
446 * in the check for the timer's quiescence.
448 ret
= timer_session_rotation_pending_check_stop(session
);
450 goto check_ongoing_rotation
;
453 check_session_rotation_pending_on_consumers(session
, &rotation_completed
);
454 if (!rotation_completed
|| session
->rotation_state
== LTTNG_ROTATION_STATE_ERROR
) {
455 goto check_ongoing_rotation
;
459 * Now we can clear the "ONGOING" state in the session. New
460 * rotations can start now.
462 chunk_status
= lttng_trace_chunk_get_name(
463 session
->chunk_being_archived
, &archived_chunk_name
, nullptr);
464 LTTNG_ASSERT(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
465 free(session
->last_archived_chunk_name
);
466 session
->last_archived_chunk_name
= strdup(archived_chunk_name
);
467 if (!session
->last_archived_chunk_name
) {
468 PERROR("Failed to duplicate archived chunk name");
470 session_reset_rotation_state(session
, LTTNG_ROTATION_STATE_COMPLETED
);
472 if (!session
->quiet_rotation
) {
473 location
= session_get_trace_archive_location(session
);
474 ret
= notification_thread_command_session_rotation_completed(
475 notification_thread_handle
,
477 session
->last_archived_chunk_id
.value
,
479 lttng_trace_archive_location_put(location
);
480 if (ret
!= LTTNG_OK
) {
481 ERR("Failed to notify notification thread of completed rotation for session %s",
487 check_ongoing_rotation
:
488 if (session
->rotation_state
== LTTNG_ROTATION_STATE_ONGOING
) {
489 chunk_status
= lttng_trace_chunk_get_id(session
->chunk_being_archived
,
490 &chunk_being_archived_id
);
491 LTTNG_ASSERT(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
493 DBG("Rotation of trace archive %" PRIu64
" is still pending for session %s",
494 chunk_being_archived_id
,
496 ret
= timer_session_rotation_pending_check_start(session
,
497 DEFAULT_ROTATE_PENDING_TIMER
);
499 ERR("Failed to re-enable rotation pending timer");
509 /* Call with the session and session_list locks held. */
510 static int launch_session_rotation(struct ltt_session
*session
)
513 struct lttng_rotate_session_return rotation_return
;
515 DBG("Launching scheduled time-based rotation on session \"%s\"", session
->name
);
517 ret
= cmd_rotate_session(
518 session
, &rotation_return
, false, LTTNG_TRACE_CHUNK_COMMAND_TYPE_MOVE_TO_COMPLETED
);
519 if (ret
== LTTNG_OK
) {
520 DBG("Scheduled time-based rotation successfully launched on session \"%s\"",
523 /* Don't consider errors as fatal. */
524 DBG("Scheduled time-based rotation aborted for session %s: %s",
526 lttng_strerror(ret
));
531 static int run_job(struct rotation_thread_job
*job
,
532 struct ltt_session
*session
,
533 struct notification_thread_handle
*notification_thread_handle
)
538 case ROTATION_THREAD_JOB_TYPE_SCHEDULED_ROTATION
:
539 ret
= launch_session_rotation(session
);
541 case ROTATION_THREAD_JOB_TYPE_CHECK_PENDING_ROTATION
:
542 ret
= check_session_rotation_pending(session
, notification_thread_handle
);
550 static int handle_job_queue(struct rotation_thread_handle
*handle
,
551 struct rotation_thread
*state
__attribute__((unused
)),
552 struct rotation_thread_timer_queue
*queue
)
557 struct ltt_session
*session
;
558 struct rotation_thread_job
*job
;
560 /* Take the queue lock only to pop an element from the list. */
561 pthread_mutex_lock(&queue
->lock
);
562 if (cds_list_empty(&queue
->list
)) {
563 pthread_mutex_unlock(&queue
->lock
);
566 job
= cds_list_first_entry(&queue
->list
, typeof(*job
), head
);
567 cds_list_del(&job
->head
);
568 pthread_mutex_unlock(&queue
->lock
);
571 session
= job
->session
;
573 DBG("Session \"%s\" not found", session
->name
!= NULL
? session
->name
: "");
575 * This is a non-fatal error, and we cannot report it to
576 * the user (timer), so just print the error and
577 * continue the processing.
579 * While the timer thread will purge pending signals for
580 * a session on the session's destruction, it is
581 * possible for a job targeting that session to have
582 * already been queued before it was destroyed.
585 session_put(session
);
586 session_unlock_list();
590 session_lock(session
);
591 ret
= run_job(job
, session
, handle
->notification_thread_handle
);
592 session_unlock(session
);
593 /* Release reference held by the job. */
594 session_put(session
);
595 session_unlock_list();
608 static int handle_condition(const struct lttng_notification
*notification
,
609 struct notification_thread_handle
*notification_thread_handle
)
612 const char *condition_session_name
= nullptr;
613 enum lttng_condition_type condition_type
;
614 enum lttng_condition_status condition_status
;
615 enum lttng_evaluation_status evaluation_status
;
617 struct ltt_session
*session
;
618 const struct lttng_condition
*condition
=
619 lttng_notification_get_const_condition(notification
);
620 const struct lttng_evaluation
*evaluation
=
621 lttng_notification_get_const_evaluation(notification
);
623 condition_type
= lttng_condition_get_type(condition
);
625 if (condition_type
!= LTTNG_CONDITION_TYPE_SESSION_CONSUMED_SIZE
) {
627 ERR("Condition type and session usage type are not the same");
631 /* Fetch info to test */
632 condition_status
= lttng_condition_session_consumed_size_get_session_name(
633 condition
, &condition_session_name
);
634 if (condition_status
!= LTTNG_CONDITION_STATUS_OK
) {
635 ERR("Session name could not be fetched");
640 lttng_evaluation_session_consumed_size_get_consumed_size(evaluation
, &consumed
);
641 if (evaluation_status
!= LTTNG_EVALUATION_STATUS_OK
) {
642 ERR("Failed to get evaluation");
648 session
= session_find_by_name(condition_session_name
);
650 DBG("Failed to find session while handling notification: notification type = %s, session name = `%s`",
651 lttng_condition_type_str(condition_type
),
652 condition_session_name
);
654 * Not a fatal error: a session can be destroyed before we get
655 * the chance to handle the notification.
658 session_unlock_list();
661 session_lock(session
);
663 if (!lttng_trigger_is_equal(session
->rotate_trigger
,
664 lttng_notification_get_const_trigger(notification
))) {
665 /* Notification does not originate from our rotation trigger. */
670 ret
= unsubscribe_session_consumed_size_rotation(session
, notification_thread_handle
);
675 ret
= cmd_rotate_session(
676 session
, nullptr, false, LTTNG_TRACE_CHUNK_COMMAND_TYPE_MOVE_TO_COMPLETED
);
680 case -LTTNG_ERR_ROTATION_PENDING
:
681 DBG("Rotate already pending, subscribe to the next threshold value");
683 case -LTTNG_ERR_ROTATION_MULTIPLE_AFTER_STOP
:
684 DBG("Rotation already happened since last stop, subscribe to the next threshold value");
686 case -LTTNG_ERR_ROTATION_AFTER_STOP_CLEAR
:
687 DBG("Rotation already happened since last stop and clear, subscribe to the next threshold value");
690 ERR("Failed to rotate on size notification with error: %s", lttng_strerror(ret
));
695 ret
= subscribe_session_consumed_size_rotation(
696 session
, consumed
+ session
->rotate_size
, notification_thread_handle
);
698 ERR("Failed to subscribe to session consumed size condition");
704 session_unlock(session
);
705 session_put(session
);
706 session_unlock_list();
711 static int handle_notification_channel(int fd
__attribute__((unused
)),
712 struct rotation_thread_handle
*handle
,
713 struct rotation_thread
*state
__attribute__((unused
)))
716 bool notification_pending
;
717 struct lttng_notification
*notification
= nullptr;
718 enum lttng_notification_channel_status status
;
720 status
= lttng_notification_channel_has_pending_notification(rotate_notification_channel
,
721 ¬ification_pending
);
722 if (status
!= LTTNG_NOTIFICATION_CHANNEL_STATUS_OK
) {
723 ERR("Error occurred while checking for pending notification");
728 if (!notification_pending
) {
733 /* Receive the next notification. */
734 status
= lttng_notification_channel_get_next_notification(rotate_notification_channel
,
738 case LTTNG_NOTIFICATION_CHANNEL_STATUS_OK
:
740 case LTTNG_NOTIFICATION_CHANNEL_STATUS_NOTIFICATIONS_DROPPED
:
741 /* Not an error, we will wait for the next one */
745 case LTTNG_NOTIFICATION_CHANNEL_STATUS_CLOSED
:
746 ERR("Notification channel was closed");
750 /* Unhandled conditions / errors. */
751 ERR("Unknown notification channel status");
756 ret
= handle_condition(notification
, handle
->notification_thread_handle
);
759 lttng_notification_destroy(notification
);
763 static void *thread_rotation(void *data
)
766 struct rotation_thread_handle
*handle
= (rotation_thread_handle
*) data
;
767 struct rotation_thread thread
;
770 DBG("Started rotation thread");
771 rcu_register_thread();
773 health_register(the_health_sessiond
, HEALTH_SESSIOND_TYPE_ROTATION
);
774 health_code_update();
777 ERR("Invalid thread context provided");
781 queue_pipe_fd
= lttng_pipe_get_readfd(handle
->rotation_timer_queue
->event_pipe
);
783 ret
= init_thread_state(handle
, &thread
);
792 DBG("Entering poll wait");
793 ret
= lttng_poll_wait(&thread
.events
, -1);
794 DBG("Poll wait returned (%i)", ret
);
798 * Restart interrupted system call.
800 if (errno
== EINTR
) {
803 ERR("Error encountered during lttng_poll_wait (%i)", ret
);
808 for (i
= 0; i
< fd_count
; i
++) {
809 int fd
= LTTNG_POLL_GETFD(&thread
.events
, i
);
810 uint32_t revents
= LTTNG_POLL_GETEV(&thread
.events
, i
);
812 DBG("Handling fd (%i) activity (%u)", fd
, revents
);
814 if (revents
& LPOLLERR
) {
815 ERR("Polling returned an error on fd %i", fd
);
819 if (fd
== rotate_notification_channel
->socket
) {
820 ret
= handle_notification_channel(fd
, handle
, &thread
);
822 ERR("Error occurred while handling activity on notification channel socket");
826 /* Job queue or quit pipe activity. */
829 * The job queue is serviced if there is
830 * activity on the quit pipe to ensure it is
831 * flushed and all references held in the queue
834 ret
= handle_job_queue(
835 handle
, &thread
, handle
->rotation_timer_queue
);
837 ERR("Failed to handle rotation timer pipe event");
841 if (fd
== queue_pipe_fd
) {
844 ret
= lttng_read(fd
, &buf
, 1);
846 ERR("Failed to read from wakeup pipe (fd = %i)",
851 DBG("Quit pipe activity");
860 fini_thread_state(&thread
);
862 health_unregister(the_health_sessiond
);
863 rcu_thread_offline();
864 rcu_unregister_thread();
868 static bool shutdown_rotation_thread(void *thread_data
)
870 struct rotation_thread_handle
*handle
= (rotation_thread_handle
*) thread_data
;
871 const int write_fd
= lttng_pipe_get_writefd(handle
->quit_pipe
);
873 return notify_thread_pipe(write_fd
) == 1;
876 bool launch_rotation_thread(struct rotation_thread_handle
*handle
)
878 struct lttng_thread
*thread
;
880 thread
= lttng_thread_create(
881 "Rotation", thread_rotation
, shutdown_rotation_thread
, nullptr, handle
);
885 lttng_thread_put(thread
);