2 * Copyright (C) 2017 Julien Desfossez <jdesfossez@efficios.com>
3 * Copyright (C) 2018 Jérémie Galarneau <jeremie.galarneau@efficios.com>
5 * SPDX-License-Identifier: GPL-2.0-only
10 #include <lttng/trigger/trigger.h>
11 #include <common/error.h>
12 #include <common/config/session-config.h>
13 #include <common/defaults.h>
14 #include <common/utils.h>
15 #include <common/futex.h>
16 #include <common/align.h>
17 #include <common/time.h>
18 #include <common/hashtable/utils.h>
24 #include <common/kernel-ctl/kernel-ctl.h>
25 #include <lttng/notification/channel-internal.h>
26 #include <lttng/rotate-internal.h>
27 #include <lttng/location-internal.h>
28 #include <lttng/condition/condition-internal.h>
29 #include <lttng/notification/notification-internal.h>
31 #include "rotation-thread.h"
32 #include "lttng-sessiond.h"
33 #include "health-sessiond.h"
38 #include "notification-thread-commands.h"
43 #include <urcu/list.h>
45 struct lttng_notification_channel
*rotate_notification_channel
= NULL
;
47 struct rotation_thread
{
48 struct lttng_poll_event events
;
51 struct rotation_thread_job
{
52 enum rotation_thread_job_type type
;
53 struct ltt_session
*session
;
54 /* List member in struct rotation_thread_timer_queue. */
55 struct cds_list_head head
;
59 * The timer thread enqueues jobs and wakes up the rotation thread.
60 * When the rotation thread wakes up, it empties the queue.
62 struct rotation_thread_timer_queue
{
63 struct lttng_pipe
*event_pipe
;
64 struct cds_list_head list
;
68 struct rotation_thread_handle
{
69 struct rotation_thread_timer_queue
*rotation_timer_queue
;
70 /* Access to the notification thread cmd_queue */
71 struct notification_thread_handle
*notification_thread_handle
;
72 /* Thread-specific quit pipe. */
73 struct lttng_pipe
*quit_pipe
;
77 const char *get_job_type_str(enum rotation_thread_job_type job_type
)
80 case ROTATION_THREAD_JOB_TYPE_CHECK_PENDING_ROTATION
:
81 return "CHECK_PENDING_ROTATION";
82 case ROTATION_THREAD_JOB_TYPE_SCHEDULED_ROTATION
:
83 return "SCHEDULED_ROTATION";
89 struct rotation_thread_timer_queue
*rotation_thread_timer_queue_create(void)
91 struct rotation_thread_timer_queue
*queue
= NULL
;
93 queue
= zmalloc(sizeof(*queue
));
95 PERROR("Failed to allocate timer rotate queue");
99 queue
->event_pipe
= lttng_pipe_open(FD_CLOEXEC
| O_NONBLOCK
);
100 CDS_INIT_LIST_HEAD(&queue
->list
);
101 pthread_mutex_init(&queue
->lock
, NULL
);
106 void rotation_thread_timer_queue_destroy(
107 struct rotation_thread_timer_queue
*queue
)
113 lttng_pipe_destroy(queue
->event_pipe
);
115 pthread_mutex_lock(&queue
->lock
);
116 assert(cds_list_empty(&queue
->list
));
117 pthread_mutex_unlock(&queue
->lock
);
118 pthread_mutex_destroy(&queue
->lock
);
123 * Destroy the thread data previously created by the init function.
125 void rotation_thread_handle_destroy(
126 struct rotation_thread_handle
*handle
)
128 lttng_pipe_destroy(handle
->quit_pipe
);
132 struct rotation_thread_handle
*rotation_thread_handle_create(
133 struct rotation_thread_timer_queue
*rotation_timer_queue
,
134 struct notification_thread_handle
*notification_thread_handle
)
136 struct rotation_thread_handle
*handle
;
138 handle
= zmalloc(sizeof(*handle
));
143 handle
->rotation_timer_queue
= rotation_timer_queue
;
144 handle
->notification_thread_handle
= notification_thread_handle
;
145 handle
->quit_pipe
= lttng_pipe_open(FD_CLOEXEC
);
146 if (!handle
->quit_pipe
) {
153 rotation_thread_handle_destroy(handle
);
158 * Called with the rotation_thread_timer_queue lock held.
159 * Return true if the same timer job already exists in the queue, false if not.
162 bool timer_job_exists(const struct rotation_thread_timer_queue
*queue
,
163 enum rotation_thread_job_type job_type
,
164 struct ltt_session
*session
)
167 struct rotation_thread_job
*job
;
169 cds_list_for_each_entry(job
, &queue
->list
, head
) {
170 if (job
->session
== session
&& job
->type
== job_type
) {
179 void rotation_thread_enqueue_job(struct rotation_thread_timer_queue
*queue
,
180 enum rotation_thread_job_type job_type
,
181 struct ltt_session
*session
)
184 const char dummy
= '!';
185 struct rotation_thread_job
*job
= NULL
;
186 const char *job_type_str
= get_job_type_str(job_type
);
188 pthread_mutex_lock(&queue
->lock
);
189 if (timer_job_exists(queue
, job_type
, session
)) {
191 * This timer job is already pending, we don't need to add
197 job
= zmalloc(sizeof(struct rotation_thread_job
));
199 PERROR("Failed to allocate rotation thread job of type \"%s\" for session \"%s\"",
200 job_type_str
, session
->name
);
203 /* No reason for this to fail as the caller must hold a reference. */
204 (void) session_get(session
);
206 job
->session
= session
;
207 job
->type
= job_type
;
208 cds_list_add_tail(&job
->head
, &queue
->list
);
210 ret
= lttng_write(lttng_pipe_get_writefd(queue
->event_pipe
), &dummy
,
214 * We do not want to block in the timer handler, the job has
215 * been enqueued in the list, the wakeup pipe is probably full,
216 * the job will be processed when the rotation_thread catches
219 if (errno
== EAGAIN
|| errno
== EWOULDBLOCK
) {
221 * Not an error, but would be surprising and indicate
222 * that the rotation thread can't keep up with the
225 DBG("Wake-up pipe of rotation thread job queue is full");
228 PERROR("Failed to wake-up the rotation thread after pushing a job of type \"%s\" for session \"%s\"",
229 job_type_str
, session
->name
);
234 pthread_mutex_unlock(&queue
->lock
);
238 int init_poll_set(struct lttng_poll_event
*poll_set
,
239 struct rotation_thread_handle
*handle
)
244 * Create pollset with size 3:
245 * - rotation thread quit pipe,
246 * - rotation thread timer queue pipe,
247 * - notification channel sock,
249 ret
= lttng_poll_create(poll_set
, 5, LTTNG_CLOEXEC
);
254 ret
= lttng_poll_add(poll_set
,
255 lttng_pipe_get_readfd(handle
->quit_pipe
),
258 ERR("Failed to add quit pipe read fd to poll set");
262 ret
= lttng_poll_add(poll_set
,
263 lttng_pipe_get_readfd(handle
->rotation_timer_queue
->event_pipe
),
266 ERR("Failed to add rotate_pending fd to poll set");
272 lttng_poll_clean(poll_set
);
277 void fini_thread_state(struct rotation_thread
*state
)
279 lttng_poll_clean(&state
->events
);
280 if (rotate_notification_channel
) {
281 lttng_notification_channel_destroy(rotate_notification_channel
);
286 int init_thread_state(struct rotation_thread_handle
*handle
,
287 struct rotation_thread
*state
)
291 memset(state
, 0, sizeof(*state
));
292 lttng_poll_init(&state
->events
);
294 ret
= init_poll_set(&state
->events
, handle
);
296 ERR("Failed to initialize rotation thread poll set");
300 rotate_notification_channel
= lttng_notification_channel_create(
301 lttng_session_daemon_notification_endpoint
);
302 if (!rotate_notification_channel
) {
303 ERR("Could not create notification channel");
307 ret
= lttng_poll_add(&state
->events
, rotate_notification_channel
->socket
,
310 ERR("Failed to add notification fd to pollset");
319 void check_session_rotation_pending_on_consumers(struct ltt_session
*session
,
320 bool *_rotation_completed
)
323 struct consumer_socket
*socket
;
324 struct cds_lfht_iter iter
;
325 enum consumer_trace_chunk_exists_status exists_status
;
327 bool chunk_exists_on_peer
= false;
328 enum lttng_trace_chunk_status chunk_status
;
330 assert(session
->chunk_being_archived
);
333 * Check for a local pending rotation on all consumers (32-bit
334 * user space, 64-bit user space, and kernel).
337 if (!session
->ust_session
) {
340 cds_lfht_for_each_entry(session
->ust_session
->consumer
->socks
->ht
,
341 &iter
, socket
, node
.node
) {
342 relayd_id
= session
->ust_session
->consumer
->type
== CONSUMER_DST_LOCAL
?
344 session
->ust_session
->consumer
->net_seq_index
;
346 pthread_mutex_lock(socket
->lock
);
347 ret
= consumer_trace_chunk_exists(socket
,
349 session
->id
, session
->chunk_being_archived
,
352 pthread_mutex_unlock(socket
->lock
);
353 ERR("Error occurred while checking rotation status on consumer daemon");
357 if (exists_status
!= CONSUMER_TRACE_CHUNK_EXISTS_STATUS_UNKNOWN_CHUNK
) {
358 pthread_mutex_unlock(socket
->lock
);
359 chunk_exists_on_peer
= true;
362 pthread_mutex_unlock(socket
->lock
);
366 if (!session
->kernel_session
) {
369 cds_lfht_for_each_entry(session
->kernel_session
->consumer
->socks
->ht
,
370 &iter
, socket
, node
.node
) {
371 pthread_mutex_lock(socket
->lock
);
372 relayd_id
= session
->kernel_session
->consumer
->type
== CONSUMER_DST_LOCAL
?
374 session
->kernel_session
->consumer
->net_seq_index
;
376 ret
= consumer_trace_chunk_exists(socket
,
378 session
->id
, session
->chunk_being_archived
,
381 pthread_mutex_unlock(socket
->lock
);
382 ERR("Error occurred while checking rotation status on consumer daemon");
386 if (exists_status
!= CONSUMER_TRACE_CHUNK_EXISTS_STATUS_UNKNOWN_CHUNK
) {
387 pthread_mutex_unlock(socket
->lock
);
388 chunk_exists_on_peer
= true;
391 pthread_mutex_unlock(socket
->lock
);
397 if (!chunk_exists_on_peer
) {
398 uint64_t chunk_being_archived_id
;
400 chunk_status
= lttng_trace_chunk_get_id(
401 session
->chunk_being_archived
,
402 &chunk_being_archived_id
);
403 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
404 DBG("Rotation of trace archive %" PRIu64
" of session \"%s\" is complete on all consumers",
405 chunk_being_archived_id
,
408 *_rotation_completed
= !chunk_exists_on_peer
;
410 ret
= session_reset_rotation_state(session
,
411 LTTNG_ROTATION_STATE_ERROR
);
413 ERR("Failed to reset rotation state of session \"%s\"",
420 * Check if the last rotation was completed, called with session lock held.
421 * Should only return non-zero in the event of a fatal error. Doing so will
422 * shutdown the thread.
425 int check_session_rotation_pending(struct ltt_session
*session
,
426 struct notification_thread_handle
*notification_thread_handle
)
429 struct lttng_trace_archive_location
*location
;
430 enum lttng_trace_chunk_status chunk_status
;
431 bool rotation_completed
= false;
432 const char *archived_chunk_name
;
433 uint64_t chunk_being_archived_id
;
435 if (!session
->chunk_being_archived
) {
440 chunk_status
= lttng_trace_chunk_get_id(session
->chunk_being_archived
,
441 &chunk_being_archived_id
);
442 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
444 DBG("Checking for pending rotation on session \"%s\", trace archive %" PRIu64
,
445 session
->name
, chunk_being_archived_id
);
448 * The rotation-pending check timer of a session is launched in
449 * one-shot mode. If the rotation is incomplete, the rotation
450 * thread will re-enable the pending-check timer.
452 * The timer thread can't stop the timer itself since it is involved
453 * in the check for the timer's quiescence.
455 ret
= timer_session_rotation_pending_check_stop(session
);
457 goto check_ongoing_rotation
;
460 check_session_rotation_pending_on_consumers(session
,
461 &rotation_completed
);
462 if (!rotation_completed
||
463 session
->rotation_state
== LTTNG_ROTATION_STATE_ERROR
) {
464 goto check_ongoing_rotation
;
468 * Now we can clear the "ONGOING" state in the session. New
469 * rotations can start now.
471 chunk_status
= lttng_trace_chunk_get_name(session
->chunk_being_archived
,
472 &archived_chunk_name
, NULL
);
473 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
474 free(session
->last_archived_chunk_name
);
475 session
->last_archived_chunk_name
= strdup(archived_chunk_name
);
476 if (!session
->last_archived_chunk_name
) {
477 PERROR("Failed to duplicate archived chunk name");
479 session_reset_rotation_state(session
, LTTNG_ROTATION_STATE_COMPLETED
);
481 if (!session
->quiet_rotation
) {
482 location
= session_get_trace_archive_location(session
);
483 ret
= notification_thread_command_session_rotation_completed(
484 notification_thread_handle
,
488 session
->last_archived_chunk_id
.value
,
490 lttng_trace_archive_location_put(location
);
491 if (ret
!= LTTNG_OK
) {
492 ERR("Failed to notify notification thread of completed rotation for session %s",
498 check_ongoing_rotation
:
499 if (session
->rotation_state
== LTTNG_ROTATION_STATE_ONGOING
) {
500 chunk_status
= lttng_trace_chunk_get_id(
501 session
->chunk_being_archived
,
502 &chunk_being_archived_id
);
503 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
505 DBG("Rotation of trace archive %" PRIu64
" is still pending for session %s",
506 chunk_being_archived_id
, session
->name
);
507 ret
= timer_session_rotation_pending_check_start(session
,
508 DEFAULT_ROTATE_PENDING_TIMER
);
510 ERR("Failed to re-enable rotation pending timer");
520 /* Call with the session and session_list locks held. */
522 int launch_session_rotation(struct ltt_session
*session
)
525 struct lttng_rotate_session_return rotation_return
;
527 DBG("Launching scheduled time-based rotation on session \"%s\"",
530 ret
= cmd_rotate_session(session
, &rotation_return
, false,
531 LTTNG_TRACE_CHUNK_COMMAND_TYPE_MOVE_TO_COMPLETED
);
532 if (ret
== LTTNG_OK
) {
533 DBG("Scheduled time-based rotation successfully launched on session \"%s\"",
536 /* Don't consider errors as fatal. */
537 DBG("Scheduled time-based rotation aborted for session %s: %s",
538 session
->name
, lttng_strerror(ret
));
544 int run_job(struct rotation_thread_job
*job
, struct ltt_session
*session
,
545 struct notification_thread_handle
*notification_thread_handle
)
550 case ROTATION_THREAD_JOB_TYPE_SCHEDULED_ROTATION
:
551 ret
= launch_session_rotation(session
);
553 case ROTATION_THREAD_JOB_TYPE_CHECK_PENDING_ROTATION
:
554 ret
= check_session_rotation_pending(session
,
555 notification_thread_handle
);
564 int handle_job_queue(struct rotation_thread_handle
*handle
,
565 struct rotation_thread
*state
,
566 struct rotation_thread_timer_queue
*queue
)
571 struct ltt_session
*session
;
572 struct rotation_thread_job
*job
;
574 /* Take the queue lock only to pop an element from the list. */
575 pthread_mutex_lock(&queue
->lock
);
576 if (cds_list_empty(&queue
->list
)) {
577 pthread_mutex_unlock(&queue
->lock
);
580 job
= cds_list_first_entry(&queue
->list
,
582 cds_list_del(&job
->head
);
583 pthread_mutex_unlock(&queue
->lock
);
586 session
= job
->session
;
589 * This is a non-fatal error, and we cannot report it to
590 * the user (timer), so just print the error and
591 * continue the processing.
593 * While the timer thread will purge pending signals for
594 * a session on the session's destruction, it is
595 * possible for a job targeting that session to have
596 * already been queued before it was destroyed.
599 session_put(session
);
600 session_unlock_list();
604 session_lock(session
);
605 ret
= run_job(job
, session
, handle
->notification_thread_handle
);
606 session_unlock(session
);
607 /* Release reference held by the job. */
608 session_put(session
);
609 session_unlock_list();
623 int handle_condition(const struct lttng_notification
*notification
,
624 struct notification_thread_handle
*notification_thread_handle
)
627 const char *condition_session_name
= NULL
;
628 enum lttng_condition_type condition_type
;
629 enum lttng_condition_status condition_status
;
630 enum lttng_evaluation_status evaluation_status
;
632 struct ltt_session
*session
;
633 const struct lttng_condition
*condition
=
634 lttng_notification_get_const_condition(notification
);
635 const struct lttng_evaluation
*evaluation
=
636 lttng_notification_get_const_evaluation(notification
);
638 condition_type
= lttng_condition_get_type(condition
);
640 if (condition_type
!= LTTNG_CONDITION_TYPE_SESSION_CONSUMED_SIZE
) {
642 ERR("Condition type and session usage type are not the same");
646 /* Fetch info to test */
647 condition_status
= lttng_condition_session_consumed_size_get_session_name(
648 condition
, &condition_session_name
);
649 if (condition_status
!= LTTNG_CONDITION_STATUS_OK
) {
650 ERR("Session name could not be fetched");
654 evaluation_status
= lttng_evaluation_session_consumed_size_get_consumed_size(evaluation
,
656 if (evaluation_status
!= LTTNG_EVALUATION_STATUS_OK
) {
657 ERR("Failed to get evaluation");
663 session
= session_find_by_name(condition_session_name
);
665 DBG("Failed to find session while handling notification: notification type = %s, session name = `%s`",
666 lttng_condition_type_str(condition_type
),
667 condition_session_name
);
669 * Not a fatal error: a session can be destroyed before we get
670 * the chance to handle the notification.
673 session_unlock_list();
676 session_lock(session
);
678 if (!lttng_trigger_is_equal(session
->rotate_trigger
,
679 lttng_notification_get_const_trigger(notification
))) {
680 /* Notification does not originate from our rotation trigger. */
685 ret
= unsubscribe_session_consumed_size_rotation(session
,
686 notification_thread_handle
);
691 ret
= cmd_rotate_session(
692 session
, NULL
, false, LTTNG_TRACE_CHUNK_COMMAND_TYPE_MOVE_TO_COMPLETED
);
696 case -LTTNG_ERR_ROTATION_PENDING
:
697 DBG("Rotate already pending, subscribe to the next threshold value");
699 case -LTTNG_ERR_ROTATION_MULTIPLE_AFTER_STOP
:
700 DBG("Rotation already happened since last stop, subscribe to the next threshold value");
702 case -LTTNG_ERR_ROTATION_AFTER_STOP_CLEAR
:
703 DBG("Rotation already happened since last stop and clear, subscribe to the next threshold value");
706 ERR("Failed to rotate on size notification with error: %s", lttng_strerror(ret
));
711 ret
= subscribe_session_consumed_size_rotation(
712 session
, consumed
+ session
->rotate_size
, notification_thread_handle
);
714 ERR("Failed to subscribe to session consumed size condition");
720 session_unlock(session
);
721 session_put(session
);
722 session_unlock_list();
728 int handle_notification_channel(int fd
,
729 struct rotation_thread_handle
*handle
,
730 struct rotation_thread
*state
)
733 bool notification_pending
;
734 struct lttng_notification
*notification
= NULL
;
735 enum lttng_notification_channel_status status
;
737 status
= lttng_notification_channel_has_pending_notification(
738 rotate_notification_channel
, ¬ification_pending
);
739 if (status
!= LTTNG_NOTIFICATION_CHANNEL_STATUS_OK
) {
740 ERR("Error occurred while checking for pending notification");
745 if (!notification_pending
) {
750 /* Receive the next notification. */
751 status
= lttng_notification_channel_get_next_notification(
752 rotate_notification_channel
,
756 case LTTNG_NOTIFICATION_CHANNEL_STATUS_OK
:
758 case LTTNG_NOTIFICATION_CHANNEL_STATUS_NOTIFICATIONS_DROPPED
:
759 /* Not an error, we will wait for the next one */
762 case LTTNG_NOTIFICATION_CHANNEL_STATUS_CLOSED
:
763 ERR("Notification channel was closed");
767 /* Unhandled conditions / errors. */
768 ERR("Unknown notification channel status");
773 ret
= handle_condition(notification
,
774 handle
->notification_thread_handle
);
777 lttng_notification_destroy(notification
);
782 void *thread_rotation(void *data
)
785 struct rotation_thread_handle
*handle
= data
;
786 struct rotation_thread thread
;
789 DBG("Started rotation thread");
790 rcu_register_thread();
792 health_register(the_health_sessiond
, HEALTH_SESSIOND_TYPE_ROTATION
);
793 health_code_update();
796 ERR("Invalid thread context provided");
800 queue_pipe_fd
= lttng_pipe_get_readfd(
801 handle
->rotation_timer_queue
->event_pipe
);
804 ret
= init_thread_state(handle
, &thread
);
813 DBG("Entering poll wait");
814 ret
= lttng_poll_wait(&thread
.events
, -1);
815 DBG("Poll wait returned (%i)", ret
);
819 * Restart interrupted system call.
821 if (errno
== EINTR
) {
824 ERR("Error encountered during lttng_poll_wait (%i)", ret
);
829 for (i
= 0; i
< fd_count
; i
++) {
830 int fd
= LTTNG_POLL_GETFD(&thread
.events
, i
);
831 uint32_t revents
= LTTNG_POLL_GETEV(&thread
.events
, i
);
833 DBG("Handling fd (%i) activity (%u)",
836 if (revents
& LPOLLERR
) {
837 ERR("Polling returned an error on fd %i", fd
);
841 if (fd
== rotate_notification_channel
->socket
) {
842 ret
= handle_notification_channel(fd
, handle
,
845 ERR("Error occurred while handling activity on notification channel socket");
849 /* Job queue or quit pipe activity. */
852 * The job queue is serviced if there is
853 * activity on the quit pipe to ensure it is
854 * flushed and all references held in the queue
857 ret
= handle_job_queue(handle
, &thread
,
858 handle
->rotation_timer_queue
);
860 ERR("Failed to handle rotation timer pipe event");
864 if (fd
== queue_pipe_fd
) {
867 ret
= lttng_read(fd
, &buf
, 1);
869 ERR("Failed to read from wakeup pipe (fd = %i)", fd
);
873 DBG("Quit pipe activity");
882 fini_thread_state(&thread
);
884 health_unregister(the_health_sessiond
);
885 rcu_thread_offline();
886 rcu_unregister_thread();
891 bool shutdown_rotation_thread(void *thread_data
)
893 struct rotation_thread_handle
*handle
= thread_data
;
894 const int write_fd
= lttng_pipe_get_writefd(handle
->quit_pipe
);
896 return notify_thread_pipe(write_fd
) == 1;
899 bool launch_rotation_thread(struct rotation_thread_handle
*handle
)
901 struct lttng_thread
*thread
;
903 thread
= lttng_thread_create("Rotation",
905 shutdown_rotation_thread
,
911 lttng_thread_put(thread
);