Fix: consumer: snapshot: assertion on subsequent snapshot
authorJonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Wed, 1 Jun 2022 19:20:55 +0000 (15:20 -0400)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Tue, 4 Oct 2022 19:37:27 +0000 (15:37 -0400)
Observed issue
==============

While a snapshot is being taken, the containing folder can disappear
unexpectedly. This can lead to the following errors, which are expected
and mostly handled fine:

PERROR - 14:47:32.002564464 [2922498/2922507]: Failed to open file relative to trace chunk file_path = "channel0_0", flags = 577, mode = 432: No such file or directory (in _lttng_trace_chunk_open_fs_handle_locked() at trace-chunk.cpp:1411)
Error: Failed to open stream file "channel0_0"
Error: Snapshot channel failed

The problem happens on the subsequent snapshot for the session:

 #0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
 #1  0x00007fbbdadb3859 in __GI_abort () at abort.c:79
 #2  0x00007fbbdadb3729 in __assert_fail_base (fmt=0x7fbbdaf49588 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x55c4212cfbb5 "!stream->trace_chunk", file=0x55c4212cf820 "kernel-co
 #3  0x00007fbbdadc5006 in __GI___assert_fail (assertion=0x55c4212cfbb5 "!stream->trace_chunk", file=0x55c4212cf820 "kernel-consumer/kernel-consumer.cpp", line=188, function=0x55c4212cfb00 "
 #4  0x000055c421268cc6 in lttng_kconsumer_snapshot_channel (channel=0x7fbbc4000b60, key=1, path=0x7fbbd37f8fd4 "", relayd_id=18446744073709551615, nb_packets_per_stream=0) at kernel-consume
 #5  0x000055c42126b39d in lttng_kconsumer_recv_cmd (ctx=0x55c421b80a90, sock=31, consumer_sockpoll=0x7fbbd37fd280) at kernel-consumer/kernel-consumer.cpp:986
 #6  0x000055c4212546d1 in lttng_consumer_recv_cmd (ctx=0x55c421b80a90, sock=31, consumer_sockpoll=0x7fbbd37fd280) at consumer/consumer.cpp:2090
 #7  0x000055c421259963 in consumer_thread_sessiond_poll (data=0x55c421b80a90) at consumer/consumer.cpp:3281
 #8  0x00007fbbdaf8b609 in start_thread (arg=<optimized out>) at pthread_create.c:477
 #9  0x00007fbbdaeb0163 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

How to reproduce:

 1. Setting a breakpoint on snapshot_channel() inside
    src/common/ust-consumer/ust-consumer.cpp
 2. When the breakpoint hits, remove the the complete lttng directory
    containing the session data.
 3. Continue the lttng_consumerd process from gdb.
 4. In that case you see a negative return value -1 from
    consumer_stream_create_output_files() inside snapshot_channel().
 5. Take another snapshot and lttng_consumerd crashes because
    of the `assert(!stream->trace_chunk)` in snapshot_channel().

    This last action does not require any breakpoint intervention.

Cause
=====

During the snapshot, the stream is assigned the channel current chunk.
It is expected that the stream does not have a chunk at this point.

The error handling is faulty here, the stream chunk must be
invalidated/reset on error to allow its reuse later on.

The problem exists for both consumer domains (user/kernel).

Solution
========

For the ust consumer, we can directly use the `error_close_stream`
label.

For the kernel consumer, the code path is slightly different since it
does not uses `consumer_stream_close`. Note that `consumer_stream_close`
cannot be used as is for the kernel consumer. The current implementation
partially resembles `consumer_stream_close` at the end of the iteration.
It is extracted to its own function for easier reuse from the new
`error_finalize_stream` label.

Known drawbacks
=========

None.

Fixes: #1352
Signed-off-by: Marcel Hamer <marcel.hamer@windriver.com>
Signed-off-by: Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
Change-Id: I9fc81917b19aa436ed8e8679672648f2d5baf41a

src/common/kernel-consumer/kernel-consumer.cpp
src/common/ust-consumer/ust-consumer.cpp

index 9bc66a47880d82247644ba8e1b6c4ba7c707253d..d22aea1831978225da90eaf5b5d9507bd248ca4a 100644 (file)
@@ -135,6 +135,30 @@ error:
        return ret;
 }
 
+static void finalize_snapshot_stream(
+               struct lttng_consumer_stream *stream, uint64_t relayd_id)
+{
+       ASSERT_LOCKED(stream->lock);
+
+       if (relayd_id == (uint64_t) -1ULL) {
+               if (stream->out_fd >= 0) {
+                       const int ret = close(stream->out_fd);
+
+                       if (ret < 0) {
+                               PERROR("Failed to close stream snapshot output file descriptor");
+                       }
+
+                       stream->out_fd = -1;
+               }
+       } else {
+               close_relayd_stream(stream);
+               stream->net_seq_idx = (uint64_t) -1ULL;
+       }
+
+       lttng_trace_chunk_put(stream->trace_chunk);
+       stream->trace_chunk = NULL;
+}
+
 /*
  * Take a snapshot of all the stream of a channel
  * RCU read-side lock must be held across this function to ensure existence of
@@ -198,13 +222,13 @@ static int lttng_kconsumer_snapshot_channel(
                        ret = consumer_send_relayd_stream(stream, path);
                        if (ret < 0) {
                                ERR("sending stream to relayd");
-                               goto end_unlock;
+                               goto error_finalize_stream;
                        }
                } else {
                        ret = consumer_stream_create_output_files(stream,
                                        false);
                        if (ret < 0) {
-                               goto end_unlock;
+                               goto error_finalize_stream;
                        }
                        DBG("Kernel consumer snapshot stream (%" PRIu64 ")",
                                        stream->key);
@@ -222,7 +246,7 @@ static int lttng_kconsumer_snapshot_channel(
                        ret = kernctl_buffer_flush(stream->wait_fd);
                        if (ret < 0) {
                                ERR("Failed to flush kernel stream");
-                               goto end_unlock;
+                               goto error_finalize_stream;
                        }
                        goto end_unlock;
                }
@@ -230,19 +254,19 @@ static int lttng_kconsumer_snapshot_channel(
                ret = lttng_kconsumer_take_snapshot(stream);
                if (ret < 0) {
                        ERR("Taking kernel snapshot");
-                       goto end_unlock;
+                       goto error_finalize_stream;
                }
 
                ret = lttng_kconsumer_get_produced_snapshot(stream, &produced_pos);
                if (ret < 0) {
                        ERR("Produced kernel snapshot position");
-                       goto end_unlock;
+                       goto error_finalize_stream;
                }
 
                ret = lttng_kconsumer_get_consumed_snapshot(stream, &consumed_pos);
                if (ret < 0) {
                        ERR("Consumerd kernel snapshot position");
-                       goto end_unlock;
+                       goto error_finalize_stream;
                }
 
                consumed_pos = consumer_get_consume_start_pos(consumed_pos,
@@ -262,7 +286,7 @@ static int lttng_kconsumer_snapshot_channel(
                        if (ret < 0) {
                                if (ret != -EAGAIN) {
                                        PERROR("kernctl_get_subbuf snapshot");
-                                       goto end_unlock;
+                                       goto error_finalize_stream;
                                }
                                DBG("Kernel consumer get subbuf failed. Skipping it.");
                                consumed_pos += stream->max_sb_size;
@@ -312,26 +336,12 @@ static int lttng_kconsumer_snapshot_channel(
                        ret = kernctl_put_subbuf(stream->wait_fd);
                        if (ret < 0) {
                                ERR("Snapshot kernctl_put_subbuf");
-                               goto end_unlock;
+                               goto error_finalize_stream;
                        }
                        consumed_pos += stream->max_sb_size;
                }
 
-               if (relayd_id == (uint64_t) -1ULL) {
-                       if (stream->out_fd >= 0) {
-                               ret = close(stream->out_fd);
-                               if (ret < 0) {
-                                       PERROR("Kernel consumer snapshot close out_fd");
-                                       goto end_unlock;
-                               }
-                               stream->out_fd = -1;
-                       }
-               } else {
-                       close_relayd_stream(stream);
-                       stream->net_seq_idx = (uint64_t) -1ULL;
-               }
-               lttng_trace_chunk_put(stream->trace_chunk);
-               stream->trace_chunk = NULL;
+               finalize_snapshot_stream(stream, relayd_id);
                pthread_mutex_unlock(&stream->lock);
        }
 
@@ -344,6 +354,8 @@ error_put_subbuf:
        if (ret < 0) {
                ERR("Snapshot kernctl_put_subbuf error path");
        }
+error_finalize_stream:
+       finalize_snapshot_stream(stream, relayd_id);
 end_unlock:
        pthread_mutex_unlock(&stream->lock);
 end:
index 30d1f102142781289be72a965a9b8bfe719486ff..b08cafdff9da6c217fb2702a7ad0e55379514de1 100644 (file)
@@ -1089,13 +1089,13 @@ static int snapshot_channel(struct lttng_consumer_channel *channel,
                if (use_relayd) {
                        ret = consumer_send_relayd_stream(stream, path);
                        if (ret < 0) {
-                               goto error_unlock;
+                               goto error_close_stream;
                        }
                } else {
                        ret = consumer_stream_create_output_files(stream,
                                        false);
                        if (ret < 0) {
-                               goto error_unlock;
+                               goto error_close_stream;
                        }
                        DBG("UST consumer snapshot stream (%" PRIu64 ")",
                                        stream->key);
@@ -1117,19 +1117,19 @@ static int snapshot_channel(struct lttng_consumer_channel *channel,
                ret = lttng_ustconsumer_take_snapshot(stream);
                if (ret < 0) {
                        ERR("Taking UST snapshot");
-                       goto error_unlock;
+                       goto error_close_stream;
                }
 
                ret = lttng_ustconsumer_get_produced_snapshot(stream, &produced_pos);
                if (ret < 0) {
                        ERR("Produced UST snapshot position");
-                       goto error_unlock;
+                       goto error_close_stream;
                }
 
                ret = lttng_ustconsumer_get_consumed_snapshot(stream, &consumed_pos);
                if (ret < 0) {
                        ERR("Consumerd UST snapshot position");
-                       goto error_unlock;
+                       goto error_close_stream;
                }
 
                /*
This page took 0.035935 seconds and 4 git commands to generate.