Fix: relayd: live connection fails to open file during clear
authorJérémie Galarneau <jeremie.galarneau@efficios.com>
Tue, 28 Jul 2020 15:29:18 +0000 (11:29 -0400)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Wed, 29 Jul 2020 17:02:17 +0000 (13:02 -0400)
Issue observed
==============

A `session clear` test occasionaly fails on the CI (very rarely, and
more often on PowerPC executors for some reason) with babeltrace
reporting that the live connection was closed by the remote side:

  PASS: tools/clear/test_ust 276 - Waiting for live viewers on url: net://localhost
  07-21 16:17:07.058 23855 23855 E PLUGIN/SRC.CTF.LTTNG-LIVE/VIEWER lttng_live_recv@viewer-connection.c:198 [lttng-live] Remote side has closed connection
  07-21 16:17:07.058 23855 23855 E PLUGIN/SRC.CTF.LTTNG-LIVE/VIEWER lttng_live_session_get_new_streams@viewer-connection.c:1701 [lttng-live] Error receiving get new streams reply
  07-21 16:17:07.058 23855 23855 E PLUGIN/SRC.CTF.LTTNG-LIVE lttng_live_msg_iter_next@lttng-live.c:1665 [lttng-live] Error preparing the next batch of messages: live-iter-status=LTTNG_LIVE_ITERATOR_STATUS_ERROR
  07-21 16:17:07.058 23855 23855 W LIB/MSG-ITER bt_message_iterator_next@iterator.c:864 Component input port message iterator's "next" method failed: iter-addr=0x1014d6a8, iter-upstream-comp-name="lttng-live", iter-upstream-comp-log-level=WARNING, iter-upstream-comp-class-type=SOURCE, iter-upstream-comp-class-name="lttng-live", iter-upstream-comp-class-partial-descr="Connect to an LTTng relay daemon", iter-upstream-port-type=OUTPUT, iter-upstream-port-name="out", status=ERROR
  07-21 16:17:07.059 23855 23855 E PLUGIN/FLT.UTILS.MUXER muxer_upstream_msg_iter_next@muxer.c:454 [muxer] Upstream iterator's next method returned an error: status=ERROR
  07-21 16:17:07.059 23855 23855 E PLUGIN/FLT.UTILS.MUXER validate_muxer_upstream_msg_iters@muxer.c:991 [muxer] Cannot validate muxer's upstream message iterator wrapper: muxer-msg-iter-addr=0x1014d668, muxer-upstream-msg-iter-wrap-addr=0x1014e210
  07-21 16:17:07.059 23855 23855 E PLUGIN/FLT.UTILS.MUXER muxer_msg_iter_next@muxer.c:1415 [muxer] Cannot get next message: comp-addr=0x1014cca8, muxer-comp-addr=0x1015afd8, muxer-msg-iter-addr=0x1014d668, msg-iter-addr=0x1014d598, status=ERROR
  07-21 16:17:07.059 23855 23855 W LIB/MSG-ITER bt_message_iterator_next@iterator.c:864 Component input port message iterator's "next" method failed: iter-addr=0x1014d598, iter-upstream-comp-name="muxer", iter-upstream-comp-log-level=WARNING, iter-upstream-comp-class-type=FILTER, iter-upstream-comp-class-name="muxer", iter-upstream-comp-class-partial-descr="Sort messages from multiple inpu", iter-upstream-port-type=OUTPUT, iter-upstream-port-name="out", status=ERROR
  07-21 16:17:07.059 23855 23855 W LIB/GRAPH consume_graph_sink@graph.c:473 Component's "consume" method failed: status=ERROR, comp-addr=0x1014d128, comp-name="pretty", comp-log-level=WARNING, comp-class-type=SINK, comp-class-name="pretty", comp-class-partial-descr="Pretty-print messages (`text` fo", comp-class-is-frozen=1, comp-class-so-handle-addr=0x10159dd8, comp-class-so-handle-path="/home/jenkins/workspace/lttng-tools_stable-2.12_portbuild/arch/powerpc/babeltrace_version/stable-2.0/build/std/conf/agents/liburcu_version/stable-0.12/test_type/base/deps/build/lib/babeltrace2/plugins/babeltrace-plugin-text.so", comp-input-port-count=1, comp-output-port-count=0
  07-21 16:17:07.059 23855 23855 E CLI cmd_run@babeltrace2.c:2548 Graph failed to complete successfully

  ERROR:    [Babeltrace CLI] (babeltrace2.c:2548)
    Graph failed to complete successfully
  CAUSED BY [libbabeltrace2] (graph.c:473)
    Component's "consume" method failed: status=ERROR, comp-addr=0x1014d128,
    comp-name="pretty", comp-log-level=WARNING, comp-class-type=SINK,
    comp-class-name="pretty", comp-class-partial-descr="Pretty-print messages
    (`text` fo", comp-class-is-frozen=1, comp-class-so-handle-addr=0x10159dd8,
    comp-class-so-handle-path="/home/jenkins/workspace/lttng-tools_stable-2.12_portbuild/arch/powerpc/babeltrace_version/stable-2.0/build/std/conf/agents/liburcu_version/stable-0.12/test_type/base/deps/build/lib/babeltrace2/plugins/babeltrace-plugin-text.so",
    comp-input-port-count=1, comp-output-port-count=0
  CAUSED BY [libbabeltrace2] (iterator.c:864)
    Component input port message iterator's "next" method failed:
    iter-addr=0x1014d598, iter-upstream-comp-name="muxer",
    iter-upstream-comp-log-level=WARNING, iter-upstream-comp-class-type=FILTER,
    iter-upstream-comp-class-name="muxer",
    iter-upstream-comp-class-partial-descr="Sort messages from multiple inpu",
    iter-upstream-port-type=OUTPUT, iter-upstream-port-name="out", status=ERROR
  CAUSED BY [muxer: 'filter.utils.muxer'] (muxer.c:991)
    Cannot validate muxer's upstream message iterator wrapper:
    muxer-msg-iter-addr=0x1014d668, muxer-upstream-msg-iter-wrap-addr=0x1014e210
  CAUSED BY [muxer: 'filter.utils.muxer'] (muxer.c:454)
    Upstream iterator's next method returned an error: status=ERROR
  CAUSED BY [libbabeltrace2] (iterator.c:864)
    Component input port message iterator's "next" method failed:
    iter-addr=0x1014d6a8, iter-upstream-comp-name="lttng-live",
    iter-upstream-comp-log-level=WARNING, iter-upstream-comp-class-type=SOURCE,
    iter-upstream-comp-class-name="lttng-live",
    iter-upstream-comp-class-partial-descr="Connect to an LTTng relay daemon",
    iter-upstream-port-type=OUTPUT, iter-upstream-port-name="out", status=ERROR
  CAUSED BY [lttng-live: 'source.ctf.lttng-live'] (lttng-live.c:1665)
    Error preparing the next batch of messages:
    live-iter-status=LTTNG_LIVE_ITERATOR_STATUS_ERROR
  CAUSED BY [lttng-live: 'source.ctf.lttng-live'] (viewer-connection.c:1701)
    Error receiving get new streams reply
  CAUSED BY [lttng-live: 'source.ctf.lttng-live'] (viewer-connection.c:198)
    Remote side has closed connection
  ok 277 - Clear session J7WXjh7fmMleTCfE
  PASS: tools/clear/test_ust 277 - Clear session J7WXjh7fmMleTCfE
  ok 278 - Clear session J7WXjh7fmMleTCfE
  PASS: tools/clear/test_ust 278 - Clear session J7WXjh7fmMleTCfE
  ok 279 - Stop lttng tracing for session J7WXjh7fmMleTCfE
  PASS: tools/clear/test_ust 279 - Stop lttng tracing for session J7WXjh7fmMleTCfE
  ok 280 - Destroy session J7WXjh7fmMleTCfE
  PASS: tools/clear/test_ust 280 - Destroy session J7WXjh7fmMleTCfE
  # Wait for viewer to exit
  not ok 281 - Babeltrace succeeds

Cause
=====

Looking at the relay daemon logs, it appears that the live client
requests an enumeration of the available streams while a rotation is
ongoing (clear).

Ultimately, this results in the relay daemon attempting to open a
non-existing file:
  PERROR - 16:33:59.242388809 [734380/734387]: Failed to open fs handle to ust/uid/1000/64-bit/chan_0, open() returned: No such file or directory (in fd_tracker_open_fs_handle() at fd-tracker.c:550)

The logs indicate that this file existed at some point. However, it
was unlinked and its newest instance was created in a trace chunk
named "20200727T163359-0400-1".

This chunk name is a temporary name used until the original trace
chunk can be unlinked (cleared) and the newest can be moved in its
place.

The file is being opened as part of the creation of a viewer stream
when make_viewer_stream() fails to find it. This implies that, somehow,
an outdated trace chunk is being used to open the viewer stream's file.

The reason why is that make_viewer_stream is called with the
viewer session's current trace chunk. During a rotation, the use of the
viewer session's current trace chunk is touchy due to the way the
switch-over to a new chunk is handled.

How viewer session/stream trace chunks are rotated
--------------------------------------------------

The viewer polls the relay daemon for new data/metadata to consume using
the `GET_NEXT_INDEX` and `GET_METADATA` commands. Both commands will
indicate to the viewer that it should try again later if a rotation is
ongoing on the "side" of the relay session.

When a rotation is not ongoing, the relay compares the `id` of the
target viewer stream's trace chunk with the relay session's current
trace chunk. If those `id`s don't match, the viewer session's current
trace chunk is then updated to a copy of the relay session's trace
chunk. The viewer stream's files are then closed and re-opened in the
context of the viewer session's now-updated trace chunk.

While the live protocol allows `GET_NEXT_INDEX` and `GET_METADATA` to
communicate to the viewer that it should "retry" later, there is no such
provisions made for the paths that lead to the creation of the viewer
streams. This means viewer streams must be created even if a rotation is
ongoing.

Solution
========

If a rotation is ongoing at the moment of the creation of the viewer
streams, we wish to use a copy of the trace chunk being used by the
relay stream. This way, we can be sure that the streams' files exist.

It is okay for viewer streams to hold references to different copies of
the trace chunks since no user-visible actions are performed when the
reference to those chunks is released. This is different from relay
streams where the detection of the completion of a rotation is done when
the last relay stream releases its reference to a specific trace chunk
instance.

Known drawbacks
===============

None beyond a slight increase (temporary until the next rotation) in the
number of FDs used when a client connects during a session rotation.

Note
====

Since make_viewer_streams now acts on relay and viewer counterparts of
the session and stream objects, the various variables are prefixed with
`relay` and `viewer` to make the code easier to understand.

The locking period of the relay stream is extended to most of the
iteration in make_viewer_stream() rather than only in
viewer_stream_create(). As a result, callers of viewer_stream_create()
must hold the relay stream's lock.

Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
Change-Id: I3b9c52a57835134712eadba0f68f6a21e922356b

src/bin/lttng-relayd/live.c
src/bin/lttng-relayd/viewer-session.h
src/bin/lttng-relayd/viewer-stream.c

index d5f04582419dffec1466c60c4de449cf34948a03..7d6dc1bc20a03a5af6cafe1e5ef170be1b7f5cb8 100644 (file)
@@ -268,8 +268,8 @@ end_unlock:
  *
  * Return 0 on success or else a negative value.
  */
-static int make_viewer_streams(struct relay_session *session,
-               struct lttng_trace_chunk *viewer_trace_chunk,
+static int make_viewer_streams(struct relay_session *relay_session,
+               struct relay_viewer_session *viewer_session,
                enum lttng_viewer_seek seek_t,
                uint32_t *nb_total,
                uint32_t *nb_unsent,
@@ -279,18 +279,19 @@ static int make_viewer_streams(struct relay_session *session,
        int ret;
        struct lttng_ht_iter iter;
        struct ctf_trace *ctf_trace;
+       struct relay_stream *relay_stream = NULL;
 
-       assert(session);
-       ASSERT_LOCKED(session->lock);
+       assert(relay_session);
+       ASSERT_LOCKED(relay_session->lock);
 
-       if (!viewer_trace_chunk) {
+       if (!viewer_session->current_trace_chunk) {
                ERR("Internal error: viewer session associated with session \"%s\" has a NULL trace chunk",
-                               session->session_name);
+                               relay_session->session_name);
                ret = -1;
                goto error;
        }
 
-       if (session->connection_closed) {
+       if (relay_session->connection_closed) {
                *closed = true;
        }
 
@@ -299,10 +300,9 @@ static int make_viewer_streams(struct relay_session *session,
         * used for a the given session id only.
         */
        rcu_read_lock();
-       cds_lfht_for_each_entry(session->ctf_traces_ht->ht, &iter.iter, ctf_trace,
-                       node.node) {
+       cds_lfht_for_each_entry (relay_session->ctf_traces_ht->ht, &iter.iter,
+                       ctf_trace, node.node) {
                bool trace_has_metadata_stream = false;
-               struct relay_stream *stream;
 
                health_code_update();
 
@@ -314,15 +314,23 @@ static int make_viewer_streams(struct relay_session *session,
                 * Iterate over all the streams of the trace to see if we have a
                 * metadata stream.
                 */
-               cds_list_for_each_entry_rcu(
-                               stream, &ctf_trace->stream_list, stream_node)
+               cds_list_for_each_entry_rcu(relay_stream,
+                               &ctf_trace->stream_list, stream_node)
                {
-                       if (stream->is_metadata) {
+                       bool is_metadata_stream;
+
+                       pthread_mutex_lock(&relay_stream->lock);
+                       is_metadata_stream = relay_stream->is_metadata;
+                       pthread_mutex_unlock(&relay_stream->lock);
+
+                       if (is_metadata_stream) {
                                trace_has_metadata_stream = true;
                                break;
                        }
                }
 
+               relay_stream = NULL;
+
                /*
                 * If there is no metadata stream in this trace at the moment
                 * and we never sent one to the viewer, skip the trace. We
@@ -334,35 +342,72 @@ static int make_viewer_streams(struct relay_session *session,
                        continue;
                }
 
-               cds_list_for_each_entry_rcu(stream, &ctf_trace->stream_list, stream_node) {
-                       struct relay_viewer_stream *vstream;
+               cds_list_for_each_entry_rcu(relay_stream,
+                               &ctf_trace->stream_list, stream_node)
+               {
+                       struct relay_viewer_stream *viewer_stream;
 
-                       if (!stream_get(stream)) {
+                       if (!stream_get(relay_stream)) {
                                continue;
                        }
+
+                       pthread_mutex_lock(&relay_stream->lock);
                        /*
                         * stream published is protected by the session lock.
                         */
-                       if (!stream->published) {
+                       if (!relay_stream->published) {
                                goto next;
                        }
-                       vstream = viewer_stream_get_by_id(stream->stream_handle);
-                       if (!vstream) {
+                       viewer_stream = viewer_stream_get_by_id(
+                                       relay_stream->stream_handle);
+                       if (!viewer_stream) {
+                               struct lttng_trace_chunk *viewer_stream_trace_chunk;
+
                                /*
                                 * Save that we sent the metadata stream to the
                                 * viewer. So that we know what trace the viewer
                                 * is aware of.
                                 */
-                               if (stream->is_metadata) {
-                                       ctf_trace->metadata_stream_sent_to_viewer =
-                                                       true;
+                               if (relay_stream->is_metadata) {
+                                       ctf_trace->metadata_stream_sent_to_viewer = true;
                                }
-                               vstream = viewer_stream_create(stream,
-                                               viewer_trace_chunk, seek_t);
-                               if (!vstream) {
+
+                               /*
+                                * If a rotation is ongoing, use a copy of the
+                                * relay stream's chunk to ensure the stream
+                                * files exist.
+                                *
+                                * Otherwise, the viewer session's current trace
+                                * chunk can be used safely.
+                                */
+                               if ((relay_stream->ongoing_rotation.is_set ||
+                                                   relay_session->ongoing_rotation) &&
+                                               relay_stream->trace_chunk) {
+                                       viewer_stream_trace_chunk = lttng_trace_chunk_copy(
+                                                       relay_stream->trace_chunk);
+                                       if (!viewer_stream_trace_chunk) {
+                                               ret = -1;
+                                               ctf_trace_put(ctf_trace);
+                                               goto error_unlock;
+                                       }
+                               } else {
+                                       const bool reference_acquired = lttng_trace_chunk_get(
+                                                       viewer_session->current_trace_chunk);
+
+                                       assert(reference_acquired);
+                                       viewer_stream_trace_chunk =
+                                                       viewer_session->current_trace_chunk;
+                               }
+
+                               viewer_stream = viewer_stream_create(
+                                               relay_stream,
+                                               viewer_stream_trace_chunk,
+                                               seek_t);
+                               lttng_trace_chunk_put(viewer_stream_trace_chunk);
+                               viewer_stream_trace_chunk = NULL;
+                               if (!viewer_stream) {
                                        ret = -1;
                                        ctf_trace_put(ctf_trace);
-                                       stream_put(stream);
                                        goto error_unlock;
                                }
 
@@ -374,36 +419,40 @@ static int make_viewer_streams(struct relay_session *session,
                                 * Ensure a self-reference is preserved even
                                 * after we have put our local reference.
                                 */
-                               if (!viewer_stream_get(vstream)) {
+                               if (!viewer_stream_get(viewer_stream)) {
                                        ERR("Unable to get self-reference on viewer stream, logic error.");
                                        abort();
                                }
                        } else {
-                               if (!vstream->sent_flag && nb_unsent) {
+                               if (!viewer_stream->sent_flag && nb_unsent) {
                                        /* Update number of unsent stream counter. */
                                        (*nb_unsent)++;
                                }
                        }
                        /* Update number of total stream counter. */
                        if (nb_total) {
-                               if (stream->is_metadata) {
-                                       if (!stream->closed ||
-                                                       stream->metadata_received > vstream->metadata_sent) {
+                               if (relay_stream->is_metadata) {
+                                       if (!relay_stream->closed ||
+                                                       relay_stream->metadata_received >
+                                                                       viewer_stream->metadata_sent) {
                                                (*nb_total)++;
                                        }
                                } else {
-                                       if (!stream->closed ||
-                                               !(((int64_t) (stream->prev_data_seq - stream->last_net_seq_num)) >= 0)) {
-
+                                       if (!relay_stream->closed ||
+                                                       !(((int64_t)(relay_stream->prev_data_seq -
+                                                                         relay_stream->last_net_seq_num)) >=
+                                                                       0)) {
                                                (*nb_total)++;
                                        }
                                }
                        }
                        /* Put local reference. */
-                       viewer_stream_put(vstream);
+                       viewer_stream_put(viewer_stream);
                next:
-                       stream_put(stream);
+                       pthread_mutex_unlock(&relay_stream->lock);
+                       stream_put(relay_stream);
                }
+               relay_stream = NULL;
                ctf_trace_put(ctf_trace);
        }
 
@@ -412,6 +461,11 @@ static int make_viewer_streams(struct relay_session *session,
 error_unlock:
        rcu_read_unlock();
 error:
+       if (relay_stream) {
+               pthread_mutex_unlock(&relay_stream->lock);
+               stream_put(relay_stream);
+       }
+
        return ret;
 }
 
@@ -1088,7 +1142,7 @@ int viewer_get_new_streams(struct relay_connection *conn)
 
        pthread_mutex_lock(&session->lock);
        ret = make_viewer_streams(session,
-                       conn->viewer_session->current_trace_chunk,
+                       conn->viewer_session,
                        LTTNG_VIEWER_SEEK_LAST, &nb_total, &nb_unsent,
                        &nb_created, &closed);
        if (ret < 0) {
@@ -1237,7 +1291,7 @@ int viewer_attach_session(struct relay_connection *conn)
        }
 
        ret = make_viewer_streams(session,
-                       conn->viewer_session->current_trace_chunk, seek_type,
+                       conn->viewer_session, seek_type,
                        &nb_streams, NULL, NULL, &closed);
        if (ret < 0) {
                goto end_put_session;
index ebde2dd1dbfc77cfb08c541f898ed89f76253138..2895db0ef43230ed4d40b5dad4a24da434abf22f 100644 (file)
@@ -30,7 +30,6 @@ struct relay_viewer_session {
         */
        struct cds_list_head session_list;      /* RCU list. */
        pthread_mutex_t session_list_lock;      /* Protects list updates. */
-       /* Once set, the current trace chunk of a viewer must not change. */
        struct lttng_trace_chunk *current_trace_chunk;
 };
 
index 05499215a8049e5e98e239b7e8f50d030230328e..f07e062f8d303a6b1aeeb0520a263009a05eae3d 100644 (file)
@@ -34,14 +34,15 @@ static void viewer_stream_destroy_rcu(struct rcu_head *head)
        viewer_stream_destroy(vstream);
 }
 
+/* Relay stream's lock must be held by the caller. */
 struct relay_viewer_stream *viewer_stream_create(struct relay_stream *stream,
-               struct lttng_trace_chunk *viewer_trace_chunk,
+               struct lttng_trace_chunk *trace_chunk,
                enum lttng_viewer_seek seek_t)
 {
        struct relay_viewer_stream *vstream = NULL;
-       const bool acquired_reference = lttng_trace_chunk_get(
-                       viewer_trace_chunk);
+       const bool acquired_reference = lttng_trace_chunk_get(trace_chunk);
 
+       ASSERT_LOCKED(stream->lock);
        if (!acquired_reference) {
                goto error;
        }
@@ -52,8 +53,8 @@ struct relay_viewer_stream *viewer_stream_create(struct relay_stream *stream,
                goto error;
        }
 
-       vstream->stream_file.trace_chunk = viewer_trace_chunk;
-       viewer_trace_chunk = NULL;
+       vstream->stream_file.trace_chunk = trace_chunk;
+       trace_chunk = NULL;
        vstream->path_name = lttng_strndup(stream->path_name, LTTNG_VIEWER_PATH_MAX);
        if (vstream->path_name == NULL) {
                PERROR("relay viewer path_name alloc");
@@ -72,8 +73,6 @@ struct relay_viewer_stream *viewer_stream_create(struct relay_stream *stream,
        }
        vstream->stream = stream;
 
-       pthread_mutex_lock(&stream->lock);
-
        if (stream->is_metadata && stream->trace->viewer_metadata_stream) {
                ERR("Cannot attach viewer metadata stream to trace (busy).");
                goto error_unlock;
@@ -183,7 +182,6 @@ struct relay_viewer_stream *viewer_stream_create(struct relay_stream *stream,
                rcu_assign_pointer(stream->trace->viewer_metadata_stream,
                                vstream);
        }
-       pthread_mutex_unlock(&stream->lock);
 
        /* Globally visible after the add unique. */
        lttng_ht_node_init_u64(&vstream->stream_n, stream->stream_handle);
@@ -198,8 +196,8 @@ error:
        if (vstream) {
                viewer_stream_destroy(vstream);
        }
-       if (viewer_trace_chunk && acquired_reference) {
-               lttng_trace_chunk_put(viewer_trace_chunk);
+       if (trace_chunk && acquired_reference) {
+               lttng_trace_chunk_put(trace_chunk);
        }
        return NULL;
 }
This page took 0.033317 seconds and 4 git commands to generate.