Fix: trace chunk reported unknown before close command execution
authorJérémie Galarneau <jeremie.galarneau@efficios.com>
Mon, 30 Sep 2019 19:57:33 +0000 (15:57 -0400)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Tue, 1 Oct 2019 18:25:53 +0000 (14:25 -0400)
The check for the existence of a trace chunk is done by using the
regular 'find' operations on a trace chunk registry in both the relay
and consumer daemons.

The 'find' operations attempt to acquire a reference to the trace
chunk being looked-up. On failure to acquire a reference, a trace
chunk is reported as being "unknown". The rotation completion check
uses this mechanism to determine if a trace chunk is still in use.

A close command defers a given operation until a trace chunk is no
longer in use (when its last reference is dropped). Hence, a thread
can attempt to 'find' a trace chunk, fail to acquire a reference, and
fail to see the effects of the close command.

In other words, the thread that has dropped the last reference to
the chunk could still be running the close command of a trace chunk
that is reported to be "unknown" by the consumer and relay daemons.

To fix this, dedicated "chunk_exists" operations are introduced. These
operations do not attempt to acquire a trace chunk. They only look it
up in the registry's internal hash table. As the removal of the trace
chunk from the hash table is performed _after_ the execution of its
close command, it provides a reliable way to ensure that a chunk that
had a close command could complete it before being reported as
"unknown"/no longer in use.

In terms of provided guarantees, this changes the moment at which a
trace chunk is considered to no longer exist from the moment its last
reference was dropped to the moment after the execution of its close
command has completed.

Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
src/bin/lttng-relayd/main.c
src/bin/lttng-relayd/sessiond-trace-chunks.c
src/bin/lttng-relayd/sessiond-trace-chunks.h
src/common/consumer/consumer.c
src/common/trace-chunk-registry.h
src/common/trace-chunk.c

index 0cd163fe85eae4028ce025c0cd1a5d35b5361132..5b32341cd9ff4cb2647caa3557536326188dcf47 100644 (file)
@@ -2674,8 +2674,8 @@ static int relay_trace_chunk_exists(const struct lttcomm_relayd_hdr *recv_hdr,
        struct lttcomm_relayd_trace_chunk_exists *msg;
        struct lttcomm_relayd_trace_chunk_exists_reply reply = {};
        struct lttng_buffer_view header_view;
-       struct lttng_trace_chunk *chunk = NULL;
        uint64_t chunk_id;
+       bool chunk_exists;
 
        if (!session || !conn->version_check_done) {
                ERR("Trying to close a trace chunk before version check");
@@ -2700,25 +2700,29 @@ static int relay_trace_chunk_exists(const struct lttcomm_relayd_hdr *recv_hdr,
        msg = (typeof(msg)) header_view.data;
        chunk_id = be64toh(msg->chunk_id);
 
-       chunk = sessiond_trace_chunk_registry_get_chunk(
+       ret = sessiond_trace_chunk_registry_chunk_exists(
                        sessiond_trace_chunk_registry,
                        conn->session->sessiond_uuid,
                        conn->session->id,
-                       chunk_id);
-
-       reply = (typeof(reply)) {
-               .generic.ret_code = htobe32((uint32_t) LTTNG_OK),
-               .trace_chunk_exists = !!chunk,
+                       chunk_id, &chunk_exists);
+       /*
+        * If ret is not 0, send the reply and report the error to the caller.
+        * It is a protocol (or internal) error and the session/connection
+        * should be torn down.
+        */
+       reply = (typeof(reply)){
+               .generic.ret_code = htobe32((uint32_t)
+                       (ret == 0 ? LTTNG_OK : LTTNG_ERR_INVALID_PROTOCOL)),
+               .trace_chunk_exists = ret == 0 ? chunk_exists : 0,
        };
-       send_ret = conn->sock->ops->sendmsg(conn->sock,
-                       &reply, sizeof(reply), 0);
+       send_ret = conn->sock->ops->sendmsg(
+                       conn->sock, &reply, sizeof(reply), 0);
        if (send_ret < (ssize_t) sizeof(reply)) {
                ERR("Failed to send \"create trace chunk\" command reply (ret = %zd)",
                                send_ret);
                ret = -1;
        }
 end_no_reply:
-       lttng_trace_chunk_put(chunk);
        return ret;
 }
 
index e7a7d112d93e586ee3ee256bee91a0efb4c09d07..495227226e8a4bcb5f48c62a607ae29e1ce91336 100644 (file)
@@ -476,3 +476,39 @@ sessiond_trace_chunk_registry_get_chunk(
 end:
        return chunk;
 }
+
+int sessiond_trace_chunk_registry_chunk_exists(
+               struct sessiond_trace_chunk_registry *sessiond_registry,
+               const lttng_uuid sessiond_uuid,
+               uint64_t session_id, uint64_t chunk_id, bool *chunk_exists)
+{
+       int ret;
+       struct trace_chunk_registry_ht_element *element;
+       struct trace_chunk_registry_ht_key key;
+
+       lttng_uuid_copy(key.sessiond_uuid, sessiond_uuid);
+       element = trace_chunk_registry_ht_element_find(sessiond_registry, &key);
+       if (!element) {
+               char uuid_str[UUID_STR_LEN];
+
+               lttng_uuid_to_str(sessiond_uuid, uuid_str);
+               /*
+                * While this certainly means that the chunk does not exist,
+                * it is unexpected for a chunk existence query to target a
+                * session daemon that does not have an active
+                * connection/registry. This would indicate a protocol
+                * (or internal) error.
+                */
+               ERR("Failed to find trace chunk registry of sessiond {%s}",
+                               uuid_str);
+               ret = -1;
+               goto end;
+       }
+
+       ret = lttng_trace_chunk_registry_chunk_exists(
+                       element->trace_chunk_registry,
+                       session_id, chunk_id, chunk_exists);
+       trace_chunk_registry_ht_element_put(element);
+end:
+       return ret;
+}
index 39804679c811e7306be044faaecec7b579a7db0d..c62fd3e6fe8e1071b5a798b0699aca3be556bc32 100644 (file)
@@ -55,4 +55,9 @@ sessiond_trace_chunk_registry_get_chunk(
                const lttng_uuid sessiond_uuid,
                uint64_t session_id, uint64_t chunk_id);
 
+int sessiond_trace_chunk_registry_chunk_exists(
+               struct sessiond_trace_chunk_registry *sessiond_registry,
+               const lttng_uuid sessiond_uuid,
+               uint64_t session_id, uint64_t chunk_id, bool *chunk_exists);
+
 #endif /* SESSIOND_TRACE_CHUNK_REGISTRY_H */
index e1426f116ccc939c5f343343993ff4d70d3ea288..0086d748f0c6fe17fd54e87ccffdf7384f2ae447 100644 (file)
@@ -4772,12 +4772,11 @@ enum lttcomm_return_code lttng_consumer_trace_chunk_exists(
 {
        int ret;
        enum lttcomm_return_code ret_code;
-       struct lttng_trace_chunk *chunk;
        char relayd_id_buffer[MAX_INT_DEC_LEN(*relayd_id)];
        const char *relayd_id_str = "(none)";
        const bool is_local_trace = !relayd_id;
        struct consumer_relayd_sock_pair *relayd = NULL;
-       bool chunk_exists_remote;
+       bool chunk_exists_local, chunk_exists_remote;
 
        if (relayd_id) {
                int ret;
@@ -4795,13 +4794,19 @@ enum lttcomm_return_code lttng_consumer_trace_chunk_exists(
        DBG("Consumer trace chunk exists command: relayd_id = %s"
                        ", chunk_id = %" PRIu64, relayd_id_str,
                        chunk_id);
-       chunk = lttng_trace_chunk_registry_find_chunk(
+       ret = lttng_trace_chunk_registry_chunk_exists(
                        consumer_data.chunk_registry, session_id,
-                       chunk_id);
-       DBG("Trace chunk %s locally", chunk ? "exists" : "does not exist");
-       if (chunk) {
+                       chunk_id, &chunk_exists_local);
+       if (ret) {
+               /* Internal error. */
+               ERR("Failed to query the existence of a trace chunk");
+               ret_code = LTTCOMM_CONSUMERD_FATAL;
+               goto end_rcu_unlock;
+       }
+       DBG("Trace chunk %s locally",
+                       chunk_exists_local ? "exists" : "does not exist");
+       if (chunk_exists_local) {
                ret_code = LTTCOMM_CONSUMERD_TRACE_CHUNK_EXISTS_LOCAL;
-               lttng_trace_chunk_put(chunk);
                goto end;
        } else if (is_local_trace) {
                ret_code = LTTCOMM_CONSUMERD_UNKNOWN_TRACE_CHUNK;
index 23df91a3eca24a13ac64b0eb3a0a5d9b983d5ebf..afe942ba913d46d19f32f3ca1dc1d175a8fabad3 100644 (file)
@@ -83,6 +83,16 @@ lttng_trace_chunk_registry_find_chunk(
                const struct lttng_trace_chunk_registry *registry,
                uint64_t session_id, uint64_t chunk_id);
 
+/*
+ * Query the existence of a trace chunk by session_id and chunk_id.
+ *
+ * Returns 0 on success, a negative value on error.
+ */
+LTTNG_HIDDEN
+int lttng_trace_chunk_registry_chunk_exists(
+               const struct lttng_trace_chunk_registry *registry,
+               uint64_t session_id, uint64_t chunk_id, bool *chunk_exists);
+
 /*
  * Look-up an anonymous trace chunk by session_id.
  * A reference is acquired on behalf of the caller.
index 601d402e0843639ac9457269b4916d2ace8dce29..affe3ac2050186043d5abd0590173e2f3296bd11 100644 (file)
@@ -1358,6 +1358,41 @@ lttng_trace_chunk_registry_find_chunk(
                        session_id, &chunk_id);
 }
 
+LTTNG_HIDDEN
+int lttng_trace_chunk_registry_chunk_exists(
+               const struct lttng_trace_chunk_registry *registry,
+               uint64_t session_id, uint64_t chunk_id, bool *chunk_exists)
+{
+       int ret = 0;
+       const struct lttng_trace_chunk_registry_element target_element = {
+               .chunk.id.is_set = true,
+               .chunk.id.value = chunk_id,
+               .session_id = session_id,
+       };
+       const unsigned long element_hash =
+                       lttng_trace_chunk_registry_element_hash(
+                               &target_element);
+       struct cds_lfht_node *published_node;
+       struct cds_lfht_iter iter;
+
+       rcu_read_lock();
+       cds_lfht_lookup(registry->ht,
+                       element_hash,
+                       lttng_trace_chunk_registry_element_match,
+                       &target_element,
+                       &iter);
+       published_node = cds_lfht_iter_get_node(&iter);
+       if (!published_node) {
+               *chunk_exists = false;
+               goto end;
+       }
+
+       *chunk_exists = !cds_lfht_is_node_deleted(published_node);
+end:
+       rcu_read_unlock();
+       return ret;
+}
+
 LTTNG_HIDDEN
 struct lttng_trace_chunk *
 lttng_trace_chunk_registry_find_anonymous_chunk(
This page took 0.032376 seconds and 4 git commands to generate.