Fix: run-as thread deadlock on itself in restart error path
authorJonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Wed, 16 Jan 2019 18:38:57 +0000 (13:38 -0500)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Mon, 11 Feb 2019 22:35:48 +0000 (17:35 -0500)
The deadlock was found using this backtrace

Thread 5:
0  __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1  0x00007efc6b650023 in __GI___pthread_mutex_lock (mutex=mutex@entry=0x55fc37128400 <worker_lock>) at ../nptl/pthread_mutex_lock.c:78
2  0x000055fc36efbe05 in run_as_destroy_worker () at runas.c:1233
3  0x000055fc36efc2e7 in run_as_restart_worker (worker=<optimized out>) at runas.c:998
4  run_as (cmd=cmd@entry=RUN_AS_UNLINK, data=data@entry=0x7efc5b7fa630, ret_value=ret_value@entry=0x7efc5b7fa510, uid=uid@entry=1000, gid=gid@entry=1000) at runas.c:1033
5  0x000055fc36efc9ce in run_as_unlink (path=path@entry=0x7efc5b7fb690 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index/channel0_3.idx", uid=uid@entry=1000, gid=gid@entry=1000) at runas.c :1120
6  0x000055fc36ef7feb in utils_unlink_stream_file (path_name=path_name@entry=0x7efc5b7fc7e0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index", file_name=file_name@entry=0x7efc500085d4 "channel0_3", size=size@entry=0, count=count@entry=0, uid=uid@entry=1000, gid=gid@entry=1000, suffix=0x55fc36f19b26 ".idx") at utils.c:929
7  0x000055fc36f01d4e in lttng_index_file_create (path_name=path_name@entry=0x7efc500087a0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel", stream_name=stream_name@entry=0x7efc500085d4 "channel0_3", uid=1000, gid=1000, size=0, count=0, major=1, minor=1) at index.c:79
8  0x000055fc36ed9475 in rotate_local_stream (ctx=<optimized out>, stream=0x7efc50008460) at consumer.c:4105
9  0x000055fc36ed98b5 in lttng_consumer_rotate_stream (ctx=ctx@entry=0x55fc37428d80, stream=stream@entry=0x7efc50008460, rotated=rotated@entry=0x7efc5b7fdb27) at consumer.c:4181
10 0x000055fc36ee354e in lttng_kconsumer_read_subbuffer (stream=stream@entry=0x7efc50008460, ctx=ctx@entry=0x55fc37428d80, rotated=rotated@entry=0x7efc5b7fdb27) at kernel-consumer.c:1740
11 0x000055fc36ed7a30 in lttng_consumer_read_subbuffer (stream=0x7efc50008460, ctx=0x55fc37428d80) at consumer.c:3383
12 0x000055fc36ed4b74 in consumer_thread_data_poll (data=0x55fc37428d80) at consumer.c:2751
13 0x00007efc6b64d6db in start_thread (arg=0x7efc5b7fe700) at pthread_create.c:463
14 0x00007efc6af6488f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

The owner of the lock is itself:
  print worker_lock.__data.__owner
    $2 = 25725
  thread find 25725
    Thread 5 has target id 'Thread 0x7efc5b7fe700 (LWP 25725)'

The worker_lock is first taken in frame #4: run_as runas.c:1033

  pthread_mutex_lock(&worker_lock);
  if (use_clone()) {
...
    /*
     * If the worker thread crashed the errno is set to EIO. we log
     * the error and  start a new worker process.
     */
    if (ret == -1 && saved_errno == EIO) {
        DBG("Socket closed unexpectedly... "
        "Restarting the worker process");
->      ret = run_as_restart_worker(global_worker);
        if (ret == -1) {
          ERR("Failed to restart worker process.");
          goto err;
        }

Solution
========

Create run_as_restart_worker_no_lock which does not to take the lock on
execution.
Use run_as_restart_worker_no_lock at the run_as error path call site.
Use run_as_restart_worker_no_lock inside run_as_restart_worker while
holding the worker lock to provide identical behaviour to other call sites.

Signed-off-by: Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
src/common/runas.c

index 227429dd11247eaaf76c81895edef00ecc4de912..c5cf09bfccbe2cd40d819cbc467d42f371cb4d80 100644 (file)
@@ -994,6 +994,51 @@ error_procname_alloc:
        return ret;
 }
 
+static
+void run_as_destroy_worker_no_lock(void)
+{
+       struct run_as_worker *worker = global_worker;
+
+       DBG("Destroying run_as worker");
+       if (!worker) {
+               return;
+       }
+       /* Close unix socket */
+       DBG("Closing run_as worker socket");
+       if (lttcomm_close_unix_sock(worker->sockpair[0])) {
+               PERROR("close");
+       }
+       worker->sockpair[0] = -1;
+       /* Wait for worker. */
+       for (;;) {
+               int status;
+               pid_t wait_ret;
+
+               wait_ret = waitpid(worker->pid, &status, 0);
+               if (wait_ret < 0) {
+                       if (errno == EINTR) {
+                               continue;
+                       }
+                       PERROR("waitpid");
+                       break;
+               }
+
+               if (WIFEXITED(status)) {
+                       LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
+                                       DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
+                                       WEXITSTATUS(status));
+                       break;
+               } else if (WIFSIGNALED(status)) {
+                       ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
+                                       WTERMSIG(status));
+                       break;
+               }
+       }
+       free(worker->procname);
+       free(worker);
+       global_worker = NULL;
+}
+
 static
 int run_as_restart_worker(struct run_as_worker *worker)
 {
@@ -1003,7 +1048,7 @@ int run_as_restart_worker(struct run_as_worker *worker)
        procname = worker->procname;
 
        /* Close socket to run_as worker process and clean up the zombie process */
-       run_as_destroy_worker();
+       run_as_destroy_worker_no_lock();
 
        /* Create a new run_as worker process*/
        ret = run_as_create_worker_no_lock(procname, NULL, NULL);
@@ -1239,47 +1284,7 @@ int run_as_create_worker(const char *procname,
 LTTNG_HIDDEN
 void run_as_destroy_worker(void)
 {
-       struct run_as_worker *worker = global_worker;
-
-       DBG("Destroying run_as worker");
        pthread_mutex_lock(&worker_lock);
-       if (!worker) {
-               goto end;
-       }
-       /* Close unix socket */
-       DBG("Closing run_as worker socket");
-       if (lttcomm_close_unix_sock(worker->sockpair[0])) {
-               PERROR("close");
-       }
-       worker->sockpair[0] = -1;
-       /* Wait for worker. */
-       for (;;) {
-               int status;
-               pid_t wait_ret;
-
-               wait_ret = waitpid(worker->pid, &status, 0);
-               if (wait_ret < 0) {
-                       if (errno == EINTR) {
-                               continue;
-                       }
-                       PERROR("waitpid");
-                       break;
-               }
-
-               if (WIFEXITED(status)) {
-                       LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
-                                       DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
-                                       WEXITSTATUS(status));
-                       break;
-               } else if (WIFSIGNALED(status)) {
-                       ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
-                                       WTERMSIG(status));
-                       break;
-               }
-       }
-       free(worker->procname);
-       free(worker);
-       global_worker = NULL;
-end:
+       run_as_destroy_worker_no_lock();
        pthread_mutex_unlock(&worker_lock);
 }
This page took 0.02816 seconds and 4 git commands to generate.