fix: sched/tracing: Don't re-read p->state when emitting sched_switch event (v5.18)
authorMichael Jeanson <mjeanson@efficios.com>
Mon, 4 Apr 2022 17:52:57 +0000 (13:52 -0400)
committerMathieu Desnoyers <mathieu.desnoyers@efficios.com>
Tue, 26 Apr 2022 16:00:44 +0000 (12:00 -0400)
See upstream commit :

  commit fa2c3254d7cfff5f7a916ab928a562d1165f17bb
  Author: Valentin Schneider <valentin.schneider@arm.com>
  Date:   Thu Jan 20 16:25:19 2022 +0000

    sched/tracing: Don't re-read p->state when emitting sched_switch event

    As of commit

      c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

    the following sequence becomes possible:

                          p->__state = TASK_INTERRUPTIBLE;
                          __schedule()
                            deactivate_task(p);
      ttwu()
        READ !p->on_rq
        p->__state=TASK_WAKING
                            trace_sched_switch()
                              __trace_sched_switch_state()
                                task_state_index()
                                  return 0;

    TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
    the trace event.

    Prevent this by pushing the value read from __schedule() down the trace
    event.

Change-Id: I46743cd006be4b4d573cae2d77df7d6d16744d04
Signed-off-by: Michael Jeanson <mjeanson@efficios.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
include/instrumentation/events/sched.h

index 91953a6f1ca993990cdd7034efb6aee388357b88..339bec945d3efd84db562934fcb358384d4b2eea 100644 (file)
 #ifndef _TRACE_SCHED_DEF_
 #define _TRACE_SCHED_DEF_
 
-#if (LTTNG_LINUX_VERSION_CODE >= LTTNG_KERNEL_VERSION(4,15,0))
+#if (LTTNG_LINUX_VERSION_CODE >= LTTNG_KERNEL_VERSION(5,18,0))
+
+static inline long __trace_sched_switch_state(bool preempt,
+               unsigned int prev_state,
+               struct task_struct *p)
+{
+        unsigned int state;
+
+#ifdef CONFIG_SCHED_DEBUG
+        BUG_ON(p != current);
+#endif /* CONFIG_SCHED_DEBUG */
+
+        /*
+         * Preemption ignores task state, therefore preempted tasks are always
+         * RUNNING (we will not have dequeued if state != RUNNING).
+         */
+        if (preempt)
+                return TASK_REPORT_MAX;
+
+        /*
+         * task_state_index() uses fls() and returns a value from 0-8 range.
+         * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
+         * it for left shift operation to get the correct task->state
+         * mapping.
+         */
+       state = __task_state_index(prev_state, p->exit_state);
+
+        return state ? (1 << (state - 1)) : state;
+}
+
+#elif (LTTNG_LINUX_VERSION_CODE >= LTTNG_KERNEL_VERSION(4,15,0))
 
 static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
 {
@@ -321,43 +351,81 @@ LTTNG_TRACEPOINT_EVENT_INSTANCE(sched_wakeup_template, sched_wakeup_new,
 /*
  * Tracepoint for task switches, performed by the scheduler:
  */
+
+#if (LTTNG_LINUX_VERSION_CODE >= LTTNG_KERNEL_VERSION(5,18,0))
 LTTNG_TRACEPOINT_EVENT(sched_switch,
 
-#if (LTTNG_LINUX_VERSION_CODE >= LTTNG_KERNEL_VERSION(4,4,0))
        TP_PROTO(bool preempt,
-                struct task_struct *prev,
-                struct task_struct *next),
+               unsigned int prev_state,
+               struct task_struct *prev,
+               struct task_struct *next),
 
-       TP_ARGS(preempt, prev, next),
+       TP_ARGS(preempt, prev_state, prev, next),
+
+       TP_FIELDS(
+               ctf_array_text(char, prev_comm, prev->comm, TASK_COMM_LEN)
+               ctf_integer(pid_t, prev_tid, prev->pid)
+               ctf_integer(int, prev_prio, prev->prio - MAX_RT_PRIO)
+#ifdef CONFIG_LTTNG_EXPERIMENTAL_BITWISE_ENUM
+               ctf_enum(task_state, long, prev_state, __trace_sched_switch_state(preempt, prev_state, prev))
 #else
-       TP_PROTO(struct task_struct *prev,
+               ctf_integer(long, prev_state, __trace_sched_switch_state(preempt, prev_state, prev))
+#endif
+               ctf_array_text(char, next_comm, next->comm, TASK_COMM_LEN)
+               ctf_integer(pid_t, next_tid, next->pid)
+               ctf_integer(int, next_prio, next->prio - MAX_RT_PRIO)
+       )
+)
+
+#elif (LTTNG_LINUX_VERSION_CODE >= LTTNG_KERNEL_VERSION(4,4,0))
+
+LTTNG_TRACEPOINT_EVENT(sched_switch,
+
+       TP_PROTO(bool preempt,
+                struct task_struct *prev,
                 struct task_struct *next),
 
-       TP_ARGS(prev, next),
-#endif /* #if (LTTNG_LINUX_VERSION_CODE >= LTTNG_KERNEL_VERSION(4,4,0)) */
+       TP_ARGS(preempt, prev, next),
 
        TP_FIELDS(
                ctf_array_text(char, prev_comm, prev->comm, TASK_COMM_LEN)
                ctf_integer(pid_t, prev_tid, prev->pid)
                ctf_integer(int, prev_prio, prev->prio - MAX_RT_PRIO)
-#if (LTTNG_LINUX_VERSION_CODE >= LTTNG_KERNEL_VERSION(4,4,0))
 #ifdef CONFIG_LTTNG_EXPERIMENTAL_BITWISE_ENUM
                ctf_enum(task_state, long, prev_state, __trace_sched_switch_state(preempt, prev))
 #else
                ctf_integer(long, prev_state, __trace_sched_switch_state(preempt, prev))
 #endif
+               ctf_array_text(char, next_comm, next->comm, TASK_COMM_LEN)
+               ctf_integer(pid_t, next_tid, next->pid)
+               ctf_integer(int, next_prio, next->prio - MAX_RT_PRIO)
+       )
+)
+
 #else
+
+LTTNG_TRACEPOINT_EVENT(sched_switch,
+
+       TP_PROTO(struct task_struct *prev,
+                struct task_struct *next),
+
+       TP_ARGS(prev, next),
+
+       TP_FIELDS(
+               ctf_array_text(char, prev_comm, prev->comm, TASK_COMM_LEN)
+               ctf_integer(pid_t, prev_tid, prev->pid)
+               ctf_integer(int, prev_prio, prev->prio - MAX_RT_PRIO)
 #ifdef CONFIG_LTTNG_EXPERIMENTAL_BITWISE_ENUM
                ctf_enum(task_state, long, prev_state, __trace_sched_switch_state(prev))
 #else
                ctf_integer(long, prev_state, __trace_sched_switch_state(prev))
-#endif
 #endif
                ctf_array_text(char, next_comm, next->comm, TASK_COMM_LEN)
                ctf_integer(pid_t, next_tid, next->pid)
                ctf_integer(int, next_prio, next->prio - MAX_RT_PRIO)
        )
 )
+#endif
 
 /*
  * Tracepoint for a task being migrated:
This page took 0.028185 seconds and 4 git commands to generate.