doc/developer/time-monotonic-accurate.txt

   1
   2 Monotonic accurate time
   3
   4 The goal of this design is to provide a monotonic time :
   5
   6 Readable from userspace without a system call
   7 Readable from NMI handler
   8 Readable without disabling interrupts
   9 Readable without disabling preemption
  10 Only one clock source (most precise available : tsc)
  11 Support architectures with variable TSC frequency.
  12
  13 Main difference with wall time currently implemented in the Linux kernel : the
  14 time update is done atomically instead of using a write seqlock. It permits
  15 reading time from NMI handler and from userspace.
  16
  17 struct time_info {
  18         u64 tsc;
  19         u64 freq;
  20         u64 walltime;
  21 }
  22
  23 static struct time_struct {
  24         struct time_info time_sel[2];
  25         long update_count;
  26 }
  27
  28 DECLARE_PERCPU(struct time_struct, cpu_time);
  29
  30 /* Number of times the scheduler is called on each CPU */
  31 DECLARE_PERCPU(unsigned long, sched_nr);
  32
  33 /* On frequency change event */
  34 /* In irq context */
  35 void freq_change_cb(unsigned int new_freq)
  36 {
  37         struct time_struct this_cpu_time =
  38                 per_cpu(cpu_time, smp_processor_id());
  39         struct time_info *write_time, *current_time;
  40         write_time =
  41                 this_cpu_time->time_sel[(this_cpu_time->update_count+1)&1];
  42         current_time =
  43                 this_cpu_time->time_sel[(this_cpu_time->update_count)&1];
  44         write_time->tsc = get_cycles();
  45         write_time->freq = new_freq;
  46         /* We cumulate the division imprecision. This is the downside of using
  47          * the TSC with variable frequency as a time base. */
  48         write_time->walltime =
  49                 current_time->walltime +
  50                         (write_time->tsc - current_time->tsc) /
  51                         current_time->freq;
  52         wmb();
  53         this_cpu_time->update_count++;
  54 }
  55
  56
  57 /* Init cpu freq */
  58 init_cpu_freq()
  59 {
  60         struct time_struct this_cpu_time =
  61                 per_cpu(cpu_time, smp_processor_id());
  62         struct time_info *current_time;
  63         memset(this_cpu_time, 0, sizeof(this_cpu_time));
  64         current_time = this_cpu_time->time_sel[this_cpu_time->update_count&1];
  65         /* Init current time */
  66         /* Get frequency */
  67         /* Reset cpus to 0 ns, 0 tsc, start their tsc. */
  68 }
  69
  70
  71 /* After a CPU comes back from hlt */
  72 /* The trick is to sync all the other CPUs on the first CPU up when they come
  73  * up. If all CPUs are down, then there is no need to increment the walltime :
  74  * let's simply define the useful walltime on a machine as the time elapsed
  75  * while there is a CPU running. If we want, when no cpu is active, we can use
  76  * a lower resolution clock to somehow keep track of walltime. */
  77
  78 wake_from_hlt()
  79 {
  80         /* TODO */
  81 }
  82
  83
  84
  85 /* Read time from anywhere in the kernel. Return time in walltime. (ns) */
  86 /* If the update_count changes while we read the context, it may be invalid.
  87  * This would happen if we are scheduled out for a period of time long enough to
  88  * permit 2 frequency changes. We simply start the loop again if it happens.
  89  * We detect it by comparing the update_count running counter.
  90  * We detect preemption by incrementing a counter sched_nr within schedule().
  91  * This counter is readable by user space through the vsyscall page. */
  92  */
  93 u64 read_time(void)
  94 {
  95         u64 walltime;
  96         long update_count;
  97         struct time_struct this_cpu_time;
  98         struct time_info *current_time;
  99         unsigned int cpu;
 100         long prev_sched_nr;
 101         do {
 102                 cpu = _smp_processor_id();
 103                 prev_sched_nr = per_cpu(sched_nr, cpu);
 104                 if(cpu != _smp_processor_id())
 105                         continue;       /* changed CPU between CPUID and getting
 106                                            sched_nr */
 107                 this_cpu_time = per_cpu(cpu_time, cpu);
 108                 update_count = this_cpu_time->update_count;
 109                 current_time = this_cpu_time->time_sel[update_count&1];
 110                 walltime = current_time->walltime +
 111                                 (get_cycles() - current_time->tsc) /
 112                                 current_time->freq;
 113                 if(per_cpu(sched_nr, cpu) != prev_sched_nr)
 114                         continue;       /* been preempted */
 115         } while(this_cpu_time->update_count != update_count);
 116         return walltime;
 117 }
 118
 119 /* Userspace */
 120 /* Export all this data to user space through the vsyscall page. Use a function
 121  * like read_time to read the walltime. This function can be implemented as-is
 122  * because it doesn't need to disable preemption. */
 123
 124
 125