2 Monotonic accurate time
4 The goal of this design is to provide a monotonic time :
6 Readable from userspace without a system call
7 Readable from NMI handler
8 Readable without disabling interrupts
9 Readable without disabling preemption
10 Only one clock source (most precise available : tsc)
11 Support architectures with variable TSC frequency.
13 Main difference with wall time currently implemented in the Linux kernel : the
14 time update is done atomically instead of using a write seqlock. It permits
15 reading time from NMI handler and from userspace.
23 static struct time_struct {
24 struct time_info time_sel[2];
28 DECLARE_PERCPU(struct time_struct, cpu_time);
30 /* Number of times the scheduler is called on each CPU */
31 DECLARE_PERCPU(unsigned long, sched_nr);
33 /* On frequency change event */
35 void freq_change_cb(unsigned int new_freq)
37 struct time_struct this_cpu_time =
38 per_cpu(cpu_time, smp_processor_id());
39 struct time_info *write_time, *current_time;
41 this_cpu_time->time_sel[(this_cpu_time->update_count+1)&1];
43 this_cpu_time->time_sel[(this_cpu_time->update_count)&1];
44 write_time->tsc = get_cycles();
45 write_time->freq = new_freq;
46 /* We cumulate the division imprecision. This is the downside of using
47 * the TSC with variable frequency as a time base. */
48 write_time->walltime =
49 current_time->walltime +
50 (write_time->tsc - current_time->tsc) /
53 this_cpu_time->update_count++;
60 struct time_struct this_cpu_time =
61 per_cpu(cpu_time, smp_processor_id());
62 struct time_info *current_time;
63 memset(this_cpu_time, 0, sizeof(this_cpu_time));
64 current_time = this_cpu_time->time_sel[this_cpu_time->update_count&1];
65 /* Init current time */
67 /* Reset cpus to 0 ns, 0 tsc, start their tsc. */
71 /* After a CPU comes back from hlt */
72 /* The trick is to sync all the other CPUs on the first CPU up when they come
73 * up. If all CPUs are down, then there is no need to increment the walltime :
74 * let's simply define the useful walltime on a machine as the time elapsed
75 * while there is a CPU running. If we want, when no cpu is active, we can use
76 * a lower resolution clock to somehow keep track of walltime. */
85 /* Read time from anywhere in the kernel. Return time in walltime. (ns) */
86 /* If the update_count changes while we read the context, it may be invalid.
87 * This would happen if we are scheduled out for a period of time long enough to
88 * permit 2 frequency changes. We simply start the loop again if it happens.
89 * We detect it by comparing the update_count running counter.
90 * We detect preemption by incrementing a counter sched_nr within schedule().
91 * This counter is readable by user space through the vsyscall page. */
97 struct time_struct this_cpu_time;
98 struct time_info *current_time;
102 cpu = _smp_processor_id();
103 prev_sched_nr = per_cpu(sched_nr, cpu);
104 if(cpu != _smp_processor_id())
105 continue; /* changed CPU between CPUID and getting
107 this_cpu_time = per_cpu(cpu_time, cpu);
108 update_count = this_cpu_time->update_count;
109 current_time = this_cpu_time->time_sel[update_count&1];
110 walltime = current_time->walltime +
111 (get_cycles() - current_time->tsc) /
113 if(per_cpu(sched_nr, cpu) != prev_sched_nr)
114 continue; /* been preempted */
115 } while(this_cpu_time->update_count != update_count);
120 /* Export all this data to user space through the vsyscall page. Use a function
121 * like read_time to read the walltime. This function can be implemented as-is
122 * because it doesn't need to disable preemption. */