d8ce0927 |
1 | |
2 | Monotonic accurate time |
3 | |
4 | The goal of this design is to provide a monotonic time : |
5 | |
6 | Readable from userspace without a system call |
7 | Readable from NMI handler |
8 | Readable without disabling interrupts |
9 | Readable without disabling preemption |
10 | Only one clock source (most precise available : tsc) |
11 | Support architectures with variable TSC frequency. |
12 | |
13 | Main difference with wall time currently implemented in the Linux kernel : the |
14 | time update is done atomically instead of using a write seqlock. It permits |
15 | reading time from NMI handler and from userspace. |
16 | |
17 | struct time_info { |
18 | u64 tsc; |
19 | u64 freq; |
20 | u64 walltime; |
21 | } |
22 | |
23 | static struct time_struct { |
24 | struct time_info time_sel[2]; |
25 | long update_count; |
26 | } |
27 | |
28 | DECLARE_PERCPU(struct time_struct, cpu_time); |
29 | |
0f57d12e |
30 | /* Number of times the scheduler is called on each CPU */ |
31 | DECLARE_PERCPU(unsigned long, sched_nr); |
32 | |
d8ce0927 |
33 | /* On frequency change event */ |
34 | /* In irq context */ |
35 | void freq_change_cb(unsigned int new_freq) |
36 | { |
37 | struct time_struct this_cpu_time = |
38 | per_cpu(cpu_time, smp_processor_id()); |
39 | struct time_info *write_time, *current_time; |
40 | write_time = |
41 | this_cpu_time->time_sel[(this_cpu_time->update_count+1)&1]; |
42 | current_time = |
43 | this_cpu_time->time_sel[(this_cpu_time->update_count)&1]; |
44 | write_time->tsc = get_cycles(); |
45 | write_time->freq = new_freq; |
46 | /* We cumulate the division imprecision. This is the downside of using |
47 | * the TSC with variable frequency as a time base. */ |
48 | write_time->walltime = |
49 | current_time->walltime + |
50 | (write_time->tsc - current_time->tsc) / |
51 | current_time->freq; |
52 | wmb(); |
53 | this_cpu_time->update_count++; |
54 | } |
55 | |
56 | |
57 | /* Init cpu freq */ |
58 | init_cpu_freq() |
59 | { |
60 | struct time_struct this_cpu_time = |
61 | per_cpu(cpu_time, smp_processor_id()); |
62 | struct time_info *current_time; |
63 | memset(this_cpu_time, 0, sizeof(this_cpu_time)); |
64 | current_time = this_cpu_time->time_sel[this_cpu_time->update_count&1]; |
65 | /* Init current time */ |
66 | /* Get frequency */ |
67 | /* Reset cpus to 0 ns, 0 tsc, start their tsc. */ |
68 | } |
69 | |
70 | |
71 | /* After a CPU comes back from hlt */ |
72 | /* The trick is to sync all the other CPUs on the first CPU up when they come |
73 | * up. If all CPUs are down, then there is no need to increment the walltime : |
74 | * let's simply define the useful walltime on a machine as the time elapsed |
75 | * while there is a CPU running. If we want, when no cpu is active, we can use |
76 | * a lower resolution clock to somehow keep track of walltime. */ |
77 | |
78 | wake_from_hlt() |
79 | { |
80 | /* TODO */ |
81 | } |
82 | |
83 | |
84 | |
85 | /* Read time from anywhere in the kernel. Return time in walltime. (ns) */ |
86 | /* If the update_count changes while we read the context, it may be invalid. |
87 | * This would happen if we are scheduled out for a period of time long enough to |
88 | * permit 2 frequency changes. We simply start the loop again if it happens. |
0f57d12e |
89 | * We detect it by comparing the update_count running counter. |
90 | * We detect preemption by incrementing a counter sched_nr within schedule(). |
91 | * This counter is readable by user space through the vsyscall page. */ |
b82734ba |
92 | */ |
d8ce0927 |
93 | u64 read_time(void) |
94 | { |
95 | u64 walltime; |
96 | long update_count; |
b82734ba |
97 | struct time_struct this_cpu_time; |
d8ce0927 |
98 | struct time_info *current_time; |
b82734ba |
99 | unsigned int cpu; |
0f57d12e |
100 | long prev_sched_nr; |
d8ce0927 |
101 | do { |
b82734ba |
102 | cpu = _smp_processor_id(); |
0f57d12e |
103 | prev_sched_nr = per_cpu(sched_nr, cpu); |
104 | if(cpu != _smp_processor_id()) |
105 | continue; /* changed CPU between CPUID and getting |
106 | sched_nr */ |
b82734ba |
107 | this_cpu_time = per_cpu(cpu_time, cpu); |
d8ce0927 |
108 | update_count = this_cpu_time->update_count; |
109 | current_time = this_cpu_time->time_sel[update_count&1]; |
110 | walltime = current_time->walltime + |
111 | (get_cycles() - current_time->tsc) / |
112 | current_time->freq; |
0f57d12e |
113 | if(per_cpu(sched_nr, cpu) != prev_sched_nr) |
114 | continue; /* been preempted */ |
115 | } while(this_cpu_time->update_count != update_count); |
d8ce0927 |
116 | return walltime; |
117 | } |
118 | |
119 | /* Userspace */ |
120 | /* Export all this data to user space through the vsyscall page. Use a function |
121 | * like read_time to read the walltime. This function can be implemented as-is |
122 | * because it doesn't need to disable preemption. */ |
123 | |
124 | |
125 | |