| 1 | |
| 2 | Monotonic accurate time |
| 3 | |
| 4 | The goal of this design is to provide a monotonic time : |
| 5 | |
| 6 | Readable from userspace without a system call |
| 7 | Readable from NMI handler |
| 8 | Readable without disabling interrupts |
| 9 | Readable without disabling preemption |
| 10 | Only one clock source (most precise available : tsc) |
| 11 | Support architectures with variable TSC frequency. |
| 12 | |
| 13 | Main difference with wall time currently implemented in the Linux kernel : the |
| 14 | time update is done atomically instead of using a write seqlock. It permits |
| 15 | reading time from NMI handler and from userspace. |
| 16 | |
| 17 | struct time_info { |
| 18 | u64 tsc; |
| 19 | u64 freq; |
| 20 | u64 walltime; |
| 21 | } |
| 22 | |
| 23 | static struct time_struct { |
| 24 | struct time_info time_sel[2]; |
| 25 | long update_count; |
| 26 | } |
| 27 | |
| 28 | DECLARE_PERCPU(struct time_struct, cpu_time); |
| 29 | |
| 30 | /* Number of times the scheduler is called on each CPU */ |
| 31 | DECLARE_PERCPU(unsigned long, sched_nr); |
| 32 | |
| 33 | /* On frequency change event */ |
| 34 | /* In irq context */ |
| 35 | void freq_change_cb(unsigned int new_freq) |
| 36 | { |
| 37 | struct time_struct this_cpu_time = |
| 38 | per_cpu(cpu_time, smp_processor_id()); |
| 39 | struct time_info *write_time, *current_time; |
| 40 | write_time = |
| 41 | this_cpu_time->time_sel[(this_cpu_time->update_count+1)&1]; |
| 42 | current_time = |
| 43 | this_cpu_time->time_sel[(this_cpu_time->update_count)&1]; |
| 44 | write_time->tsc = get_cycles(); |
| 45 | write_time->freq = new_freq; |
| 46 | /* We cumulate the division imprecision. This is the downside of using |
| 47 | * the TSC with variable frequency as a time base. */ |
| 48 | write_time->walltime = |
| 49 | current_time->walltime + |
| 50 | (write_time->tsc - current_time->tsc) / |
| 51 | current_time->freq; |
| 52 | wmb(); |
| 53 | this_cpu_time->update_count++; |
| 54 | } |
| 55 | |
| 56 | |
| 57 | /* Init cpu freq */ |
| 58 | init_cpu_freq() |
| 59 | { |
| 60 | struct time_struct this_cpu_time = |
| 61 | per_cpu(cpu_time, smp_processor_id()); |
| 62 | struct time_info *current_time; |
| 63 | memset(this_cpu_time, 0, sizeof(this_cpu_time)); |
| 64 | current_time = this_cpu_time->time_sel[this_cpu_time->update_count&1]; |
| 65 | /* Init current time */ |
| 66 | /* Get frequency */ |
| 67 | /* Reset cpus to 0 ns, 0 tsc, start their tsc. */ |
| 68 | } |
| 69 | |
| 70 | |
| 71 | /* After a CPU comes back from hlt */ |
| 72 | /* The trick is to sync all the other CPUs on the first CPU up when they come |
| 73 | * up. If all CPUs are down, then there is no need to increment the walltime : |
| 74 | * let's simply define the useful walltime on a machine as the time elapsed |
| 75 | * while there is a CPU running. If we want, when no cpu is active, we can use |
| 76 | * a lower resolution clock to somehow keep track of walltime. */ |
| 77 | |
| 78 | wake_from_hlt() |
| 79 | { |
| 80 | /* TODO */ |
| 81 | } |
| 82 | |
| 83 | |
| 84 | |
| 85 | /* Read time from anywhere in the kernel. Return time in walltime. (ns) */ |
| 86 | /* If the update_count changes while we read the context, it may be invalid. |
| 87 | * This would happen if we are scheduled out for a period of time long enough to |
| 88 | * permit 2 frequency changes. We simply start the loop again if it happens. |
| 89 | * We detect it by comparing the update_count running counter. |
| 90 | * We detect preemption by incrementing a counter sched_nr within schedule(). |
| 91 | * This counter is readable by user space through the vsyscall page. */ |
| 92 | */ |
| 93 | u64 read_time(void) |
| 94 | { |
| 95 | u64 walltime; |
| 96 | long update_count; |
| 97 | struct time_struct this_cpu_time; |
| 98 | struct time_info *current_time; |
| 99 | unsigned int cpu; |
| 100 | long prev_sched_nr; |
| 101 | do { |
| 102 | cpu = _smp_processor_id(); |
| 103 | prev_sched_nr = per_cpu(sched_nr, cpu); |
| 104 | if(cpu != _smp_processor_id()) |
| 105 | continue; /* changed CPU between CPUID and getting |
| 106 | sched_nr */ |
| 107 | this_cpu_time = per_cpu(cpu_time, cpu); |
| 108 | update_count = this_cpu_time->update_count; |
| 109 | current_time = this_cpu_time->time_sel[update_count&1]; |
| 110 | walltime = current_time->walltime + |
| 111 | (get_cycles() - current_time->tsc) / |
| 112 | current_time->freq; |
| 113 | if(per_cpu(sched_nr, cpu) != prev_sched_nr) |
| 114 | continue; /* been preempted */ |
| 115 | } while(this_cpu_time->update_count != update_count); |
| 116 | return walltime; |
| 117 | } |
| 118 | |
| 119 | /* Userspace */ |
| 120 | /* Export all this data to user space through the vsyscall page. Use a function |
| 121 | * like read_time to read the walltime. This function can be implemented as-is |
| 122 | * because it doesn't need to disable preemption. */ |
| 123 | |
| 124 | |
| 125 | |