| 1 | |
| 2 | Monotonic accurate time |
| 3 | |
| 4 | The goal of this design is to provide a monotonic time : |
| 5 | |
| 6 | Readable from userspace without a system call |
| 7 | Readable from NMI handler |
| 8 | Readable without disabling interrupts |
| 9 | Readable without disabling preemption |
| 10 | Only one clock source (most precise available : tsc) |
| 11 | Support architectures with variable TSC frequency. |
| 12 | |
| 13 | Main difference with wall time currently implemented in the Linux kernel : the |
| 14 | time update is done atomically instead of using a write seqlock. It permits |
| 15 | reading time from NMI handler and from userspace. |
| 16 | |
| 17 | struct time_info { |
| 18 | u64 tsc; |
| 19 | u64 freq; |
| 20 | u64 walltime; |
| 21 | } |
| 22 | |
| 23 | static struct time_struct { |
| 24 | struct time_info time_sel[2]; |
| 25 | long update_count; |
| 26 | } |
| 27 | |
| 28 | DECLARE_PERCPU(struct time_struct, cpu_time); |
| 29 | |
| 30 | /* On frequency change event */ |
| 31 | /* In irq context */ |
| 32 | void freq_change_cb(unsigned int new_freq) |
| 33 | { |
| 34 | struct time_struct this_cpu_time = |
| 35 | per_cpu(cpu_time, smp_processor_id()); |
| 36 | struct time_info *write_time, *current_time; |
| 37 | write_time = |
| 38 | this_cpu_time->time_sel[(this_cpu_time->update_count+1)&1]; |
| 39 | current_time = |
| 40 | this_cpu_time->time_sel[(this_cpu_time->update_count)&1]; |
| 41 | write_time->tsc = get_cycles(); |
| 42 | write_time->freq = new_freq; |
| 43 | /* We cumulate the division imprecision. This is the downside of using |
| 44 | * the TSC with variable frequency as a time base. */ |
| 45 | write_time->walltime = |
| 46 | current_time->walltime + |
| 47 | (write_time->tsc - current_time->tsc) / |
| 48 | current_time->freq; |
| 49 | wmb(); |
| 50 | this_cpu_time->update_count++; |
| 51 | } |
| 52 | |
| 53 | |
| 54 | /* Init cpu freq */ |
| 55 | init_cpu_freq() |
| 56 | { |
| 57 | struct time_struct this_cpu_time = |
| 58 | per_cpu(cpu_time, smp_processor_id()); |
| 59 | struct time_info *current_time; |
| 60 | memset(this_cpu_time, 0, sizeof(this_cpu_time)); |
| 61 | current_time = this_cpu_time->time_sel[this_cpu_time->update_count&1]; |
| 62 | /* Init current time */ |
| 63 | /* Get frequency */ |
| 64 | /* Reset cpus to 0 ns, 0 tsc, start their tsc. */ |
| 65 | } |
| 66 | |
| 67 | |
| 68 | /* After a CPU comes back from hlt */ |
| 69 | /* The trick is to sync all the other CPUs on the first CPU up when they come |
| 70 | * up. If all CPUs are down, then there is no need to increment the walltime : |
| 71 | * let's simply define the useful walltime on a machine as the time elapsed |
| 72 | * while there is a CPU running. If we want, when no cpu is active, we can use |
| 73 | * a lower resolution clock to somehow keep track of walltime. */ |
| 74 | |
| 75 | wake_from_hlt() |
| 76 | { |
| 77 | /* TODO */ |
| 78 | } |
| 79 | |
| 80 | |
| 81 | |
| 82 | /* Read time from anywhere in the kernel. Return time in walltime. (ns) */ |
| 83 | /* If the update_count changes while we read the context, it may be invalid. |
| 84 | * This would happen if we are scheduled out for a period of time long enough to |
| 85 | * permit 2 frequency changes. We simply start the loop again if it happens. |
| 86 | * We detect it by comparing the update_count running counter. */ |
| 87 | u64 read_time(void) |
| 88 | { |
| 89 | u64 walltime; |
| 90 | long update_count; |
| 91 | struct time_struct this_cpu_time = |
| 92 | per_cpu(cpu_time, smp_processor_id()); |
| 93 | struct time_info *current_time; |
| 94 | do { |
| 95 | update_count = this_cpu_time->update_count; |
| 96 | current_time = this_cpu_time->time_sel[update_count&1]; |
| 97 | walltime = current_time->walltime + |
| 98 | (get_cycles() - current_time->tsc) / |
| 99 | current_time->freq; |
| 100 | } while(this_cpu_time->update_count != update_count); |
| 101 | return walltime; |
| 102 | } |
| 103 | |
| 104 | /* Userspace */ |
| 105 | /* Export all this data to user space through the vsyscall page. Use a function |
| 106 | * like read_time to read the walltime. This function can be implemented as-is |
| 107 | * because it doesn't need to disable preemption. */ |
| 108 | |
| 109 | |
| 110 | |