Commit | Line | Data |
---|---|---|
1c8284eb MD |
1 | #ifndef _LTT_LTT_RELAY_LOCKLESS_H |
2 | #define _LTT_LTT_RELAY_LOCKLESS_H | |
3 | ||
4 | /* | |
5 | * ltt/ltt-relay-lockless.h | |
6 | * | |
7 | * (C) Copyright 2005-2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) | |
8 | * | |
9 | * LTTng lockless buffer space management (reader/writer). | |
10 | * | |
11 | * Author: | |
12 | * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) | |
13 | * | |
14 | * Inspired from LTT : | |
15 | * Karim Yaghmour (karim@opersys.com) | |
16 | * Tom Zanussi (zanussi@us.ibm.com) | |
17 | * Bob Wisniewski (bob@watson.ibm.com) | |
18 | * And from K42 : | |
19 | * Bob Wisniewski (bob@watson.ibm.com) | |
20 | * | |
21 | * Changelog: | |
22 | * 08/10/08, Cleanup. | |
23 | * 19/10/05, Complete lockless mechanism. | |
24 | * 27/05/05, Modular redesign and rewrite. | |
25 | * | |
26 | * Userspace reader semantic : | |
27 | * while (poll fd != POLLHUP) { | |
28 | * - ioctl RELAY_GET_SUBBUF_SIZE | |
29 | * while (1) { | |
30 | * - ioctl GET_SUBBUF | |
31 | * - splice 1 subbuffer worth of data to a pipe | |
32 | * - splice the data from pipe to disk/network | |
33 | * - ioctl PUT_SUBBUF, check error value | |
34 | * if err val < 0, previous subbuffer was corrupted. | |
35 | * } | |
36 | * } | |
37 | * | |
38 | * Dual LGPL v2.1/GPL v2 license. | |
39 | */ | |
40 | ||
41 | #include <linux/cache.h> | |
42 | #include <linux/time.h> | |
43 | #include <linux/module.h> | |
44 | #include <linux/string.h> | |
45 | #include <linux/slab.h> | |
46 | #include <linux/init.h> | |
47 | #include <linux/rcupdate.h> | |
48 | #include <linux/timer.h> | |
49 | #include <linux/sched.h> | |
50 | #include <linux/bitops.h> | |
51 | #include <linux/fs.h> | |
52 | #include <linux/smp_lock.h> | |
53 | #include <linux/debugfs.h> | |
54 | #include <linux/stat.h> | |
55 | #include <linux/cpu.h> | |
56 | #include <linux/pipe_fs_i.h> | |
57 | #include <linux/splice.h> | |
58 | #include <asm/atomic.h> | |
59 | #include <asm/local.h> | |
60 | ||
61 | #include "ltt-tracer.h" | |
62 | #include "ltt-relay.h" | |
63 | ||
64 | #if 0 | |
65 | #define printk_dbg(fmt, args...) printk(fmt, args) | |
66 | #else | |
67 | #define printk_dbg(fmt, args...) | |
68 | #endif | |
69 | ||
70 | struct commit_counters { | |
71 | local_t cc; | |
72 | local_t cc_sb; /* Incremented _once_ at sb switch */ | |
73 | local_t events; /* Event count */ | |
74 | }; | |
75 | ||
76 | /* LTTng lockless logging buffer info */ | |
77 | struct ltt_chanbuf { | |
78 | struct ltt_chanbuf_alloc a; /* Parent. First field. */ | |
79 | /* First 32 bytes cache-hot cacheline */ | |
80 | local_t offset; /* Current offset in the buffer */ | |
81 | struct commit_counters *commit_count; | |
82 | /* Commit count per sub-buffer */ | |
83 | atomic_long_t consumed; /* | |
84 | * Current offset in the buffer | |
85 | * standard atomic access (shared) | |
86 | */ | |
87 | unsigned long last_tsc; /* | |
88 | * Last timestamp written in the buffer. | |
89 | */ | |
90 | /* End of first 32 bytes cacheline */ | |
2e6246b4 | 91 | #ifdef LTT_VMCORE |
1c8284eb MD |
92 | local_t *commit_seq; /* Consecutive commits */ |
93 | #endif | |
94 | atomic_long_t active_readers; /* | |
95 | * Active readers count | |
96 | * standard atomic access (shared) | |
97 | */ | |
98 | local_t events_lost; | |
99 | local_t corrupted_subbuffers; | |
100 | spinlock_t full_lock; /* | |
101 | * buffer full condition spinlock, only | |
102 | * for userspace tracing blocking mode | |
103 | * synchronization with reader. | |
104 | */ | |
105 | wait_queue_head_t write_wait; /* | |
106 | * Wait queue for blocking user space | |
107 | * writers | |
108 | */ | |
109 | wait_queue_head_t read_wait; /* reader wait queue */ | |
110 | unsigned int finalized; /* buffer has been finalized */ | |
111 | struct timer_list switch_timer; /* timer for periodical switch */ | |
112 | }; | |
113 | ||
114 | /* | |
115 | * A switch is done during tracing or as a final flush after tracing (so it | |
116 | * won't write in the new sub-buffer). | |
117 | */ | |
118 | enum force_switch_mode { FORCE_ACTIVE, FORCE_FLUSH }; | |
119 | ||
120 | extern | |
121 | int ltt_reserve_slot_lockless_slow(struct ltt_chan *chan, | |
122 | struct ltt_trace *trace, size_t data_size, | |
123 | int largest_align, int cpu, | |
124 | struct ltt_chanbuf **ret_buf, | |
125 | size_t *slot_size, long *buf_offset, | |
126 | u64 *tsc, unsigned int *rflags); | |
127 | ||
128 | extern void ltt_force_switch_lockless_slow(struct ltt_chanbuf *buf, | |
129 | enum force_switch_mode mode); | |
130 | ||
131 | /* | |
132 | * Last TSC comparison functions. Check if the current TSC overflows | |
133 | * LTT_TSC_BITS bits from the last TSC read. Reads and writes last_tsc | |
134 | * atomically. | |
135 | */ | |
136 | ||
137 | #if (BITS_PER_LONG == 32) | |
138 | static __inline__ void save_last_tsc(struct ltt_chanbuf *buf, u64 tsc) | |
139 | { | |
140 | buf->last_tsc = (unsigned long)(tsc >> LTT_TSC_BITS); | |
141 | } | |
142 | ||
143 | static __inline__ int last_tsc_overflow(struct ltt_chanbuf *buf, u64 tsc) | |
144 | { | |
145 | unsigned long tsc_shifted = (unsigned long)(tsc >> LTT_TSC_BITS); | |
146 | ||
147 | if (unlikely((tsc_shifted - buf->last_tsc))) | |
148 | return 1; | |
149 | else | |
150 | return 0; | |
151 | } | |
152 | #else | |
153 | static __inline__ void save_last_tsc(struct ltt_chanbuf *buf, u64 tsc) | |
154 | { | |
155 | buf->last_tsc = (unsigned long)tsc; | |
156 | } | |
157 | ||
158 | static __inline__ int last_tsc_overflow(struct ltt_chanbuf *buf, u64 tsc) | |
159 | { | |
160 | if (unlikely((tsc - buf->last_tsc) >> LTT_TSC_BITS)) | |
161 | return 1; | |
162 | else | |
163 | return 0; | |
164 | } | |
165 | #endif | |
166 | ||
167 | extern | |
168 | int ltt_chanbuf_create(struct ltt_chanbuf *buf, struct ltt_chan_alloc *chana, | |
169 | int cpu); | |
170 | extern void ltt_chanbuf_free(struct ltt_chanbuf *buf); | |
171 | extern int ltt_chan_create(const char *base_filename, struct ltt_chan *chan, | |
172 | struct dentry *parent, size_t sb_size, size_t n_sb, | |
173 | int overwrite, struct ltt_trace *trace); | |
174 | extern void ltt_chan_free(struct kref *kref); | |
175 | extern void ltt_chan_remove_files(struct ltt_chan *chan); | |
176 | ||
177 | /* Buffer access operations */ | |
178 | ||
179 | extern int ltt_chanbuf_open_read(struct ltt_chanbuf *buf); | |
180 | extern void ltt_chanbuf_release_read(struct ltt_chanbuf *buf); | |
181 | extern int ltt_chanbuf_get_subbuf(struct ltt_chanbuf *buf, | |
182 | unsigned long *consumed); | |
183 | extern int ltt_chanbuf_put_subbuf(struct ltt_chanbuf *buf, | |
184 | unsigned long consumed); | |
185 | extern void ltt_chan_start_switch_timer(struct ltt_chan *chan); | |
186 | extern void ltt_chan_stop_switch_timer(struct ltt_chan *chan); | |
187 | ||
188 | extern int ltt_relay_init(void); | |
189 | extern void ltt_relay_exit(void); | |
190 | ||
191 | static __inline__ | |
192 | unsigned long ltt_chanbuf_get_offset(struct ltt_chanbuf *buf) | |
193 | { | |
194 | return local_read(&buf->offset); | |
195 | } | |
196 | ||
197 | static __inline__ | |
198 | unsigned long ltt_chanbuf_get_consumed(struct ltt_chanbuf *buf) | |
199 | { | |
200 | return atomic_long_read(&buf->consumed); | |
201 | } | |
202 | ||
203 | static __inline__ | |
204 | int ltt_chanbuf_is_finalized(struct ltt_chanbuf *buf) | |
205 | { | |
206 | return buf->finalized; | |
207 | } | |
208 | ||
209 | static __inline__ | |
210 | void ltt_reserve_push_reader(struct ltt_chanbuf *buf, struct ltt_chan *chan, | |
211 | long offset) | |
212 | { | |
213 | long consumed_old, consumed_new; | |
214 | ||
215 | do { | |
216 | consumed_old = atomic_long_read(&buf->consumed); | |
217 | /* | |
218 | * If buffer is in overwrite mode, push the reader consumed | |
219 | * count if the write position has reached it and we are not | |
220 | * at the first iteration (don't push the reader farther than | |
221 | * the writer). This operation can be done concurrently by many | |
222 | * writers in the same buffer, the writer being at the farthest | |
223 | * write position sub-buffer index in the buffer being the one | |
224 | * which will win this loop. | |
225 | * If the buffer is not in overwrite mode, pushing the reader | |
226 | * only happens if a sub-buffer is corrupted. | |
227 | */ | |
228 | if (unlikely((SUBBUF_TRUNC(offset, chan) | |
229 | - SUBBUF_TRUNC(consumed_old, chan)) | |
230 | >= chan->a.buf_size)) | |
231 | consumed_new = SUBBUF_ALIGN(consumed_old, chan); | |
232 | else | |
233 | return; | |
234 | } while (unlikely(atomic_long_cmpxchg(&buf->consumed, consumed_old, | |
235 | consumed_new) != consumed_old)); | |
236 | } | |
237 | ||
2e6246b4 | 238 | #ifdef LTT_VMCORE |
1c8284eb MD |
239 | static __inline__ |
240 | void ltt_vmcore_check_deliver(struct ltt_chanbuf *buf, long commit_count, | |
241 | long idx) | |
242 | { | |
243 | local_set(&buf->commit_seq[idx], commit_count); | |
244 | } | |
245 | #else | |
246 | static __inline__ | |
247 | void ltt_vmcore_check_deliver(struct ltt_chanbuf *buf, long commit_count, | |
248 | long idx) | |
249 | { | |
250 | } | |
251 | #endif | |
252 | ||
253 | static __inline__ | |
254 | void ltt_check_deliver(struct ltt_chanbuf *buf, struct ltt_chan *chan, | |
255 | long offset, long commit_count, long idx) | |
256 | { | |
257 | long old_commit_count = commit_count - chan->a.sb_size; | |
258 | ||
259 | /* Check if all commits have been done */ | |
260 | if (unlikely((BUFFER_TRUNC(offset, chan) >> chan->a.n_sb_order) | |
261 | - (old_commit_count & chan->commit_count_mask) == 0)) { | |
262 | /* | |
263 | * If we succeeded in updating the cc_sb, we are delivering | |
264 | * the subbuffer. Deals with concurrent updates of the "cc" | |
265 | * value without adding a add_return atomic operation to the | |
266 | * fast path. | |
267 | */ | |
268 | if (likely(local_cmpxchg(&buf->commit_count[idx].cc_sb, | |
269 | old_commit_count, commit_count) | |
270 | == old_commit_count)) { | |
271 | /* | |
272 | * Set noref flag for this subbuffer. | |
273 | */ | |
274 | ltt_set_noref_flag(&buf->a, idx); | |
275 | ltt_vmcore_check_deliver(buf, commit_count, idx); | |
276 | } | |
277 | } | |
278 | } | |
279 | ||
280 | ||
281 | static __inline__ | |
282 | int ltt_poll_deliver(struct ltt_chanbuf *buf, struct ltt_chan *chan) | |
283 | { | |
284 | long consumed_old, consumed_idx, commit_count, write_offset; | |
285 | ||
286 | consumed_old = atomic_long_read(&buf->consumed); | |
287 | consumed_idx = SUBBUF_INDEX(consumed_old, chan); | |
288 | commit_count = local_read(&buf->commit_count[consumed_idx].cc_sb); | |
289 | /* | |
290 | * No memory barrier here, since we are only interested | |
291 | * in a statistically correct polling result. The next poll will | |
292 | * get the data is we are racing. The mb() that ensures correct | |
293 | * memory order is in get_subbuf. | |
294 | */ | |
295 | write_offset = local_read(&buf->offset); | |
296 | ||
297 | /* | |
298 | * Check that the subbuffer we are trying to consume has been | |
299 | * already fully committed. | |
300 | */ | |
301 | ||
302 | if (((commit_count - chan->a.sb_size) | |
303 | & chan->commit_count_mask) | |
304 | - (BUFFER_TRUNC(consumed_old, chan) | |
305 | >> chan->a.n_sb_order) | |
306 | != 0) | |
307 | return 0; | |
308 | ||
309 | /* | |
310 | * Check that we are not about to read the same subbuffer in | |
311 | * which the writer head is. | |
312 | */ | |
313 | if ((SUBBUF_TRUNC(write_offset, chan) | |
314 | - SUBBUF_TRUNC(consumed_old, chan)) | |
315 | == 0) | |
316 | return 0; | |
317 | ||
318 | return 1; | |
319 | ||
320 | } | |
321 | ||
322 | static __inline__ | |
323 | u32 get_read_sb_size(struct ltt_chanbuf *buf) | |
324 | { | |
325 | struct ltt_subbuffer_header *header = | |
326 | (struct ltt_subbuffer_header *) | |
327 | ltt_relay_read_offset_address(&buf->a, 0); | |
328 | return header->sb_size; | |
329 | } | |
330 | ||
331 | /* | |
332 | * returns 0 if reserve ok, or 1 if the slow path must be taken. | |
333 | */ | |
334 | static __inline__ | |
335 | int ltt_relay_try_reserve(struct ltt_chanbuf *buf, struct ltt_chan *chan, | |
336 | size_t data_size, u64 *tsc, unsigned int *rflags, | |
337 | int largest_align, long *o_begin, long *o_end, | |
338 | long *o_old, size_t *before_hdr_pad, size_t *size) | |
339 | { | |
340 | *o_begin = local_read(&buf->offset); | |
341 | *o_old = *o_begin; | |
342 | ||
343 | *tsc = trace_clock_read64(); | |
344 | ||
2e6246b4 | 345 | #ifdef LTT_VMCORE |
1c8284eb MD |
346 | prefetch(&buf->commit_count[SUBBUF_INDEX(*o_begin, chan)]); |
347 | prefetch(&buf->commit_seq[SUBBUF_INDEX(*o_begin, chan)]); | |
348 | #else | |
349 | prefetchw(&buf->commit_count[SUBBUF_INDEX(*o_begin, chan)]); | |
350 | #endif | |
351 | if (last_tsc_overflow(buf, *tsc)) | |
352 | *rflags = LTT_RFLAG_ID_SIZE_TSC; | |
353 | ||
354 | if (unlikely(SUBBUF_OFFSET(*o_begin, chan) == 0)) | |
355 | return 1; | |
356 | ||
357 | *size = ltt_get_header_size(chan, *o_begin, data_size, before_hdr_pad, | |
358 | *rflags); | |
359 | *size += ltt_align(*o_begin + *size, largest_align) + data_size; | |
360 | if (unlikely((SUBBUF_OFFSET(*o_begin, chan) + *size) > chan->a.sb_size)) | |
361 | return 1; | |
362 | ||
363 | /* | |
364 | * Event fits in the current buffer and we are not on a switch | |
365 | * boundary. It's safe to write. | |
366 | */ | |
367 | *o_end = *o_begin + *size; | |
368 | ||
369 | if (unlikely((SUBBUF_OFFSET(*o_end, chan)) == 0)) | |
370 | /* | |
371 | * The offset_end will fall at the very beginning of the next | |
372 | * subbuffer. | |
373 | */ | |
374 | return 1; | |
375 | ||
376 | return 0; | |
377 | } | |
378 | ||
379 | static __inline__ | |
380 | int ltt_reserve_slot(struct ltt_chan *chan, | |
381 | struct ltt_trace *trace, size_t data_size, | |
382 | int largest_align, int cpu, | |
383 | struct ltt_chanbuf **ret_buf, | |
384 | size_t *slot_size, long *buf_offset, u64 *tsc, | |
385 | unsigned int *rflags) | |
386 | { | |
387 | struct ltt_chanbuf *buf = *ret_buf = per_cpu_ptr(chan->a.buf, cpu); | |
388 | long o_begin, o_end, o_old; | |
389 | size_t before_hdr_pad; | |
390 | ||
391 | /* | |
392 | * Perform retryable operations. | |
393 | */ | |
394 | if (unlikely(__get_cpu_var(ltt_nesting) > 4)) { | |
395 | local_inc(&buf->events_lost); | |
396 | return -EPERM; | |
397 | } | |
398 | ||
399 | if (unlikely(ltt_relay_try_reserve(buf, chan, data_size, tsc, rflags, | |
400 | largest_align, &o_begin, &o_end, | |
401 | &o_old, &before_hdr_pad, slot_size))) | |
402 | goto slow_path; | |
403 | ||
404 | if (unlikely(local_cmpxchg(&buf->offset, o_old, o_end) != o_old)) | |
405 | goto slow_path; | |
406 | ||
407 | /* | |
408 | * Atomically update last_tsc. This update races against concurrent | |
409 | * atomic updates, but the race will always cause supplementary full TSC | |
410 | * events, never the opposite (missing a full TSC event when it would be | |
411 | * needed). | |
412 | */ | |
413 | save_last_tsc(buf, *tsc); | |
414 | ||
415 | /* | |
416 | * Push the reader if necessary | |
417 | */ | |
418 | ltt_reserve_push_reader(buf, chan, o_end - 1); | |
419 | ||
420 | /* | |
421 | * Clear noref flag for this subbuffer. | |
422 | */ | |
423 | ltt_clear_noref_flag(&buf->a, SUBBUF_INDEX(o_end - 1, chan)); | |
424 | ||
425 | *buf_offset = o_begin + before_hdr_pad; | |
426 | return 0; | |
427 | slow_path: | |
428 | return ltt_reserve_slot_lockless_slow(chan, trace, data_size, | |
429 | largest_align, cpu, ret_buf, | |
430 | slot_size, buf_offset, tsc, | |
431 | rflags); | |
432 | } | |
433 | ||
434 | /* | |
435 | * Force a sub-buffer switch for a per-cpu buffer. This operation is | |
436 | * completely reentrant : can be called while tracing is active with | |
437 | * absolutely no lock held. | |
438 | * | |
439 | * Note, however, that as a local_cmpxchg is used for some atomic | |
440 | * operations, this function must be called from the CPU which owns the buffer | |
441 | * for a ACTIVE flush. | |
442 | */ | |
443 | static __inline__ | |
444 | void ltt_force_switch(struct ltt_chanbuf *buf, enum force_switch_mode mode) | |
445 | { | |
446 | return ltt_force_switch_lockless_slow(buf, mode); | |
447 | } | |
448 | ||
449 | /* | |
450 | * for flight recording. must be called after relay_commit. | |
451 | * This function increments the subbuffer's commit_seq counter each time the | |
452 | * commit count reaches back the reserve offset (module subbuffer size). It is | |
453 | * useful for crash dump. | |
454 | */ | |
2e6246b4 | 455 | #ifdef LTT_VMCORE |
1c8284eb MD |
456 | static __inline__ |
457 | void ltt_write_commit_counter(struct ltt_chanbuf *buf, struct ltt_chan *chan, | |
458 | long idx, long buf_offset, long commit_count, | |
459 | size_t data_size) | |
460 | { | |
461 | long offset; | |
462 | long commit_seq_old; | |
463 | ||
464 | offset = buf_offset + data_size; | |
465 | ||
466 | /* | |
467 | * SUBBUF_OFFSET includes commit_count_mask. We can simply | |
468 | * compare the offsets within the subbuffer without caring about | |
469 | * buffer full/empty mismatch because offset is never zero here | |
470 | * (subbuffer header and event headers have non-zero length). | |
471 | */ | |
472 | if (unlikely(SUBBUF_OFFSET(offset - commit_count, chan))) | |
473 | return; | |
474 | ||
475 | commit_seq_old = local_read(&buf->commit_seq[idx]); | |
476 | while (commit_seq_old < commit_count) | |
477 | commit_seq_old = local_cmpxchg(&buf->commit_seq[idx], | |
478 | commit_seq_old, commit_count); | |
479 | } | |
480 | #else | |
481 | static __inline__ | |
482 | void ltt_write_commit_counter(struct ltt_chanbuf *buf, struct ltt_chan *chan, | |
483 | long idx, long buf_offset, long commit_count, | |
484 | size_t data_size) | |
485 | { | |
486 | } | |
487 | #endif | |
488 | ||
489 | /* | |
490 | * Atomic unordered slot commit. Increments the commit count in the | |
491 | * specified sub-buffer, and delivers it if necessary. | |
492 | * | |
493 | * Parameters: | |
494 | * | |
495 | * @buf: buffer. | |
496 | * @chan: channel. | |
497 | * @buf_offset : offset following the event header. | |
498 | * @data_size : size of the event data. | |
499 | * @slot_size : size of the reserved slot. | |
500 | */ | |
501 | static __inline__ | |
502 | void ltt_commit_slot(struct ltt_chanbuf *buf, struct ltt_chan *chan, | |
503 | long buf_offset, size_t data_size, size_t slot_size) | |
504 | { | |
505 | long offset_end = buf_offset; | |
506 | long endidx = SUBBUF_INDEX(offset_end - 1, chan); | |
507 | long commit_count; | |
508 | ||
509 | #ifdef LTT_NO_IPI_BARRIER | |
510 | smp_wmb(); | |
511 | #else | |
512 | /* | |
513 | * Must write slot data before incrementing commit count. | |
514 | * This compiler barrier is upgraded into a smp_mb() by the IPI | |
515 | * sent by get_subbuf(). | |
516 | */ | |
517 | barrier(); | |
518 | #endif | |
519 | local_add(slot_size, &buf->commit_count[endidx].cc); | |
520 | local_inc(&buf->commit_count[endidx].events); | |
521 | /* | |
522 | * commit count read can race with concurrent OOO commit count updates. | |
523 | * This is only needed for ltt_check_deliver (for non-polling delivery | |
524 | * only) and for ltt_write_commit_counter. The race can only cause the | |
525 | * counter to be read with the same value more than once, which could | |
526 | * cause : | |
527 | * - Multiple delivery for the same sub-buffer (which is handled | |
528 | * gracefully by the reader code) if the value is for a full | |
529 | * sub-buffer. It's important that we can never miss a sub-buffer | |
530 | * delivery. Re-reading the value after the local_add ensures this. | |
531 | * - Reading a commit_count with a higher value that what was actually | |
532 | * added to it for the ltt_write_commit_counter call (again caused by | |
533 | * a concurrent committer). It does not matter, because this function | |
534 | * is interested in the fact that the commit count reaches back the | |
535 | * reserve offset for a specific sub-buffer, which is completely | |
536 | * independent of the order. | |
537 | */ | |
538 | commit_count = local_read(&buf->commit_count[endidx].cc); | |
539 | ||
540 | ltt_check_deliver(buf, chan, offset_end - 1, commit_count, endidx); | |
541 | /* | |
542 | * Update data_size for each commit. It's needed only for extracting | |
543 | * ltt buffers from vmcore, after crash. | |
544 | */ | |
545 | ltt_write_commit_counter(buf, chan, endidx, buf_offset, | |
546 | commit_count, data_size); | |
547 | } | |
548 | ||
549 | #endif //_LTT_LTT_RELAY_LOCKLESS_H |