2 * ring_buffer_iterator.c
4 * (C) Copyright 2010 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
6 * Ring buffer and channel iterators. Get each event of a channel in order. Uses
7 * a prio heap for per-cpu buffers, giving a O(log(NR_CPUS)) algorithmic
8 * complexity for the "get next event" operation.
11 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
13 * Dual LGPL v2.1/GPL v2 license.
16 #include "../../wrapper/ringbuffer/iterator.h"
17 #include <linux/jiffies.h>
18 #include <linux/delay.h>
19 #include <linux/module.h>
22 * Safety factor taking into account internal kernel interrupt latency.
23 * Assuming 250ms worse-case latency.
25 #define MAX_SYSTEM_LATENCY 250
28 * Maximum delta expected between trace clocks. At most 1 jiffy delta.
30 #define MAX_CLOCK_DELTA (jiffies_to_usecs(1) * 1000)
33 * lib_ring_buffer_get_next_record - Get the next record in a buffer.
37 * Returns the size of the event read, -EAGAIN if buffer is empty, -ENODATA if
38 * buffer is empty and finalized. The buffer must already be opened for reading.
40 ssize_t
lib_ring_buffer_get_next_record(struct channel
*chan
,
41 struct lib_ring_buffer
*buf
)
43 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
44 struct lib_ring_buffer_iter
*iter
= &buf
->iter
;
48 switch (iter
->state
) {
50 ret
= lib_ring_buffer_get_next_subbuf(buf
);
51 if (ret
&& !ACCESS_ONCE(buf
->finalized
)
52 && config
->alloc
== RING_BUFFER_ALLOC_GLOBAL
) {
54 * Use "pull" scheme for global buffers. The reader
55 * itself flushes the buffer to "pull" data not visible
56 * to readers yet. Flush current subbuffer and re-try.
58 * Per-CPU buffers rather use a "push" scheme because
59 * the IPI needed to flush all CPU's buffers is too
60 * costly. In the "push" scheme, the reader waits for
61 * the writer periodic deferrable timer to flush the
62 * buffers (keeping track of a quiescent state
63 * timestamp). Therefore, the writer "pushes" data out
64 * of the buffers rather than letting the reader "pull"
65 * data from the buffer.
67 lib_ring_buffer_switch_slow(buf
, SWITCH_ACTIVE
);
68 ret
= lib_ring_buffer_get_next_subbuf(buf
);
72 iter
->consumed
= buf
->cons_snapshot
;
73 iter
->data_size
= lib_ring_buffer_get_read_data_size(config
, buf
);
74 iter
->read_offset
= iter
->consumed
;
76 iter
->read_offset
+= config
->cb
.subbuffer_header_size();
77 iter
->state
= ITER_TEST_RECORD
;
79 case ITER_TEST_RECORD
:
80 if (iter
->read_offset
- iter
->consumed
>= iter
->data_size
) {
81 iter
->state
= ITER_PUT_SUBBUF
;
83 CHAN_WARN_ON(chan
, !config
->cb
.record_get
);
84 config
->cb
.record_get(config
, chan
, buf
,
89 iter
->read_offset
+= iter
->header_len
;
90 subbuffer_consume_record(config
, &buf
->backend
);
91 iter
->state
= ITER_NEXT_RECORD
;
92 return iter
->payload_len
;
95 case ITER_NEXT_RECORD
:
96 iter
->read_offset
+= iter
->payload_len
;
97 iter
->state
= ITER_TEST_RECORD
;
100 lib_ring_buffer_put_next_subbuf(buf
);
101 iter
->state
= ITER_GET_SUBBUF
;
104 CHAN_WARN_ON(chan
, 1); /* Should not happen */
108 EXPORT_SYMBOL_GPL(lib_ring_buffer_get_next_record
);
110 static int buf_is_higher(void *a
, void *b
)
112 struct lib_ring_buffer
*bufa
= a
;
113 struct lib_ring_buffer
*bufb
= b
;
115 /* Consider lowest timestamps to be at the top of the heap */
116 return (bufa
->iter
.timestamp
< bufb
->iter
.timestamp
);
120 void lib_ring_buffer_get_empty_buf_records(const struct lib_ring_buffer_config
*config
,
121 struct channel
*chan
)
123 struct ptr_heap
*heap
= &chan
->iter
.heap
;
124 struct lib_ring_buffer
*buf
, *tmp
;
127 list_for_each_entry_safe(buf
, tmp
, &chan
->iter
.empty_head
,
129 len
= lib_ring_buffer_get_next_record(chan
, buf
);
132 * Deal with -EAGAIN and -ENODATA.
133 * len >= 0 means record contains data.
134 * -EBUSY should never happen, because we support only one
139 /* Keep node in empty list */
143 * Buffer is finalized. Don't add to list of empty
144 * buffer, because it has no more data to provide, ever.
146 list_del(&buf
->iter
.empty_node
);
149 CHAN_WARN_ON(chan
, 1);
153 * Insert buffer into the heap, remove from empty buffer
154 * list. The heap should never overflow.
156 CHAN_WARN_ON(chan
, len
< 0);
157 list_del(&buf
->iter
.empty_node
);
158 CHAN_WARN_ON(chan
, heap_insert(heap
, buf
) != NULL
);
164 void lib_ring_buffer_wait_for_qs(const struct lib_ring_buffer_config
*config
,
165 struct channel
*chan
)
168 unsigned long wait_msecs
;
171 * No need to wait if no empty buffers are present.
173 if (list_empty(&chan
->iter
.empty_head
))
176 timestamp_qs
= config
->cb
.ring_buffer_clock_read(chan
);
178 * We need to consider previously empty buffers.
179 * Do a get next buf record on each of them. Add them to
180 * the heap if they have data. If at least one of them
181 * don't have data, we need to wait for
182 * switch_timer_interval + MAX_SYSTEM_LATENCY (so we are sure the
183 * buffers have been switched either by the timer or idle entry) and
184 * check them again, adding them if they have data.
186 lib_ring_buffer_get_empty_buf_records(config
, chan
);
189 * No need to wait if no empty buffers are present.
191 if (list_empty(&chan
->iter
.empty_head
))
195 * We need to wait for the buffer switch timer to run. If the
196 * CPU is idle, idle entry performed the switch.
197 * TODO: we could optimize further by skipping the sleep if all
198 * empty buffers belong to idle or offline cpus.
200 wait_msecs
= jiffies_to_msecs(chan
->switch_timer_interval
);
201 wait_msecs
+= MAX_SYSTEM_LATENCY
;
203 lib_ring_buffer_get_empty_buf_records(config
, chan
);
205 * Any buffer still in the empty list here cannot possibly
206 * contain an event with a timestamp prior to "timestamp_qs".
207 * The new quiescent state timestamp is the one we grabbed
208 * before waiting for buffer data. It is therefore safe to
209 * ignore empty buffers up to last_qs timestamp for fusion
212 chan
->iter
.last_qs
= timestamp_qs
;
216 * channel_get_next_record - Get the next record in a channel.
218 * @ret_buf: the buffer in which the event is located (output)
220 * Returns the size of new current event, -EAGAIN if all buffers are empty,
221 * -ENODATA if all buffers are empty and finalized. The channel must already be
222 * opened for reading.
225 ssize_t
channel_get_next_record(struct channel
*chan
,
226 struct lib_ring_buffer
**ret_buf
)
228 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
229 struct lib_ring_buffer
*buf
;
230 struct ptr_heap
*heap
;
233 if (config
->alloc
== RING_BUFFER_ALLOC_GLOBAL
) {
234 *ret_buf
= channel_get_ring_buffer(config
, chan
, 0);
235 return lib_ring_buffer_get_next_record(chan
, *ret_buf
);
238 heap
= &chan
->iter
.heap
;
241 * get next record for topmost buffer.
243 buf
= heap_maximum(heap
);
245 len
= lib_ring_buffer_get_next_record(chan
, buf
);
247 * Deal with -EAGAIN and -ENODATA.
248 * len >= 0 means record contains data.
252 buf
->iter
.timestamp
= 0;
253 list_add(&buf
->iter
.empty_node
, &chan
->iter
.empty_head
);
254 /* Remove topmost buffer from the heap */
255 CHAN_WARN_ON(chan
, heap_remove(heap
) != buf
);
259 * Buffer is finalized. Remove buffer from heap and
260 * don't add to list of empty buffer, because it has no
261 * more data to provide, ever.
263 CHAN_WARN_ON(chan
, heap_remove(heap
) != buf
);
266 CHAN_WARN_ON(chan
, 1);
270 * Reinsert buffer into the heap. Note that heap can be
271 * partially empty, so we need to use
272 * heap_replace_max().
274 CHAN_WARN_ON(chan
, len
< 0);
275 CHAN_WARN_ON(chan
, heap_replace_max(heap
, buf
) != buf
);
280 buf
= heap_maximum(heap
);
281 if (!buf
|| buf
->iter
.timestamp
> chan
->iter
.last_qs
) {
283 * Deal with buffers previously showing no data.
284 * Add buffers containing data to the heap, update
287 lib_ring_buffer_wait_for_qs(config
, chan
);
290 *ret_buf
= buf
= heap_maximum(heap
);
293 * If this warning triggers, you probably need to check your
294 * system interrupt latency. Typical causes: too many printk()
295 * output going to a serial console with interrupts off.
296 * Allow for MAX_CLOCK_DELTA ns timestamp delta going backward.
297 * Observed on SMP KVM setups with trace_clock().
299 if (chan
->iter
.last_timestamp
300 > (buf
->iter
.timestamp
+ MAX_CLOCK_DELTA
)) {
301 printk(KERN_WARNING
"ring_buffer: timestamps going "
302 "backward. Last time %llu ns, cpu %d, "
303 "current time %llu ns, cpu %d, "
305 chan
->iter
.last_timestamp
, chan
->iter
.last_cpu
,
306 buf
->iter
.timestamp
, buf
->backend
.cpu
,
307 chan
->iter
.last_timestamp
- buf
->iter
.timestamp
);
308 CHAN_WARN_ON(chan
, 1);
310 chan
->iter
.last_timestamp
= buf
->iter
.timestamp
;
311 chan
->iter
.last_cpu
= buf
->backend
.cpu
;
312 return buf
->iter
.payload_len
;
315 if (list_empty(&chan
->iter
.empty_head
))
316 return -ENODATA
; /* All buffers finalized */
318 return -EAGAIN
; /* Temporarily empty */
321 EXPORT_SYMBOL_GPL(channel_get_next_record
);
324 void lib_ring_buffer_iterator_init(struct channel
*chan
, struct lib_ring_buffer
*buf
)
326 if (buf
->iter
.allocated
)
329 buf
->iter
.allocated
= 1;
330 if (chan
->iter
.read_open
&& !buf
->iter
.read_open
) {
331 CHAN_WARN_ON(chan
, lib_ring_buffer_open_read(buf
) != 0);
332 buf
->iter
.read_open
= 1;
335 /* Add to list of buffers without any current record */
336 if (chan
->backend
.config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
)
337 list_add(&buf
->iter
.empty_node
, &chan
->iter
.empty_head
);
340 #ifdef CONFIG_HOTPLUG_CPU
342 int __cpuinit
channel_iterator_cpu_hotplug(struct notifier_block
*nb
,
343 unsigned long action
,
346 unsigned int cpu
= (unsigned long)hcpu
;
347 struct channel
*chan
= container_of(nb
, struct channel
,
349 struct lib_ring_buffer
*buf
= per_cpu_ptr(chan
->backend
.buf
, cpu
);
350 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
352 if (!chan
->hp_iter_enable
)
355 CHAN_WARN_ON(chan
, config
->alloc
== RING_BUFFER_ALLOC_GLOBAL
);
358 case CPU_DOWN_FAILED
:
359 case CPU_DOWN_FAILED_FROZEN
:
361 case CPU_ONLINE_FROZEN
:
362 lib_ring_buffer_iterator_init(chan
, buf
);
370 int channel_iterator_init(struct channel
*chan
)
372 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
373 struct lib_ring_buffer
*buf
;
375 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
) {
378 INIT_LIST_HEAD(&chan
->iter
.empty_head
);
379 ret
= heap_init(&chan
->iter
.heap
,
381 * sizeof(struct lib_ring_buffer
*),
382 GFP_KERNEL
, buf_is_higher
);
386 * In case of non-hotplug cpu, if the ring-buffer is allocated
387 * in early initcall, it will not be notified of secondary cpus.
388 * In that off case, we need to allocate for all possible cpus.
390 #ifdef CONFIG_HOTPLUG_CPU
391 chan
->hp_iter_notifier
.notifier_call
=
392 channel_iterator_cpu_hotplug
;
393 chan
->hp_iter_notifier
.priority
= 10;
394 register_cpu_notifier(&chan
->hp_iter_notifier
);
396 for_each_online_cpu(cpu
) {
397 buf
= per_cpu_ptr(chan
->backend
.buf
, cpu
);
398 lib_ring_buffer_iterator_init(chan
, buf
);
400 chan
->hp_iter_enable
= 1;
403 for_each_possible_cpu(cpu
) {
404 buf
= per_cpu_ptr(chan
->backend
.buf
, cpu
);
405 lib_ring_buffer_iterator_init(chan
, buf
);
409 buf
= channel_get_ring_buffer(config
, chan
, 0);
410 lib_ring_buffer_iterator_init(chan
, buf
);
415 void channel_iterator_unregister_notifiers(struct channel
*chan
)
417 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
419 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
) {
420 chan
->hp_iter_enable
= 0;
421 unregister_cpu_notifier(&chan
->hp_iter_notifier
);
425 void channel_iterator_free(struct channel
*chan
)
427 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
429 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
)
430 heap_free(&chan
->iter
.heap
);
433 int lib_ring_buffer_iterator_open(struct lib_ring_buffer
*buf
)
435 struct channel
*chan
= buf
->backend
.chan
;
436 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
437 CHAN_WARN_ON(chan
, config
->output
!= RING_BUFFER_ITERATOR
);
438 return lib_ring_buffer_open_read(buf
);
440 EXPORT_SYMBOL_GPL(lib_ring_buffer_iterator_open
);
443 * Note: Iterators must not be mixed with other types of outputs, because an
444 * iterator can leave the buffer in "GET" state, which is not consistent with
445 * other types of output (mmap, splice, raw data read).
447 void lib_ring_buffer_iterator_release(struct lib_ring_buffer
*buf
)
449 lib_ring_buffer_release_read(buf
);
451 EXPORT_SYMBOL_GPL(lib_ring_buffer_iterator_release
);
453 int channel_iterator_open(struct channel
*chan
)
455 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
456 struct lib_ring_buffer
*buf
;
459 CHAN_WARN_ON(chan
, config
->output
!= RING_BUFFER_ITERATOR
);
461 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
) {
463 /* Allow CPU hotplug to keep track of opened reader */
464 chan
->iter
.read_open
= 1;
465 for_each_channel_cpu(cpu
, chan
) {
466 buf
= channel_get_ring_buffer(config
, chan
, cpu
);
467 ret
= lib_ring_buffer_iterator_open(buf
);
470 buf
->iter
.read_open
= 1;
474 buf
= channel_get_ring_buffer(config
, chan
, 0);
475 ret
= lib_ring_buffer_iterator_open(buf
);
479 /* Error should always happen on CPU 0, hence no close is required. */
480 CHAN_WARN_ON(chan
, cpu
!= 0);
484 EXPORT_SYMBOL_GPL(channel_iterator_open
);
486 void channel_iterator_release(struct channel
*chan
)
488 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
489 struct lib_ring_buffer
*buf
;
492 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
) {
494 for_each_channel_cpu(cpu
, chan
) {
495 buf
= channel_get_ring_buffer(config
, chan
, cpu
);
496 if (buf
->iter
.read_open
) {
497 lib_ring_buffer_iterator_release(buf
);
498 buf
->iter
.read_open
= 0;
501 chan
->iter
.read_open
= 0;
504 buf
= channel_get_ring_buffer(config
, chan
, 0);
505 lib_ring_buffer_iterator_release(buf
);
508 EXPORT_SYMBOL_GPL(channel_iterator_release
);
510 void lib_ring_buffer_iterator_reset(struct lib_ring_buffer
*buf
)
512 struct channel
*chan
= buf
->backend
.chan
;
514 if (buf
->iter
.state
!= ITER_GET_SUBBUF
)
515 lib_ring_buffer_put_next_subbuf(buf
);
516 buf
->iter
.state
= ITER_GET_SUBBUF
;
517 /* Remove from heap (if present). */
518 if (heap_cherrypick(&chan
->iter
.heap
, buf
))
519 list_add(&buf
->iter
.empty_node
, &chan
->iter
.empty_head
);
520 buf
->iter
.timestamp
= 0;
521 buf
->iter
.header_len
= 0;
522 buf
->iter
.payload_len
= 0;
523 buf
->iter
.consumed
= 0;
524 buf
->iter
.read_offset
= 0;
525 buf
->iter
.data_size
= 0;
526 /* Don't reset allocated and read_open */
529 void channel_iterator_reset(struct channel
*chan
)
531 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
532 struct lib_ring_buffer
*buf
;
535 /* Empty heap, put into empty_head */
536 while ((buf
= heap_remove(&chan
->iter
.heap
)) != NULL
)
537 list_add(&buf
->iter
.empty_node
, &chan
->iter
.empty_head
);
539 for_each_channel_cpu(cpu
, chan
) {
540 buf
= channel_get_ring_buffer(config
, chan
, cpu
);
541 lib_ring_buffer_iterator_reset(buf
);
543 /* Don't reset read_open */
544 chan
->iter
.last_qs
= 0;
545 chan
->iter
.last_timestamp
= 0;
546 chan
->iter
.last_cpu
= 0;
547 chan
->iter
.len_left
= 0;
551 * Ring buffer payload extraction read() implementation.
554 ssize_t
channel_ring_buffer_file_read(struct file
*filp
,
555 char __user
*user_buf
,
558 struct channel
*chan
,
559 struct lib_ring_buffer
*buf
,
562 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
563 size_t read_count
= 0, read_offset
;
567 if (!access_ok(VERIFY_WRITE
, user_buf
, count
))
570 /* Finish copy of previous record */
572 if (read_count
< count
) {
573 len
= chan
->iter
.len_left
;
575 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
577 buf
= heap_maximum(&chan
->iter
.heap
);
578 CHAN_WARN_ON(chan
, !buf
);
583 while (read_count
< count
) {
584 size_t copy_len
, space_left
;
587 len
= channel_get_next_record(chan
, &buf
);
589 len
= lib_ring_buffer_get_next_record(chan
, buf
);
593 * Check if buffer is finalized (end of file).
595 if (len
== -ENODATA
) {
596 /* A 0 read_count will tell about end of file */
599 if (filp
->f_flags
& O_NONBLOCK
) {
601 read_count
= -EAGAIN
;
607 * No data available at the moment, return what
614 * Wait for returned len to be >= 0 or -ENODATA.
617 error
= wait_event_interruptible(
619 ((len
= channel_get_next_record(chan
,
620 &buf
)), len
!= -EAGAIN
));
622 error
= wait_event_interruptible(
624 ((len
= lib_ring_buffer_get_next_record(
625 chan
, buf
)), len
!= -EAGAIN
));
626 CHAN_WARN_ON(chan
, len
== -EBUSY
);
631 CHAN_WARN_ON(chan
, len
< 0 && len
!= -ENODATA
);
635 read_offset
= buf
->iter
.read_offset
;
637 space_left
= count
- read_count
;
638 if (len
<= space_left
) {
640 chan
->iter
.len_left
= 0;
643 copy_len
= space_left
;
644 chan
->iter
.len_left
= len
- copy_len
;
645 *ppos
= read_offset
+ copy_len
;
647 if (__lib_ring_buffer_copy_to_user(&buf
->backend
, read_offset
,
648 &user_buf
[read_count
],
651 * Leave the len_left and ppos values at their current
652 * state, as we currently have a valid event to read.
656 read_count
+= copy_len
;
662 chan
->iter
.len_left
= 0;
667 * lib_ring_buffer_file_read - Read buffer record payload.
668 * @filp: file structure pointer.
669 * @buffer: user buffer to read data into.
670 * @count: number of bytes to read.
671 * @ppos: file read position.
673 * Returns a negative value on error, or the number of bytes read on success.
674 * ppos is used to save the position _within the current record_ between calls
678 ssize_t
lib_ring_buffer_file_read(struct file
*filp
,
679 char __user
*user_buf
,
683 struct inode
*inode
= filp
->f_dentry
->d_inode
;
684 struct lib_ring_buffer
*buf
= inode
->i_private
;
685 struct channel
*chan
= buf
->backend
.chan
;
687 return channel_ring_buffer_file_read(filp
, user_buf
, count
, ppos
,
692 * channel_file_read - Read channel record payload.
693 * @filp: file structure pointer.
694 * @buffer: user buffer to read data into.
695 * @count: number of bytes to read.
696 * @ppos: file read position.
698 * Returns a negative value on error, or the number of bytes read on success.
699 * ppos is used to save the position _within the current record_ between calls
703 ssize_t
channel_file_read(struct file
*filp
,
704 char __user
*user_buf
,
708 struct inode
*inode
= filp
->f_dentry
->d_inode
;
709 struct channel
*chan
= inode
->i_private
;
710 const struct lib_ring_buffer_config
*config
= chan
->backend
.config
;
712 if (config
->alloc
== RING_BUFFER_ALLOC_PER_CPU
)
713 return channel_ring_buffer_file_read(filp
, user_buf
, count
,
714 ppos
, chan
, NULL
, 1);
716 struct lib_ring_buffer
*buf
=
717 channel_get_ring_buffer(config
, chan
, 0);
718 return channel_ring_buffer_file_read(filp
, user_buf
, count
,
724 int lib_ring_buffer_file_open(struct inode
*inode
, struct file
*file
)
726 struct lib_ring_buffer
*buf
= inode
->i_private
;
729 ret
= lib_ring_buffer_iterator_open(buf
);
733 file
->private_data
= buf
;
734 ret
= nonseekable_open(inode
, file
);
740 lib_ring_buffer_iterator_release(buf
);
745 int lib_ring_buffer_file_release(struct inode
*inode
, struct file
*file
)
747 struct lib_ring_buffer
*buf
= inode
->i_private
;
749 lib_ring_buffer_iterator_release(buf
);
754 int channel_file_open(struct inode
*inode
, struct file
*file
)
756 struct channel
*chan
= inode
->i_private
;
759 ret
= channel_iterator_open(chan
);
763 file
->private_data
= chan
;
764 ret
= nonseekable_open(inode
, file
);
770 channel_iterator_release(chan
);
775 int channel_file_release(struct inode
*inode
, struct file
*file
)
777 struct channel
*chan
= inode
->i_private
;
779 channel_iterator_release(chan
);
783 const struct file_operations channel_payload_file_operations
= {
784 .open
= channel_file_open
,
785 .release
= channel_file_release
,
786 .read
= channel_file_read
,
787 .llseek
= lib_ring_buffer_no_llseek
,
789 EXPORT_SYMBOL_GPL(channel_payload_file_operations
);
791 const struct file_operations lib_ring_buffer_payload_file_operations
= {
792 .open
= lib_ring_buffer_file_open
,
793 .release
= lib_ring_buffer_file_release
,
794 .read
= lib_ring_buffer_file_read
,
795 .llseek
= lib_ring_buffer_no_llseek
,
797 EXPORT_SYMBOL_GPL(lib_ring_buffer_payload_file_operations
);