From: Mathieu Desnoyers Date: Thu, 16 Jun 2011 19:29:28 +0000 (-0400) Subject: lib ring buffer initial import X-Git-Url: https://git.lttng.org./?a=commitdiff_plain;h=852c29366c62c78f78af261f0287afcd92eae6b5;p=ust.git lib ring buffer initial import Signed-off-by: Mathieu Desnoyers --- diff --git a/libringbuffer/api.h b/libringbuffer/api.h new file mode 100644 index 0000000..f8a1145 --- /dev/null +++ b/libringbuffer/api.h @@ -0,0 +1,25 @@ +#ifndef _LINUX_RING_BUFFER_API_H +#define _LINUX_RING_BUFFER_API_H + +/* + * linux/ringbuffer/api.h + * + * Copyright (C) 2010 - Mathieu Desnoyers "mathieu.desnoyers@efficios.com" + * + * Ring Buffer API. + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include "../../wrapper/ringbuffer/backend.h" +#include "../../wrapper/ringbuffer/frontend.h" +#include "../../wrapper/ringbuffer/vfs.h" + +/* + * ring_buffer_frontend_api.h contains static inline functions that depend on + * client static inlines. Hence the inclusion of this "api" header only + * within the client. + */ +#include "../../wrapper/ringbuffer/frontend_api.h" + +#endif /* _LINUX_RING_BUFFER_API_H */ diff --git a/libringbuffer/backend.h b/libringbuffer/backend.h new file mode 100644 index 0000000..47bc179 --- /dev/null +++ b/libringbuffer/backend.h @@ -0,0 +1,141 @@ +#ifndef _LINUX_RING_BUFFER_BACKEND_H +#define _LINUX_RING_BUFFER_BACKEND_H + +/* + * linux/ringbuffer/backend.h + * + * Copyright (C) 2008-2010 - Mathieu Desnoyers + * + * Ring buffer backend (API). + * + * Dual LGPL v2.1/GPL v2 license. + * + * Credits to Steven Rostedt for proposing to use an extra-subbuffer owned by + * the reader in flight recorder mode. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Internal helpers */ +#include "../../wrapper/ringbuffer/backend_internal.h" +#include "../../wrapper/ringbuffer/frontend_internal.h" + +/* Ring buffer backend API */ + +/* Ring buffer backend access (read/write) */ + +extern size_t lib_ring_buffer_read(struct lib_ring_buffer_backend *bufb, + size_t offset, void *dest, size_t len); + +extern int __lib_ring_buffer_copy_to_user(struct lib_ring_buffer_backend *bufb, + size_t offset, void __user *dest, + size_t len); + +extern int lib_ring_buffer_read_cstr(struct lib_ring_buffer_backend *bufb, + size_t offset, void *dest, size_t len); + +extern struct page ** +lib_ring_buffer_read_get_page(struct lib_ring_buffer_backend *bufb, size_t offset, + void ***virt); + +/* + * Return the address where a given offset is located. + * Should be used to get the current subbuffer header pointer. Given we know + * it's never on a page boundary, it's safe to write directly to this address, + * as long as the write is never bigger than a page size. + */ +extern void * +lib_ring_buffer_offset_address(struct lib_ring_buffer_backend *bufb, + size_t offset); +extern void * +lib_ring_buffer_read_offset_address(struct lib_ring_buffer_backend *bufb, + size_t offset); + +/** + * lib_ring_buffer_write - write data to a buffer backend + * @config : ring buffer instance configuration + * @ctx: ring buffer context. (input arguments only) + * @src : source pointer to copy from + * @len : length of data to copy + * + * This function copies "len" bytes of data from a source pointer to a buffer + * backend, at the current context offset. This is more or less a buffer + * backend-specific memcpy() operation. Calls the slow path (_ring_buffer_write) + * if copy is crossing a page boundary. + */ +static inline +void lib_ring_buffer_write(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_ctx *ctx, + const void *src, size_t len) +{ + struct lib_ring_buffer_backend *bufb = &ctx->buf->backend; + struct channel_backend *chanb = &ctx->chan->backend; + size_t sbidx, index; + size_t offset = ctx->buf_offset; + ssize_t pagecpy; + struct lib_ring_buffer_backend_pages *rpages; + unsigned long sb_bindex, id; + + offset &= chanb->buf_size - 1; + sbidx = offset >> chanb->subbuf_size_order; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + pagecpy = min_t(size_t, len, (-offset) & ~PAGE_MASK); + id = bufb->buf_wsb[sbidx].id; + sb_bindex = subbuffer_id_get_index(config, id); + rpages = bufb->array[sb_bindex]; + CHAN_WARN_ON(ctx->chan, + config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, id)); + if (likely(pagecpy == len)) + lib_ring_buffer_do_copy(config, + rpages->p[index].virt + + (offset & ~PAGE_MASK), + src, len); + else + _lib_ring_buffer_write(bufb, offset, src, len, 0); + ctx->buf_offset += len; +} + +/* + * This accessor counts the number of unread records in a buffer. + * It only provides a consistent value if no reads not writes are performed + * concurrently. + */ +static inline +unsigned long lib_ring_buffer_get_records_unread( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + struct lib_ring_buffer_backend *bufb = &buf->backend; + struct lib_ring_buffer_backend_pages *pages; + unsigned long records_unread = 0, sb_bindex, id; + unsigned int i; + + for (i = 0; i < bufb->chan->backend.num_subbuf; i++) { + id = bufb->buf_wsb[i].id; + sb_bindex = subbuffer_id_get_index(config, id); + pages = bufb->array[sb_bindex]; + records_unread += v_read(config, &pages->records_unread); + } + if (config->mode == RING_BUFFER_OVERWRITE) { + id = bufb->buf_rsb.id; + sb_bindex = subbuffer_id_get_index(config, id); + pages = bufb->array[sb_bindex]; + records_unread += v_read(config, &pages->records_unread); + } + return records_unread; +} + +ssize_t lib_ring_buffer_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +loff_t lib_ring_buffer_no_llseek(struct file *file, loff_t offset, int origin); + +#endif /* _LINUX_RING_BUFFER_BACKEND_H */ diff --git a/libringbuffer/backend_internal.h b/libringbuffer/backend_internal.h new file mode 100644 index 0000000..d6c1c82 --- /dev/null +++ b/libringbuffer/backend_internal.h @@ -0,0 +1,417 @@ +#ifndef _LINUX_RING_BUFFER_BACKEND_INTERNAL_H +#define _LINUX_RING_BUFFER_BACKEND_INTERNAL_H + +/* + * linux/ringbuffer/backend_internal.h + * + * Copyright (C) 2008-2010 - Mathieu Desnoyers + * + * Ring buffer backend (internal helpers). + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include "../../wrapper/ringbuffer/config.h" +#include "../../wrapper//ringbuffer/backend_types.h" +#include "../../wrapper/ringbuffer/frontend_types.h" +#include + +/* Ring buffer backend API presented to the frontend */ + +/* Ring buffer and channel backend create/free */ + +int lib_ring_buffer_backend_create(struct lib_ring_buffer_backend *bufb, + struct channel_backend *chan, int cpu); +void channel_backend_unregister_notifiers(struct channel_backend *chanb); +void lib_ring_buffer_backend_free(struct lib_ring_buffer_backend *bufb); +int channel_backend_init(struct channel_backend *chanb, + const char *name, + const struct lib_ring_buffer_config *config, + void *priv, size_t subbuf_size, + size_t num_subbuf); +void channel_backend_free(struct channel_backend *chanb); + +void lib_ring_buffer_backend_reset(struct lib_ring_buffer_backend *bufb); +void channel_backend_reset(struct channel_backend *chanb); + +int lib_ring_buffer_backend_init(void); +void lib_ring_buffer_backend_exit(void); + +extern void _lib_ring_buffer_write(struct lib_ring_buffer_backend *bufb, + size_t offset, const void *src, size_t len, + ssize_t pagecpy); + +/* + * Subbuffer ID bits for overwrite mode. Need to fit within a single word to be + * exchanged atomically. + * + * Top half word, except lowest bit, belongs to "offset", which is used to keep + * to count the produced buffers. For overwrite mode, this provides the + * consumer with the capacity to read subbuffers in order, handling the + * situation where producers would write up to 2^15 buffers (or 2^31 for 64-bit + * systems) concurrently with a single execution of get_subbuf (between offset + * sampling and subbuffer ID exchange). + */ + +#define HALF_ULONG_BITS (BITS_PER_LONG >> 1) + +#define SB_ID_OFFSET_SHIFT (HALF_ULONG_BITS + 1) +#define SB_ID_OFFSET_COUNT (1UL << SB_ID_OFFSET_SHIFT) +#define SB_ID_OFFSET_MASK (~(SB_ID_OFFSET_COUNT - 1)) +/* + * Lowest bit of top word half belongs to noref. Used only for overwrite mode. + */ +#define SB_ID_NOREF_SHIFT (SB_ID_OFFSET_SHIFT - 1) +#define SB_ID_NOREF_COUNT (1UL << SB_ID_NOREF_SHIFT) +#define SB_ID_NOREF_MASK SB_ID_NOREF_COUNT +/* + * In overwrite mode: lowest half of word is used for index. + * Limit of 2^16 subbuffers per buffer on 32-bit, 2^32 on 64-bit. + * In producer-consumer mode: whole word used for index. + */ +#define SB_ID_INDEX_SHIFT 0 +#define SB_ID_INDEX_COUNT (1UL << SB_ID_INDEX_SHIFT) +#define SB_ID_INDEX_MASK (SB_ID_NOREF_COUNT - 1) + +/* + * Construct the subbuffer id from offset, index and noref. Use only the index + * for producer-consumer mode (offset and noref are only used in overwrite + * mode). + */ +static inline +unsigned long subbuffer_id(const struct lib_ring_buffer_config *config, + unsigned long offset, unsigned long noref, + unsigned long index) +{ + if (config->mode == RING_BUFFER_OVERWRITE) + return (offset << SB_ID_OFFSET_SHIFT) + | (noref << SB_ID_NOREF_SHIFT) + | index; + else + return index; +} + +/* + * Compare offset with the offset contained within id. Return 1 if the offset + * bits are identical, else 0. + */ +static inline +int subbuffer_id_compare_offset(const struct lib_ring_buffer_config *config, + unsigned long id, unsigned long offset) +{ + return (id & SB_ID_OFFSET_MASK) == (offset << SB_ID_OFFSET_SHIFT); +} + +static inline +unsigned long subbuffer_id_get_index(const struct lib_ring_buffer_config *config, + unsigned long id) +{ + if (config->mode == RING_BUFFER_OVERWRITE) + return id & SB_ID_INDEX_MASK; + else + return id; +} + +static inline +unsigned long subbuffer_id_is_noref(const struct lib_ring_buffer_config *config, + unsigned long id) +{ + if (config->mode == RING_BUFFER_OVERWRITE) + return !!(id & SB_ID_NOREF_MASK); + else + return 1; +} + +/* + * Only used by reader on subbuffer ID it has exclusive access to. No volatile + * needed. + */ +static inline +void subbuffer_id_set_noref(const struct lib_ring_buffer_config *config, + unsigned long *id) +{ + if (config->mode == RING_BUFFER_OVERWRITE) + *id |= SB_ID_NOREF_MASK; +} + +static inline +void subbuffer_id_set_noref_offset(const struct lib_ring_buffer_config *config, + unsigned long *id, unsigned long offset) +{ + unsigned long tmp; + + if (config->mode == RING_BUFFER_OVERWRITE) { + tmp = *id; + tmp &= ~SB_ID_OFFSET_MASK; + tmp |= offset << SB_ID_OFFSET_SHIFT; + tmp |= SB_ID_NOREF_MASK; + /* Volatile store, read concurrently by readers. */ + ACCESS_ONCE(*id) = tmp; + } +} + +/* No volatile access, since already used locally */ +static inline +void subbuffer_id_clear_noref(const struct lib_ring_buffer_config *config, + unsigned long *id) +{ + if (config->mode == RING_BUFFER_OVERWRITE) + *id &= ~SB_ID_NOREF_MASK; +} + +/* + * For overwrite mode, cap the number of subbuffers per buffer to: + * 2^16 on 32-bit architectures + * 2^32 on 64-bit architectures + * This is required to fit in the index part of the ID. Return 0 on success, + * -EPERM on failure. + */ +static inline +int subbuffer_id_check_index(const struct lib_ring_buffer_config *config, + unsigned long num_subbuf) +{ + if (config->mode == RING_BUFFER_OVERWRITE) + return (num_subbuf > (1UL << HALF_ULONG_BITS)) ? -EPERM : 0; + else + return 0; +} + +static inline +void subbuffer_count_record(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + unsigned long idx) +{ + unsigned long sb_bindex; + + sb_bindex = subbuffer_id_get_index(config, bufb->buf_wsb[idx].id); + v_inc(config, &bufb->array[sb_bindex]->records_commit); +} + +/* + * Reader has exclusive subbuffer access for record consumption. No need to + * perform the decrement atomically. + */ +static inline +void subbuffer_consume_record(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb) +{ + unsigned long sb_bindex; + + sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id); + CHAN_WARN_ON(bufb->chan, + !v_read(config, &bufb->array[sb_bindex]->records_unread)); + /* Non-atomic decrement protected by exclusive subbuffer access */ + _v_dec(config, &bufb->array[sb_bindex]->records_unread); + v_inc(config, &bufb->records_read); +} + +static inline +unsigned long subbuffer_get_records_count( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + unsigned long idx) +{ + unsigned long sb_bindex; + + sb_bindex = subbuffer_id_get_index(config, bufb->buf_wsb[idx].id); + return v_read(config, &bufb->array[sb_bindex]->records_commit); +} + +/* + * Must be executed at subbuffer delivery when the writer has _exclusive_ + * subbuffer access. See ring_buffer_check_deliver() for details. + * ring_buffer_get_records_count() must be called to get the records count + * before this function, because it resets the records_commit count. + */ +static inline +unsigned long subbuffer_count_records_overrun( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + unsigned long idx) +{ + struct lib_ring_buffer_backend_pages *pages; + unsigned long overruns, sb_bindex; + + sb_bindex = subbuffer_id_get_index(config, bufb->buf_wsb[idx].id); + pages = bufb->array[sb_bindex]; + overruns = v_read(config, &pages->records_unread); + v_set(config, &pages->records_unread, + v_read(config, &pages->records_commit)); + v_set(config, &pages->records_commit, 0); + + return overruns; +} + +static inline +void subbuffer_set_data_size(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + unsigned long idx, + unsigned long data_size) +{ + struct lib_ring_buffer_backend_pages *pages; + unsigned long sb_bindex; + + sb_bindex = subbuffer_id_get_index(config, bufb->buf_wsb[idx].id); + pages = bufb->array[sb_bindex]; + pages->data_size = data_size; +} + +static inline +unsigned long subbuffer_get_read_data_size( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb) +{ + struct lib_ring_buffer_backend_pages *pages; + unsigned long sb_bindex; + + sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id); + pages = bufb->array[sb_bindex]; + return pages->data_size; +} + +static inline +unsigned long subbuffer_get_data_size( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + unsigned long idx) +{ + struct lib_ring_buffer_backend_pages *pages; + unsigned long sb_bindex; + + sb_bindex = subbuffer_id_get_index(config, bufb->buf_wsb[idx].id); + pages = bufb->array[sb_bindex]; + return pages->data_size; +} + +/** + * lib_ring_buffer_clear_noref - Clear the noref subbuffer flag, called by + * writer. + */ +static inline +void lib_ring_buffer_clear_noref(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + unsigned long idx) +{ + unsigned long id, new_id; + + if (config->mode != RING_BUFFER_OVERWRITE) + return; + + /* + * Performing a volatile access to read the sb_pages, because we want to + * read a coherent version of the pointer and the associated noref flag. + */ + id = ACCESS_ONCE(bufb->buf_wsb[idx].id); + for (;;) { + /* This check is called on the fast path for each record. */ + if (likely(!subbuffer_id_is_noref(config, id))) { + /* + * Store after load dependency ordering the writes to + * the subbuffer after load and test of the noref flag + * matches the memory barrier implied by the cmpxchg() + * in update_read_sb_index(). + */ + return; /* Already writing to this buffer */ + } + new_id = id; + subbuffer_id_clear_noref(config, &new_id); + new_id = cmpxchg(&bufb->buf_wsb[idx].id, id, new_id); + if (likely(new_id == id)) + break; + id = new_id; + } +} + +/** + * lib_ring_buffer_set_noref_offset - Set the noref subbuffer flag and offset, + * called by writer. + */ +static inline +void lib_ring_buffer_set_noref_offset(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + unsigned long idx, unsigned long offset) +{ + if (config->mode != RING_BUFFER_OVERWRITE) + return; + + /* + * Because ring_buffer_set_noref() is only called by a single thread + * (the one which updated the cc_sb value), there are no concurrent + * updates to take care of: other writers have not updated cc_sb, so + * they cannot set the noref flag, and concurrent readers cannot modify + * the pointer because the noref flag is not set yet. + * The smp_wmb() in ring_buffer_commit() takes care of ordering writes + * to the subbuffer before this set noref operation. + * subbuffer_set_noref() uses a volatile store to deal with concurrent + * readers of the noref flag. + */ + CHAN_WARN_ON(bufb->chan, + subbuffer_id_is_noref(config, bufb->buf_wsb[idx].id)); + /* + * Memory barrier that ensures counter stores are ordered before set + * noref and offset. + */ + smp_mb(); + subbuffer_id_set_noref_offset(config, &bufb->buf_wsb[idx].id, offset); +} + +/** + * update_read_sb_index - Read-side subbuffer index update. + */ +static inline +int update_read_sb_index(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + struct channel_backend *chanb, + unsigned long consumed_idx, + unsigned long consumed_count) +{ + unsigned long old_id, new_id; + + if (config->mode == RING_BUFFER_OVERWRITE) { + /* + * Exchange the target writer subbuffer with our own unused + * subbuffer. No need to use ACCESS_ONCE() here to read the + * old_wpage, because the value read will be confirmed by the + * following cmpxchg(). + */ + old_id = bufb->buf_wsb[consumed_idx].id; + if (unlikely(!subbuffer_id_is_noref(config, old_id))) + return -EAGAIN; + /* + * Make sure the offset count we are expecting matches the one + * indicated by the writer. + */ + if (unlikely(!subbuffer_id_compare_offset(config, old_id, + consumed_count))) + return -EAGAIN; + CHAN_WARN_ON(bufb->chan, + !subbuffer_id_is_noref(config, bufb->buf_rsb.id)); + subbuffer_id_set_noref_offset(config, &bufb->buf_rsb.id, + consumed_count); + new_id = cmpxchg(&bufb->buf_wsb[consumed_idx].id, old_id, + bufb->buf_rsb.id); + if (unlikely(old_id != new_id)) + return -EAGAIN; + bufb->buf_rsb.id = new_id; + } else { + /* No page exchange, use the writer page directly */ + bufb->buf_rsb.id = bufb->buf_wsb[consumed_idx].id; + } + return 0; +} + +/* + * Use the architecture-specific memcpy implementation for constant-sized + * inputs, but rely on an inline memcpy for length statically unknown. + * The function call to memcpy is just way too expensive for a fast path. + */ +#define lib_ring_buffer_do_copy(config, dest, src, len) \ +do { \ + size_t __len = (len); \ + if (__builtin_constant_p(len)) \ + memcpy(dest, src, __len); \ + else \ + inline_memcpy(dest, src, __len); \ +} while (0) + +#endif /* _LINUX_RING_BUFFER_BACKEND_INTERNAL_H */ diff --git a/libringbuffer/backend_types.h b/libringbuffer/backend_types.h new file mode 100644 index 0000000..1d301de --- /dev/null +++ b/libringbuffer/backend_types.h @@ -0,0 +1,80 @@ +#ifndef _LINUX_RING_BUFFER_BACKEND_TYPES_H +#define _LINUX_RING_BUFFER_BACKEND_TYPES_H + +/* + * linux/ringbuffer/backend_types.h + * + * Copyright (C) 2008-2010 - Mathieu Desnoyers + * + * Ring buffer backend (types). + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include +#include + +struct lib_ring_buffer_backend_page { + void *virt; /* page virtual address (cached) */ + struct page *page; /* pointer to page structure */ +}; + +struct lib_ring_buffer_backend_pages { + unsigned long mmap_offset; /* offset of the subbuffer in mmap */ + union v_atomic records_commit; /* current records committed count */ + union v_atomic records_unread; /* records to read */ + unsigned long data_size; /* Amount of data to read from subbuf */ + struct lib_ring_buffer_backend_page p[]; +}; + +struct lib_ring_buffer_backend_subbuffer { + /* Identifier for subbuf backend pages. Exchanged atomically. */ + unsigned long id; /* backend subbuffer identifier */ +}; + +/* + * Forward declaration of frontend-specific channel and ring_buffer. + */ +struct channel; +struct lib_ring_buffer; + +struct lib_ring_buffer_backend { + /* Array of ring_buffer_backend_subbuffer for writer */ + struct lib_ring_buffer_backend_subbuffer *buf_wsb; + /* ring_buffer_backend_subbuffer for reader */ + struct lib_ring_buffer_backend_subbuffer buf_rsb; + /* + * Pointer array of backend pages, for whole buffer. + * Indexed by ring_buffer_backend_subbuffer identifier (id) index. + */ + struct lib_ring_buffer_backend_pages **array; + unsigned int num_pages_per_subbuf; + + struct channel *chan; /* Associated channel */ + int cpu; /* This buffer's cpu. -1 if global. */ + union v_atomic records_read; /* Number of records read */ + unsigned int allocated:1; /* Bool: is buffer allocated ? */ +}; + +struct channel_backend { + unsigned long buf_size; /* Size of the buffer */ + unsigned long subbuf_size; /* Sub-buffer size */ + unsigned int subbuf_size_order; /* Order of sub-buffer size */ + unsigned int num_subbuf_order; /* + * Order of number of sub-buffers/buffer + * for writer. + */ + unsigned int buf_size_order; /* Order of buffer size */ + int extra_reader_sb:1; /* Bool: has extra reader subbuffer */ + struct lib_ring_buffer *buf; /* Channel per-cpu buffers */ + + unsigned long num_subbuf; /* Number of sub-buffers for writer */ + u64 start_tsc; /* Channel creation TSC value */ + void *priv; /* Client-specific information */ + struct notifier_block cpu_hp_notifier; /* CPU hotplug notifier */ + const struct lib_ring_buffer_config *config; /* Ring buffer configuration */ + cpumask_var_t cpumask; /* Allocated per-cpu buffers cpumask */ + char name[NAME_MAX]; /* Channel name */ +}; + +#endif /* _LINUX_RING_BUFFER_BACKEND_TYPES_H */ diff --git a/libringbuffer/frontend.h b/libringbuffer/frontend.h new file mode 100644 index 0000000..01af77a --- /dev/null +++ b/libringbuffer/frontend.h @@ -0,0 +1,228 @@ +#ifndef _LINUX_RING_BUFFER_FRONTEND_H +#define _LINUX_RING_BUFFER_FRONTEND_H + +/* + * linux/ringbuffer/frontend.h + * + * (C) Copyright 2005-2010 - Mathieu Desnoyers + * + * Ring Buffer Library Synchronization Header (API). + * + * Author: + * Mathieu Desnoyers + * + * See ring_buffer_frontend.c for more information on wait-free algorithms. + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* Internal helpers */ +#include "../../wrapper/ringbuffer/frontend_internal.h" + +/* Buffer creation/removal and setup operations */ + +/* + * switch_timer_interval is the time interval (in us) to fill sub-buffers with + * padding to let readers get those sub-buffers. Used for live streaming. + * + * read_timer_interval is the time interval (in us) to wake up pending readers. + * + * buf_addr is a pointer the the beginning of the preallocated buffer contiguous + * address mapping. It is used only by RING_BUFFER_STATIC configuration. It can + * be set to NULL for other backends. + */ + +extern +struct channel *channel_create(const struct lib_ring_buffer_config *config, + const char *name, void *priv, + void *buf_addr, + size_t subbuf_size, size_t num_subbuf, + unsigned int switch_timer_interval, + unsigned int read_timer_interval); + +/* + * channel_destroy returns the private data pointer. It finalizes all channel's + * buffers, waits for readers to release all references, and destroys the + * channel. + */ +extern +void *channel_destroy(struct channel *chan); + + +/* Buffer read operations */ + +/* + * Iteration on channel cpumask needs to issue a read barrier to match the write + * barrier in cpu hotplug. It orders the cpumask read before read of per-cpu + * buffer data. The per-cpu buffer is never removed by cpu hotplug; teardown is + * only performed at channel destruction. + */ +#define for_each_channel_cpu(cpu, chan) \ + for ((cpu) = -1; \ + ({ (cpu) = cpumask_next(cpu, (chan)->backend.cpumask); \ + smp_read_barrier_depends(); (cpu) < nr_cpu_ids; });) + +extern struct lib_ring_buffer *channel_get_ring_buffer( + const struct lib_ring_buffer_config *config, + struct channel *chan, int cpu); +extern int lib_ring_buffer_open_read(struct lib_ring_buffer *buf); +extern void lib_ring_buffer_release_read(struct lib_ring_buffer *buf); + +/* + * Read sequence: snapshot, many get_subbuf/put_subbuf, move_consumer. + */ +extern int lib_ring_buffer_snapshot(struct lib_ring_buffer *buf, + unsigned long *consumed, + unsigned long *produced); +extern void lib_ring_buffer_move_consumer(struct lib_ring_buffer *buf, + unsigned long consumed_new); + +extern int lib_ring_buffer_get_subbuf(struct lib_ring_buffer *buf, + unsigned long consumed); +extern void lib_ring_buffer_put_subbuf(struct lib_ring_buffer *buf); + +/* + * lib_ring_buffer_get_next_subbuf/lib_ring_buffer_put_next_subbuf are helpers + * to read sub-buffers sequentially. + */ +static inline int lib_ring_buffer_get_next_subbuf(struct lib_ring_buffer *buf) +{ + int ret; + + ret = lib_ring_buffer_snapshot(buf, &buf->cons_snapshot, + &buf->prod_snapshot); + if (ret) + return ret; + ret = lib_ring_buffer_get_subbuf(buf, buf->cons_snapshot); + return ret; +} + +static inline void lib_ring_buffer_put_next_subbuf(struct lib_ring_buffer *buf) +{ + lib_ring_buffer_put_subbuf(buf); + lib_ring_buffer_move_consumer(buf, subbuf_align(buf->cons_snapshot, + buf->backend.chan)); +} + +extern void channel_reset(struct channel *chan); +extern void lib_ring_buffer_reset(struct lib_ring_buffer *buf); + +static inline +unsigned long lib_ring_buffer_get_offset(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return v_read(config, &buf->offset); +} + +static inline +unsigned long lib_ring_buffer_get_consumed(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return atomic_long_read(&buf->consumed); +} + +/* + * Must call lib_ring_buffer_is_finalized before reading counters (memory + * ordering enforced with respect to trace teardown). + */ +static inline +int lib_ring_buffer_is_finalized(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + int finalized = ACCESS_ONCE(buf->finalized); + /* + * Read finalized before counters. + */ + smp_rmb(); + return finalized; +} + +static inline +int lib_ring_buffer_channel_is_finalized(const struct channel *chan) +{ + return chan->finalized; +} + +static inline +int lib_ring_buffer_channel_is_disabled(const struct channel *chan) +{ + return atomic_read(&chan->record_disabled); +} + +static inline +unsigned long lib_ring_buffer_get_read_data_size( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return subbuffer_get_read_data_size(config, &buf->backend); +} + +static inline +unsigned long lib_ring_buffer_get_records_count( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return v_read(config, &buf->records_count); +} + +static inline +unsigned long lib_ring_buffer_get_records_overrun( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return v_read(config, &buf->records_overrun); +} + +static inline +unsigned long lib_ring_buffer_get_records_lost_full( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return v_read(config, &buf->records_lost_full); +} + +static inline +unsigned long lib_ring_buffer_get_records_lost_wrap( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return v_read(config, &buf->records_lost_wrap); +} + +static inline +unsigned long lib_ring_buffer_get_records_lost_big( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return v_read(config, &buf->records_lost_big); +} + +static inline +unsigned long lib_ring_buffer_get_records_read( + const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + return v_read(config, &buf->backend.records_read); +} + +#endif /* _LINUX_RING_BUFFER_FRONTEND_H */ diff --git a/libringbuffer/frontend_api.h b/libringbuffer/frontend_api.h new file mode 100644 index 0000000..391e593 --- /dev/null +++ b/libringbuffer/frontend_api.h @@ -0,0 +1,358 @@ +#ifndef _LINUX_RING_BUFFER_FRONTEND_API_H +#define _LINUX_RING_BUFFER_FRONTEND_API_H + +/* + * linux/ringbuffer/frontend_api.h + * + * (C) Copyright 2005-2010 - Mathieu Desnoyers + * + * Ring Buffer Library Synchronization Header (buffer write API). + * + * Author: + * Mathieu Desnoyers + * + * See ring_buffer_frontend.c for more information on wait-free algorithms. + * See linux/ringbuffer/frontend.h for channel allocation and read-side API. + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include "../../wrapper/ringbuffer/frontend.h" +#include + +/** + * lib_ring_buffer_get_cpu - Precedes ring buffer reserve/commit. + * + * Disables preemption (acts as a RCU read-side critical section) and keeps a + * ring buffer nesting count as supplementary safety net to ensure tracer client + * code will never trigger an endless recursion. Returns the processor ID on + * success, -EPERM on failure (nesting count too high). + * + * asm volatile and "memory" clobber prevent the compiler from moving + * instructions out of the ring buffer nesting count. This is required to ensure + * that probe side-effects which can cause recursion (e.g. unforeseen traps, + * divisions by 0, ...) are triggered within the incremented nesting count + * section. + */ +static inline +int lib_ring_buffer_get_cpu(const struct lib_ring_buffer_config *config) +{ + int cpu, nesting; + + rcu_read_lock_sched_notrace(); + cpu = smp_processor_id(); + nesting = ++per_cpu(lib_ring_buffer_nesting, cpu); + barrier(); + + if (unlikely(nesting > 4)) { + WARN_ON_ONCE(1); + per_cpu(lib_ring_buffer_nesting, cpu)--; + rcu_read_unlock_sched_notrace(); + return -EPERM; + } else + return cpu; +} + +/** + * lib_ring_buffer_put_cpu - Follows ring buffer reserve/commit. + */ +static inline +void lib_ring_buffer_put_cpu(const struct lib_ring_buffer_config *config) +{ + barrier(); + __get_cpu_var(lib_ring_buffer_nesting)--; + rcu_read_unlock_sched_notrace(); +} + +/* + * lib_ring_buffer_try_reserve is called by lib_ring_buffer_reserve(). It is not + * part of the API per se. + * + * returns 0 if reserve ok, or 1 if the slow path must be taken. + */ +static inline +int lib_ring_buffer_try_reserve(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_ctx *ctx, + unsigned long *o_begin, unsigned long *o_end, + unsigned long *o_old, size_t *before_hdr_pad) +{ + struct channel *chan = ctx->chan; + struct lib_ring_buffer *buf = ctx->buf; + *o_begin = v_read(config, &buf->offset); + *o_old = *o_begin; + + ctx->tsc = lib_ring_buffer_clock_read(chan); + if ((int64_t) ctx->tsc == -EIO) + return 1; + + /* + * Prefetch cacheline for read because we have to read the previous + * commit counter to increment it and commit seq value to compare it to + * the commit counter. + */ + prefetch(&buf->commit_hot[subbuf_index(*o_begin, chan)]); + + if (last_tsc_overflow(config, buf, ctx->tsc)) + ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC; + + if (unlikely(subbuf_offset(*o_begin, chan) == 0)) + return 1; + + ctx->slot_size = record_header_size(config, chan, *o_begin, + before_hdr_pad, ctx); + ctx->slot_size += + lib_ring_buffer_align(*o_begin + ctx->slot_size, + ctx->largest_align) + ctx->data_size; + if (unlikely((subbuf_offset(*o_begin, chan) + ctx->slot_size) + > chan->backend.subbuf_size)) + return 1; + + /* + * Record fits in the current buffer and we are not on a switch + * boundary. It's safe to write. + */ + *o_end = *o_begin + ctx->slot_size; + + if (unlikely((subbuf_offset(*o_end, chan)) == 0)) + /* + * The offset_end will fall at the very beginning of the next + * subbuffer. + */ + return 1; + + return 0; +} + +/** + * lib_ring_buffer_reserve - Reserve space in a ring buffer. + * @config: ring buffer instance configuration. + * @ctx: ring buffer context. (input and output) Must be already initialized. + * + * Atomic wait-free slot reservation. The reserved space starts at the context + * "pre_offset". Its length is "slot_size". The associated time-stamp is "tsc". + * + * Return : + * 0 on success. + * -EAGAIN if channel is disabled. + * -ENOSPC if event size is too large for packet. + * -ENOBUFS if there is currently not enough space in buffer for the event. + * -EIO if data cannot be written into the buffer for any other reason. + */ + +static inline +int lib_ring_buffer_reserve(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_ctx *ctx) +{ + struct channel *chan = ctx->chan; + struct lib_ring_buffer *buf; + unsigned long o_begin, o_end, o_old; + size_t before_hdr_pad = 0; + + if (atomic_read(&chan->record_disabled)) + return -EAGAIN; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + buf = per_cpu_ptr(chan->backend.buf, ctx->cpu); + else + buf = chan->backend.buf; + if (atomic_read(&buf->record_disabled)) + return -EAGAIN; + ctx->buf = buf; + + /* + * Perform retryable operations. + */ + if (unlikely(lib_ring_buffer_try_reserve(config, ctx, &o_begin, + &o_end, &o_old, &before_hdr_pad))) + goto slow_path; + + if (unlikely(v_cmpxchg(config, &ctx->buf->offset, o_old, o_end) + != o_old)) + goto slow_path; + + /* + * Atomically update last_tsc. This update races against concurrent + * atomic updates, but the race will always cause supplementary full TSC + * record headers, never the opposite (missing a full TSC record header + * when it would be needed). + */ + save_last_tsc(config, ctx->buf, ctx->tsc); + + /* + * Push the reader if necessary + */ + lib_ring_buffer_reserve_push_reader(ctx->buf, chan, o_end - 1); + + /* + * Clear noref flag for this subbuffer. + */ + lib_ring_buffer_clear_noref(config, &ctx->buf->backend, + subbuf_index(o_end - 1, chan)); + + ctx->pre_offset = o_begin; + ctx->buf_offset = o_begin + before_hdr_pad; + return 0; +slow_path: + return lib_ring_buffer_reserve_slow(ctx); +} + +/** + * lib_ring_buffer_switch - Perform a sub-buffer switch for a per-cpu buffer. + * @config: ring buffer instance configuration. + * @buf: buffer + * @mode: buffer switch mode (SWITCH_ACTIVE or SWITCH_FLUSH) + * + * This operation is completely reentrant : can be called while tracing is + * active with absolutely no lock held. + * + * Note, however, that as a v_cmpxchg is used for some atomic operations and + * requires to be executed locally for per-CPU buffers, this function must be + * called from the CPU which owns the buffer for a ACTIVE flush, with preemption + * disabled, for RING_BUFFER_SYNC_PER_CPU configuration. + */ +static inline +void lib_ring_buffer_switch(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, enum switch_mode mode) +{ + lib_ring_buffer_switch_slow(buf, mode); +} + +/* See ring_buffer_frontend_api.h for lib_ring_buffer_reserve(). */ + +/** + * lib_ring_buffer_commit - Commit an record. + * @config: ring buffer instance configuration. + * @ctx: ring buffer context. (input arguments only) + * + * Atomic unordered slot commit. Increments the commit count in the + * specified sub-buffer, and delivers it if necessary. + */ +static inline +void lib_ring_buffer_commit(const struct lib_ring_buffer_config *config, + const struct lib_ring_buffer_ctx *ctx) +{ + struct channel *chan = ctx->chan; + struct lib_ring_buffer *buf = ctx->buf; + unsigned long offset_end = ctx->buf_offset; + unsigned long endidx = subbuf_index(offset_end - 1, chan); + unsigned long commit_count; + + /* + * Must count record before incrementing the commit count. + */ + subbuffer_count_record(config, &buf->backend, endidx); + + /* + * Order all writes to buffer before the commit count update that will + * determine that the subbuffer is full. + */ + if (config->ipi == RING_BUFFER_IPI_BARRIER) { + /* + * Must write slot data before incrementing commit count. This + * compiler barrier is upgraded into a smp_mb() by the IPI sent + * by get_subbuf(). + */ + barrier(); + } else + smp_wmb(); + + v_add(config, ctx->slot_size, &buf->commit_hot[endidx].cc); + + /* + * commit count read can race with concurrent OOO commit count updates. + * This is only needed for lib_ring_buffer_check_deliver (for + * non-polling delivery only) and for + * lib_ring_buffer_write_commit_counter. The race can only cause the + * counter to be read with the same value more than once, which could + * cause : + * - Multiple delivery for the same sub-buffer (which is handled + * gracefully by the reader code) if the value is for a full + * sub-buffer. It's important that we can never miss a sub-buffer + * delivery. Re-reading the value after the v_add ensures this. + * - Reading a commit_count with a higher value that what was actually + * added to it for the lib_ring_buffer_write_commit_counter call + * (again caused by a concurrent committer). It does not matter, + * because this function is interested in the fact that the commit + * count reaches back the reserve offset for a specific sub-buffer, + * which is completely independent of the order. + */ + commit_count = v_read(config, &buf->commit_hot[endidx].cc); + + lib_ring_buffer_check_deliver(config, buf, chan, offset_end - 1, + commit_count, endidx); + /* + * Update used size at each commit. It's needed only for extracting + * ring_buffer buffers from vmcore, after crash. + */ + lib_ring_buffer_write_commit_counter(config, buf, chan, endidx, + ctx->buf_offset, commit_count, + ctx->slot_size); +} + +/** + * lib_ring_buffer_try_discard_reserve - Try discarding a record. + * @config: ring buffer instance configuration. + * @ctx: ring buffer context. (input arguments only) + * + * Only succeeds if no other record has been written after the record to + * discard. If discard fails, the record must be committed to the buffer. + * + * Returns 0 upon success, -EPERM if the record cannot be discarded. + */ +static inline +int lib_ring_buffer_try_discard_reserve(const struct lib_ring_buffer_config *config, + const struct lib_ring_buffer_ctx *ctx) +{ + struct lib_ring_buffer *buf = ctx->buf; + unsigned long end_offset = ctx->pre_offset + ctx->slot_size; + + /* + * We need to ensure that if the cmpxchg succeeds and discards the + * record, the next record will record a full TSC, because it cannot + * rely on the last_tsc associated with the discarded record to detect + * overflows. The only way to ensure this is to set the last_tsc to 0 + * (assuming no 64-bit TSC overflow), which forces to write a 64-bit + * timestamp in the next record. + * + * Note: if discard fails, we must leave the TSC in the record header. + * It is needed to keep track of TSC overflows for the following + * records. + */ + save_last_tsc(config, buf, 0ULL); + + if (likely(v_cmpxchg(config, &buf->offset, end_offset, ctx->pre_offset) + != end_offset)) + return -EPERM; + else + return 0; +} + +static inline +void channel_record_disable(const struct lib_ring_buffer_config *config, + struct channel *chan) +{ + atomic_inc(&chan->record_disabled); +} + +static inline +void channel_record_enable(const struct lib_ring_buffer_config *config, + struct channel *chan) +{ + atomic_dec(&chan->record_disabled); +} + +static inline +void lib_ring_buffer_record_disable(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + atomic_inc(&buf->record_disabled); +} + +static inline +void lib_ring_buffer_record_enable(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf) +{ + atomic_dec(&buf->record_disabled); +} + +#endif /* _LINUX_RING_BUFFER_FRONTEND_API_H */ diff --git a/libringbuffer/frontend_internal.h b/libringbuffer/frontend_internal.h new file mode 100644 index 0000000..3bd5721 --- /dev/null +++ b/libringbuffer/frontend_internal.h @@ -0,0 +1,424 @@ +#ifndef _LINUX_RING_BUFFER_FRONTEND_INTERNAL_H +#define _LINUX_RING_BUFFER_FRONTEND_INTERNAL_H + +/* + * linux/ringbuffer/frontend_internal.h + * + * (C) Copyright 2005-2010 - Mathieu Desnoyers + * + * Ring Buffer Library Synchronization Header (internal helpers). + * + * Author: + * Mathieu Desnoyers + * + * See ring_buffer_frontend.c for more information on wait-free algorithms. + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include "../../wrapper/ringbuffer/config.h" +#include "../../wrapper/ringbuffer/backend_types.h" +#include "../../wrapper/ringbuffer/frontend_types.h" +#include "../../lib/prio_heap/lttng_prio_heap.h" /* For per-CPU read-side iterator */ + +/* Buffer offset macros */ + +/* buf_trunc mask selects only the buffer number. */ +static inline +unsigned long buf_trunc(unsigned long offset, struct channel *chan) +{ + return offset & ~(chan->backend.buf_size - 1); + +} + +/* Select the buffer number value (counter). */ +static inline +unsigned long buf_trunc_val(unsigned long offset, struct channel *chan) +{ + return buf_trunc(offset, chan) >> chan->backend.buf_size_order; +} + +/* buf_offset mask selects only the offset within the current buffer. */ +static inline +unsigned long buf_offset(unsigned long offset, struct channel *chan) +{ + return offset & (chan->backend.buf_size - 1); +} + +/* subbuf_offset mask selects the offset within the current subbuffer. */ +static inline +unsigned long subbuf_offset(unsigned long offset, struct channel *chan) +{ + return offset & (chan->backend.subbuf_size - 1); +} + +/* subbuf_trunc mask selects the subbuffer number. */ +static inline +unsigned long subbuf_trunc(unsigned long offset, struct channel *chan) +{ + return offset & ~(chan->backend.subbuf_size - 1); +} + +/* subbuf_align aligns the offset to the next subbuffer. */ +static inline +unsigned long subbuf_align(unsigned long offset, struct channel *chan) +{ + return (offset + chan->backend.subbuf_size) + & ~(chan->backend.subbuf_size - 1); +} + +/* subbuf_index returns the index of the current subbuffer within the buffer. */ +static inline +unsigned long subbuf_index(unsigned long offset, struct channel *chan) +{ + return buf_offset(offset, chan) >> chan->backend.subbuf_size_order; +} + +/* + * Last TSC comparison functions. Check if the current TSC overflows tsc_bits + * bits from the last TSC read. When overflows are detected, the full 64-bit + * timestamp counter should be written in the record header. Reads and writes + * last_tsc atomically. + */ + +#if (BITS_PER_LONG == 32) +static inline +void save_last_tsc(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, u64 tsc) +{ + if (config->tsc_bits == 0 || config->tsc_bits == 64) + return; + + /* + * Ensure the compiler performs this update in a single instruction. + */ + v_set(config, &buf->last_tsc, (unsigned long)(tsc >> config->tsc_bits)); +} + +static inline +int last_tsc_overflow(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, u64 tsc) +{ + unsigned long tsc_shifted; + + if (config->tsc_bits == 0 || config->tsc_bits == 64) + return 0; + + tsc_shifted = (unsigned long)(tsc >> config->tsc_bits); + if (unlikely(tsc_shifted + - (unsigned long)v_read(config, &buf->last_tsc))) + return 1; + else + return 0; +} +#else +static inline +void save_last_tsc(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, u64 tsc) +{ + if (config->tsc_bits == 0 || config->tsc_bits == 64) + return; + + v_set(config, &buf->last_tsc, (unsigned long)tsc); +} + +static inline +int last_tsc_overflow(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, u64 tsc) +{ + if (config->tsc_bits == 0 || config->tsc_bits == 64) + return 0; + + if (unlikely((tsc - v_read(config, &buf->last_tsc)) + >> config->tsc_bits)) + return 1; + else + return 0; +} +#endif + +extern +int lib_ring_buffer_reserve_slow(struct lib_ring_buffer_ctx *ctx); + +extern +void lib_ring_buffer_switch_slow(struct lib_ring_buffer *buf, + enum switch_mode mode); + +/* Buffer write helpers */ + +static inline +void lib_ring_buffer_reserve_push_reader(struct lib_ring_buffer *buf, + struct channel *chan, + unsigned long offset) +{ + unsigned long consumed_old, consumed_new; + + do { + consumed_old = atomic_long_read(&buf->consumed); + /* + * If buffer is in overwrite mode, push the reader consumed + * count if the write position has reached it and we are not + * at the first iteration (don't push the reader farther than + * the writer). This operation can be done concurrently by many + * writers in the same buffer, the writer being at the farthest + * write position sub-buffer index in the buffer being the one + * which will win this loop. + */ + if (unlikely(subbuf_trunc(offset, chan) + - subbuf_trunc(consumed_old, chan) + >= chan->backend.buf_size)) + consumed_new = subbuf_align(consumed_old, chan); + else + return; + } while (unlikely(atomic_long_cmpxchg(&buf->consumed, consumed_old, + consumed_new) != consumed_old)); +} + +static inline +void lib_ring_buffer_vmcore_check_deliver(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, + unsigned long commit_count, + unsigned long idx) +{ + if (config->oops == RING_BUFFER_OOPS_CONSISTENCY) + v_set(config, &buf->commit_hot[idx].seq, commit_count); +} + +static inline +int lib_ring_buffer_poll_deliver(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, + struct channel *chan) +{ + unsigned long consumed_old, consumed_idx, commit_count, write_offset; + + consumed_old = atomic_long_read(&buf->consumed); + consumed_idx = subbuf_index(consumed_old, chan); + commit_count = v_read(config, &buf->commit_cold[consumed_idx].cc_sb); + /* + * No memory barrier here, since we are only interested + * in a statistically correct polling result. The next poll will + * get the data is we are racing. The mb() that ensures correct + * memory order is in get_subbuf. + */ + write_offset = v_read(config, &buf->offset); + + /* + * Check that the subbuffer we are trying to consume has been + * already fully committed. + */ + + if (((commit_count - chan->backend.subbuf_size) + & chan->commit_count_mask) + - (buf_trunc(consumed_old, chan) + >> chan->backend.num_subbuf_order) + != 0) + return 0; + + /* + * Check that we are not about to read the same subbuffer in + * which the writer head is. + */ + if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_old, chan) + == 0) + return 0; + + return 1; + +} + +static inline +int lib_ring_buffer_pending_data(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, + struct channel *chan) +{ + return !!subbuf_offset(v_read(config, &buf->offset), chan); +} + +static inline +unsigned long lib_ring_buffer_get_data_size(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, + unsigned long idx) +{ + return subbuffer_get_data_size(config, &buf->backend, idx); +} + +/* + * Check if all space reservation in a buffer have been committed. This helps + * knowing if an execution context is nested (for per-cpu buffers only). + * This is a very specific ftrace use-case, so we keep this as "internal" API. + */ +static inline +int lib_ring_buffer_reserve_committed(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, + struct channel *chan) +{ + unsigned long offset, idx, commit_count; + + CHAN_WARN_ON(chan, config->alloc != RING_BUFFER_ALLOC_PER_CPU); + CHAN_WARN_ON(chan, config->sync != RING_BUFFER_SYNC_PER_CPU); + + /* + * Read offset and commit count in a loop so they are both read + * atomically wrt interrupts. By deal with interrupt concurrency by + * restarting both reads if the offset has been pushed. Note that given + * we only have to deal with interrupt concurrency here, an interrupt + * modifying the commit count will also modify "offset", so it is safe + * to only check for offset modifications. + */ + do { + offset = v_read(config, &buf->offset); + idx = subbuf_index(offset, chan); + commit_count = v_read(config, &buf->commit_hot[idx].cc); + } while (offset != v_read(config, &buf->offset)); + + return ((buf_trunc(offset, chan) >> chan->backend.num_subbuf_order) + - (commit_count & chan->commit_count_mask) == 0); +} + +static inline +void lib_ring_buffer_check_deliver(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, + struct channel *chan, + unsigned long offset, + unsigned long commit_count, + unsigned long idx) +{ + unsigned long old_commit_count = commit_count + - chan->backend.subbuf_size; + u64 tsc; + + /* Check if all commits have been done */ + if (unlikely((buf_trunc(offset, chan) >> chan->backend.num_subbuf_order) + - (old_commit_count & chan->commit_count_mask) == 0)) { + /* + * If we succeeded at updating cc_sb below, we are the subbuffer + * writer delivering the subbuffer. Deals with concurrent + * updates of the "cc" value without adding a add_return atomic + * operation to the fast path. + * + * We are doing the delivery in two steps: + * - First, we cmpxchg() cc_sb to the new value + * old_commit_count + 1. This ensures that we are the only + * subbuffer user successfully filling the subbuffer, but we + * do _not_ set the cc_sb value to "commit_count" yet. + * Therefore, other writers that would wrap around the ring + * buffer and try to start writing to our subbuffer would + * have to drop records, because it would appear as + * non-filled. + * We therefore have exclusive access to the subbuffer control + * structures. This mutual exclusion with other writers is + * crucially important to perform record overruns count in + * flight recorder mode locklessly. + * - When we are ready to release the subbuffer (either for + * reading or for overrun by other writers), we simply set the + * cc_sb value to "commit_count" and perform delivery. + * + * The subbuffer size is least 2 bytes (minimum size: 1 page). + * This guarantees that old_commit_count + 1 != commit_count. + */ + if (likely(v_cmpxchg(config, &buf->commit_cold[idx].cc_sb, + old_commit_count, old_commit_count + 1) + == old_commit_count)) { + /* + * Start of exclusive subbuffer access. We are + * guaranteed to be the last writer in this subbuffer + * and any other writer trying to access this subbuffer + * in this state is required to drop records. + */ + tsc = config->cb.ring_buffer_clock_read(chan); + v_add(config, + subbuffer_get_records_count(config, + &buf->backend, idx), + &buf->records_count); + v_add(config, + subbuffer_count_records_overrun(config, + &buf->backend, + idx), + &buf->records_overrun); + config->cb.buffer_end(buf, tsc, idx, + lib_ring_buffer_get_data_size(config, + buf, + idx)); + + /* + * Set noref flag and offset for this subbuffer id. + * Contains a memory barrier that ensures counter stores + * are ordered before set noref and offset. + */ + lib_ring_buffer_set_noref_offset(config, &buf->backend, idx, + buf_trunc_val(offset, chan)); + + /* + * Order set_noref and record counter updates before the + * end of subbuffer exclusive access. Orders with + * respect to writers coming into the subbuffer after + * wrap around, and also order wrt concurrent readers. + */ + smp_mb(); + /* End of exclusive subbuffer access */ + v_set(config, &buf->commit_cold[idx].cc_sb, + commit_count); + lib_ring_buffer_vmcore_check_deliver(config, buf, + commit_count, idx); + + /* + * RING_BUFFER_WAKEUP_BY_WRITER wakeup is not lock-free. + */ + if (config->wakeup == RING_BUFFER_WAKEUP_BY_WRITER + && atomic_long_read(&buf->active_readers) + && lib_ring_buffer_poll_deliver(config, buf, chan)) { + wake_up_interruptible(&buf->read_wait); + wake_up_interruptible(&chan->read_wait); + } + + } + } +} + +/* + * lib_ring_buffer_write_commit_counter + * + * For flight recording. must be called after commit. + * This function increments the subbuffer's commit_seq counter each time the + * commit count reaches back the reserve offset (modulo subbuffer size). It is + * useful for crash dump. + */ +static inline +void lib_ring_buffer_write_commit_counter(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer *buf, + struct channel *chan, + unsigned long idx, + unsigned long buf_offset, + unsigned long commit_count, + size_t slot_size) +{ + unsigned long offset, commit_seq_old; + + if (config->oops != RING_BUFFER_OOPS_CONSISTENCY) + return; + + offset = buf_offset + slot_size; + + /* + * subbuf_offset includes commit_count_mask. We can simply + * compare the offsets within the subbuffer without caring about + * buffer full/empty mismatch because offset is never zero here + * (subbuffer header and record headers have non-zero length). + */ + if (unlikely(subbuf_offset(offset - commit_count, chan))) + return; + + commit_seq_old = v_read(config, &buf->commit_hot[idx].seq); + while ((long) (commit_seq_old - commit_count) < 0) + commit_seq_old = v_cmpxchg(config, &buf->commit_hot[idx].seq, + commit_seq_old, commit_count); +} + +extern int lib_ring_buffer_create(struct lib_ring_buffer *buf, + struct channel_backend *chanb, int cpu); +extern void lib_ring_buffer_free(struct lib_ring_buffer *buf); + +/* Keep track of trap nesting inside ring buffer code */ +DECLARE_PER_CPU(unsigned int, lib_ring_buffer_nesting); + +#endif /* _LINUX_RING_BUFFER_FRONTEND_INTERNAL_H */ diff --git a/libringbuffer/frontend_types.h b/libringbuffer/frontend_types.h new file mode 100644 index 0000000..fe208b6 --- /dev/null +++ b/libringbuffer/frontend_types.h @@ -0,0 +1,174 @@ +#ifndef _LINUX_RING_BUFFER_FRONTEND_TYPES_H +#define _LINUX_RING_BUFFER_FRONTEND_TYPES_H + +/* + * linux/ringbuffer/frontend_types.h + * + * (C) Copyright 2005-2010 - Mathieu Desnoyers + * + * Ring Buffer Library Synchronization Header (types). + * + * Author: + * Mathieu Desnoyers + * + * See ring_buffer_frontend.c for more information on wait-free algorithms. + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include +#include "../../wrapper/ringbuffer/config.h" +#include "../../wrapper/ringbuffer/backend_types.h" +#include "../../lib/prio_heap/lttng_prio_heap.h" /* For per-CPU read-side iterator */ + +/* + * A switch is done during tracing or as a final flush after tracing (so it + * won't write in the new sub-buffer). + */ +enum switch_mode { SWITCH_ACTIVE, SWITCH_FLUSH }; + +/* channel-level read-side iterator */ +struct channel_iter { + /* Prio heap of buffers. Lowest timestamps at the top. */ + struct lttng_ptr_heap heap; /* Heap of struct lib_ring_buffer ptrs */ + struct list_head empty_head; /* Empty buffers linked-list head */ + int read_open; /* Opened for reading ? */ + u64 last_qs; /* Last quiescent state timestamp */ + u64 last_timestamp; /* Last timestamp (for WARN_ON) */ + int last_cpu; /* Last timestamp cpu */ + /* + * read() file operation state. + */ + unsigned long len_left; +}; + +/* channel: collection of per-cpu ring buffers. */ +struct channel { + atomic_t record_disabled; + unsigned long commit_count_mask; /* + * Commit count mask, removing + * the MSBs corresponding to + * bits used to represent the + * subbuffer index. + */ + + struct channel_backend backend; /* Associated backend */ + + unsigned long switch_timer_interval; /* Buffer flush (jiffies) */ + unsigned long read_timer_interval; /* Reader wakeup (jiffies) */ + struct notifier_block cpu_hp_notifier; /* CPU hotplug notifier */ + struct notifier_block tick_nohz_notifier; /* CPU nohz notifier */ + struct notifier_block hp_iter_notifier; /* hotplug iterator notifier */ + int cpu_hp_enable:1; /* Enable CPU hotplug notif. */ + int hp_iter_enable:1; /* Enable hp iter notif. */ + wait_queue_head_t read_wait; /* reader wait queue */ + wait_queue_head_t hp_wait; /* CPU hotplug wait queue */ + int finalized; /* Has channel been finalized */ + struct channel_iter iter; /* Channel read-side iterator */ + struct kref ref; /* Reference count */ +}; + +/* Per-subbuffer commit counters used on the hot path */ +struct commit_counters_hot { + union v_atomic cc; /* Commit counter */ + union v_atomic seq; /* Consecutive commits */ +}; + +/* Per-subbuffer commit counters used only on cold paths */ +struct commit_counters_cold { + union v_atomic cc_sb; /* Incremented _once_ at sb switch */ +}; + +/* Per-buffer read iterator */ +struct lib_ring_buffer_iter { + u64 timestamp; /* Current record timestamp */ + size_t header_len; /* Current record header length */ + size_t payload_len; /* Current record payload length */ + + struct list_head empty_node; /* Linked list of empty buffers */ + unsigned long consumed, read_offset, data_size; + enum { + ITER_GET_SUBBUF = 0, + ITER_TEST_RECORD, + ITER_NEXT_RECORD, + ITER_PUT_SUBBUF, + } state; + int allocated:1; + int read_open:1; /* Opened for reading ? */ +}; + +/* ring buffer state */ +struct lib_ring_buffer { + /* First 32 bytes cache-hot cacheline */ + union v_atomic offset; /* Current offset in the buffer */ + struct commit_counters_hot *commit_hot; + /* Commit count per sub-buffer */ + atomic_long_t consumed; /* + * Current offset in the buffer + * standard atomic access (shared) + */ + atomic_t record_disabled; + /* End of first 32 bytes cacheline */ + union v_atomic last_tsc; /* + * Last timestamp written in the buffer. + */ + + struct lib_ring_buffer_backend backend; /* Associated backend */ + + struct commit_counters_cold *commit_cold; + /* Commit count per sub-buffer */ + atomic_long_t active_readers; /* + * Active readers count + * standard atomic access (shared) + */ + /* Dropped records */ + union v_atomic records_lost_full; /* Buffer full */ + union v_atomic records_lost_wrap; /* Nested wrap-around */ + union v_atomic records_lost_big; /* Events too big */ + union v_atomic records_count; /* Number of records written */ + union v_atomic records_overrun; /* Number of overwritten records */ + wait_queue_head_t read_wait; /* reader buffer-level wait queue */ + int finalized; /* buffer has been finalized */ + struct timer_list switch_timer; /* timer for periodical switch */ + struct timer_list read_timer; /* timer for read poll */ + raw_spinlock_t raw_tick_nohz_spinlock; /* nohz entry lock/trylock */ + struct lib_ring_buffer_iter iter; /* read-side iterator */ + unsigned long get_subbuf_consumed; /* Read-side consumed */ + unsigned long prod_snapshot; /* Producer count snapshot */ + unsigned long cons_snapshot; /* Consumer count snapshot */ + int get_subbuf:1; /* Sub-buffer being held by reader */ + int switch_timer_enabled:1; /* Protected by ring_buffer_nohz_lock */ + int read_timer_enabled:1; /* Protected by ring_buffer_nohz_lock */ +}; + +static inline +void *channel_get_private(struct channel *chan) +{ + return chan->backend.priv; +} + +/* + * Issue warnings and disable channels upon internal error. + * Can receive struct lib_ring_buffer or struct lib_ring_buffer_backend + * parameters. + */ +#define CHAN_WARN_ON(c, cond) \ + ({ \ + struct channel *__chan; \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ + if (__same_type(*(c), struct channel_backend)) \ + __chan = container_of((void *) (c), \ + struct channel, \ + backend); \ + else if (__same_type(*(c), struct channel)) \ + __chan = (void *) (c); \ + else \ + BUG_ON(1); \ + atomic_inc(&__chan->record_disabled); \ + WARN_ON(1); \ + } \ + _____ret; \ + }) + +#endif /* _LINUX_RING_BUFFER_FRONTEND_TYPES_H */ diff --git a/libringbuffer/iterator.h b/libringbuffer/iterator.h new file mode 100644 index 0000000..f2bd50d --- /dev/null +++ b/libringbuffer/iterator.h @@ -0,0 +1,70 @@ +#ifndef _LINUX_RING_BUFFER_ITERATOR_H +#define _LINUX_RING_BUFFER_ITERATOR_H + +/* + * linux/ringbuffer/iterator.h + * + * (C) Copyright 2010 - Mathieu Desnoyers + * + * Ring buffer and channel iterators. + * + * Author: + * Mathieu Desnoyers + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include "../../wrapper/ringbuffer/backend.h" +#include "../../wrapper/ringbuffer/frontend.h" + +/* + * lib_ring_buffer_get_next_record advances the buffer read position to the next + * record. It returns either the size of the next record, -EAGAIN if there is + * currently no data available, or -ENODATA if no data is available and buffer + * is finalized. + */ +extern ssize_t lib_ring_buffer_get_next_record(struct channel *chan, + struct lib_ring_buffer *buf); + +/* + * channel_get_next_record advances the buffer read position to the next record. + * It returns either the size of the next record, -EAGAIN if there is currently + * no data available, or -ENODATA if no data is available and buffer is + * finalized. + * Returns the current buffer in ret_buf. + */ +extern ssize_t channel_get_next_record(struct channel *chan, + struct lib_ring_buffer **ret_buf); + +/** + * read_current_record - copy the buffer current record into dest. + * @buf: ring buffer + * @dest: destination where the record should be copied + * + * dest should be large enough to contain the record. Returns the number of + * bytes copied. + */ +static inline size_t read_current_record(struct lib_ring_buffer *buf, void *dest) +{ + return lib_ring_buffer_read(&buf->backend, buf->iter.read_offset, + dest, buf->iter.payload_len); +} + +extern int lib_ring_buffer_iterator_open(struct lib_ring_buffer *buf); +extern void lib_ring_buffer_iterator_release(struct lib_ring_buffer *buf); +extern int channel_iterator_open(struct channel *chan); +extern void channel_iterator_release(struct channel *chan); + +extern const struct file_operations channel_payload_file_operations; +extern const struct file_operations lib_ring_buffer_payload_file_operations; + +/* + * Used internally. + */ +int channel_iterator_init(struct channel *chan); +void channel_iterator_unregister_notifiers(struct channel *chan); +void channel_iterator_free(struct channel *chan); +void channel_iterator_reset(struct channel *chan); +void lib_ring_buffer_iterator_reset(struct lib_ring_buffer *buf); + +#endif /* _LINUX_RING_BUFFER_ITERATOR_H */ diff --git a/libringbuffer/nohz.h b/libringbuffer/nohz.h new file mode 100644 index 0000000..3c31072 --- /dev/null +++ b/libringbuffer/nohz.h @@ -0,0 +1,30 @@ +#ifndef _LINUX_RING_BUFFER_NOHZ_H +#define _LINUX_RING_BUFFER_NOHZ_H + +/* + * ringbuffer/nohz.h + * + * Copyright (C) 2011 Mathieu Desnoyers + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#ifdef CONFIG_LIB_RING_BUFFER +void lib_ring_buffer_tick_nohz_flush(void); +void lib_ring_buffer_tick_nohz_stop(void); +void lib_ring_buffer_tick_nohz_restart(void); +#else +static inline void lib_ring_buffer_tick_nohz_flush(void) +{ +} + +static inline void lib_ring_buffer_tick_nohz_stop(void) +{ +} + +static inline void lib_ring_buffer_tick_nohz_restart(void) +{ +} +#endif + +#endif /* _LINUX_RING_BUFFER_NOHZ_H */ diff --git a/libringbuffer/ring_buffer_backend.c b/libringbuffer/ring_buffer_backend.c new file mode 100644 index 0000000..332a1d2 --- /dev/null +++ b/libringbuffer/ring_buffer_backend.c @@ -0,0 +1,754 @@ +/* + * ring_buffer_backend.c + * + * Copyright (C) 2005-2010 - Mathieu Desnoyers + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../wrapper/vmalloc.h" /* for wrapper_vmalloc_sync_all() */ +#include "../../wrapper/ringbuffer/config.h" +#include "../../wrapper/ringbuffer/backend.h" +#include "../../wrapper/ringbuffer/frontend.h" + +/** + * lib_ring_buffer_backend_allocate - allocate a channel buffer + * @config: ring buffer instance configuration + * @buf: the buffer struct + * @size: total size of the buffer + * @num_subbuf: number of subbuffers + * @extra_reader_sb: need extra subbuffer for reader + */ +static +int lib_ring_buffer_backend_allocate(const struct lib_ring_buffer_config *config, + struct lib_ring_buffer_backend *bufb, + size_t size, size_t num_subbuf, + int extra_reader_sb) +{ + struct channel_backend *chanb = &bufb->chan->backend; + unsigned long j, num_pages, num_pages_per_subbuf, page_idx = 0; + unsigned long subbuf_size, mmap_offset = 0; + unsigned long num_subbuf_alloc; + struct page **pages; + void **virt; + unsigned long i; + + num_pages = size >> PAGE_SHIFT; + num_pages_per_subbuf = num_pages >> get_count_order(num_subbuf); + subbuf_size = chanb->subbuf_size; + num_subbuf_alloc = num_subbuf; + + if (extra_reader_sb) { + num_pages += num_pages_per_subbuf; /* Add pages for reader */ + num_subbuf_alloc++; + } + + pages = kmalloc_node(ALIGN(sizeof(*pages) * num_pages, + 1 << INTERNODE_CACHE_SHIFT), + GFP_KERNEL, cpu_to_node(max(bufb->cpu, 0))); + if (unlikely(!pages)) + goto pages_error; + + virt = kmalloc_node(ALIGN(sizeof(*virt) * num_pages, + 1 << INTERNODE_CACHE_SHIFT), + GFP_KERNEL, cpu_to_node(max(bufb->cpu, 0))); + if (unlikely(!virt)) + goto virt_error; + + bufb->array = kmalloc_node(ALIGN(sizeof(*bufb->array) + * num_subbuf_alloc, + 1 << INTERNODE_CACHE_SHIFT), + GFP_KERNEL, cpu_to_node(max(bufb->cpu, 0))); + if (unlikely(!bufb->array)) + goto array_error; + + for (i = 0; i < num_pages; i++) { + pages[i] = alloc_pages_node(cpu_to_node(max(bufb->cpu, 0)), + GFP_KERNEL | __GFP_ZERO, 0); + if (unlikely(!pages[i])) + goto depopulate; + virt[i] = page_address(pages[i]); + } + bufb->num_pages_per_subbuf = num_pages_per_subbuf; + + /* Allocate backend pages array elements */ + for (i = 0; i < num_subbuf_alloc; i++) { + bufb->array[i] = + kzalloc_node(ALIGN( + sizeof(struct lib_ring_buffer_backend_pages) + + sizeof(struct lib_ring_buffer_backend_page) + * num_pages_per_subbuf, + 1 << INTERNODE_CACHE_SHIFT), + GFP_KERNEL, cpu_to_node(max(bufb->cpu, 0))); + if (!bufb->array[i]) + goto free_array; + } + + /* Allocate write-side subbuffer table */ + bufb->buf_wsb = kzalloc_node(ALIGN( + sizeof(struct lib_ring_buffer_backend_subbuffer) + * num_subbuf, + 1 << INTERNODE_CACHE_SHIFT), + GFP_KERNEL, cpu_to_node(max(bufb->cpu, 0))); + if (unlikely(!bufb->buf_wsb)) + goto free_array; + + for (i = 0; i < num_subbuf; i++) + bufb->buf_wsb[i].id = subbuffer_id(config, 0, 1, i); + + /* Assign read-side subbuffer table */ + if (extra_reader_sb) + bufb->buf_rsb.id = subbuffer_id(config, 0, 1, + num_subbuf_alloc - 1); + else + bufb->buf_rsb.id = subbuffer_id(config, 0, 1, 0); + + /* Assign pages to page index */ + for (i = 0; i < num_subbuf_alloc; i++) { + for (j = 0; j < num_pages_per_subbuf; j++) { + CHAN_WARN_ON(chanb, page_idx > num_pages); + bufb->array[i]->p[j].virt = virt[page_idx]; + bufb->array[i]->p[j].page = pages[page_idx]; + page_idx++; + } + if (config->output == RING_BUFFER_MMAP) { + bufb->array[i]->mmap_offset = mmap_offset; + mmap_offset += subbuf_size; + } + } + + /* + * If kmalloc ever uses vmalloc underneath, make sure the buffer pages + * will not fault. + */ + wrapper_vmalloc_sync_all(); + kfree(virt); + kfree(pages); + return 0; + +free_array: + for (i = 0; (i < num_subbuf_alloc && bufb->array[i]); i++) + kfree(bufb->array[i]); +depopulate: + /* Free all allocated pages */ + for (i = 0; (i < num_pages && pages[i]); i++) + __free_page(pages[i]); + kfree(bufb->array); +array_error: + kfree(virt); +virt_error: + kfree(pages); +pages_error: + return -ENOMEM; +} + +int lib_ring_buffer_backend_create(struct lib_ring_buffer_backend *bufb, + struct channel_backend *chanb, int cpu) +{ + const struct lib_ring_buffer_config *config = chanb->config; + + bufb->chan = container_of(chanb, struct channel, backend); + bufb->cpu = cpu; + + return lib_ring_buffer_backend_allocate(config, bufb, chanb->buf_size, + chanb->num_subbuf, + chanb->extra_reader_sb); +} + +void lib_ring_buffer_backend_free(struct lib_ring_buffer_backend *bufb) +{ + struct channel_backend *chanb = &bufb->chan->backend; + unsigned long i, j, num_subbuf_alloc; + + num_subbuf_alloc = chanb->num_subbuf; + if (chanb->extra_reader_sb) + num_subbuf_alloc++; + + kfree(bufb->buf_wsb); + for (i = 0; i < num_subbuf_alloc; i++) { + for (j = 0; j < bufb->num_pages_per_subbuf; j++) + __free_page(bufb->array[i]->p[j].page); + kfree(bufb->array[i]); + } + kfree(bufb->array); + bufb->allocated = 0; +} + +void lib_ring_buffer_backend_reset(struct lib_ring_buffer_backend *bufb) +{ + struct channel_backend *chanb = &bufb->chan->backend; + const struct lib_ring_buffer_config *config = chanb->config; + unsigned long num_subbuf_alloc; + unsigned int i; + + num_subbuf_alloc = chanb->num_subbuf; + if (chanb->extra_reader_sb) + num_subbuf_alloc++; + + for (i = 0; i < chanb->num_subbuf; i++) + bufb->buf_wsb[i].id = subbuffer_id(config, 0, 1, i); + if (chanb->extra_reader_sb) + bufb->buf_rsb.id = subbuffer_id(config, 0, 1, + num_subbuf_alloc - 1); + else + bufb->buf_rsb.id = subbuffer_id(config, 0, 1, 0); + + for (i = 0; i < num_subbuf_alloc; i++) { + /* Don't reset mmap_offset */ + v_set(config, &bufb->array[i]->records_commit, 0); + v_set(config, &bufb->array[i]->records_unread, 0); + bufb->array[i]->data_size = 0; + /* Don't reset backend page and virt addresses */ + } + /* Don't reset num_pages_per_subbuf, cpu, allocated */ + v_set(config, &bufb->records_read, 0); +} + +/* + * The frontend is responsible for also calling ring_buffer_backend_reset for + * each buffer when calling channel_backend_reset. + */ +void channel_backend_reset(struct channel_backend *chanb) +{ + struct channel *chan = container_of(chanb, struct channel, backend); + const struct lib_ring_buffer_config *config = chanb->config; + + /* + * Don't reset buf_size, subbuf_size, subbuf_size_order, + * num_subbuf_order, buf_size_order, extra_reader_sb, num_subbuf, + * priv, notifiers, config, cpumask and name. + */ + chanb->start_tsc = config->cb.ring_buffer_clock_read(chan); +} + +#ifdef CONFIG_HOTPLUG_CPU +/** + * lib_ring_buffer_cpu_hp_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) + */ +static +int __cpuinit lib_ring_buffer_cpu_hp_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct channel_backend *chanb = container_of(nb, struct channel_backend, + cpu_hp_notifier); + const struct lib_ring_buffer_config *config = chanb->config; + struct lib_ring_buffer *buf; + int ret; + + CHAN_WARN_ON(chanb, config->alloc == RING_BUFFER_ALLOC_GLOBAL); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + buf = per_cpu_ptr(chanb->buf, cpu); + ret = lib_ring_buffer_create(buf, chanb, cpu); + if (ret) { + printk(KERN_ERR + "ring_buffer_cpu_hp_callback: cpu %d " + "buffer creation failed\n", cpu); + return NOTIFY_BAD; + } + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + /* No need to do a buffer switch here, because it will happen + * when tracing is stopped, or will be done by switch timer CPU + * DEAD callback. */ + break; + } + return NOTIFY_OK; +} +#endif + +/** + * channel_backend_init - initialize a channel backend + * @chanb: channel backend + * @name: channel name + * @config: client ring buffer configuration + * @priv: client private data + * @parent: dentry of parent directory, %NULL for root directory + * @subbuf_size: size of sub-buffers (> PAGE_SIZE, power of 2) + * @num_subbuf: number of sub-buffers (power of 2) + * + * Returns channel pointer if successful, %NULL otherwise. + * + * Creates per-cpu channel buffers using the sizes and attributes + * specified. The created channel buffer files will be named + * name_0...name_N-1. File permissions will be %S_IRUSR. + * + * Called with CPU hotplug disabled. + */ +int channel_backend_init(struct channel_backend *chanb, + const char *name, + const struct lib_ring_buffer_config *config, + void *priv, size_t subbuf_size, size_t num_subbuf) +{ + struct channel *chan = container_of(chanb, struct channel, backend); + unsigned int i; + int ret; + + if (!name) + return -EPERM; + + if (!(subbuf_size && num_subbuf)) + return -EPERM; + + /* Check that the subbuffer size is larger than a page. */ + if (subbuf_size < PAGE_SIZE) + return -EINVAL; + + /* + * Make sure the number of subbuffers and subbuffer size are power of 2. + */ + CHAN_WARN_ON(chanb, hweight32(subbuf_size) != 1); + CHAN_WARN_ON(chanb, hweight32(num_subbuf) != 1); + + ret = subbuffer_id_check_index(config, num_subbuf); + if (ret) + return ret; + + chanb->priv = priv; + chanb->buf_size = num_subbuf * subbuf_size; + chanb->subbuf_size = subbuf_size; + chanb->buf_size_order = get_count_order(chanb->buf_size); + chanb->subbuf_size_order = get_count_order(subbuf_size); + chanb->num_subbuf_order = get_count_order(num_subbuf); + chanb->extra_reader_sb = + (config->mode == RING_BUFFER_OVERWRITE) ? 1 : 0; + chanb->num_subbuf = num_subbuf; + strlcpy(chanb->name, name, NAME_MAX); + chanb->config = config; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + if (!zalloc_cpumask_var(&chanb->cpumask, GFP_KERNEL)) + return -ENOMEM; + } + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + /* Allocating the buffer per-cpu structures */ + chanb->buf = alloc_percpu(struct lib_ring_buffer); + if (!chanb->buf) + goto free_cpumask; + + /* + * In case of non-hotplug cpu, if the ring-buffer is allocated + * in early initcall, it will not be notified of secondary cpus. + * In that off case, we need to allocate for all possible cpus. + */ +#ifdef CONFIG_HOTPLUG_CPU + /* + * buf->backend.allocated test takes care of concurrent CPU + * hotplug. + * Priority higher than frontend, so we create the ring buffer + * before we start the timer. + */ + chanb->cpu_hp_notifier.notifier_call = + lib_ring_buffer_cpu_hp_callback; + chanb->cpu_hp_notifier.priority = 5; + register_hotcpu_notifier(&chanb->cpu_hp_notifier); + + get_online_cpus(); + for_each_online_cpu(i) { + ret = lib_ring_buffer_create(per_cpu_ptr(chanb->buf, i), + chanb, i); + if (ret) + goto free_bufs; /* cpu hotplug locked */ + } + put_online_cpus(); +#else + for_each_possible_cpu(i) { + ret = lib_ring_buffer_create(per_cpu_ptr(chanb->buf, i), + chanb, i); + if (ret) + goto free_bufs; /* cpu hotplug locked */ + } +#endif + } else { + chanb->buf = kzalloc(sizeof(struct lib_ring_buffer), GFP_KERNEL); + if (!chanb->buf) + goto free_cpumask; + ret = lib_ring_buffer_create(chanb->buf, chanb, -1); + if (ret) + goto free_bufs; + } + chanb->start_tsc = config->cb.ring_buffer_clock_read(chan); + + return 0; + +free_bufs: + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + for_each_possible_cpu(i) { + struct lib_ring_buffer *buf = per_cpu_ptr(chanb->buf, i); + + if (!buf->backend.allocated) + continue; + lib_ring_buffer_free(buf); + } +#ifdef CONFIG_HOTPLUG_CPU + put_online_cpus(); +#endif + free_percpu(chanb->buf); + } else + kfree(chanb->buf); +free_cpumask: + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + free_cpumask_var(chanb->cpumask); + return -ENOMEM; +} + +/** + * channel_backend_unregister_notifiers - unregister notifiers + * @chan: the channel + * + * Holds CPU hotplug. + */ +void channel_backend_unregister_notifiers(struct channel_backend *chanb) +{ + const struct lib_ring_buffer_config *config = chanb->config; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + unregister_hotcpu_notifier(&chanb->cpu_hp_notifier); +} + +/** + * channel_backend_free - destroy the channel + * @chan: the channel + * + * Destroy all channel buffers and frees the channel. + */ +void channel_backend_free(struct channel_backend *chanb) +{ + const struct lib_ring_buffer_config *config = chanb->config; + unsigned int i; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + for_each_possible_cpu(i) { + struct lib_ring_buffer *buf = per_cpu_ptr(chanb->buf, i); + + if (!buf->backend.allocated) + continue; + lib_ring_buffer_free(buf); + } + free_cpumask_var(chanb->cpumask); + free_percpu(chanb->buf); + } else { + struct lib_ring_buffer *buf = chanb->buf; + + CHAN_WARN_ON(chanb, !buf->backend.allocated); + lib_ring_buffer_free(buf); + kfree(buf); + } +} + +/** + * lib_ring_buffer_write - write data to a ring_buffer buffer. + * @bufb : buffer backend + * @offset : offset within the buffer + * @src : source address + * @len : length to write + * @pagecpy : page size copied so far + */ +void _lib_ring_buffer_write(struct lib_ring_buffer_backend *bufb, size_t offset, + const void *src, size_t len, ssize_t pagecpy) +{ + struct channel_backend *chanb = &bufb->chan->backend; + const struct lib_ring_buffer_config *config = chanb->config; + size_t sbidx, index; + struct lib_ring_buffer_backend_pages *rpages; + unsigned long sb_bindex, id; + + do { + len -= pagecpy; + src += pagecpy; + offset += pagecpy; + sbidx = offset >> chanb->subbuf_size_order; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + + /* + * Underlying layer should never ask for writes across + * subbuffers. + */ + CHAN_WARN_ON(chanb, offset >= chanb->buf_size); + + pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK)); + id = bufb->buf_wsb[sbidx].id; + sb_bindex = subbuffer_id_get_index(config, id); + rpages = bufb->array[sb_bindex]; + CHAN_WARN_ON(chanb, config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, id)); + lib_ring_buffer_do_copy(config, + rpages->p[index].virt + + (offset & ~PAGE_MASK), + src, pagecpy); + } while (unlikely(len != pagecpy)); +} +EXPORT_SYMBOL_GPL(_lib_ring_buffer_write); + +/** + * lib_ring_buffer_read - read data from ring_buffer_buffer. + * @bufb : buffer backend + * @offset : offset within the buffer + * @dest : destination address + * @len : length to copy to destination + * + * Should be protected by get_subbuf/put_subbuf. + * Returns the length copied. + */ +size_t lib_ring_buffer_read(struct lib_ring_buffer_backend *bufb, size_t offset, + void *dest, size_t len) +{ + struct channel_backend *chanb = &bufb->chan->backend; + const struct lib_ring_buffer_config *config = chanb->config; + size_t index; + ssize_t pagecpy, orig_len; + struct lib_ring_buffer_backend_pages *rpages; + unsigned long sb_bindex, id; + + orig_len = len; + offset &= chanb->buf_size - 1; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + if (unlikely(!len)) + return 0; + for (;;) { + pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK)); + id = bufb->buf_rsb.id; + sb_bindex = subbuffer_id_get_index(config, id); + rpages = bufb->array[sb_bindex]; + CHAN_WARN_ON(chanb, config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, id)); + memcpy(dest, rpages->p[index].virt + (offset & ~PAGE_MASK), + pagecpy); + len -= pagecpy; + if (likely(!len)) + break; + dest += pagecpy; + offset += pagecpy; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + /* + * Underlying layer should never ask for reads across + * subbuffers. + */ + CHAN_WARN_ON(chanb, offset >= chanb->buf_size); + } + return orig_len; +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_read); + +/** + * __lib_ring_buffer_copy_to_user - read data from ring_buffer to userspace + * @bufb : buffer backend + * @offset : offset within the buffer + * @dest : destination userspace address + * @len : length to copy to destination + * + * Should be protected by get_subbuf/put_subbuf. + * access_ok() must have been performed on dest addresses prior to call this + * function. + * Returns -EFAULT on error, 0 if ok. + */ +int __lib_ring_buffer_copy_to_user(struct lib_ring_buffer_backend *bufb, + size_t offset, void __user *dest, size_t len) +{ + struct channel_backend *chanb = &bufb->chan->backend; + const struct lib_ring_buffer_config *config = chanb->config; + size_t index; + ssize_t pagecpy, orig_len; + struct lib_ring_buffer_backend_pages *rpages; + unsigned long sb_bindex, id; + + orig_len = len; + offset &= chanb->buf_size - 1; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + if (unlikely(!len)) + return 0; + for (;;) { + pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK)); + id = bufb->buf_rsb.id; + sb_bindex = subbuffer_id_get_index(config, id); + rpages = bufb->array[sb_bindex]; + CHAN_WARN_ON(chanb, config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, id)); + if (__copy_to_user(dest, + rpages->p[index].virt + (offset & ~PAGE_MASK), + pagecpy)) + return -EFAULT; + len -= pagecpy; + if (likely(!len)) + break; + dest += pagecpy; + offset += pagecpy; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + /* + * Underlying layer should never ask for reads across + * subbuffers. + */ + CHAN_WARN_ON(chanb, offset >= chanb->buf_size); + } + return 0; +} +EXPORT_SYMBOL_GPL(__lib_ring_buffer_copy_to_user); + +/** + * lib_ring_buffer_read_cstr - read a C-style string from ring_buffer. + * @bufb : buffer backend + * @offset : offset within the buffer + * @dest : destination address + * @len : destination's length + * + * return string's length + * Should be protected by get_subbuf/put_subbuf. + */ +int lib_ring_buffer_read_cstr(struct lib_ring_buffer_backend *bufb, size_t offset, + void *dest, size_t len) +{ + struct channel_backend *chanb = &bufb->chan->backend; + const struct lib_ring_buffer_config *config = chanb->config; + size_t index; + ssize_t pagecpy, pagelen, strpagelen, orig_offset; + char *str; + struct lib_ring_buffer_backend_pages *rpages; + unsigned long sb_bindex, id; + + offset &= chanb->buf_size - 1; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + orig_offset = offset; + for (;;) { + id = bufb->buf_rsb.id; + sb_bindex = subbuffer_id_get_index(config, id); + rpages = bufb->array[sb_bindex]; + CHAN_WARN_ON(chanb, config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, id)); + str = (char *)rpages->p[index].virt + (offset & ~PAGE_MASK); + pagelen = PAGE_SIZE - (offset & ~PAGE_MASK); + strpagelen = strnlen(str, pagelen); + if (len) { + pagecpy = min_t(size_t, len, strpagelen); + if (dest) { + memcpy(dest, str, pagecpy); + dest += pagecpy; + } + len -= pagecpy; + } + offset += strpagelen; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + if (strpagelen < pagelen) + break; + /* + * Underlying layer should never ask for reads across + * subbuffers. + */ + CHAN_WARN_ON(chanb, offset >= chanb->buf_size); + } + if (dest && len) + ((char *)dest)[0] = 0; + return offset - orig_offset; +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_read_cstr); + +/** + * lib_ring_buffer_read_get_page - Get a whole page to read from + * @bufb : buffer backend + * @offset : offset within the buffer + * @virt : pointer to page address (output) + * + * Should be protected by get_subbuf/put_subbuf. + * Returns the pointer to the page struct pointer. + */ +struct page **lib_ring_buffer_read_get_page(struct lib_ring_buffer_backend *bufb, + size_t offset, void ***virt) +{ + size_t index; + struct lib_ring_buffer_backend_pages *rpages; + struct channel_backend *chanb = &bufb->chan->backend; + const struct lib_ring_buffer_config *config = chanb->config; + unsigned long sb_bindex, id; + + offset &= chanb->buf_size - 1; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + id = bufb->buf_rsb.id; + sb_bindex = subbuffer_id_get_index(config, id); + rpages = bufb->array[sb_bindex]; + CHAN_WARN_ON(chanb, config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, id)); + *virt = &rpages->p[index].virt; + return &rpages->p[index].page; +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_read_get_page); + +/** + * lib_ring_buffer_read_offset_address - get address of a buffer location + * @bufb : buffer backend + * @offset : offset within the buffer. + * + * Return the address where a given offset is located (for read). + * Should be used to get the current subbuffer header pointer. Given we know + * it's never on a page boundary, it's safe to write directly to this address, + * as long as the write is never bigger than a page size. + */ +void *lib_ring_buffer_read_offset_address(struct lib_ring_buffer_backend *bufb, + size_t offset) +{ + size_t index; + struct lib_ring_buffer_backend_pages *rpages; + struct channel_backend *chanb = &bufb->chan->backend; + const struct lib_ring_buffer_config *config = chanb->config; + unsigned long sb_bindex, id; + + offset &= chanb->buf_size - 1; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + id = bufb->buf_rsb.id; + sb_bindex = subbuffer_id_get_index(config, id); + rpages = bufb->array[sb_bindex]; + CHAN_WARN_ON(chanb, config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, id)); + return rpages->p[index].virt + (offset & ~PAGE_MASK); +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_read_offset_address); + +/** + * lib_ring_buffer_offset_address - get address of a location within the buffer + * @bufb : buffer backend + * @offset : offset within the buffer. + * + * Return the address where a given offset is located. + * Should be used to get the current subbuffer header pointer. Given we know + * it's always at the beginning of a page, it's safe to write directly to this + * address, as long as the write is never bigger than a page size. + */ +void *lib_ring_buffer_offset_address(struct lib_ring_buffer_backend *bufb, + size_t offset) +{ + size_t sbidx, index; + struct lib_ring_buffer_backend_pages *rpages; + struct channel_backend *chanb = &bufb->chan->backend; + const struct lib_ring_buffer_config *config = chanb->config; + unsigned long sb_bindex, id; + + offset &= chanb->buf_size - 1; + sbidx = offset >> chanb->subbuf_size_order; + index = (offset & (chanb->subbuf_size - 1)) >> PAGE_SHIFT; + id = bufb->buf_wsb[sbidx].id; + sb_bindex = subbuffer_id_get_index(config, id); + rpages = bufb->array[sb_bindex]; + CHAN_WARN_ON(chanb, config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, id)); + return rpages->p[index].virt + (offset & ~PAGE_MASK); +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_offset_address); diff --git a/libringbuffer/ring_buffer_frontend.c b/libringbuffer/ring_buffer_frontend.c new file mode 100644 index 0000000..ecc72ab --- /dev/null +++ b/libringbuffer/ring_buffer_frontend.c @@ -0,0 +1,1701 @@ +/* + * ring_buffer_frontend.c + * + * (C) Copyright 2005-2010 - Mathieu Desnoyers + * + * Ring buffer wait-free buffer synchronization. Producer-consumer and flight + * recorder (overwrite) modes. See thesis: + * + * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D. + * dissertation, Ecole Polytechnique de Montreal. + * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf + * + * - Algorithm presentation in Chapter 5: + * "Lockless Multi-Core High-Throughput Buffering". + * - Algorithm formal verification in Section 8.6: + * "Formal verification of LTTng" + * + * Author: + * Mathieu Desnoyers + * + * Inspired from LTT and RelayFS: + * Karim Yaghmour + * Tom Zanussi + * Bob Wisniewski + * And from K42 : + * Bob Wisniewski + * + * Buffer reader semantic : + * + * - get_subbuf_size + * while buffer is not finalized and empty + * - get_subbuf + * - if return value != 0, continue + * - splice one subbuffer worth of data to a pipe + * - splice the data from pipe to disk/network + * - put_subbuf + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include +#include +#include + +#include "../../wrapper/ringbuffer/config.h" +#include "../../wrapper/ringbuffer/backend.h" +#include "../../wrapper/ringbuffer/frontend.h" +#include "../../wrapper/ringbuffer/iterator.h" +#include "../../wrapper/ringbuffer/nohz.h" + +/* + * Internal structure representing offsets to use at a sub-buffer switch. + */ +struct switch_offsets { + unsigned long begin, end, old; + size_t pre_header_padding, size; + unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1, + switch_old_end:1; +}; + +#ifdef CONFIG_NO_HZ +enum tick_nohz_val { + TICK_NOHZ_STOP, + TICK_NOHZ_FLUSH, + TICK_NOHZ_RESTART, +}; + +static ATOMIC_NOTIFIER_HEAD(tick_nohz_notifier); +#endif /* CONFIG_NO_HZ */ + +static DEFINE_PER_CPU(spinlock_t, ring_buffer_nohz_lock); + +DEFINE_PER_CPU(unsigned int, lib_ring_buffer_nesting); +EXPORT_PER_CPU_SYMBOL(lib_ring_buffer_nesting); + +static +void lib_ring_buffer_print_errors(struct channel *chan, + struct lib_ring_buffer *buf, int cpu); + +/* + * Must be called under cpu hotplug protection. + */ +void lib_ring_buffer_free(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + + lib_ring_buffer_print_errors(chan, buf, buf->backend.cpu); + kfree(buf->commit_hot); + kfree(buf->commit_cold); + + lib_ring_buffer_backend_free(&buf->backend); +} + +/** + * lib_ring_buffer_reset - Reset ring buffer to initial values. + * @buf: Ring buffer. + * + * Effectively empty the ring buffer. Should be called when the buffer is not + * used for writing. The ring buffer can be opened for reading, but the reader + * should not be using the iterator concurrently with reset. The previous + * current iterator record is reset. + */ +void lib_ring_buffer_reset(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned int i; + + /* + * Reset iterator first. It will put the subbuffer if it currently holds + * it. + */ + lib_ring_buffer_iterator_reset(buf); + v_set(config, &buf->offset, 0); + for (i = 0; i < chan->backend.num_subbuf; i++) { + v_set(config, &buf->commit_hot[i].cc, 0); + v_set(config, &buf->commit_hot[i].seq, 0); + v_set(config, &buf->commit_cold[i].cc_sb, 0); + } + atomic_long_set(&buf->consumed, 0); + atomic_set(&buf->record_disabled, 0); + v_set(config, &buf->last_tsc, 0); + lib_ring_buffer_backend_reset(&buf->backend); + /* Don't reset number of active readers */ + v_set(config, &buf->records_lost_full, 0); + v_set(config, &buf->records_lost_wrap, 0); + v_set(config, &buf->records_lost_big, 0); + v_set(config, &buf->records_count, 0); + v_set(config, &buf->records_overrun, 0); + buf->finalized = 0; +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_reset); + +/** + * channel_reset - Reset channel to initial values. + * @chan: Channel. + * + * Effectively empty the channel. Should be called when the channel is not used + * for writing. The channel can be opened for reading, but the reader should not + * be using the iterator concurrently with reset. The previous current iterator + * record is reset. + */ +void channel_reset(struct channel *chan) +{ + /* + * Reset iterators first. Will put the subbuffer if held for reading. + */ + channel_iterator_reset(chan); + atomic_set(&chan->record_disabled, 0); + /* Don't reset commit_count_mask, still valid */ + channel_backend_reset(&chan->backend); + /* Don't reset switch/read timer interval */ + /* Don't reset notifiers and notifier enable bits */ + /* Don't reset reader reference count */ +} +EXPORT_SYMBOL_GPL(channel_reset); + +/* + * Must be called under cpu hotplug protection. + */ +int lib_ring_buffer_create(struct lib_ring_buffer *buf, + struct channel_backend *chanb, int cpu) +{ + const struct lib_ring_buffer_config *config = chanb->config; + struct channel *chan = container_of(chanb, struct channel, backend); + void *priv = chanb->priv; + unsigned int num_subbuf; + size_t subbuf_header_size; + u64 tsc; + int ret; + + /* Test for cpu hotplug */ + if (buf->backend.allocated) + return 0; + + /* + * Paranoia: per cpu dynamic allocation is not officially documented as + * zeroing the memory, so let's do it here too, just in case. + */ + memset(buf, 0, sizeof(*buf)); + + ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend, cpu); + if (ret) + return ret; + + buf->commit_hot = + kzalloc_node(ALIGN(sizeof(*buf->commit_hot) + * chan->backend.num_subbuf, + 1 << INTERNODE_CACHE_SHIFT), + GFP_KERNEL, cpu_to_node(max(cpu, 0))); + if (!buf->commit_hot) { + ret = -ENOMEM; + goto free_chanbuf; + } + + buf->commit_cold = + kzalloc_node(ALIGN(sizeof(*buf->commit_cold) + * chan->backend.num_subbuf, + 1 << INTERNODE_CACHE_SHIFT), + GFP_KERNEL, cpu_to_node(max(cpu, 0))); + if (!buf->commit_cold) { + ret = -ENOMEM; + goto free_commit; + } + + num_subbuf = chan->backend.num_subbuf; + init_waitqueue_head(&buf->read_wait); + raw_spin_lock_init(&buf->raw_tick_nohz_spinlock); + + /* + * Write the subbuffer header for first subbuffer so we know the total + * duration of data gathering. + */ + subbuf_header_size = config->cb.subbuffer_header_size(); + v_set(config, &buf->offset, subbuf_header_size); + subbuffer_id_clear_noref(config, &buf->backend.buf_wsb[0].id); + tsc = config->cb.ring_buffer_clock_read(buf->backend.chan); + config->cb.buffer_begin(buf, tsc, 0); + v_add(config, subbuf_header_size, &buf->commit_hot[0].cc); + + if (config->cb.buffer_create) { + ret = config->cb.buffer_create(buf, priv, cpu, chanb->name); + if (ret) + goto free_init; + } + + /* + * Ensure the buffer is ready before setting it to allocated and setting + * the cpumask. + * Used for cpu hotplug vs cpumask iteration. + */ + smp_wmb(); + buf->backend.allocated = 1; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + CHAN_WARN_ON(chan, cpumask_test_cpu(cpu, + chan->backend.cpumask)); + cpumask_set_cpu(cpu, chan->backend.cpumask); + } + + return 0; + + /* Error handling */ +free_init: + kfree(buf->commit_cold); +free_commit: + kfree(buf->commit_hot); +free_chanbuf: + lib_ring_buffer_backend_free(&buf->backend); + return ret; +} + +static void switch_buffer_timer(unsigned long data) +{ + struct lib_ring_buffer *buf = (struct lib_ring_buffer *)data; + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + + /* + * Only flush buffers periodically if readers are active. + */ + if (atomic_long_read(&buf->active_readers)) + lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE); + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + mod_timer_pinned(&buf->switch_timer, + jiffies + chan->switch_timer_interval); + else + mod_timer(&buf->switch_timer, + jiffies + chan->switch_timer_interval); +} + +/* + * Called with ring_buffer_nohz_lock held for per-cpu buffers. + */ +static void lib_ring_buffer_start_switch_timer(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (!chan->switch_timer_interval || buf->switch_timer_enabled) + return; + init_timer(&buf->switch_timer); + buf->switch_timer.function = switch_buffer_timer; + buf->switch_timer.expires = jiffies + chan->switch_timer_interval; + buf->switch_timer.data = (unsigned long)buf; + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + add_timer_on(&buf->switch_timer, buf->backend.cpu); + else + add_timer(&buf->switch_timer); + buf->switch_timer_enabled = 1; +} + +/* + * Called with ring_buffer_nohz_lock held for per-cpu buffers. + */ +static void lib_ring_buffer_stop_switch_timer(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + + if (!chan->switch_timer_interval || !buf->switch_timer_enabled) + return; + + del_timer_sync(&buf->switch_timer); + buf->switch_timer_enabled = 0; +} + +/* + * Polling timer to check the channels for data. + */ +static void read_buffer_timer(unsigned long data) +{ + struct lib_ring_buffer *buf = (struct lib_ring_buffer *)data; + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + + CHAN_WARN_ON(chan, !buf->backend.allocated); + + if (atomic_long_read(&buf->active_readers) + && lib_ring_buffer_poll_deliver(config, buf, chan)) { + wake_up_interruptible(&buf->read_wait); + wake_up_interruptible(&chan->read_wait); + } + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + mod_timer_pinned(&buf->read_timer, + jiffies + chan->read_timer_interval); + else + mod_timer(&buf->read_timer, + jiffies + chan->read_timer_interval); +} + +/* + * Called with ring_buffer_nohz_lock held for per-cpu buffers. + */ +static void lib_ring_buffer_start_read_timer(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER + || !chan->read_timer_interval + || buf->read_timer_enabled) + return; + + init_timer(&buf->read_timer); + buf->read_timer.function = read_buffer_timer; + buf->read_timer.expires = jiffies + chan->read_timer_interval; + buf->read_timer.data = (unsigned long)buf; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + add_timer_on(&buf->read_timer, buf->backend.cpu); + else + add_timer(&buf->read_timer); + buf->read_timer_enabled = 1; +} + +/* + * Called with ring_buffer_nohz_lock held for per-cpu buffers. + */ +static void lib_ring_buffer_stop_read_timer(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER + || !chan->read_timer_interval + || !buf->read_timer_enabled) + return; + + del_timer_sync(&buf->read_timer); + /* + * do one more check to catch data that has been written in the last + * timer period. + */ + if (lib_ring_buffer_poll_deliver(config, buf, chan)) { + wake_up_interruptible(&buf->read_wait); + wake_up_interruptible(&chan->read_wait); + } + buf->read_timer_enabled = 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/** + * lib_ring_buffer_cpu_hp_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) + */ +static +int __cpuinit lib_ring_buffer_cpu_hp_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct channel *chan = container_of(nb, struct channel, + cpu_hp_notifier); + struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu); + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (!chan->cpu_hp_enable) + return NOTIFY_DONE; + + CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL); + + switch (action) { + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + wake_up_interruptible(&chan->hp_wait); + lib_ring_buffer_start_switch_timer(buf); + lib_ring_buffer_start_read_timer(buf); + return NOTIFY_OK; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + lib_ring_buffer_stop_switch_timer(buf); + lib_ring_buffer_stop_read_timer(buf); + return NOTIFY_OK; + + case CPU_DEAD: + case CPU_DEAD_FROZEN: + /* + * Performing a buffer switch on a remote CPU. Performed by + * the CPU responsible for doing the hotunplug after the target + * CPU stopped running completely. Ensures that all data + * from that remote CPU is flushed. + */ + lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} +#endif + +#if defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) +/* + * For per-cpu buffers, call the reader wakeups before switching the buffer, so + * that wake-up-tracing generated events are flushed before going idle (in + * tick_nohz). We test if the spinlock is locked to deal with the race where + * readers try to sample the ring buffer before we perform the switch. We let + * the readers retry in that case. If there is data in the buffer, the wake up + * is going to forbid the CPU running the reader thread from going idle. + */ +static int notrace ring_buffer_tick_nohz_callback(struct notifier_block *nb, + unsigned long val, + void *data) +{ + struct channel *chan = container_of(nb, struct channel, + tick_nohz_notifier); + const struct lib_ring_buffer_config *config = chan->backend.config; + struct lib_ring_buffer *buf; + int cpu = smp_processor_id(); + + if (config->alloc != RING_BUFFER_ALLOC_PER_CPU) { + /* + * We don't support keeping the system idle with global buffers + * and streaming active. In order to do so, we would need to + * sample a non-nohz-cpumask racelessly with the nohz updates + * without adding synchronization overhead to nohz. Leave this + * use-case out for now. + */ + return 0; + } + + buf = channel_get_ring_buffer(config, chan, cpu); + switch (val) { + case TICK_NOHZ_FLUSH: + raw_spin_lock(&buf->raw_tick_nohz_spinlock); + if (config->wakeup == RING_BUFFER_WAKEUP_BY_TIMER + && chan->read_timer_interval + && atomic_long_read(&buf->active_readers) + && (lib_ring_buffer_poll_deliver(config, buf, chan) + || lib_ring_buffer_pending_data(config, buf, chan))) { + wake_up_interruptible(&buf->read_wait); + wake_up_interruptible(&chan->read_wait); + } + if (chan->switch_timer_interval) + lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE); + raw_spin_unlock(&buf->raw_tick_nohz_spinlock); + break; + case TICK_NOHZ_STOP: + spin_lock(&__get_cpu_var(ring_buffer_nohz_lock)); + lib_ring_buffer_stop_switch_timer(buf); + lib_ring_buffer_stop_read_timer(buf); + spin_unlock(&__get_cpu_var(ring_buffer_nohz_lock)); + break; + case TICK_NOHZ_RESTART: + spin_lock(&__get_cpu_var(ring_buffer_nohz_lock)); + lib_ring_buffer_start_read_timer(buf); + lib_ring_buffer_start_switch_timer(buf); + spin_unlock(&__get_cpu_var(ring_buffer_nohz_lock)); + break; + } + + return 0; +} + +void notrace lib_ring_buffer_tick_nohz_flush(void) +{ + atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_FLUSH, + NULL); +} + +void notrace lib_ring_buffer_tick_nohz_stop(void) +{ + atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_STOP, + NULL); +} + +void notrace lib_ring_buffer_tick_nohz_restart(void) +{ + atomic_notifier_call_chain(&tick_nohz_notifier, TICK_NOHZ_RESTART, + NULL); +} +#endif /* defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) */ + +/* + * Holds CPU hotplug. + */ +static void channel_unregister_notifiers(struct channel *chan) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + int cpu; + + channel_iterator_unregister_notifiers(chan); + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { +#ifdef CONFIG_NO_HZ + /* + * Remove the nohz notifier first, so we are certain we stop + * the timers. + */ + atomic_notifier_chain_unregister(&tick_nohz_notifier, + &chan->tick_nohz_notifier); + /* + * ring_buffer_nohz_lock will not be needed below, because + * we just removed the notifiers, which were the only source of + * concurrency. + */ +#endif /* CONFIG_NO_HZ */ +#ifdef CONFIG_HOTPLUG_CPU + get_online_cpus(); + chan->cpu_hp_enable = 0; + for_each_online_cpu(cpu) { + struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, + cpu); + lib_ring_buffer_stop_switch_timer(buf); + lib_ring_buffer_stop_read_timer(buf); + } + put_online_cpus(); + unregister_cpu_notifier(&chan->cpu_hp_notifier); +#else + for_each_possible_cpu(cpu) { + struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, + cpu); + lib_ring_buffer_stop_switch_timer(buf); + lib_ring_buffer_stop_read_timer(buf); + } +#endif + } else { + struct lib_ring_buffer *buf = chan->backend.buf; + + lib_ring_buffer_stop_switch_timer(buf); + lib_ring_buffer_stop_read_timer(buf); + } + channel_backend_unregister_notifiers(&chan->backend); +} + +static void channel_free(struct channel *chan) +{ + channel_iterator_free(chan); + channel_backend_free(&chan->backend); + kfree(chan); +} + +/** + * channel_create - Create channel. + * @config: ring buffer instance configuration + * @name: name of the channel + * @priv: ring buffer client private data + * @buf_addr: pointer the the beginning of the preallocated buffer contiguous + * address mapping. It is used only by RING_BUFFER_STATIC + * configuration. It can be set to NULL for other backends. + * @subbuf_size: subbuffer size + * @num_subbuf: number of subbuffers + * @switch_timer_interval: Time interval (in us) to fill sub-buffers with + * padding to let readers get those sub-buffers. + * Used for live streaming. + * @read_timer_interval: Time interval (in us) to wake up pending readers. + * + * Holds cpu hotplug. + * Returns NULL on failure. + */ +struct channel *channel_create(const struct lib_ring_buffer_config *config, + const char *name, void *priv, void *buf_addr, + size_t subbuf_size, + size_t num_subbuf, unsigned int switch_timer_interval, + unsigned int read_timer_interval) +{ + int ret, cpu; + struct channel *chan; + + if (lib_ring_buffer_check_config(config, switch_timer_interval, + read_timer_interval)) + return NULL; + + chan = kzalloc(sizeof(struct channel), GFP_KERNEL); + if (!chan) + return NULL; + + ret = channel_backend_init(&chan->backend, name, config, priv, + subbuf_size, num_subbuf); + if (ret) + goto error; + + ret = channel_iterator_init(chan); + if (ret) + goto error_free_backend; + + chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order); + chan->switch_timer_interval = usecs_to_jiffies(switch_timer_interval); + chan->read_timer_interval = usecs_to_jiffies(read_timer_interval); + kref_init(&chan->ref); + init_waitqueue_head(&chan->read_wait); + init_waitqueue_head(&chan->hp_wait); + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { +#if defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) + /* Only benefit from NO_HZ idle with per-cpu buffers for now. */ + chan->tick_nohz_notifier.notifier_call = + ring_buffer_tick_nohz_callback; + chan->tick_nohz_notifier.priority = ~0U; + atomic_notifier_chain_register(&tick_nohz_notifier, + &chan->tick_nohz_notifier); +#endif /* defined(CONFIG_NO_HZ) && defined(CONFIG_LIB_RING_BUFFER) */ + + /* + * In case of non-hotplug cpu, if the ring-buffer is allocated + * in early initcall, it will not be notified of secondary cpus. + * In that off case, we need to allocate for all possible cpus. + */ +#ifdef CONFIG_HOTPLUG_CPU + chan->cpu_hp_notifier.notifier_call = + lib_ring_buffer_cpu_hp_callback; + chan->cpu_hp_notifier.priority = 6; + register_cpu_notifier(&chan->cpu_hp_notifier); + + get_online_cpus(); + for_each_online_cpu(cpu) { + struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, + cpu); + spin_lock(&per_cpu(ring_buffer_nohz_lock, cpu)); + lib_ring_buffer_start_switch_timer(buf); + lib_ring_buffer_start_read_timer(buf); + spin_unlock(&per_cpu(ring_buffer_nohz_lock, cpu)); + } + chan->cpu_hp_enable = 1; + put_online_cpus(); +#else + for_each_possible_cpu(cpu) { + struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, + cpu); + spin_lock(&per_cpu(ring_buffer_nohz_lock, cpu)); + lib_ring_buffer_start_switch_timer(buf); + lib_ring_buffer_start_read_timer(buf); + spin_unlock(&per_cpu(ring_buffer_nohz_lock, cpu)); + } +#endif + } else { + struct lib_ring_buffer *buf = chan->backend.buf; + + lib_ring_buffer_start_switch_timer(buf); + lib_ring_buffer_start_read_timer(buf); + } + + return chan; + +error_free_backend: + channel_backend_free(&chan->backend); +error: + kfree(chan); + return NULL; +} +EXPORT_SYMBOL_GPL(channel_create); + +static +void channel_release(struct kref *kref) +{ + struct channel *chan = container_of(kref, struct channel, ref); + channel_free(chan); +} + +/** + * channel_destroy - Finalize, wait for q.s. and destroy channel. + * @chan: channel to destroy + * + * Holds cpu hotplug. + * Call "destroy" callback, finalize channels, wait for readers to release their + * reference, then destroy ring buffer data. Note that when readers have + * completed data consumption of finalized channels, get_subbuf() will return + * -ENODATA. They should release their handle at that point. + * Returns the private data pointer. + */ +void *channel_destroy(struct channel *chan) +{ + int cpu; + const struct lib_ring_buffer_config *config = chan->backend.config; + void *priv; + + channel_unregister_notifiers(chan); + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + /* + * No need to hold cpu hotplug, because all notifiers have been + * unregistered. + */ + for_each_channel_cpu(cpu, chan) { + struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, + cpu); + + if (config->cb.buffer_finalize) + config->cb.buffer_finalize(buf, + chan->backend.priv, + cpu); + if (buf->backend.allocated) + lib_ring_buffer_switch_slow(buf, SWITCH_FLUSH); + /* + * Perform flush before writing to finalized. + */ + smp_wmb(); + ACCESS_ONCE(buf->finalized) = 1; + wake_up_interruptible(&buf->read_wait); + } + } else { + struct lib_ring_buffer *buf = chan->backend.buf; + + if (config->cb.buffer_finalize) + config->cb.buffer_finalize(buf, chan->backend.priv, -1); + if (buf->backend.allocated) + lib_ring_buffer_switch_slow(buf, SWITCH_FLUSH); + /* + * Perform flush before writing to finalized. + */ + smp_wmb(); + ACCESS_ONCE(buf->finalized) = 1; + wake_up_interruptible(&buf->read_wait); + } + ACCESS_ONCE(chan->finalized) = 1; + wake_up_interruptible(&chan->hp_wait); + wake_up_interruptible(&chan->read_wait); + kref_put(&chan->ref, channel_release); + priv = chan->backend.priv; + return priv; +} +EXPORT_SYMBOL_GPL(channel_destroy); + +struct lib_ring_buffer *channel_get_ring_buffer( + const struct lib_ring_buffer_config *config, + struct channel *chan, int cpu) +{ + if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) + return chan->backend.buf; + else + return per_cpu_ptr(chan->backend.buf, cpu); +} +EXPORT_SYMBOL_GPL(channel_get_ring_buffer); + +int lib_ring_buffer_open_read(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + + if (!atomic_long_add_unless(&buf->active_readers, 1, 1)) + return -EBUSY; + kref_get(&chan->ref); + smp_mb__after_atomic_inc(); + return 0; +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_open_read); + +void lib_ring_buffer_release_read(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + + CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1); + smp_mb__before_atomic_dec(); + atomic_long_dec(&buf->active_readers); + kref_put(&chan->ref, channel_release); +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_release_read); + +/* + * Promote compiler barrier to a smp_mb(). + * For the specific ring buffer case, this IPI call should be removed if the + * architecture does not reorder writes. This should eventually be provided by + * a separate architecture-specific infrastructure. + */ +static void remote_mb(void *info) +{ + smp_mb(); +} + +/** + * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read) + * @buf: ring buffer + * @consumed: consumed count indicating the position where to read + * @produced: produced count, indicates position when to stop reading + * + * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no + * data to read at consumed position, or 0 if the get operation succeeds. + * Busy-loop trying to get data if the tick_nohz sequence lock is held. + */ + +int lib_ring_buffer_snapshot(struct lib_ring_buffer *buf, + unsigned long *consumed, unsigned long *produced) +{ + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long consumed_cur, write_offset; + int finalized; + +retry: + finalized = ACCESS_ONCE(buf->finalized); + /* + * Read finalized before counters. + */ + smp_rmb(); + consumed_cur = atomic_long_read(&buf->consumed); + /* + * No need to issue a memory barrier between consumed count read and + * write offset read, because consumed count can only change + * concurrently in overwrite mode, and we keep a sequence counter + * identifier derived from the write offset to check we are getting + * the same sub-buffer we are expecting (the sub-buffers are atomically + * "tagged" upon writes, tags are checked upon read). + */ + write_offset = v_read(config, &buf->offset); + + /* + * Check that we are not about to read the same subbuffer in + * which the writer head is. + */ + if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan) + == 0) + goto nodata; + + *consumed = consumed_cur; + *produced = subbuf_trunc(write_offset, chan); + + return 0; + +nodata: + /* + * The memory barriers __wait_event()/wake_up_interruptible() take care + * of "raw_spin_is_locked" memory ordering. + */ + if (finalized) + return -ENODATA; + else if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock)) + goto retry; + else + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_snapshot); + +/** + * lib_ring_buffer_put_snapshot - move consumed counter forward + * @buf: ring buffer + * @consumed_new: new consumed count value + */ +void lib_ring_buffer_move_consumer(struct lib_ring_buffer *buf, + unsigned long consumed_new) +{ + struct lib_ring_buffer_backend *bufb = &buf->backend; + struct channel *chan = bufb->chan; + unsigned long consumed; + + CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1); + + /* + * Only push the consumed value forward. + * If the consumed cmpxchg fails, this is because we have been pushed by + * the writer in flight recorder mode. + */ + consumed = atomic_long_read(&buf->consumed); + while ((long) consumed - (long) consumed_new < 0) + consumed = atomic_long_cmpxchg(&buf->consumed, consumed, + consumed_new); +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_move_consumer); + +/** + * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading + * @buf: ring buffer + * @consumed: consumed count indicating the position where to read + * + * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no + * data to read at consumed position, or 0 if the get operation succeeds. + * Busy-loop trying to get data if the tick_nohz sequence lock is held. + */ +int lib_ring_buffer_get_subbuf(struct lib_ring_buffer *buf, + unsigned long consumed) +{ + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long consumed_cur, consumed_idx, commit_count, write_offset; + int ret; + int finalized; + +retry: + finalized = ACCESS_ONCE(buf->finalized); + /* + * Read finalized before counters. + */ + smp_rmb(); + consumed_cur = atomic_long_read(&buf->consumed); + consumed_idx = subbuf_index(consumed, chan); + commit_count = v_read(config, &buf->commit_cold[consumed_idx].cc_sb); + /* + * Make sure we read the commit count before reading the buffer + * data and the write offset. Correct consumed offset ordering + * wrt commit count is insured by the use of cmpxchg to update + * the consumed offset. + * smp_call_function_single can fail if the remote CPU is offline, + * this is OK because then there is no wmb to execute there. + * If our thread is executing on the same CPU as the on the buffers + * belongs to, we don't have to synchronize it at all. If we are + * migrated, the scheduler will take care of the memory barriers. + * Normally, smp_call_function_single() should ensure program order when + * executing the remote function, which implies that it surrounds the + * function execution with : + * smp_mb() + * send IPI + * csd_lock_wait + * recv IPI + * smp_mb() + * exec. function + * smp_mb() + * csd unlock + * smp_mb() + * + * However, smp_call_function_single() does not seem to clearly execute + * such barriers. It depends on spinlock semantic to provide the barrier + * before executing the IPI and, when busy-looping, csd_lock_wait only + * executes smp_mb() when it has to wait for the other CPU. + * + * I don't trust this code. Therefore, let's add the smp_mb() sequence + * required ourself, even if duplicated. It has no performance impact + * anyway. + * + * smp_mb() is needed because smp_rmb() and smp_wmb() only order read vs + * read and write vs write. They do not ensure core synchronization. We + * really have to ensure total order between the 3 barriers running on + * the 2 CPUs. + */ + if (config->ipi == RING_BUFFER_IPI_BARRIER) { + if (config->sync == RING_BUFFER_SYNC_PER_CPU + && config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + if (raw_smp_processor_id() != buf->backend.cpu) { + /* Total order with IPI handler smp_mb() */ + smp_mb(); + smp_call_function_single(buf->backend.cpu, + remote_mb, NULL, 1); + /* Total order with IPI handler smp_mb() */ + smp_mb(); + } + } else { + /* Total order with IPI handler smp_mb() */ + smp_mb(); + smp_call_function(remote_mb, NULL, 1); + /* Total order with IPI handler smp_mb() */ + smp_mb(); + } + } else { + /* + * Local rmb to match the remote wmb to read the commit count + * before the buffer data and the write offset. + */ + smp_rmb(); + } + + write_offset = v_read(config, &buf->offset); + + /* + * Check that the buffer we are getting is after or at consumed_cur + * position. + */ + if ((long) subbuf_trunc(consumed, chan) + - (long) subbuf_trunc(consumed_cur, chan) < 0) + goto nodata; + + /* + * Check that the subbuffer we are trying to consume has been + * already fully committed. + */ + if (((commit_count - chan->backend.subbuf_size) + & chan->commit_count_mask) + - (buf_trunc(consumed_cur, chan) + >> chan->backend.num_subbuf_order) + != 0) + goto nodata; + + /* + * Check that we are not about to read the same subbuffer in + * which the writer head is. + */ + if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan) + == 0) + goto nodata; + + /* + * Failure to get the subbuffer causes a busy-loop retry without going + * to a wait queue. These are caused by short-lived race windows where + * the writer is getting access to a subbuffer we were trying to get + * access to. Also checks that the "consumed" buffer count we are + * looking for matches the one contained in the subbuffer id. + */ + ret = update_read_sb_index(config, &buf->backend, &chan->backend, + consumed_idx, buf_trunc_val(consumed, chan)); + if (ret) + goto retry; + subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id); + + buf->get_subbuf_consumed = consumed; + buf->get_subbuf = 1; + + return 0; + +nodata: + /* + * The memory barriers __wait_event()/wake_up_interruptible() take care + * of "raw_spin_is_locked" memory ordering. + */ + if (finalized) + return -ENODATA; + else if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock)) + goto retry; + else + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_get_subbuf); + +/** + * lib_ring_buffer_put_subbuf - release exclusive subbuffer access + * @buf: ring buffer + */ +void lib_ring_buffer_put_subbuf(struct lib_ring_buffer *buf) +{ + struct lib_ring_buffer_backend *bufb = &buf->backend; + struct channel *chan = bufb->chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long read_sb_bindex, consumed_idx, consumed; + + CHAN_WARN_ON(chan, atomic_long_read(&buf->active_readers) != 1); + + if (!buf->get_subbuf) { + /* + * Reader puts a subbuffer it did not get. + */ + CHAN_WARN_ON(chan, 1); + return; + } + consumed = buf->get_subbuf_consumed; + buf->get_subbuf = 0; + + /* + * Clear the records_unread counter. (overruns counter) + * Can still be non-zero if a file reader simply grabbed the data + * without using iterators. + * Can be below zero if an iterator is used on a snapshot more than + * once. + */ + read_sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id); + v_add(config, v_read(config, + &bufb->array[read_sb_bindex]->records_unread), + &bufb->records_read); + v_set(config, &bufb->array[read_sb_bindex]->records_unread, 0); + CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE + && subbuffer_id_is_noref(config, bufb->buf_rsb.id)); + subbuffer_id_set_noref(config, &bufb->buf_rsb.id); + + /* + * Exchange the reader subbuffer with the one we put in its place in the + * writer subbuffer table. Expect the original consumed count. If + * update_read_sb_index fails, this is because the writer updated the + * subbuffer concurrently. We should therefore keep the subbuffer we + * currently have: it has become invalid to try reading this sub-buffer + * consumed count value anyway. + */ + consumed_idx = subbuf_index(consumed, chan); + update_read_sb_index(config, &buf->backend, &chan->backend, + consumed_idx, buf_trunc_val(consumed, chan)); + /* + * update_read_sb_index return value ignored. Don't exchange sub-buffer + * if the writer concurrently updated it. + */ +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_put_subbuf); + +/* + * cons_offset is an iterator on all subbuffer offsets between the reader + * position and the writer position. (inclusive) + */ +static +void lib_ring_buffer_print_subbuffer_errors(struct lib_ring_buffer *buf, + struct channel *chan, + unsigned long cons_offset, + int cpu) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long cons_idx, commit_count, commit_count_sb; + + cons_idx = subbuf_index(cons_offset, chan); + commit_count = v_read(config, &buf->commit_hot[cons_idx].cc); + commit_count_sb = v_read(config, &buf->commit_cold[cons_idx].cc_sb); + + if (subbuf_offset(commit_count, chan) != 0) + printk(KERN_WARNING + "ring buffer %s, cpu %d: " + "commit count in subbuffer %lu,\n" + "expecting multiples of %lu bytes\n" + " [ %lu bytes committed, %lu bytes reader-visible ]\n", + chan->backend.name, cpu, cons_idx, + chan->backend.subbuf_size, + commit_count, commit_count_sb); + + printk(KERN_DEBUG "ring buffer: %s, cpu %d: %lu bytes committed\n", + chan->backend.name, cpu, commit_count); +} + +static +void lib_ring_buffer_print_buffer_errors(struct lib_ring_buffer *buf, + struct channel *chan, + void *priv, int cpu) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long write_offset, cons_offset; + + /* + * Can be called in the error path of allocation when + * trans_channel_data is not yet set. + */ + if (!chan) + return; + /* + * No need to order commit_count, write_offset and cons_offset reads + * because we execute at teardown when no more writer nor reader + * references are left. + */ + write_offset = v_read(config, &buf->offset); + cons_offset = atomic_long_read(&buf->consumed); + if (write_offset != cons_offset) + printk(KERN_WARNING + "ring buffer %s, cpu %d: " + "non-consumed data\n" + " [ %lu bytes written, %lu bytes read ]\n", + chan->backend.name, cpu, write_offset, cons_offset); + + for (cons_offset = atomic_long_read(&buf->consumed); + (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset), + chan) + - cons_offset) > 0; + cons_offset = subbuf_align(cons_offset, chan)) + lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset, + cpu); +} + +static +void lib_ring_buffer_print_errors(struct channel *chan, + struct lib_ring_buffer *buf, int cpu) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + void *priv = chan->backend.priv; + + printk(KERN_DEBUG "ring buffer %s, cpu %d: %lu records written, " + "%lu records overrun\n", + chan->backend.name, cpu, + v_read(config, &buf->records_count), + v_read(config, &buf->records_overrun)); + + if (v_read(config, &buf->records_lost_full) + || v_read(config, &buf->records_lost_wrap) + || v_read(config, &buf->records_lost_big)) + printk(KERN_WARNING + "ring buffer %s, cpu %d: records were lost. Caused by:\n" + " [ %lu buffer full, %lu nest buffer wrap-around, " + "%lu event too big ]\n", + chan->backend.name, cpu, + v_read(config, &buf->records_lost_full), + v_read(config, &buf->records_lost_wrap), + v_read(config, &buf->records_lost_big)); + + lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu); +} + +/* + * lib_ring_buffer_switch_old_start: Populate old subbuffer header. + * + * Only executed when the buffer is finalized, in SWITCH_FLUSH. + */ +static +void lib_ring_buffer_switch_old_start(struct lib_ring_buffer *buf, + struct channel *chan, + struct switch_offsets *offsets, + u64 tsc) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long oldidx = subbuf_index(offsets->old, chan); + unsigned long commit_count; + + config->cb.buffer_begin(buf, tsc, oldidx); + + /* + * Order all writes to buffer before the commit count update that will + * determine that the subbuffer is full. + */ + if (config->ipi == RING_BUFFER_IPI_BARRIER) { + /* + * Must write slot data before incrementing commit count. This + * compiler barrier is upgraded into a smp_mb() by the IPI sent + * by get_subbuf(). + */ + barrier(); + } else + smp_wmb(); + v_add(config, config->cb.subbuffer_header_size(), + &buf->commit_hot[oldidx].cc); + commit_count = v_read(config, &buf->commit_hot[oldidx].cc); + /* Check if the written buffer has to be delivered */ + lib_ring_buffer_check_deliver(config, buf, chan, offsets->old, + commit_count, oldidx); + lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx, + offsets->old, commit_count, + config->cb.subbuffer_header_size()); +} + +/* + * lib_ring_buffer_switch_old_end: switch old subbuffer + * + * Note : offset_old should never be 0 here. It is ok, because we never perform + * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller + * increments the offset_old value when doing a SWITCH_FLUSH on an empty + * subbuffer. + */ +static +void lib_ring_buffer_switch_old_end(struct lib_ring_buffer *buf, + struct channel *chan, + struct switch_offsets *offsets, + u64 tsc) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long oldidx = subbuf_index(offsets->old - 1, chan); + unsigned long commit_count, padding_size, data_size; + + data_size = subbuf_offset(offsets->old - 1, chan) + 1; + padding_size = chan->backend.subbuf_size - data_size; + subbuffer_set_data_size(config, &buf->backend, oldidx, data_size); + + /* + * Order all writes to buffer before the commit count update that will + * determine that the subbuffer is full. + */ + if (config->ipi == RING_BUFFER_IPI_BARRIER) { + /* + * Must write slot data before incrementing commit count. This + * compiler barrier is upgraded into a smp_mb() by the IPI sent + * by get_subbuf(). + */ + barrier(); + } else + smp_wmb(); + v_add(config, padding_size, &buf->commit_hot[oldidx].cc); + commit_count = v_read(config, &buf->commit_hot[oldidx].cc); + lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1, + commit_count, oldidx); + lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx, + offsets->old, commit_count, + padding_size); +} + +/* + * lib_ring_buffer_switch_new_start: Populate new subbuffer. + * + * This code can be executed unordered : writers may already have written to the + * sub-buffer before this code gets executed, caution. The commit makes sure + * that this code is executed before the deliver of this sub-buffer. + */ +static +void lib_ring_buffer_switch_new_start(struct lib_ring_buffer *buf, + struct channel *chan, + struct switch_offsets *offsets, + u64 tsc) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long beginidx = subbuf_index(offsets->begin, chan); + unsigned long commit_count; + + config->cb.buffer_begin(buf, tsc, beginidx); + + /* + * Order all writes to buffer before the commit count update that will + * determine that the subbuffer is full. + */ + if (config->ipi == RING_BUFFER_IPI_BARRIER) { + /* + * Must write slot data before incrementing commit count. This + * compiler barrier is upgraded into a smp_mb() by the IPI sent + * by get_subbuf(). + */ + barrier(); + } else + smp_wmb(); + v_add(config, config->cb.subbuffer_header_size(), + &buf->commit_hot[beginidx].cc); + commit_count = v_read(config, &buf->commit_hot[beginidx].cc); + /* Check if the written buffer has to be delivered */ + lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin, + commit_count, beginidx); + lib_ring_buffer_write_commit_counter(config, buf, chan, beginidx, + offsets->begin, commit_count, + config->cb.subbuffer_header_size()); +} + +/* + * lib_ring_buffer_switch_new_end: finish switching current subbuffer + * + * The only remaining threads could be the ones with pending commits. They will + * have to do the deliver themselves. + */ +static +void lib_ring_buffer_switch_new_end(struct lib_ring_buffer *buf, + struct channel *chan, + struct switch_offsets *offsets, + u64 tsc) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long endidx = subbuf_index(offsets->end - 1, chan); + unsigned long commit_count, padding_size, data_size; + + data_size = subbuf_offset(offsets->end - 1, chan) + 1; + padding_size = chan->backend.subbuf_size - data_size; + subbuffer_set_data_size(config, &buf->backend, endidx, data_size); + + /* + * Order all writes to buffer before the commit count update that will + * determine that the subbuffer is full. + */ + if (config->ipi == RING_BUFFER_IPI_BARRIER) { + /* + * Must write slot data before incrementing commit count. This + * compiler barrier is upgraded into a smp_mb() by the IPI sent + * by get_subbuf(). + */ + barrier(); + } else + smp_wmb(); + v_add(config, padding_size, &buf->commit_hot[endidx].cc); + commit_count = v_read(config, &buf->commit_hot[endidx].cc); + lib_ring_buffer_check_deliver(config, buf, chan, offsets->end - 1, + commit_count, endidx); + lib_ring_buffer_write_commit_counter(config, buf, chan, endidx, + offsets->end, commit_count, + padding_size); +} + +/* + * Returns : + * 0 if ok + * !0 if execution must be aborted. + */ +static +int lib_ring_buffer_try_switch_slow(enum switch_mode mode, + struct lib_ring_buffer *buf, + struct channel *chan, + struct switch_offsets *offsets, + u64 *tsc) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long off; + + offsets->begin = v_read(config, &buf->offset); + offsets->old = offsets->begin; + offsets->switch_old_start = 0; + off = subbuf_offset(offsets->begin, chan); + + *tsc = config->cb.ring_buffer_clock_read(chan); + + /* + * Ensure we flush the header of an empty subbuffer when doing the + * finalize (SWITCH_FLUSH). This ensures that we end up knowing the + * total data gathering duration even if there were no records saved + * after the last buffer switch. + * In SWITCH_ACTIVE mode, switch the buffer when it contains events. + * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of + * subbuffer header as appropriate. + * The next record that reserves space will be responsible for + * populating the following subbuffer header. We choose not to populate + * the next subbuffer header here because we want to be able to use + * SWITCH_ACTIVE for periodical buffer flush and CPU tick_nohz stop + * buffer flush, which must guarantee that all the buffer content + * (records and header timestamps) are visible to the reader. This is + * required for quiescence guarantees for the fusion merge. + */ + if (mode == SWITCH_FLUSH || off > 0) { + if (unlikely(off == 0)) { + /* + * The client does not save any header information. + * Don't switch empty subbuffer on finalize, because it + * is invalid to deliver a completely empty subbuffer. + */ + if (!config->cb.subbuffer_header_size()) + return -1; + /* + * Need to write the subbuffer start header on finalize. + */ + offsets->switch_old_start = 1; + } + offsets->begin = subbuf_align(offsets->begin, chan); + } else + return -1; /* we do not have to switch : buffer is empty */ + /* Note: old points to the next subbuf at offset 0 */ + offsets->end = offsets->begin; + return 0; +} + +/* + * Force a sub-buffer switch. This operation is completely reentrant : can be + * called while tracing is active with absolutely no lock held. + * + * Note, however, that as a v_cmpxchg is used for some atomic + * operations, this function must be called from the CPU which owns the buffer + * for a ACTIVE flush. + */ +void lib_ring_buffer_switch_slow(struct lib_ring_buffer *buf, enum switch_mode mode) +{ + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + struct switch_offsets offsets; + unsigned long oldidx; + u64 tsc; + + offsets.size = 0; + + /* + * Perform retryable operations. + */ + do { + if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets, + &tsc)) + return; /* Switch not needed */ + } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end) + != offsets.old); + + /* + * Atomically update last_tsc. This update races against concurrent + * atomic updates, but the race will always cause supplementary full TSC + * records, never the opposite (missing a full TSC record when it would + * be needed). + */ + save_last_tsc(config, buf, tsc); + + /* + * Push the reader if necessary + */ + lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old); + + oldidx = subbuf_index(offsets.old, chan); + lib_ring_buffer_clear_noref(config, &buf->backend, oldidx); + + /* + * May need to populate header start on SWITCH_FLUSH. + */ + if (offsets.switch_old_start) { + lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc); + offsets.old += config->cb.subbuffer_header_size(); + } + + /* + * Switch old subbuffer. + */ + lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc); +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_switch_slow); + +/* + * Returns : + * 0 if ok + * -ENOSPC if event size is too large for packet. + * -ENOBUFS if there is currently not enough space in buffer for the event. + * -EIO if data cannot be written into the buffer for any other reason. + */ +static +int lib_ring_buffer_try_reserve_slow(struct lib_ring_buffer *buf, + struct channel *chan, + struct switch_offsets *offsets, + struct lib_ring_buffer_ctx *ctx) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + unsigned long reserve_commit_diff; + + offsets->begin = v_read(config, &buf->offset); + offsets->old = offsets->begin; + offsets->switch_new_start = 0; + offsets->switch_new_end = 0; + offsets->switch_old_end = 0; + offsets->pre_header_padding = 0; + + ctx->tsc = config->cb.ring_buffer_clock_read(chan); + if ((int64_t) ctx->tsc == -EIO) + return -EIO; + + if (last_tsc_overflow(config, buf, ctx->tsc)) + ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC; + + if (unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) { + offsets->switch_new_start = 1; /* For offsets->begin */ + } else { + offsets->size = config->cb.record_header_size(config, chan, + offsets->begin, + &offsets->pre_header_padding, + ctx); + offsets->size += + lib_ring_buffer_align(offsets->begin + offsets->size, + ctx->largest_align) + + ctx->data_size; + if (unlikely(subbuf_offset(offsets->begin, chan) + + offsets->size > chan->backend.subbuf_size)) { + offsets->switch_old_end = 1; /* For offsets->old */ + offsets->switch_new_start = 1; /* For offsets->begin */ + } + } + if (unlikely(offsets->switch_new_start)) { + unsigned long sb_index; + + /* + * We are typically not filling the previous buffer completely. + */ + if (likely(offsets->switch_old_end)) + offsets->begin = subbuf_align(offsets->begin, chan); + offsets->begin = offsets->begin + + config->cb.subbuffer_header_size(); + /* Test new buffer integrity */ + sb_index = subbuf_index(offsets->begin, chan); + reserve_commit_diff = + (buf_trunc(offsets->begin, chan) + >> chan->backend.num_subbuf_order) + - ((unsigned long) v_read(config, + &buf->commit_cold[sb_index].cc_sb) + & chan->commit_count_mask); + if (likely(reserve_commit_diff == 0)) { + /* Next subbuffer not being written to. */ + if (unlikely(config->mode != RING_BUFFER_OVERWRITE && + subbuf_trunc(offsets->begin, chan) + - subbuf_trunc((unsigned long) + atomic_long_read(&buf->consumed), chan) + >= chan->backend.buf_size)) { + /* + * We do not overwrite non consumed buffers + * and we are full : record is lost. + */ + v_inc(config, &buf->records_lost_full); + return -ENOBUFS; + } else { + /* + * Next subbuffer not being written to, and we + * are either in overwrite mode or the buffer is + * not full. It's safe to write in this new + * subbuffer. + */ + } + } else { + /* + * Next subbuffer reserve offset does not match the + * commit offset. Drop record in producer-consumer and + * overwrite mode. Caused by either a writer OOPS or too + * many nested writes over a reserve/commit pair. + */ + v_inc(config, &buf->records_lost_wrap); + return -EIO; + } + offsets->size = + config->cb.record_header_size(config, chan, + offsets->begin, + &offsets->pre_header_padding, + ctx); + offsets->size += + lib_ring_buffer_align(offsets->begin + offsets->size, + ctx->largest_align) + + ctx->data_size; + if (unlikely(subbuf_offset(offsets->begin, chan) + + offsets->size > chan->backend.subbuf_size)) { + /* + * Record too big for subbuffers, report error, don't + * complete the sub-buffer switch. + */ + v_inc(config, &buf->records_lost_big); + return -ENOSPC; + } else { + /* + * We just made a successful buffer switch and the + * record fits in the new subbuffer. Let's write. + */ + } + } else { + /* + * Record fits in the current buffer and we are not on a switch + * boundary. It's safe to write. + */ + } + offsets->end = offsets->begin + offsets->size; + + if (unlikely(subbuf_offset(offsets->end, chan) == 0)) { + /* + * The offset_end will fall at the very beginning of the next + * subbuffer. + */ + offsets->switch_new_end = 1; /* For offsets->begin */ + } + return 0; +} + +/** + * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer. + * @ctx: ring buffer context. + * + * Return : -NOBUFS if not enough space, -ENOSPC if event size too large, + * -EIO for other errors, else returns 0. + * It will take care of sub-buffer switching. + */ +int lib_ring_buffer_reserve_slow(struct lib_ring_buffer_ctx *ctx) +{ + struct channel *chan = ctx->chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + struct lib_ring_buffer *buf; + struct switch_offsets offsets; + int ret; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + buf = per_cpu_ptr(chan->backend.buf, ctx->cpu); + else + buf = chan->backend.buf; + ctx->buf = buf; + + offsets.size = 0; + + do { + ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets, + ctx); + if (unlikely(ret)) + return ret; + } while (unlikely(v_cmpxchg(config, &buf->offset, offsets.old, + offsets.end) + != offsets.old)); + + /* + * Atomically update last_tsc. This update races against concurrent + * atomic updates, but the race will always cause supplementary full TSC + * records, never the opposite (missing a full TSC record when it would + * be needed). + */ + save_last_tsc(config, buf, ctx->tsc); + + /* + * Push the reader if necessary + */ + lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1); + + /* + * Clear noref flag for this subbuffer. + */ + lib_ring_buffer_clear_noref(config, &buf->backend, + subbuf_index(offsets.end - 1, chan)); + + /* + * Switch old subbuffer if needed. + */ + if (unlikely(offsets.switch_old_end)) { + lib_ring_buffer_clear_noref(config, &buf->backend, + subbuf_index(offsets.old - 1, chan)); + lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc); + } + + /* + * Populate new subbuffer. + */ + if (unlikely(offsets.switch_new_start)) + lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc); + + if (unlikely(offsets.switch_new_end)) + lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc); + + ctx->slot_size = offsets.size; + ctx->pre_offset = offsets.begin; + ctx->buf_offset = offsets.begin + offsets.pre_header_padding; + return 0; +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_reserve_slow); diff --git a/libringbuffer/ring_buffer_iterator.c b/libringbuffer/ring_buffer_iterator.c new file mode 100644 index 0000000..df8d485 --- /dev/null +++ b/libringbuffer/ring_buffer_iterator.c @@ -0,0 +1,796 @@ +/* + * ring_buffer_iterator.c + * + * (C) Copyright 2010 - Mathieu Desnoyers + * + * Ring buffer and channel iterators. Get each event of a channel in order. Uses + * a prio heap for per-cpu buffers, giving a O(log(NR_CPUS)) algorithmic + * complexity for the "get next event" operation. + * + * Author: + * Mathieu Desnoyers + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include "../../wrapper/ringbuffer/iterator.h" +#include +#include +#include + +/* + * Safety factor taking into account internal kernel interrupt latency. + * Assuming 250ms worse-case latency. + */ +#define MAX_SYSTEM_LATENCY 250 + +/* + * Maximum delta expected between trace clocks. At most 1 jiffy delta. + */ +#define MAX_CLOCK_DELTA (jiffies_to_usecs(1) * 1000) + +/** + * lib_ring_buffer_get_next_record - Get the next record in a buffer. + * @chan: channel + * @buf: buffer + * + * Returns the size of the event read, -EAGAIN if buffer is empty, -ENODATA if + * buffer is empty and finalized. The buffer must already be opened for reading. + */ +ssize_t lib_ring_buffer_get_next_record(struct channel *chan, + struct lib_ring_buffer *buf) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + struct lib_ring_buffer_iter *iter = &buf->iter; + int ret; + +restart: + switch (iter->state) { + case ITER_GET_SUBBUF: + ret = lib_ring_buffer_get_next_subbuf(buf); + if (ret && !ACCESS_ONCE(buf->finalized) + && config->alloc == RING_BUFFER_ALLOC_GLOBAL) { + /* + * Use "pull" scheme for global buffers. The reader + * itself flushes the buffer to "pull" data not visible + * to readers yet. Flush current subbuffer and re-try. + * + * Per-CPU buffers rather use a "push" scheme because + * the IPI needed to flush all CPU's buffers is too + * costly. In the "push" scheme, the reader waits for + * the writer periodic deferrable timer to flush the + * buffers (keeping track of a quiescent state + * timestamp). Therefore, the writer "pushes" data out + * of the buffers rather than letting the reader "pull" + * data from the buffer. + */ + lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE); + ret = lib_ring_buffer_get_next_subbuf(buf); + } + if (ret) + return ret; + iter->consumed = buf->cons_snapshot; + iter->data_size = lib_ring_buffer_get_read_data_size(config, buf); + iter->read_offset = iter->consumed; + /* skip header */ + iter->read_offset += config->cb.subbuffer_header_size(); + iter->state = ITER_TEST_RECORD; + goto restart; + case ITER_TEST_RECORD: + if (iter->read_offset - iter->consumed >= iter->data_size) { + iter->state = ITER_PUT_SUBBUF; + } else { + CHAN_WARN_ON(chan, !config->cb.record_get); + config->cb.record_get(config, chan, buf, + iter->read_offset, + &iter->header_len, + &iter->payload_len, + &iter->timestamp); + iter->read_offset += iter->header_len; + subbuffer_consume_record(config, &buf->backend); + iter->state = ITER_NEXT_RECORD; + return iter->payload_len; + } + goto restart; + case ITER_NEXT_RECORD: + iter->read_offset += iter->payload_len; + iter->state = ITER_TEST_RECORD; + goto restart; + case ITER_PUT_SUBBUF: + lib_ring_buffer_put_next_subbuf(buf); + iter->state = ITER_GET_SUBBUF; + goto restart; + default: + CHAN_WARN_ON(chan, 1); /* Should not happen */ + return -EPERM; + } +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_get_next_record); + +static int buf_is_higher(void *a, void *b) +{ + struct lib_ring_buffer *bufa = a; + struct lib_ring_buffer *bufb = b; + + /* Consider lowest timestamps to be at the top of the heap */ + return (bufa->iter.timestamp < bufb->iter.timestamp); +} + +static +void lib_ring_buffer_get_empty_buf_records(const struct lib_ring_buffer_config *config, + struct channel *chan) +{ + struct lttng_ptr_heap *heap = &chan->iter.heap; + struct lib_ring_buffer *buf, *tmp; + ssize_t len; + + list_for_each_entry_safe(buf, tmp, &chan->iter.empty_head, + iter.empty_node) { + len = lib_ring_buffer_get_next_record(chan, buf); + + /* + * Deal with -EAGAIN and -ENODATA. + * len >= 0 means record contains data. + * -EBUSY should never happen, because we support only one + * reader. + */ + switch (len) { + case -EAGAIN: + /* Keep node in empty list */ + break; + case -ENODATA: + /* + * Buffer is finalized. Don't add to list of empty + * buffer, because it has no more data to provide, ever. + */ + list_del(&buf->iter.empty_node); + break; + case -EBUSY: + CHAN_WARN_ON(chan, 1); + break; + default: + /* + * Insert buffer into the heap, remove from empty buffer + * list. + */ + CHAN_WARN_ON(chan, len < 0); + list_del(&buf->iter.empty_node); + CHAN_WARN_ON(chan, lttng_heap_insert(heap, buf)); + } + } +} + +static +void lib_ring_buffer_wait_for_qs(const struct lib_ring_buffer_config *config, + struct channel *chan) +{ + u64 timestamp_qs; + unsigned long wait_msecs; + + /* + * No need to wait if no empty buffers are present. + */ + if (list_empty(&chan->iter.empty_head)) + return; + + timestamp_qs = config->cb.ring_buffer_clock_read(chan); + /* + * We need to consider previously empty buffers. + * Do a get next buf record on each of them. Add them to + * the heap if they have data. If at least one of them + * don't have data, we need to wait for + * switch_timer_interval + MAX_SYSTEM_LATENCY (so we are sure the + * buffers have been switched either by the timer or idle entry) and + * check them again, adding them if they have data. + */ + lib_ring_buffer_get_empty_buf_records(config, chan); + + /* + * No need to wait if no empty buffers are present. + */ + if (list_empty(&chan->iter.empty_head)) + return; + + /* + * We need to wait for the buffer switch timer to run. If the + * CPU is idle, idle entry performed the switch. + * TODO: we could optimize further by skipping the sleep if all + * empty buffers belong to idle or offline cpus. + */ + wait_msecs = jiffies_to_msecs(chan->switch_timer_interval); + wait_msecs += MAX_SYSTEM_LATENCY; + msleep(wait_msecs); + lib_ring_buffer_get_empty_buf_records(config, chan); + /* + * Any buffer still in the empty list here cannot possibly + * contain an event with a timestamp prior to "timestamp_qs". + * The new quiescent state timestamp is the one we grabbed + * before waiting for buffer data. It is therefore safe to + * ignore empty buffers up to last_qs timestamp for fusion + * merge. + */ + chan->iter.last_qs = timestamp_qs; +} + +/** + * channel_get_next_record - Get the next record in a channel. + * @chan: channel + * @ret_buf: the buffer in which the event is located (output) + * + * Returns the size of new current event, -EAGAIN if all buffers are empty, + * -ENODATA if all buffers are empty and finalized. The channel must already be + * opened for reading. + */ + +ssize_t channel_get_next_record(struct channel *chan, + struct lib_ring_buffer **ret_buf) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + struct lib_ring_buffer *buf; + struct lttng_ptr_heap *heap; + ssize_t len; + + if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) { + *ret_buf = channel_get_ring_buffer(config, chan, 0); + return lib_ring_buffer_get_next_record(chan, *ret_buf); + } + + heap = &chan->iter.heap; + + /* + * get next record for topmost buffer. + */ + buf = lttng_heap_maximum(heap); + if (buf) { + len = lib_ring_buffer_get_next_record(chan, buf); + /* + * Deal with -EAGAIN and -ENODATA. + * len >= 0 means record contains data. + */ + switch (len) { + case -EAGAIN: + buf->iter.timestamp = 0; + list_add(&buf->iter.empty_node, &chan->iter.empty_head); + /* Remove topmost buffer from the heap */ + CHAN_WARN_ON(chan, lttng_heap_remove(heap) != buf); + break; + case -ENODATA: + /* + * Buffer is finalized. Remove buffer from heap and + * don't add to list of empty buffer, because it has no + * more data to provide, ever. + */ + CHAN_WARN_ON(chan, lttng_heap_remove(heap) != buf); + break; + case -EBUSY: + CHAN_WARN_ON(chan, 1); + break; + default: + /* + * Reinsert buffer into the heap. Note that heap can be + * partially empty, so we need to use + * lttng_heap_replace_max(). + */ + CHAN_WARN_ON(chan, len < 0); + CHAN_WARN_ON(chan, lttng_heap_replace_max(heap, buf) != buf); + break; + } + } + + buf = lttng_heap_maximum(heap); + if (!buf || buf->iter.timestamp > chan->iter.last_qs) { + /* + * Deal with buffers previously showing no data. + * Add buffers containing data to the heap, update + * last_qs. + */ + lib_ring_buffer_wait_for_qs(config, chan); + } + + *ret_buf = buf = lttng_heap_maximum(heap); + if (buf) { + /* + * If this warning triggers, you probably need to check your + * system interrupt latency. Typical causes: too many printk() + * output going to a serial console with interrupts off. + * Allow for MAX_CLOCK_DELTA ns timestamp delta going backward. + * Observed on SMP KVM setups with trace_clock(). + */ + if (chan->iter.last_timestamp + > (buf->iter.timestamp + MAX_CLOCK_DELTA)) { + printk(KERN_WARNING "ring_buffer: timestamps going " + "backward. Last time %llu ns, cpu %d, " + "current time %llu ns, cpu %d, " + "delta %llu ns.\n", + chan->iter.last_timestamp, chan->iter.last_cpu, + buf->iter.timestamp, buf->backend.cpu, + chan->iter.last_timestamp - buf->iter.timestamp); + CHAN_WARN_ON(chan, 1); + } + chan->iter.last_timestamp = buf->iter.timestamp; + chan->iter.last_cpu = buf->backend.cpu; + return buf->iter.payload_len; + } else { + /* Heap is empty */ + if (list_empty(&chan->iter.empty_head)) + return -ENODATA; /* All buffers finalized */ + else + return -EAGAIN; /* Temporarily empty */ + } +} +EXPORT_SYMBOL_GPL(channel_get_next_record); + +static +void lib_ring_buffer_iterator_init(struct channel *chan, struct lib_ring_buffer *buf) +{ + if (buf->iter.allocated) + return; + + buf->iter.allocated = 1; + if (chan->iter.read_open && !buf->iter.read_open) { + CHAN_WARN_ON(chan, lib_ring_buffer_open_read(buf) != 0); + buf->iter.read_open = 1; + } + + /* Add to list of buffers without any current record */ + if (chan->backend.config->alloc == RING_BUFFER_ALLOC_PER_CPU) + list_add(&buf->iter.empty_node, &chan->iter.empty_head); +} + +#ifdef CONFIG_HOTPLUG_CPU +static +int __cpuinit channel_iterator_cpu_hotplug(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct channel *chan = container_of(nb, struct channel, + hp_iter_notifier); + struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu); + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (!chan->hp_iter_enable) + return NOTIFY_DONE; + + CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL); + + switch (action) { + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + lib_ring_buffer_iterator_init(chan, buf); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +#endif + +int channel_iterator_init(struct channel *chan) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + struct lib_ring_buffer *buf; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + int cpu, ret; + + INIT_LIST_HEAD(&chan->iter.empty_head); + ret = lttng_heap_init(&chan->iter.heap, + num_possible_cpus(), + GFP_KERNEL, buf_is_higher); + if (ret) + return ret; + /* + * In case of non-hotplug cpu, if the ring-buffer is allocated + * in early initcall, it will not be notified of secondary cpus. + * In that off case, we need to allocate for all possible cpus. + */ +#ifdef CONFIG_HOTPLUG_CPU + chan->hp_iter_notifier.notifier_call = + channel_iterator_cpu_hotplug; + chan->hp_iter_notifier.priority = 10; + register_cpu_notifier(&chan->hp_iter_notifier); + get_online_cpus(); + for_each_online_cpu(cpu) { + buf = per_cpu_ptr(chan->backend.buf, cpu); + lib_ring_buffer_iterator_init(chan, buf); + } + chan->hp_iter_enable = 1; + put_online_cpus(); +#else + for_each_possible_cpu(cpu) { + buf = per_cpu_ptr(chan->backend.buf, cpu); + lib_ring_buffer_iterator_init(chan, buf); + } +#endif + } else { + buf = channel_get_ring_buffer(config, chan, 0); + lib_ring_buffer_iterator_init(chan, buf); + } + return 0; +} + +void channel_iterator_unregister_notifiers(struct channel *chan) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + chan->hp_iter_enable = 0; + unregister_cpu_notifier(&chan->hp_iter_notifier); + } +} + +void channel_iterator_free(struct channel *chan) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + lttng_heap_free(&chan->iter.heap); +} + +int lib_ring_buffer_iterator_open(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + CHAN_WARN_ON(chan, config->output != RING_BUFFER_ITERATOR); + return lib_ring_buffer_open_read(buf); +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_iterator_open); + +/* + * Note: Iterators must not be mixed with other types of outputs, because an + * iterator can leave the buffer in "GET" state, which is not consistent with + * other types of output (mmap, splice, raw data read). + */ +void lib_ring_buffer_iterator_release(struct lib_ring_buffer *buf) +{ + lib_ring_buffer_release_read(buf); +} +EXPORT_SYMBOL_GPL(lib_ring_buffer_iterator_release); + +int channel_iterator_open(struct channel *chan) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + struct lib_ring_buffer *buf; + int ret = 0, cpu; + + CHAN_WARN_ON(chan, config->output != RING_BUFFER_ITERATOR); + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + get_online_cpus(); + /* Allow CPU hotplug to keep track of opened reader */ + chan->iter.read_open = 1; + for_each_channel_cpu(cpu, chan) { + buf = channel_get_ring_buffer(config, chan, cpu); + ret = lib_ring_buffer_iterator_open(buf); + if (ret) + goto error; + buf->iter.read_open = 1; + } + put_online_cpus(); + } else { + buf = channel_get_ring_buffer(config, chan, 0); + ret = lib_ring_buffer_iterator_open(buf); + } + return ret; +error: + /* Error should always happen on CPU 0, hence no close is required. */ + CHAN_WARN_ON(chan, cpu != 0); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(channel_iterator_open); + +void channel_iterator_release(struct channel *chan) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + struct lib_ring_buffer *buf; + int cpu; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) { + get_online_cpus(); + for_each_channel_cpu(cpu, chan) { + buf = channel_get_ring_buffer(config, chan, cpu); + if (buf->iter.read_open) { + lib_ring_buffer_iterator_release(buf); + buf->iter.read_open = 0; + } + } + chan->iter.read_open = 0; + put_online_cpus(); + } else { + buf = channel_get_ring_buffer(config, chan, 0); + lib_ring_buffer_iterator_release(buf); + } +} +EXPORT_SYMBOL_GPL(channel_iterator_release); + +void lib_ring_buffer_iterator_reset(struct lib_ring_buffer *buf) +{ + struct channel *chan = buf->backend.chan; + + if (buf->iter.state != ITER_GET_SUBBUF) + lib_ring_buffer_put_next_subbuf(buf); + buf->iter.state = ITER_GET_SUBBUF; + /* Remove from heap (if present). */ + if (lttng_heap_cherrypick(&chan->iter.heap, buf)) + list_add(&buf->iter.empty_node, &chan->iter.empty_head); + buf->iter.timestamp = 0; + buf->iter.header_len = 0; + buf->iter.payload_len = 0; + buf->iter.consumed = 0; + buf->iter.read_offset = 0; + buf->iter.data_size = 0; + /* Don't reset allocated and read_open */ +} + +void channel_iterator_reset(struct channel *chan) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + struct lib_ring_buffer *buf; + int cpu; + + /* Empty heap, put into empty_head */ + while ((buf = lttng_heap_remove(&chan->iter.heap)) != NULL) + list_add(&buf->iter.empty_node, &chan->iter.empty_head); + + for_each_channel_cpu(cpu, chan) { + buf = channel_get_ring_buffer(config, chan, cpu); + lib_ring_buffer_iterator_reset(buf); + } + /* Don't reset read_open */ + chan->iter.last_qs = 0; + chan->iter.last_timestamp = 0; + chan->iter.last_cpu = 0; + chan->iter.len_left = 0; +} + +/* + * Ring buffer payload extraction read() implementation. + */ +static +ssize_t channel_ring_buffer_file_read(struct file *filp, + char __user *user_buf, + size_t count, + loff_t *ppos, + struct channel *chan, + struct lib_ring_buffer *buf, + int fusionmerge) +{ + const struct lib_ring_buffer_config *config = chan->backend.config; + size_t read_count = 0, read_offset; + ssize_t len; + + might_sleep(); + if (!access_ok(VERIFY_WRITE, user_buf, count)) + return -EFAULT; + + /* Finish copy of previous record */ + if (*ppos != 0) { + if (read_count < count) { + len = chan->iter.len_left; + read_offset = *ppos; + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU + && fusionmerge) + buf = lttng_heap_maximum(&chan->iter.heap); + CHAN_WARN_ON(chan, !buf); + goto skip_get_next; + } + } + + while (read_count < count) { + size_t copy_len, space_left; + + if (fusionmerge) + len = channel_get_next_record(chan, &buf); + else + len = lib_ring_buffer_get_next_record(chan, buf); +len_test: + if (len < 0) { + /* + * Check if buffer is finalized (end of file). + */ + if (len == -ENODATA) { + /* A 0 read_count will tell about end of file */ + goto nodata; + } + if (filp->f_flags & O_NONBLOCK) { + if (!read_count) + read_count = -EAGAIN; + goto nodata; + } else { + int error; + + /* + * No data available at the moment, return what + * we got. + */ + if (read_count) + goto nodata; + + /* + * Wait for returned len to be >= 0 or -ENODATA. + */ + if (fusionmerge) + error = wait_event_interruptible( + chan->read_wait, + ((len = channel_get_next_record(chan, + &buf)), len != -EAGAIN)); + else + error = wait_event_interruptible( + buf->read_wait, + ((len = lib_ring_buffer_get_next_record( + chan, buf)), len != -EAGAIN)); + CHAN_WARN_ON(chan, len == -EBUSY); + if (error) { + read_count = error; + goto nodata; + } + CHAN_WARN_ON(chan, len < 0 && len != -ENODATA); + goto len_test; + } + } + read_offset = buf->iter.read_offset; +skip_get_next: + space_left = count - read_count; + if (len <= space_left) { + copy_len = len; + chan->iter.len_left = 0; + *ppos = 0; + } else { + copy_len = space_left; + chan->iter.len_left = len - copy_len; + *ppos = read_offset + copy_len; + } + if (__lib_ring_buffer_copy_to_user(&buf->backend, read_offset, + &user_buf[read_count], + copy_len)) { + /* + * Leave the len_left and ppos values at their current + * state, as we currently have a valid event to read. + */ + return -EFAULT; + } + read_count += copy_len; + }; + return read_count; + +nodata: + *ppos = 0; + chan->iter.len_left = 0; + return read_count; +} + +/** + * lib_ring_buffer_file_read - Read buffer record payload. + * @filp: file structure pointer. + * @buffer: user buffer to read data into. + * @count: number of bytes to read. + * @ppos: file read position. + * + * Returns a negative value on error, or the number of bytes read on success. + * ppos is used to save the position _within the current record_ between calls + * to read(). + */ +static +ssize_t lib_ring_buffer_file_read(struct file *filp, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct lib_ring_buffer *buf = inode->i_private; + struct channel *chan = buf->backend.chan; + + return channel_ring_buffer_file_read(filp, user_buf, count, ppos, + chan, buf, 0); +} + +/** + * channel_file_read - Read channel record payload. + * @filp: file structure pointer. + * @buffer: user buffer to read data into. + * @count: number of bytes to read. + * @ppos: file read position. + * + * Returns a negative value on error, or the number of bytes read on success. + * ppos is used to save the position _within the current record_ between calls + * to read(). + */ +static +ssize_t channel_file_read(struct file *filp, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct channel *chan = inode->i_private; + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) + return channel_ring_buffer_file_read(filp, user_buf, count, + ppos, chan, NULL, 1); + else { + struct lib_ring_buffer *buf = + channel_get_ring_buffer(config, chan, 0); + return channel_ring_buffer_file_read(filp, user_buf, count, + ppos, chan, buf, 0); + } +} + +static +int lib_ring_buffer_file_open(struct inode *inode, struct file *file) +{ + struct lib_ring_buffer *buf = inode->i_private; + int ret; + + ret = lib_ring_buffer_iterator_open(buf); + if (ret) + return ret; + + file->private_data = buf; + ret = nonseekable_open(inode, file); + if (ret) + goto release_iter; + return 0; + +release_iter: + lib_ring_buffer_iterator_release(buf); + return ret; +} + +static +int lib_ring_buffer_file_release(struct inode *inode, struct file *file) +{ + struct lib_ring_buffer *buf = inode->i_private; + + lib_ring_buffer_iterator_release(buf); + return 0; +} + +static +int channel_file_open(struct inode *inode, struct file *file) +{ + struct channel *chan = inode->i_private; + int ret; + + ret = channel_iterator_open(chan); + if (ret) + return ret; + + file->private_data = chan; + ret = nonseekable_open(inode, file); + if (ret) + goto release_iter; + return 0; + +release_iter: + channel_iterator_release(chan); + return ret; +} + +static +int channel_file_release(struct inode *inode, struct file *file) +{ + struct channel *chan = inode->i_private; + + channel_iterator_release(chan); + return 0; +} + +const struct file_operations channel_payload_file_operations = { + .open = channel_file_open, + .release = channel_file_release, + .read = channel_file_read, + .llseek = lib_ring_buffer_no_llseek, +}; +EXPORT_SYMBOL_GPL(channel_payload_file_operations); + +const struct file_operations lib_ring_buffer_payload_file_operations = { + .open = lib_ring_buffer_file_open, + .release = lib_ring_buffer_file_release, + .read = lib_ring_buffer_file_read, + .llseek = lib_ring_buffer_no_llseek, +}; +EXPORT_SYMBOL_GPL(lib_ring_buffer_payload_file_operations); diff --git a/libringbuffer/ring_buffer_vfs.c b/libringbuffer/ring_buffer_vfs.c new file mode 100644 index 0000000..9786d35 --- /dev/null +++ b/libringbuffer/ring_buffer_vfs.c @@ -0,0 +1,380 @@ +/* + * ring_buffer_vfs.c + * + * Copyright (C) 2009-2010 - Mathieu Desnoyers + * + * Ring Buffer VFS file operations. + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include +#include +#include + +#include "../../wrapper/ringbuffer/backend.h" +#include "../../wrapper/ringbuffer/frontend.h" +#include "../../wrapper/ringbuffer/vfs.h" +#include "../../wrapper/poll.h" + +static int put_ulong(unsigned long val, unsigned long arg) +{ + return put_user(val, (unsigned long __user *)arg); +} + +#ifdef CONFIG_COMPAT +static int compat_put_ulong(compat_ulong_t val, unsigned long arg) +{ + return put_user(val, (compat_ulong_t __user *)compat_ptr(arg)); +} +#endif + +/** + * lib_ring_buffer_open - ring buffer open file operation + * @inode: opened inode + * @file: opened file + * + * Open implementation. Makes sure only one open instance of a buffer is + * done at a given moment. + */ +int lib_ring_buffer_open(struct inode *inode, struct file *file) +{ + struct lib_ring_buffer *buf = inode->i_private; + int ret; + + ret = lib_ring_buffer_open_read(buf); + if (ret) + return ret; + + file->private_data = buf; + ret = nonseekable_open(inode, file); + if (ret) + goto release_read; + return 0; + +release_read: + lib_ring_buffer_release_read(buf); + return ret; +} + +/** + * lib_ring_buffer_release - ring buffer release file operation + * @inode: opened inode + * @file: opened file + * + * Release implementation. + */ +int lib_ring_buffer_release(struct inode *inode, struct file *file) +{ + struct lib_ring_buffer *buf = file->private_data; + + lib_ring_buffer_release_read(buf); + + return 0; +} + +/** + * lib_ring_buffer_poll - ring buffer poll file operation + * @filp: the file + * @wait: poll table + * + * Poll implementation. + */ +unsigned int lib_ring_buffer_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask = 0; + struct lib_ring_buffer *buf = filp->private_data; + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + int finalized, disabled; + + if (filp->f_mode & FMODE_READ) { + poll_wait_set_exclusive(wait); + poll_wait(filp, &buf->read_wait, wait); + + finalized = lib_ring_buffer_is_finalized(config, buf); + disabled = lib_ring_buffer_channel_is_disabled(chan); + + /* + * lib_ring_buffer_is_finalized() contains a smp_rmb() ordering + * finalized load before offsets loads. + */ + WARN_ON(atomic_long_read(&buf->active_readers) != 1); +retry: + if (disabled) + return POLLERR; + + if (subbuf_trunc(lib_ring_buffer_get_offset(config, buf), chan) + - subbuf_trunc(lib_ring_buffer_get_consumed(config, buf), chan) + == 0) { + if (finalized) + return POLLHUP; + else { + /* + * The memory barriers + * __wait_event()/wake_up_interruptible() take + * care of "raw_spin_is_locked" memory ordering. + */ + if (raw_spin_is_locked(&buf->raw_tick_nohz_spinlock)) + goto retry; + else + return 0; + } + } else { + if (subbuf_trunc(lib_ring_buffer_get_offset(config, buf), + chan) + - subbuf_trunc(lib_ring_buffer_get_consumed(config, buf), + chan) + >= chan->backend.buf_size) + return POLLPRI | POLLRDBAND; + else + return POLLIN | POLLRDNORM; + } + } + return mask; +} + +/** + * lib_ring_buffer_ioctl - control ring buffer reader synchronization + * + * @filp: the file + * @cmd: the command + * @arg: command arg + * + * This ioctl implements commands necessary for producer/consumer + * and flight recorder reader interaction : + * RING_BUFFER_GET_NEXT_SUBBUF + * Get the next sub-buffer that can be read. It never blocks. + * RING_BUFFER_PUT_NEXT_SUBBUF + * Release the currently read sub-buffer. + * RING_BUFFER_GET_SUBBUF_SIZE + * returns the size of the current sub-buffer. + * RING_BUFFER_GET_MAX_SUBBUF_SIZE + * returns the maximum size for sub-buffers. + * RING_BUFFER_GET_NUM_SUBBUF + * returns the number of reader-visible sub-buffers in the per cpu + * channel (for mmap). + * RING_BUFFER_GET_MMAP_READ_OFFSET + * returns the offset of the subbuffer belonging to the reader. + * Should only be used for mmap clients. + */ +long lib_ring_buffer_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct lib_ring_buffer *buf = filp->private_data; + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (lib_ring_buffer_channel_is_disabled(chan)) + return -EIO; + + switch (cmd) { + case RING_BUFFER_SNAPSHOT: + return lib_ring_buffer_snapshot(buf, &buf->cons_snapshot, + &buf->prod_snapshot); + case RING_BUFFER_SNAPSHOT_GET_CONSUMED: + return put_ulong(buf->cons_snapshot, arg); + case RING_BUFFER_SNAPSHOT_GET_PRODUCED: + return put_ulong(buf->prod_snapshot, arg); + case RING_BUFFER_GET_SUBBUF: + { + unsigned long uconsume; + long ret; + + ret = get_user(uconsume, (unsigned long __user *) arg); + if (ret) + return ret; /* will return -EFAULT */ + ret = lib_ring_buffer_get_subbuf(buf, uconsume); + if (!ret) { + /* Set file position to zero at each successful "get" */ + filp->f_pos = 0; + } + return ret; + } + case RING_BUFFER_PUT_SUBBUF: + lib_ring_buffer_put_subbuf(buf); + return 0; + + case RING_BUFFER_GET_NEXT_SUBBUF: + { + long ret; + + ret = lib_ring_buffer_get_next_subbuf(buf); + if (!ret) { + /* Set file position to zero at each successful "get" */ + filp->f_pos = 0; + } + return ret; + } + case RING_BUFFER_PUT_NEXT_SUBBUF: + lib_ring_buffer_put_next_subbuf(buf); + return 0; + case RING_BUFFER_GET_SUBBUF_SIZE: + return put_ulong(lib_ring_buffer_get_read_data_size(config, buf), + arg); + case RING_BUFFER_GET_PADDED_SUBBUF_SIZE: + { + unsigned long size; + + size = lib_ring_buffer_get_read_data_size(config, buf); + size = PAGE_ALIGN(size); + return put_ulong(size, arg); + } + case RING_BUFFER_GET_MAX_SUBBUF_SIZE: + return put_ulong(chan->backend.subbuf_size, arg); + case RING_BUFFER_GET_MMAP_LEN: + { + unsigned long mmap_buf_len; + + if (config->output != RING_BUFFER_MMAP) + return -EINVAL; + mmap_buf_len = chan->backend.buf_size; + if (chan->backend.extra_reader_sb) + mmap_buf_len += chan->backend.subbuf_size; + if (mmap_buf_len > INT_MAX) + return -EFBIG; + return put_ulong(mmap_buf_len, arg); + } + case RING_BUFFER_GET_MMAP_READ_OFFSET: + { + unsigned long sb_bindex; + + if (config->output != RING_BUFFER_MMAP) + return -EINVAL; + sb_bindex = subbuffer_id_get_index(config, + buf->backend.buf_rsb.id); + return put_ulong(buf->backend.array[sb_bindex]->mmap_offset, + arg); + } + default: + return -ENOIOCTLCMD; + } +} + +#ifdef CONFIG_COMPAT +long lib_ring_buffer_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct lib_ring_buffer *buf = filp->private_data; + struct channel *chan = buf->backend.chan; + const struct lib_ring_buffer_config *config = chan->backend.config; + + if (lib_ring_buffer_channel_is_disabled(chan)) + return -EIO; + + switch (cmd) { + case RING_BUFFER_SNAPSHOT: + return lib_ring_buffer_snapshot(buf, &buf->cons_snapshot, + &buf->prod_snapshot); + case RING_BUFFER_SNAPSHOT_GET_CONSUMED: + return compat_put_ulong(buf->cons_snapshot, arg); + case RING_BUFFER_SNAPSHOT_GET_PRODUCED: + return compat_put_ulong(buf->prod_snapshot, arg); + case RING_BUFFER_GET_SUBBUF: + { + __u32 uconsume; + unsigned long consume; + long ret; + + ret = get_user(uconsume, (__u32 __user *) arg); + if (ret) + return ret; /* will return -EFAULT */ + consume = buf->cons_snapshot; + consume &= ~0xFFFFFFFFL; + consume |= uconsume; + ret = lib_ring_buffer_get_subbuf(buf, consume); + if (!ret) { + /* Set file position to zero at each successful "get" */ + filp->f_pos = 0; + } + return ret; + } + case RING_BUFFER_PUT_SUBBUF: + lib_ring_buffer_put_subbuf(buf); + return 0; + + case RING_BUFFER_GET_NEXT_SUBBUF: + { + long ret; + + ret = lib_ring_buffer_get_next_subbuf(buf); + if (!ret) { + /* Set file position to zero at each successful "get" */ + filp->f_pos = 0; + } + return ret; + } + case RING_BUFFER_PUT_NEXT_SUBBUF: + lib_ring_buffer_put_next_subbuf(buf); + return 0; + case RING_BUFFER_GET_SUBBUF_SIZE: + { + unsigned long data_size; + + data_size = lib_ring_buffer_get_read_data_size(config, buf); + if (data_size > UINT_MAX) + return -EFBIG; + return put_ulong(data_size, arg); + } + case RING_BUFFER_GET_PADDED_SUBBUF_SIZE: + { + unsigned long size; + + size = lib_ring_buffer_get_read_data_size(config, buf); + size = PAGE_ALIGN(size); + if (size > UINT_MAX) + return -EFBIG; + return put_ulong(size, arg); + } + case RING_BUFFER_GET_MAX_SUBBUF_SIZE: + if (chan->backend.subbuf_size > UINT_MAX) + return -EFBIG; + return put_ulong(chan->backend.subbuf_size, arg); + case RING_BUFFER_GET_MMAP_LEN: + { + unsigned long mmap_buf_len; + + if (config->output != RING_BUFFER_MMAP) + return -EINVAL; + mmap_buf_len = chan->backend.buf_size; + if (chan->backend.extra_reader_sb) + mmap_buf_len += chan->backend.subbuf_size; + if (mmap_buf_len > UINT_MAX) + return -EFBIG; + return put_ulong(mmap_buf_len, arg); + } + case RING_BUFFER_GET_MMAP_READ_OFFSET: + { + unsigned long sb_bindex, read_offset; + + if (config->output != RING_BUFFER_MMAP) + return -EINVAL; + sb_bindex = subbuffer_id_get_index(config, + buf->backend.buf_rsb.id); + read_offset = buf->backend.array[sb_bindex]->mmap_offset; + if (read_offset > UINT_MAX) + return -EINVAL; + return put_ulong(read_offset, arg); + } + default: + return -ENOIOCTLCMD; + } +} +#endif + +const struct file_operations lib_ring_buffer_file_operations = { + .open = lib_ring_buffer_open, + .release = lib_ring_buffer_release, + .poll = lib_ring_buffer_poll, + .splice_read = lib_ring_buffer_splice_read, + .mmap = lib_ring_buffer_mmap, + .unlocked_ioctl = lib_ring_buffer_ioctl, + .llseek = lib_ring_buffer_no_llseek, +#ifdef CONFIG_COMPAT + .compat_ioctl = lib_ring_buffer_compat_ioctl, +#endif +}; +EXPORT_SYMBOL_GPL(lib_ring_buffer_file_operations); + +MODULE_LICENSE("GPL and additional rights"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("Ring Buffer Library VFS"); diff --git a/libringbuffer/vatomic.h b/libringbuffer/vatomic.h new file mode 100644 index 0000000..b944dd6 --- /dev/null +++ b/libringbuffer/vatomic.h @@ -0,0 +1,85 @@ +#ifndef _LINUX_RING_BUFFER_VATOMIC_H +#define _LINUX_RING_BUFFER_VATOMIC_H + +/* + * linux/ringbuffer/vatomic.h + * + * Copyright (C) 2010 - Mathieu Desnoyers + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include +#include + +/* + * Same data type (long) accessed differently depending on configuration. + * v field is for non-atomic access (protected by mutual exclusion). + * In the fast-path, the ring_buffer_config structure is constant, so the + * compiler can statically select the appropriate branch. + * local_t is used for per-cpu and per-thread buffers. + * atomic_long_t is used for globally shared buffers. + */ +union v_atomic { + local_t l; + atomic_long_t a; + long v; +}; + +static inline +long v_read(const struct lib_ring_buffer_config *config, union v_atomic *v_a) +{ + if (config->sync == RING_BUFFER_SYNC_PER_CPU) + return local_read(&v_a->l); + else + return atomic_long_read(&v_a->a); +} + +static inline +void v_set(const struct lib_ring_buffer_config *config, union v_atomic *v_a, + long v) +{ + if (config->sync == RING_BUFFER_SYNC_PER_CPU) + local_set(&v_a->l, v); + else + atomic_long_set(&v_a->a, v); +} + +static inline +void v_add(const struct lib_ring_buffer_config *config, long v, union v_atomic *v_a) +{ + if (config->sync == RING_BUFFER_SYNC_PER_CPU) + local_add(v, &v_a->l); + else + atomic_long_add(v, &v_a->a); +} + +static inline +void v_inc(const struct lib_ring_buffer_config *config, union v_atomic *v_a) +{ + if (config->sync == RING_BUFFER_SYNC_PER_CPU) + local_inc(&v_a->l); + else + atomic_long_inc(&v_a->a); +} + +/* + * Non-atomic decrement. Only used by reader, apply to reader-owned subbuffer. + */ +static inline +void _v_dec(const struct lib_ring_buffer_config *config, union v_atomic *v_a) +{ + --v_a->v; +} + +static inline +long v_cmpxchg(const struct lib_ring_buffer_config *config, union v_atomic *v_a, + long old, long _new) +{ + if (config->sync == RING_BUFFER_SYNC_PER_CPU) + return local_cmpxchg(&v_a->l, old, _new); + else + return atomic_long_cmpxchg(&v_a->a, old, _new); +} + +#endif /* _LINUX_RING_BUFFER_VATOMIC_H */ diff --git a/libringbuffer/vfs.h b/libringbuffer/vfs.h new file mode 100644 index 0000000..0a09637 --- /dev/null +++ b/libringbuffer/vfs.h @@ -0,0 +1,87 @@ +#ifndef _LINUX_RING_BUFFER_VFS_H +#define _LINUX_RING_BUFFER_VFS_H + +/* + * linux/ringbuffer/vfs.h + * + * (C) Copyright 2005-2010 - Mathieu Desnoyers + * + * Wait-free ring buffer VFS file operations. + * + * Author: + * Mathieu Desnoyers + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include +#include + +/* VFS API */ + +extern const struct file_operations lib_ring_buffer_file_operations; + +/* + * Internal file operations. + */ + +int lib_ring_buffer_open(struct inode *inode, struct file *file); +int lib_ring_buffer_release(struct inode *inode, struct file *file); +unsigned int lib_ring_buffer_poll(struct file *filp, poll_table *wait); +ssize_t lib_ring_buffer_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); +int lib_ring_buffer_mmap(struct file *filp, struct vm_area_struct *vma); + +/* Ring Buffer ioctl() and ioctl numbers */ +long lib_ring_buffer_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT +long lib_ring_buffer_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); +#endif + +/* + * Use RING_BUFFER_GET_NEXT_SUBBUF / RING_BUFFER_PUT_NEXT_SUBBUF to read and + * consume sub-buffers sequentially. + * + * Reading sub-buffers without consuming them can be performed with: + * + * RING_BUFFER_SNAPSHOT + * RING_BUFFER_SNAPSHOT_GET_CONSUMED + * RING_BUFFER_SNAPSHOT_GET_PRODUCED + * + * to get the offset range to consume, and then by passing each sub-buffer + * offset to RING_BUFFER_GET_SUBBUF, read the sub-buffer, and then release it + * with RING_BUFFER_PUT_SUBBUF. + * + * Note that the "snapshot" API can be used to read the sub-buffer in reverse + * order, which is useful for flight recorder snapshots. + */ + +/* Get a snapshot of the current ring buffer producer and consumer positions */ +#define RING_BUFFER_SNAPSHOT _IO(0xF6, 0x00) +/* Get the consumer position (iteration start) */ +#define RING_BUFFER_SNAPSHOT_GET_CONSUMED _IOR(0xF6, 0x01, unsigned long) +/* Get the producer position (iteration end) */ +#define RING_BUFFER_SNAPSHOT_GET_PRODUCED _IOR(0xF6, 0x02, unsigned long) +/* Get exclusive read access to the specified sub-buffer position */ +#define RING_BUFFER_GET_SUBBUF _IOW(0xF6, 0x03, unsigned long) +/* Release exclusive sub-buffer access */ +#define RING_BUFFER_PUT_SUBBUF _IO(0xF6, 0x04) + +/* Get exclusive read access to the next sub-buffer that can be read. */ +#define RING_BUFFER_GET_NEXT_SUBBUF _IO(0xF6, 0x05) +/* Release exclusive sub-buffer access, move consumer forward. */ +#define RING_BUFFER_PUT_NEXT_SUBBUF _IO(0xF6, 0x06) +/* returns the size of the current sub-buffer, without padding (for mmap). */ +#define RING_BUFFER_GET_SUBBUF_SIZE _IOR(0xF6, 0x07, unsigned long) +/* returns the size of the current sub-buffer, with padding (for splice). */ +#define RING_BUFFER_GET_PADDED_SUBBUF_SIZE _IOR(0xF6, 0x08, unsigned long) +/* returns the maximum size for sub-buffers. */ +#define RING_BUFFER_GET_MAX_SUBBUF_SIZE _IOR(0xF6, 0x09, unsigned long) +/* returns the length to mmap. */ +#define RING_BUFFER_GET_MMAP_LEN _IOR(0xF6, 0x0A, unsigned long) +/* returns the offset of the subbuffer belonging to the mmap reader. */ +#define RING_BUFFER_GET_MMAP_READ_OFFSET _IOR(0xF6, 0x0B, unsigned long) + +#endif /* _LINUX_RING_BUFFER_VFS_H */