This patch is based on the kvmalloc helpers introduced in kernel 4.12.
It will gracefully failover memory allocations of more than one page to
vmalloc for systems under high memory pressure or fragmentation.
See Linux kernel commit:
commit
a7c3e901a46ff54c016d040847eda598a9e3e653
Author: Michal Hocko <mhocko@suse.com>
Date: Mon May 8 15:57:09 2017 -0700
mm: introduce kv[mz]alloc helpers
Patch series "kvmalloc", v5.
There are many open coded kmalloc with vmalloc fallback instances in the
tree. Most of them are not careful enough or simply do not care about
the underlying semantic of the kmalloc/page allocator which means that
a) some vmalloc fallbacks are basically unreachable because the kmalloc
part will keep retrying until it succeeds b) the page allocator can
invoke a really disruptive steps like the OOM killer to move forward
which doesn't sound appropriate when we consider that the vmalloc
fallback is available.
As it can be seen implementing kvmalloc requires quite an intimate
knowledge if the page allocator and the memory reclaim internals which
strongly suggests that a helper should be implemented in the memory
subsystem proper.
Most callers, I could find, have been converted to use the helper
instead. This is patch 6. There are some more relying on __GFP_REPEAT
in the networking stack which I have converted as well and Eric Dumazet
was not opposed [2] to convert them as well.
[1] http://lkml.kernel.org/r/
20170130094940.13546-1-mhocko@kernel.org
[2] http://lkml.kernel.org/r/
1485273626.16328.301.camel@edumazet-glaptop3.roam.corp.google.com
This patch (of 9):
Using kmalloc with the vmalloc fallback for larger allocations is a
common pattern in the kernel code. Yet we do not have any common helper
for that and so users have invented their own helpers. Some of them are
really creative when doing so. Let's just add kv[mz]alloc and make sure
it is implemented properly. This implementation makes sure to not make
a large memory pressure for > PAGE_SZE requests (__GFP_NORETRY) and also
to not warn about allocation failures. This also rules out the OOM
killer as the vmalloc is a more approapriate fallback than a disruptive
user visible action.
Signed-off-by: Michael Jeanson <mjeanson@efficios.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
#include <linux/slab.h>
#include <lib/prio_heap/lttng_prio_heap.h>
+#include <wrapper/vmalloc.h>
#ifdef DEBUG_HEAP
void lttng_check_heap(const struct lttng_ptr_heap *heap)
return 0;
heap->alloc_len = max_t(size_t, new_len, heap->alloc_len << 1);
- new_ptrs = kmalloc(heap->alloc_len * sizeof(void *), heap->gfpmask);
+ new_ptrs = lttng_kvmalloc(heap->alloc_len * sizeof(void *), heap->gfpmask);
if (!new_ptrs)
return -ENOMEM;
if (heap->ptrs)
memcpy(new_ptrs, heap->ptrs, heap->len * sizeof(void *));
- kfree(heap->ptrs);
+ lttng_kvfree(heap->ptrs);
heap->ptrs = new_ptrs;
return 0;
}
void lttng_heap_free(struct lttng_ptr_heap *heap)
{
- kfree(heap->ptrs);
+ lttng_kvfree(heap->ptrs);
}
static void heapify(struct lttng_ptr_heap *heap, size_t i)
if (unlikely(!pages))
goto pages_error;
- bufb->array = kmalloc_node(ALIGN(sizeof(*bufb->array)
+ bufb->array = lttng_kvmalloc_node(ALIGN(sizeof(*bufb->array)
* num_subbuf_alloc,
1 << INTERNODE_CACHE_SHIFT),
GFP_KERNEL | __GFP_NOWARN,
/* Allocate backend pages array elements */
for (i = 0; i < num_subbuf_alloc; i++) {
bufb->array[i] =
- kzalloc_node(ALIGN(
+ lttng_kvzalloc_node(ALIGN(
sizeof(struct lib_ring_buffer_backend_pages) +
sizeof(struct lib_ring_buffer_backend_page)
* num_pages_per_subbuf,
}
/* Allocate write-side subbuffer table */
- bufb->buf_wsb = kzalloc_node(ALIGN(
+ bufb->buf_wsb = lttng_kvzalloc_node(ALIGN(
sizeof(struct lib_ring_buffer_backend_subbuffer)
* num_subbuf,
1 << INTERNODE_CACHE_SHIFT),
bufb->buf_rsb.id = subbuffer_id(config, 0, 1, 0);
/* Allocate subbuffer packet counter table */
- bufb->buf_cnt = kzalloc_node(ALIGN(
+ bufb->buf_cnt = lttng_kvzalloc_node(ALIGN(
sizeof(struct lib_ring_buffer_backend_counts)
* num_subbuf,
1 << INTERNODE_CACHE_SHIFT),
return 0;
free_wsb:
- kfree(bufb->buf_wsb);
+ lttng_kvfree(bufb->buf_wsb);
free_array:
for (i = 0; (i < num_subbuf_alloc && bufb->array[i]); i++)
- kfree(bufb->array[i]);
+ lttng_kvfree(bufb->array[i]);
depopulate:
/* Free all allocated pages */
for (i = 0; (i < num_pages && pages[i]); i++)
__free_page(pages[i]);
- kfree(bufb->array);
+ lttng_kvfree(bufb->array);
array_error:
vfree(pages);
pages_error:
if (chanb->extra_reader_sb)
num_subbuf_alloc++;
- kfree(bufb->buf_wsb);
- kfree(bufb->buf_cnt);
+ lttng_kvfree(bufb->buf_wsb);
+ lttng_kvfree(bufb->buf_cnt);
for (i = 0; i < num_subbuf_alloc; i++) {
for (j = 0; j < bufb->num_pages_per_subbuf; j++)
__free_page(pfn_to_page(bufb->array[i]->p[j].pfn));
- kfree(bufb->array[i]);
+ lttng_kvfree(bufb->array[i]);
}
- kfree(bufb->array);
+ lttng_kvfree(bufb->array);
bufb->allocated = 0;
}
#include <wrapper/kref.h>
#include <wrapper/percpu-defs.h>
#include <wrapper/timer.h>
+#include <wrapper/vmalloc.h>
/*
* Internal structure representing offsets to use at a sub-buffer switch.
struct channel *chan = buf->backend.chan;
lib_ring_buffer_print_errors(chan, buf, buf->backend.cpu);
- kfree(buf->commit_hot);
- kfree(buf->commit_cold);
+ lttng_kvfree(buf->commit_hot);
+ lttng_kvfree(buf->commit_cold);
lib_ring_buffer_backend_free(&buf->backend);
}
return ret;
buf->commit_hot =
- kzalloc_node(ALIGN(sizeof(*buf->commit_hot)
+ lttng_kvzalloc_node(ALIGN(sizeof(*buf->commit_hot)
* chan->backend.num_subbuf,
1 << INTERNODE_CACHE_SHIFT),
GFP_KERNEL | __GFP_NOWARN,
}
buf->commit_cold =
- kzalloc_node(ALIGN(sizeof(*buf->commit_cold)
+ lttng_kvzalloc_node(ALIGN(sizeof(*buf->commit_cold)
* chan->backend.num_subbuf,
1 << INTERNODE_CACHE_SHIFT),
GFP_KERNEL | __GFP_NOWARN,
/* Error handling */
free_init:
- kfree(buf->commit_cold);
+ lttng_kvfree(buf->commit_cold);
free_commit:
- kfree(buf->commit_hot);
+ lttng_kvfree(buf->commit_hot);
free_chanbuf:
lib_ring_buffer_backend_free(&buf->backend);
return ret;
#endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)) */
kfree(field->event_field.name);
kfree(field->u.perf_counter->attr);
- kfree(events);
+ lttng_kvfree(events);
kfree(field->u.perf_counter);
}
int ret;
char *name_alloc;
- events = kzalloc(num_possible_cpus() * sizeof(*events), GFP_KERNEL);
+ events = lttng_kvzalloc(num_possible_cpus() * sizeof(*events), GFP_KERNEL);
if (!events)
return -ENOMEM;
error_alloc_perf_field:
kfree(attr);
error_attr:
- kfree(events);
+ lttng_kvfree(events);
return ret;
}
struct lttng_ctx_field *new_fields;
ctx->allocated_fields = max_t(size_t, 1, 2 * ctx->allocated_fields);
- new_fields = kzalloc(ctx->allocated_fields * sizeof(struct lttng_ctx_field), GFP_KERNEL);
+ new_fields = lttng_kvzalloc(ctx->allocated_fields * sizeof(struct lttng_ctx_field), GFP_KERNEL);
if (!new_fields)
return NULL;
if (ctx->fields)
memcpy(new_fields, ctx->fields, sizeof(*ctx->fields) * ctx->nr_fields);
- kfree(ctx->fields);
+ lttng_kvfree(ctx->fields);
ctx->fields = new_fields;
}
field = &ctx->fields[ctx->nr_fields];
if (ctx->fields[i].destroy)
ctx->fields[i].destroy(&ctx->fields[i]);
}
- kfree(ctx->fields);
+ lttng_kvfree(ctx->fields);
kfree(ctx);
}
int i;
mutex_lock(&sessions_mutex);
- session = kzalloc(sizeof(struct lttng_session), GFP_KERNEL);
+ session = lttng_kvzalloc(sizeof(struct lttng_session), GFP_KERNEL);
if (!session)
goto err;
INIT_LIST_HEAD(&session->chan);
err_free_cache:
kfree(metadata_cache);
err_free_session:
- kfree(session);
+ lttng_kvfree(session);
err:
mutex_unlock(&sessions_mutex);
return NULL;
kref_put(&session->metadata_cache->refcount, metadata_cache_destroy);
list_del(&session->list);
mutex_unlock(&sessions_mutex);
- kfree(session);
+ lttng_kvfree(session);
}
int lttng_session_statedump(struct lttng_session *session)
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
#ifdef CONFIG_KALLSYMS
#include <linux/kallsyms.h>
}
#else
-#include <linux/vmalloc.h>
-
static inline
void wrapper_vmalloc_sync_all(void)
{
}
#endif
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0))
+static inline
+void *lttng_kvmalloc_node(unsigned long size, gfp_t flags, int node)
+{
+ void *ret;
+
+ ret = kvmalloc_node(size, flags, node);
+ if (is_vmalloc_addr(ret)) {
+ /*
+ * Make sure we don't trigger recursive page faults in the
+ * tracing fast path.
+ */
+ wrapper_vmalloc_sync_all();
+ }
+ return ret;
+}
+
+static inline
+void *lttng_kvzalloc_node(unsigned long size, gfp_t flags, int node)
+{
+ return lttng_kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+
+static inline
+void *lttng_kvmalloc(unsigned long size, gfp_t flags)
+{
+ return lttng_kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+
+static inline
+void *lttng_kvzalloc(unsigned long size, gfp_t flags)
+{
+ return lttng_kvzalloc_node(size, flags, NUMA_NO_NODE);
+}
+
+static inline
+void lttng_kvfree(const void *addr)
+{
+ kvfree(addr);
+}
+
+#else
+
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+/*
+ * kallsyms wrapper of __vmalloc_node with a fallback to kmalloc_node.
+ */
+static inline
+void *__lttng_vmalloc_node_fallback(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot, int node, void *caller)
+{
+ void *ret;
+
+#ifdef CONFIG_KALLSYMS
+ /*
+ * If we have KALLSYMS, get * __vmalloc_node which is not exported.
+ */
+ void *(*lttng__vmalloc_node)(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot, int node, void *caller);
+
+ lttng__vmalloc_node = (void *) kallsyms_lookup_funcptr("__vmalloc_node");
+ ret = lttng__vmalloc_node(size, align, gfp_mask, prot, node, caller);
+#else
+ /*
+ * If we don't have KALLSYMS, fallback to kmalloc_node.
+ */
+ ret = kmalloc_node(size, flags, node);
+#endif
+
+ return ret;
+}
+
+/**
+ * lttng_kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @flags: gfp mask for the allocation - must be compatible with GFP_KERNEL.
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use lttng_kvfree to free the memory.
+ *
+ * Reclaim modifiers - __GFP_NORETRY, __GFP_REPEAT and __GFP_NOFAIL are not supported
+ */
+static inline
+void *lttng_kvmalloc_node(unsigned long size, gfp_t flags, int node)
+{
+ void *ret;
+
+ /*
+ * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
+ * so the given set of flags has to be compatible.
+ */
+ WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
+
+ /*
+ * If the allocation fits in a single page, do not fallback.
+ */
+ if (size <= PAGE_SIZE) {
+ return kmalloc_node(size, flags, node);
+ }
+
+ /*
+ * Make sure that larger requests are not too disruptive - no OOM
+ * killer and no allocation failure warnings as we have a fallback
+ */
+ ret = kmalloc_node(size, flags | __GFP_NOWARN | __GFP_NORETRY, node);
+ if (!ret) {
+ if (node == NUMA_NO_NODE) {
+ /*
+ * If no node was specified, use __vmalloc which is
+ * always exported.
+ */
+ ret = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+ } else {
+ /*
+ * Otherwise, we need to select a node but __vmalloc_node
+ * is not exported, use this fallback wrapper which uses
+ * kallsyms if available or falls back to kmalloc_node.
+ */
+ ret = __lttng_vmalloc_node_fallback(size, 1,
+ flags | __GFP_HIGHMEM, PAGE_KERNEL, node,
+ __builtin_return_address(0));
+ }
+
+ /*
+ * Make sure we don't trigger recursive page faults in the
+ * tracing fast path.
+ */
+ wrapper_vmalloc_sync_all();
+ }
+ return ret;
+}
+
+static inline
+void *lttng_kvzalloc_node(unsigned long size, gfp_t flags, int node)
+{
+ return lttng_kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+
+static inline
+void *lttng_kvmalloc(unsigned long size, gfp_t flags)
+{
+ return lttng_kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+
+static inline
+void *lttng_kvzalloc(unsigned long size, gfp_t flags)
+{
+ return lttng_kvzalloc_node(size, flags, NUMA_NO_NODE);
+}
+
+static inline
+void lttng_kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr)) {
+ vfree(addr);
+ } else {
+ kfree(addr);
+ }
+}
+#endif
+
#endif /* _LTTNG_WRAPPER_VMALLOC_H */