Problem Statement
-----------------
commit
4d4838bad480 ("Use MAP_POPULATE to reduce pagefault when available")
was first introduced in tag v2.11.0 and never backported to stable
branches. Its purpose was to reduce the tracer fast-path latency caused
by handling minor page faults the first time a given application writes
to each page of the ring buffer after mapping them. The discussion
thread leading to this commit can be found here [1]. When using
LTTng-UST for diagnosing real-time applications with very strict
constraints, this added latency is unwanted.
That commit introduced the MAP_POPULATE flag when mapping the ring
buffer pages, which causes the kernel to pre-populate the page table
entries (PTE).
This has, however, unintended consequences for the following scenarios:
* Short-lived applications which write very little to the ring buffer end
up taking more time to start, because of the time it takes to
pre-populate all the ring buffer pages, even though they typically won't
be used by the application.
* Containerized workloads using cpusets will also end up having longer
application startup time than strictly required, and will populate
PTE for ring buffers of CPUs which are not present in the cpuset.
There are, therefore, two sets of irreconcilable requirements:
short-lived and containerized workloads benefit from lazily populating
the PTE, whereas real-time workloads benefit from pre-populating them.
This will therefore require a tunable environment variable that will let
the end-user choose the behavior for each application.
Solution
--------
Allow users to specify whether they want to pre-populate
shared memory pages within the application with an environment
variable.
LTTNG_UST_MAP_POPULATE_POLICY
If set, override the policy used to populate shared memory pages within the
application. The expected values are:
none
Do not pre-populate any pages, take minor faults on first access while
tracing.
cpu_possible
Pre-populate pages for all possible CPUs in the system, as listed by
/sys/devices/system/cpu/possible.
Default: none. If the policy is unknown, use the default.
Choice of the default
---------------------
Given that users with strict real-time constraints already have to setup
their tracing with specific options (see the "--read-timer"
lttng-enable-channel(3) option [2]), it makes sense that the default
is to lazily populate the ring buffer PTE, and require users with
real-time constraints to explicitly enable the pre-populate through an
environment variable.
Effect on default behavior
--------------------------
The default behavior for ring buffer PTE mapping will be changing across
LTTng-UST versions in the following way:
- 2.10 and earlier: lazily populate PTE,
- 2.11-2.13: pre-populate PTE,
- 2.14: lazily populate PTE.
LTTng-UST 2.14 will revert back to the 2.10 lazy populate scheme by
default.
[1] https://lists.lttng.org/pipermail/lttng-dev/2019-July/thread.html#29094
[2] https://lttng.org/docs/v2.13/#doc-channel-timers
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Change-Id: I6743b08cd1fe0d956caaf6aad63005555bb9640e
documentation under
https://github.com/lttng/lttng-ust/tree/v{lttng_version}/doc/examples/getcpu-override[`examples/getcpu-override`].
+`LTTNG_UST_MAP_POPULATE_POLICY`::
++
+--
+If set, override the policy used to populate shared memory pages
+within the application. The expected values are:
+
+`none`:::
+ Do not pre-populate any pages, take minor faults on first access
+ while tracing.
+
+`cpu_possible`:::
+ Pre-populate pages for all possible CPUs in the system, as
+ listed by `/sys/devices/system/cpu/possible`.
+--
++
+Default: `none`. If the policy is unknown, use the default.
+
`LTTNG_UST_REGISTER_TIMEOUT`::
Waiting time for the _registration done_ session daemon command
before proceeding to execute the main program (milliseconds).
logging.h \
smp.c \
smp.h \
+ populate.c \
+ populate.h \
strutils.c \
strutils.h \
utils.c \
#include "common/bitmap.h"
#include "common/smp.h"
+#include "common/populate.h"
#include "shm.h"
static size_t lttng_counter_get_dimension_nr_elements(struct lib_counter_dimension *dimension)
if (counter->is_daemon) {
/* Allocate and clear shared memory. */
shm_object = lttng_counter_shm_object_table_alloc(counter->object_table,
- shm_length, LTTNG_COUNTER_SHM_OBJECT_SHM, shm_fd, cpu);
+ shm_length, LTTNG_COUNTER_SHM_OBJECT_SHM, shm_fd, cpu,
+ lttng_ust_map_populate_cpu_is_enabled(cpu));
if (!shm_object)
return -ENOMEM;
} else {
/* Map pre-existing shared memory. */
shm_object = lttng_counter_shm_object_table_append_shm(counter->object_table,
- shm_fd, shm_length);
+ shm_fd, shm_length, lttng_ust_map_populate_cpu_is_enabled(cpu));
if (!shm_object)
return -ENOMEM;
}
int cpu, ret;
int nr_handles = 0;
int nr_cpus = get_possible_cpus_array_len();
+ bool populate = lttng_ust_map_populate_is_enabled();
if (validate_args(config, nr_dimensions, max_nr_elem,
global_sum_step, global_counter_fd, nr_counter_cpu_fds,
counter_cpu_fds))
return NULL;
- counter = zmalloc(sizeof(struct lib_counter));
+ counter = zmalloc_populate(sizeof(struct lib_counter), populate);
if (!counter)
return NULL;
counter->global_counters.shm_fd = -1;
if (lttng_counter_set_global_sum_step(counter, global_sum_step))
goto error_sum_step;
counter->nr_dimensions = nr_dimensions;
- counter->dimensions = zmalloc(nr_dimensions * sizeof(*counter->dimensions));
+ counter->dimensions = zmalloc_populate(nr_dimensions * sizeof(*counter->dimensions), populate);
if (!counter->dimensions)
goto error_dimensions;
for (dimension = 0; dimension < nr_dimensions; dimension++)
counter->dimensions[dimension].max_nr_elem = max_nr_elem[dimension];
if (config->alloc & COUNTER_ALLOC_PER_CPU) {
- counter->percpu_counters = zmalloc(sizeof(struct lib_counter_layout) * nr_cpus);
+ counter->percpu_counters = zmalloc_populate(sizeof(struct lib_counter_layout) * nr_cpus, populate);
if (!counter->percpu_counters)
goto error_alloc_percpu;
for_each_possible_cpu(cpu)
if (config->alloc & COUNTER_ALLOC_PER_CPU)
nr_handles += nr_cpus;
/* Allocate table for global and per-cpu counters. */
- counter->object_table = lttng_counter_shm_object_table_create(nr_handles);
+ counter->object_table = lttng_counter_shm_object_table_create(nr_handles, populate);
if (!counter->object_table)
goto error_alloc_object_table;
return ret;
}
-struct lttng_counter_shm_object_table *lttng_counter_shm_object_table_create(size_t max_nb_obj)
+struct lttng_counter_shm_object_table *lttng_counter_shm_object_table_create(size_t max_nb_obj, bool populate)
{
struct lttng_counter_shm_object_table *table;
- table = zmalloc(sizeof(struct lttng_counter_shm_object_table) +
- max_nb_obj * sizeof(table->objects[0]));
+ table = zmalloc_populate(sizeof(struct lttng_counter_shm_object_table) +
+ max_nb_obj * sizeof(table->objects[0]), populate);
if (!table)
return NULL;
table->size = max_nb_obj;
static
struct lttng_counter_shm_object *_lttng_counter_shm_object_table_alloc_shm(struct lttng_counter_shm_object_table *table,
size_t memory_map_size,
- int cpu_fd)
+ int cpu_fd, bool populate)
{
- int shmfd, ret;
struct lttng_counter_shm_object *obj;
+ int flags = MAP_SHARED;
+ int shmfd, ret;
char *memory_map;
if (cpu_fd < 0)
obj->shm_fd_ownership = 0;
obj->shm_fd = shmfd;
+ if (populate)
+ flags |= LTTNG_MAP_POPULATE;
/* memory_map: mmap */
memory_map = mmap(NULL, memory_map_size, PROT_READ | PROT_WRITE,
- MAP_SHARED | LTTNG_MAP_POPULATE, shmfd, 0);
+ flags, shmfd, 0);
if (memory_map == MAP_FAILED) {
PERROR("mmap");
goto error_mmap;
static
struct lttng_counter_shm_object *_lttng_counter_shm_object_table_alloc_mem(struct lttng_counter_shm_object_table *table,
- size_t memory_map_size)
+ size_t memory_map_size, bool populate)
{
struct lttng_counter_shm_object *obj;
void *memory_map;
return NULL;
obj = &table->objects[table->allocated_len];
- memory_map = zmalloc(memory_map_size);
+ memory_map = zmalloc_populate(memory_map_size, populate);
if (!memory_map)
goto alloc_error;
size_t memory_map_size,
enum lttng_counter_shm_object_type type,
int cpu_fd,
- int cpu)
+ int cpu,
+ bool populate)
#else
struct lttng_counter_shm_object *lttng_counter_shm_object_table_alloc(struct lttng_counter_shm_object_table *table,
size_t memory_map_size,
enum lttng_counter_shm_object_type type,
int cpu_fd,
- int cpu __attribute__((unused)))
+ int cpu __attribute__((unused)),
+ bool populate)
#endif
{
struct lttng_counter_shm_object *shm_object;
switch (type) {
case LTTNG_COUNTER_SHM_OBJECT_SHM:
shm_object = _lttng_counter_shm_object_table_alloc_shm(table, memory_map_size,
- cpu_fd);
+ cpu_fd, populate);
break;
case LTTNG_COUNTER_SHM_OBJECT_MEM:
- shm_object = _lttng_counter_shm_object_table_alloc_mem(table, memory_map_size);
+ shm_object = _lttng_counter_shm_object_table_alloc_mem(table, memory_map_size,
+ populate);
break;
default:
assert(0);
}
struct lttng_counter_shm_object *lttng_counter_shm_object_table_append_shm(struct lttng_counter_shm_object_table *table,
- int shm_fd,
- size_t memory_map_size)
+ int shm_fd, size_t memory_map_size, bool populate)
{
struct lttng_counter_shm_object *obj;
+ int flags = MAP_SHARED;
char *memory_map;
if (table->allocated_len >= table->size)
obj->shm_fd = shm_fd;
obj->shm_fd_ownership = 1;
+ if (populate)
+ flags |= LTTNG_MAP_POPULATE;
/* memory_map: mmap */
memory_map = mmap(NULL, memory_map_size, PROT_READ | PROT_WRITE,
- MAP_SHARED | LTTNG_MAP_POPULATE, shm_fd, 0);
+ flags, shm_fd, 0);
if (memory_map == MAP_FAILED) {
PERROR("mmap");
goto error_mmap;
#include <stddef.h>
#include <stdint.h>
#include <unistd.h>
+#include <stdbool.h>
#include "common/logging.h"
#include <urcu/compiler.h>
#include "shm_types.h"
#define lttng_counter_set_shmp(ref, src) _lttng_counter_set_shmp(&(ref)._ref, src)
-struct lttng_counter_shm_object_table *lttng_counter_shm_object_table_create(size_t max_nb_obj)
+struct lttng_counter_shm_object_table *lttng_counter_shm_object_table_create(size_t max_nb_obj, bool populate)
__attribute__((visibility("hidden")));
struct lttng_counter_shm_object *lttng_counter_shm_object_table_alloc(struct lttng_counter_shm_object_table *table,
size_t memory_map_size,
enum lttng_counter_shm_object_type type,
const int cpu_fd,
- int cpu)
+ int cpu, bool populate)
__attribute__((visibility("hidden")));
struct lttng_counter_shm_object *lttng_counter_shm_object_table_append_shm(struct lttng_counter_shm_object_table *table,
- int shm_fd, size_t memory_map_size)
+ int shm_fd, size_t memory_map_size, bool populate)
__attribute__((visibility("hidden")));
/* mem ownership is passed to lttng_counter_shm_object_table_append_mem(). */
/* Env. var. which can be used in setuid/setgid executables. */
{ "LTTNG_UST_WITHOUT_BADDR_STATEDUMP", LTTNG_ENV_NOT_SECURE, NULL, },
{ "LTTNG_UST_REGISTER_TIMEOUT", LTTNG_ENV_NOT_SECURE, NULL, },
+ { "LTTNG_UST_MAP_POPULATE_POLICY", LTTNG_ENV_NOT_SECURE, NULL, },
/* Env. var. which are not fetched in setuid/setgid executables. */
{ "LTTNG_UST_CLOCK_PLUGIN", LTTNG_ENV_SECURE, NULL, },
#define _UST_COMMON_MACROS_H
#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
#include <lttng/ust-arch.h>
+/*
+ * calloc() does not always populate the page table for the allocated
+ * memory. Optionally enforce page table populate.
+ */
+static inline
+void *zmalloc_populate(size_t len, bool populate)
+ __attribute__((always_inline));
+static inline
+void *zmalloc_populate(size_t len, bool populate)
+{
+ if (populate) {
+ void *ret = malloc(len);
+ if (ret == NULL)
+ return ret;
+ bzero(ret, len);
+ return ret;
+ } else {
+ return calloc(len, 1);
+ }
+}
+
/*
* Memory allocation zeroed
*/
static inline
void *zmalloc(size_t len)
{
- return calloc(len, 1);
+ return zmalloc_populate(len, false);
}
#define max_t(type, x, y) \
--- /dev/null
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright (C) 2024-2012 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#define _LGPL_SOURCE
+#include "common/getenv.h"
+#include "common/logging.h"
+#include "common/populate.h"
+
+enum populate_policy {
+ POPULATE_UNSET,
+
+ POPULATE_NONE,
+ POPULATE_CPU_POSSIBLE,
+
+ POPULATE_UNKNOWN,
+};
+
+static enum populate_policy map_populate_policy = POPULATE_UNSET;
+
+static void init_map_populate_policy(void)
+{
+ const char *populate_env_str;
+
+ if (map_populate_policy != POPULATE_UNSET)
+ return;
+
+ populate_env_str = lttng_ust_getenv("LTTNG_UST_MAP_POPULATE_POLICY");
+ if (!populate_env_str) {
+ map_populate_policy = POPULATE_NONE;
+ return;
+ }
+ if (!strcmp(populate_env_str, "none")) {
+ map_populate_policy = POPULATE_NONE;
+ } else if (!strcmp(populate_env_str, "cpu_possible")) {
+ map_populate_policy = POPULATE_CPU_POSSIBLE;
+ } else {
+ /*
+ * populate_env_str is an untrusted environment variable
+ * input (can be provided to setuid/setgid binaries), so
+ * don't even try to print it.
+ */
+ WARN("Unknown policy for LTTNG_UST_MAP_POPULATE_POLICY environment variable.");
+ map_populate_policy = POPULATE_UNKNOWN;
+ }
+}
+
+/*
+ * Return the shared page populate policy for global pages. Returns true
+ * if shared memory pages should be pre-populated, false otherwise.
+ */
+bool lttng_ust_map_populate_is_enabled(void)
+{
+ init_map_populate_policy();
+
+ switch (map_populate_policy) {
+ case POPULATE_UNKNOWN: /* Fall-through */
+ case POPULATE_NONE:
+ return false;
+ case POPULATE_CPU_POSSIBLE:
+ return true;
+ default:
+ abort();
+ }
+ return false;
+}
+
+/*
+ * Return the shared page populate policy based on the @cpu number
+ * provided as input. Returns true if shared memory pages should be
+ * pre-populated, false otherwise.
+ *
+ * The @cpu argument is currently unused except for negative value
+ * validation. It is present to eventually match cpu affinity or cpu
+ * online masks if those features are added in the future.
+ */
+bool lttng_ust_map_populate_cpu_is_enabled(int cpu)
+{
+ /* Reject invalid cpu number. */
+ if (cpu < 0)
+ return false;
+
+ return lttng_ust_map_populate_is_enabled();
+}
--- /dev/null
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright (C) 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifndef _UST_COMMON_POPULATE_H
+#define _UST_COMMON_POPULATE_H
+
+#include <stdbool.h>
+
+bool lttng_ust_map_populate_cpu_is_enabled(int cpu)
+ __attribute__((visibility("hidden")));
+
+bool lttng_ust_map_populate_is_enabled(void)
+ __attribute__((visibility("hidden")));
+
+#endif /* _UST_COMMON_POPULATE_H */
#include "common/smp.h"
#include "shm.h"
#include "common/align.h"
+#include "common/populate.h"
/**
* lib_ring_buffer_backend_allocate - allocate a channel buffer
struct shm_object *shmobj;
shmobj = shm_object_table_alloc(handle->table, shmsize,
- SHM_OBJECT_SHM, stream_fds[i], i);
+ SHM_OBJECT_SHM, stream_fds[i], i,
+ lttng_ust_map_populate_cpu_is_enabled(i));
if (!shmobj)
goto end;
align_shm(shmobj, __alignof__(struct lttng_ust_ring_buffer));
struct lttng_ust_ring_buffer *buf;
shmobj = shm_object_table_alloc(handle->table, shmsize,
- SHM_OBJECT_SHM, stream_fds[0], -1);
+ SHM_OBJECT_SHM, stream_fds[0], -1,
+ lttng_ust_map_populate_is_enabled());
if (!shmobj)
goto end;
align_shm(shmobj, __alignof__(struct lttng_ust_ring_buffer));
#include "shm.h"
#include "rb-init.h"
#include "common/compat/errno.h" /* For ENODATA */
+#include "common/populate.h"
/* Print DBG() messages about events lost only every 1048576 hits */
#define DBG_PRINT_NR_LOST (1UL << 20)
struct shm_object *shmobj;
unsigned int nr_streams;
int64_t blocking_timeout_ms;
+ bool populate = lttng_ust_map_populate_is_enabled();
if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
nr_streams = get_possible_cpus_array_len();
read_timer_interval))
return NULL;
- handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
+ handle = zmalloc_populate(sizeof(struct lttng_ust_shm_handle), populate);
if (!handle)
return NULL;
/* Allocate table for channel + per-cpu buffers */
- handle->table = shm_object_table_create(1 + get_possible_cpus_array_len());
+ handle->table = shm_object_table_create(1 + get_possible_cpus_array_len(), populate);
if (!handle->table)
goto error_table_alloc;
/* Allocate normal memory for channel (not shared) */
shmobj = shm_object_table_alloc(handle->table, shmsize, SHM_OBJECT_MEM,
- -1, -1);
+ -1, -1, populate);
if (!shmobj)
goto error_append;
/* struct lttng_ust_ring_buffer_channel is at object 0, offset 0 (hardcoded) */
{
struct lttng_ust_shm_handle *handle;
struct shm_object *object;
+ bool populate = lttng_ust_map_populate_is_enabled();
- handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
+ handle = zmalloc_populate(sizeof(struct lttng_ust_shm_handle), populate);
if (!handle)
return NULL;
/* Allocate table for channel + per-cpu buffers */
- handle->table = shm_object_table_create(1 + get_possible_cpus_array_len());
+ handle->table = shm_object_table_create(1 + get_possible_cpus_array_len(), populate);
if (!handle->table)
goto error_table_alloc;
/* Add channel object */
/* Add stream object */
object = shm_object_table_append_shm(handle->table,
shm_fd, wakeup_fd, stream_nr,
- memory_map_size);
+ memory_map_size, lttng_ust_map_populate_cpu_is_enabled(stream_nr));
if (!object)
return -EINVAL;
return 0;
return ret;
}
-struct shm_object_table *shm_object_table_create(size_t max_nb_obj)
+struct shm_object_table *shm_object_table_create(size_t max_nb_obj, bool populate)
{
struct shm_object_table *table;
- table = zmalloc(sizeof(struct shm_object_table) +
- max_nb_obj * sizeof(table->objects[0]));
+ table = zmalloc_populate(sizeof(struct shm_object_table) +
+ max_nb_obj * sizeof(table->objects[0]), populate);
if (!table)
return NULL;
table->size = max_nb_obj;
static
struct shm_object *_shm_object_table_alloc_shm(struct shm_object_table *table,
size_t memory_map_size,
- int stream_fd)
+ int stream_fd,
+ bool populate)
{
int shmfd, waitfd[2], ret, i;
+ int flags = MAP_SHARED;
struct shm_object *obj;
char *memory_map;
obj->shm_fd_ownership = 0;
obj->shm_fd = shmfd;
+ if (populate)
+ flags |= LTTNG_MAP_POPULATE;
/* memory_map: mmap */
memory_map = mmap(NULL, memory_map_size, PROT_READ | PROT_WRITE,
- MAP_SHARED | LTTNG_MAP_POPULATE, shmfd, 0);
+ flags, shmfd, 0);
if (memory_map == MAP_FAILED) {
PERROR("mmap");
goto error_mmap;
static
struct shm_object *_shm_object_table_alloc_mem(struct shm_object_table *table,
- size_t memory_map_size)
+ size_t memory_map_size, bool populate)
{
struct shm_object *obj;
void *memory_map;
return NULL;
obj = &table->objects[table->allocated_len];
- memory_map = zmalloc(memory_map_size);
+ memory_map = zmalloc_populate(memory_map_size, populate);
if (!memory_map)
goto alloc_error;
size_t memory_map_size,
enum shm_object_type type,
int stream_fd,
- int cpu)
+ int cpu,
+ bool populate)
#else
struct shm_object *shm_object_table_alloc(struct shm_object_table *table,
size_t memory_map_size,
enum shm_object_type type,
int stream_fd,
- int cpu __attribute__((unused)))
+ int cpu __attribute__((unused)),
+ bool populate)
#endif
{
struct shm_object *shm_object;
switch (type) {
case SHM_OBJECT_SHM:
shm_object = _shm_object_table_alloc_shm(table, memory_map_size,
- stream_fd);
+ stream_fd, populate);
break;
case SHM_OBJECT_MEM:
- shm_object = _shm_object_table_alloc_mem(table, memory_map_size);
+ shm_object = _shm_object_table_alloc_mem(table, memory_map_size,
+ populate);
break;
default:
assert(0);
struct shm_object *shm_object_table_append_shm(struct shm_object_table *table,
int shm_fd, int wakeup_fd, uint32_t stream_nr,
- size_t memory_map_size)
+ size_t memory_map_size, bool populate)
{
+ int flags = MAP_SHARED;
struct shm_object *obj;
char *memory_map;
int ret;
goto error_fcntl;
}
+ if (populate)
+ flags |= LTTNG_MAP_POPULATE;
/* memory_map: mmap */
memory_map = mmap(NULL, memory_map_size, PROT_READ | PROT_WRITE,
- MAP_SHARED | LTTNG_MAP_POPULATE, shm_fd, 0);
+ flags, shm_fd, 0);
if (memory_map == MAP_FAILED) {
PERROR("mmap");
goto error_mmap;
#define set_shmp(ref, src) _set_shmp(&(ref)._ref, src)
-struct shm_object_table *shm_object_table_create(size_t max_nb_obj)
+struct shm_object_table *shm_object_table_create(size_t max_nb_obj, bool populate)
__attribute__((visibility("hidden")));
struct shm_object *shm_object_table_alloc(struct shm_object_table *table,
size_t memory_map_size,
enum shm_object_type type,
const int stream_fd,
- int cpu)
+ int cpu, bool populate)
__attribute__((visibility("hidden")));
struct shm_object *shm_object_table_append_shm(struct shm_object_table *table,
int shm_fd, int wakeup_fd, uint32_t stream_nr,
- size_t memory_map_size)
+ size_t memory_map_size, bool populate)
__attribute__((visibility("hidden")));
/* mem ownership is passed to shm_object_table_append_mem(). */
ok(shmfd > 0, "Open a POSIX shm fd");
/* Create a dummy shm object table to test the allocation function */
- table = shm_object_table_create(1);
+ table = shm_object_table_create(1, false);
ok(table, "Create a shm object table");
assert(table);
/* This function sets the initial size of the shm with ftruncate and zeros it */
- shmobj = shm_object_table_alloc(table, shmsize, SHM_OBJECT_SHM, shmfd, -1);
+ shmobj = shm_object_table_alloc(table, shmsize, SHM_OBJECT_SHM, shmfd, -1, false);
ok(shmobj, "Allocate the shm object table");
assert(shmobj);