From: Michael Jeanson Date: Wed, 20 Jul 2022 18:49:56 +0000 (-0400) Subject: fix: num_possible_cpus() with hot-unplugged CPUs X-Git-Tag: v2.13.4~10 X-Git-Url: https://git.lttng.org./?a=commitdiff_plain;h=022c53ed5418c34694d021a3a84a42dc10fbf6de;p=lttng-ust.git fix: num_possible_cpus() with hot-unplugged CPUs We rely on sysconf(_SC_NPROCESSORS_CONF) to get the maximum possible number of CPUs that can be attached to the system for the lifetime of an application. We use this value to allocate an array of per-CPU buffers that is indexed by the numerical id of the CPUs. As such we expect that the highest possible CPU id would be one less than the number returned by sysconf(_SC_NPROCESSORS_CONF) which is unfortunatly not always the case and can vary across libc implementations and versions. Glibc up to 2.35 will count the number of "cpuX" directories in "/sys/devices/system/cpu" which doesn't include CPUS that were hot-unplugged. This information is however provided by the kernel in "/sys/devices/system/cpu/possible" in the form of a mask listing all the CPUs that could possibly be hot-plugged in the system. This patch changes the implementation of num_possible_cpus() to first try parsing the possible CPU mask to extract the highest possible value and if this fails fallback to the previous behavior. Change-Id: I1a3cb1a446154ec443a391d6689cb7d4165726fd Signed-off-by: Michael Jeanson Signed-off-by: Mathieu Desnoyers --- diff --git a/src/common/smp.c b/src/common/smp.c index 95f6dd14..a4346591 100644 --- a/src/common/smp.c +++ b/src/common/smp.c @@ -6,20 +6,25 @@ */ #define _LGPL_SOURCE +#include +#include +#include +#include #include #include +#include #include +#include "common/align.h" +#include "common/logging.h" #include "common/smp.h" static int num_possible_cpus_cache; #if (defined(__GLIBC__) || defined( __UCLIBC__)) -static void _get_num_possible_cpus(void) +int get_num_possible_cpus_fallback(void) { - int result; - /* On Linux, when some processors are offline * _SC_NPROCESSORS_CONF counts the offline * processors, whereas _SC_NPROCESSORS_ONLN @@ -28,10 +33,7 @@ static void _get_num_possible_cpus(void) * this sysconf, in which case the arrays * indexed by processor would overflow. */ - result = sysconf(_SC_NPROCESSORS_CONF); - if (result == -1) - return; - num_possible_cpus_cache = result; + return sysconf(_SC_NPROCESSORS_CONF); } #else @@ -54,9 +56,9 @@ static void _get_num_possible_cpus(void) #define __max(a,b) ((a)>(b)?(a):(b)) -static void _get_num_possible_cpus(void) +int get_num_possible_cpus_fallback(void) { - int result, count = 0; + int count = 0; DIR *cpudir; struct dirent *entry; @@ -87,22 +89,135 @@ end: /* * Get the sysconf value as a fallback. Keep the highest number. */ - result = __max(sysconf(_SC_NPROCESSORS_CONF), count); + return __max(sysconf(_SC_NPROCESSORS_CONF), count); +} +#endif + +/* + * Get the CPU possible mask string from sysfs. + * + * buf: the buffer where the mask will be read. + * max_bytes: the maximum number of bytes to write in the buffer. + * + * Returns the number of bytes read or -1 on error. + */ +int get_possible_cpu_mask_from_sysfs(char *buf, size_t max_bytes) +{ + ssize_t bytes_read = 0; + size_t total_bytes_read = 0; + int fd = 0; + + if (buf == NULL) + return -1; + + fd = open("/sys/devices/system/cpu/possible", O_RDONLY); + if (fd < 0) + return -1; + + do { + bytes_read = read(fd, buf + total_bytes_read, + max_bytes - total_bytes_read); + + if (bytes_read < 0) { + if (errno == EINTR) { + continue; /* retry operation */ + } else { + return -1; + } + } + + total_bytes_read += bytes_read; + assert(total_bytes_read <= max_bytes); + } while (max_bytes > total_bytes_read && bytes_read > 0); + + if (close(fd)) + PERROR("close"); + + /* + * Make sure the mask read is a null terminated string. + */ + if (total_bytes_read < max_bytes) + buf[total_bytes_read] = '\0'; + else + buf[max_bytes - 1] = '\0'; + + return total_bytes_read; +} + +/* + * Get the number of CPUs from the possible cpu mask. + * + * pmask: the mask to parse. + * len: the len of the mask excluding '\0'. + * + * Returns the number of possible CPUs from the mask or 0 on error. + */ +int get_num_possible_cpus_from_mask(const char *pmask, size_t len) +{ + ssize_t i; + unsigned long cpu_index; + char *endptr; + + /* We need at least one char to read */ + if (len < 1) + goto error; + + /* Start from the end to read the last CPU index. */ + for (i = len - 1; i > 0; i--) { + /* Break when we hit the first separator. */ + if ((pmask[i] == ',') || (pmask[i] == '-')) { + i++; + break; + } + } + + cpu_index = strtoul(&pmask[i], &endptr, 10); /* - * If both methods failed, don't store the value. + * If we read a CPU index, increment it by one to return a number of + * CPUs. */ - if (result < 1) + if ((&pmask[i] != endptr) && (cpu_index < INT_MAX)) + return (int) cpu_index + 1; + +error: + return 0; +} + +static void _get_num_possible_cpus(void) +{ + int ret; + int buf_len = LTTNG_UST_PAGE_SIZE; + char buf[buf_len]; + + /* Get the possible cpu mask from sysfs, fallback to sysconf. */ + ret = get_possible_cpu_mask_from_sysfs((char *) &buf, buf_len); + if (ret <= 0) + goto fallback; + + /* Parse the possible cpu mask, on failure fallback to sysconf. */ + ret = get_num_possible_cpus_from_mask((char *) &buf, ret); + if (ret > 0) + goto end; + +fallback: + /* Fallback to sysconf. */ + ret = get_num_possible_cpus_fallback(); + +end: + /* If all methods failed, don't store the value. */ + if (ret < 1) return; - num_possible_cpus_cache = result; + + num_possible_cpus_cache = ret; } -#endif /* * Returns the total number of CPUs in the system. If the cache is not yet - * initialized, get the value from the system through sysconf and cache it. + * initialized, get the value from "/sys/devices/system/cpu/possible" or + * fallback to sysconf and cache it. * - * If the sysconf call fails, don't populate the cache and return 0. + * If all methods fail, don't populate the cache and return 0. */ int num_possible_cpus(void) { diff --git a/src/common/smp.h b/src/common/smp.h index 5f81094f..b6dd5707 100644 --- a/src/common/smp.h +++ b/src/common/smp.h @@ -7,11 +7,43 @@ #ifndef _UST_COMMON_SMP_H #define _UST_COMMON_SMP_H +/* + * Get the CPU possible mask string from sysfs. + * + * buf: the buffer where the mask will be read. + * max_bytes: the maximum number of bytes to write in the buffer. + * + * Returns the number of bytes read or -1 on error. + */ +int get_possible_cpu_mask_from_sysfs(char *buf, size_t max_bytes) + __attribute__((visibility("hidden"))); + +/* + * Get the number of possible CPUs in the system from either + * sysconf(_SC_NPROCESSORS_CONF) or some other mechanism depending on the libc. + * + * Returns the number of possible CPUs in the system or 0 on error. + */ +int get_num_possible_cpus_fallback(void) + __attribute__((visibility("hidden"))); + +/* + * Get the number of CPUs from the possible cpu mask. + * + * pmask: the mask to parse. + * len: the len of the mask excluding '\0'. + * + * Returns the number of possible CPUs from the mask or 0 on error. + */ +int get_num_possible_cpus_from_mask(const char *pmask, size_t len) + __attribute__((visibility("hidden"))); + /* * Returns the total number of CPUs in the system. If the cache is not yet - * initialized, get the value from the system through sysconf and cache it. + * initialized, get the value from "/sys/devices/system/cpu/possible" or + * fallback to sysconf and cache it. * - * If the sysconf call fails, don't populate the cache and return 0. + * If all methods fail, don't populate the cache and return 0. */ int num_possible_cpus(void) __attribute__((visibility("hidden")));