From: Mathieu Desnoyers Date: Wed, 24 Aug 2016 20:02:02 +0000 (-0400) Subject: RCU Judy Array (rcuja) implementation X-Git-Url: https://git.lttng.org./?a=commitdiff_plain;h=db61f215ed34cf68d4a7e8ea7cde9bfea19f476f;p=urcu.git RCU Judy Array (rcuja) implementation Signed-off-by: Mathieu Desnoyers --- diff --git a/doc/Makefile.am b/doc/Makefile.am index cf2e902..6db23b9 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -3,4 +3,5 @@ SUBDIRS = examples dist_doc_DATA = rcu-api.md \ cds-api.md \ solaris-build.md \ - uatomic-api.md + uatomic-api.md \ + rcuja-design.txt diff --git a/doc/rcuja-design.txt b/doc/rcuja-design.txt new file mode 100644 index 0000000..09c9150 --- /dev/null +++ b/doc/rcuja-design.txt @@ -0,0 +1,448 @@ +RCU Judy Array Design +Mathieu Desnoyers +March 8, 2012 + +Initial ideas based on the released Judy Shop Manual +(http://judy.sourceforge.net/). Judy was invented by Doug Baskins and +implemented by Hewlett-Packard. + +Thresholds and RCU-specific analysis is introduced in this document. + +Advantages of using Judy Array (compressed nodes) for RCU tree: +- no rebalancing +- no transplant +- RCU-friendly! +- favor cache-line alignment of structures + +Disadvantage: +- updates that need to reallocate nodes are slower than, e.g. non-rcu + red-black trees. + +Choice: Using 256 entries intermediate nodes (index can be represented +on 8 bits): 4 levels on 32-bit, 8 levels on 64-bit + + +* Node types (from less dense node to most dense) + + +- empty node: + +Parent pointer is NULL. + + +- Type A: sequential search in value and pointer arrays + ++ Add/removal just needs to update value and pointer array, single-entry + (non-RCU...). For RCU, we might need to update the entire node anyway. +- Requires sequential search through all value array for lookup fail + test. + +Filled at 3 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +64-bit: 1 byte + 3 bytes + 4 bytes pad + 3*8 = 32 bytes + +-> up to this point on 64-bit, sequential lookup and pointer read fit in +a 32-byte cache line. + - lookup fail&success: 1 cache-line. + +Filled at 6 entries max 32-bit, 7 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 6 bytes + 1 byte pad + 6*4bytes = 32 bytes +64-bit: 1 byte + 7 bytes + 7*8 = 64 bytes + +-> up to this point on 32-bit, sequential lookup and pointer read fit in +a 32-byte cache line. + - lookup fail&success: 1 cache-line. + +Filled at 12 entries max 32-bit, 14 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 12 bytes + 3 bytes pad + 12*4bytes = 64 bytes +64-bit: 1 byte + 14 bytes + 1 byte pad + 14*8 = 128 bytes + +Filled at 25 entries max 32-bit, 28 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 25 bytes + 2 bytes pad + 25*4bytes = 128 bytes +64-bit: 1 byte + 28 bytes + 3 bytes pad + 28*8 = 256 bytes + +---> up to this point, on both 32-bit and 64-bit, the sequential lookup +in values array fits in a 32-byte cache line. + - lookup failure: 1 cache line. + - lookup success: 2 cache lines. + +The two below are listed for completeness sake, but because they require +2 32-byte cache lines for lookup, these are deemed inappropriate. + +Filled at 51 entries max 32-bit, 56 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 51 bytes + 51*4bytes = 256 bytes +64-bit: 1 byte + 56 bytes + 7 bytes pad + 56*8 = 512 bytes + +Filled at 102 entries max 32-bit, 113 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 102 bytes + 1 byte pad + 102*4bytes = 512 bytes +64-bit: 1 byte + 113 bytes + 6 bytes pad + 113*8 = 1024 bytes + + +- Type B: pools of values and pointers arrays + +Pools of values and pointers arrays. Each pool values array is 32-bytes +in size (so it fits in a L1 cacheline). Each pool begins with an 8-bit +integer, which is the number of children in this pool, followed by an +array of 8-bit values, padding, and an array of pointers. Values and +pointer arrays are associated as in Type A. + +The entries of a node are associated to their respective pool based +on their index position. + ++ Allows lookup failure to use 1 32-byte cache-line only. (1 cacheline) + lookup success: 2 cache lines. + ++ Allows in-place updates without reallocation, except when a pool is + full. (this was not possible with bitmap-based nodes) +- If one pool exhausts its space, we need to increase the node size. + Therefore, for very dense populations, we will end up using the + pigeon-hole node type sooner, thus consuming more space. + +Pool configuration: + +Per pool, filled at 25 entries (32-bit), 28 entries (64-bit) +32-bit: 1 byte + 25 bytes + 2 bytes pad + 25*4bytes = 128 bytes +64-bit: 1 byte + 28 bytes + 3 bytes pad + 28*8 = 256 bytes + +Total up to 50 entries (32-bit), 56 entries (64-bit) +2 pools: 32-bit = 256 bytes +2 pools: 64-bit = 512 bytes + +Total up to 100 entries (32-bit), 112 entries (64-bit) +4 pools: 32-bit = 512 bytes +4 pools: 32-bit = 1024 bytes + + +* Choice of pool configuration distribution: + +We have pools of either 2 or 4 linear arrays. Their total size is +between 256 bytes (32-bit 2 arrays) and 1024 bytes (64-bit 4 arrays). + +Alignment on 256 bytes means that we can spare the 8 least significant +bits of the pointers. Given that the type selection already uses 3 bits, +we have 7 bits left. + +Alignment on 512 bytes -> 8 bits left. + +We can therefore encode which bit, or which two bits, are used as +distribution selection. We can use this technique to reequilibrate pools +if they become unbalanced (e.g. all children are within one of the two +linear arrays). + +Assuming that finding the exact sub-pool usage maximum for any given +distribution is NP complete (not proven). + +Taking into account sub-class size unbalance (tested programmatically by +randomly taking N entries from 256, calculating the distribution for +each bit (number of nodes for which bit is one/zero), and calculating +the difference in number of nodes for each bit, choosing the minimum +difference -- for millions of runs). + +We start from the "ideal" population size (with no unbalance), and do a +fixed point to find the appropriate population size. + +tot entries subclass extra items largest linear array (stat. approx.) +--------------------------------------------------------------------- +48 entries: 1 (98%) 24+1=25 (target ~50/2=25) +54 entries: 1 (97%) 27+1=28 (target ~56/2=28) + +Note: there exists rare worse cases where the unbalance is larger, but +it happens _very_ rarely. But need to provide a fallback if the subclass +does not fit, but it does not need to be efficient. + + +For pool of size 4, we need to approximate what is the maximum unbalance +we can get for choice of distributions grouped by pairs of bits. + +tot entries subclass extra items largest linear array (stat. approx.) +--------------------------------------------------------------------- +92 entries: 2 (99%) 23+2=25 (target: ~100/4=25) +104 entries: 2 (99%) 26+2=28 (target: ~112/4=28) + + +Note: there exists rare worse cases where the unbalance is larger, but +it happens _very_ rarely. But need to provide a fallback if the subclass +does not fit, but it does not need to be efficient. + + +* Population "does not fit" and distribution fallback + +When adding a child to a distribution node, if the child does not fit, +we recalculate the best distribution. If it does not fit in that +distribution neither, we need to expand the node type. + +When removing a child, if the node child count is brought to the number +of entries expected to statistically fit in the lower order node, we try +to shrink. However, if we notice that the distribution does not actually +fit in that shrinked node, we abort the shrink operation. If shrink +fails, we keep a counter of insertion/removal operations on the node +before we allow the shrink to be attempted again. + + +- Type C: pigeon-hole array + +Filled at 47.2%/48.8% or more (32-bit: 121 entries+, 64-bit: 125 entries+) +Array of children node pointers. Pointers NULL if no child at index. +32-bit: 4*256 = 1024 bytes +64-bit: 8*256 = 2048 bytes + + +* Analysis of the thresholds: + +Analysis of number of cache-lines touched for each node, per-node-type, +depending on the number of children per node, as we increment the number +of children from 0 to 256. Through this, we choose number of children +thresholds at which it is worthwhile to use a different node type. + +Per node: + +- ALWAYS 1 cache line hit for lookup failure (all cases) + +32-bit + +- Unexisting + +0 children + +- Type A: sequential search in value and pointer arrays +- 1 cache line hit for lookup success +- 32 bytes storage + +up to 6 children + +- 2 cache line hit for lookup success +- 64 bytes storage + +up to 12 children + +- 128 bytes storage + +up to 25 children + +- Type B: pool + +- 256 bytes storage + +up to 50 children + +- 512 bytes storage +up to 100 children + +- Type C: pigeon-hole array +- 1 cache line hit for lookup success +- 1024 bytes storage + +up to 256 children + + +64-bit + +- Unexisting + +0 children + +- Type A: sequential search in value and pointer arrays +- 1 cache line hit for lookup success +- 32 bytes storage + +up to 3 children + +- 2 cache line hit for lookup success +- 64 bytes storage + +up to 7 children + +- 128 bytes storage + +up to 14 children + +- 256 bytes storage + +up to 28 children + +- Type B: pool + +- 512 bytes storage +up to 56 children + +- 1024 bytes storage +up to 112 children + +- Type C: pigeon-hole array +- 1 cache line hit for lookup success +- 2048 bytes storage + +up to 256 children + + +* Analysis of node type encoding and node pointers: + +Lookups are _always_ from the top of the tree going down. This +facilitates RCU replacement as we only keep track of pointers going +downward. + +Type of node encoded in the parent's pointer. Need to reserve 2 +least-significant bits. + +Types of children: + +enum child_type { + RCU_JA_LINEAR = 0, /* Type A */ + /* 32-bit: 1 to 25 children, 8 to 128 bytes */ + /* 64-bit: 1 to 28 children, 16 to 256 bytes */ + RCU_JA_POOL = 1, /* Type B */ + /* 32-bit: 26 to 100 children, 256 to 512 bytes */ + /* 64-bit: 29 to 112 children, 512 to 1024 bytes */ + RCU_JA_PIGEON = 2, /* Type C */ + /* 32-bit: 101 to 256 children, 1024 bytes */ + /* 64-bit: 113 to 256 children, 2048 bytes */ + /* Leaf nodes are implicit from their height in the tree */ +}; + +If entire pointer is NULL, children is empty. + + +* Lookup and Update Algorithms + +Let's propose a quite simple scheme that uses a mutex on nodes to manage +update concurrency. It's certainly not optimal in terms of concurrency +management within a node, but it has the advantage of being simple to +implement and understand. + +We need to keep a count of the number of children nodes (for each node), +to keep track of when the node type thresholds are reached. It would be +important to put an hysteresis loop so we don't change between node +types too often for a loop on add/removal of the same node. + +We acquire locks from child to parent, nested. We take all locks +required to perform a given update in the tree (but no more) to keep it +consistent with respect to number of children per node. + +If check for node being gc'd (always under node lock) fails, we simply +need to release the lock and lookup the node again. + + +- Leaf lookup + +rcu_read_lock() + +RCU-lookup each level of the tree. If level is not populated, fail. +Until we reach the leaf node. + +rcu_read_unlock() + + +- Leaf insertion + +A) Lookup + +rcu_read_lock() +RCU-lookup insert position. Find location in tree where nodes are +missing for this insertion. If leaf is already present, insert fails, +releasing the rcu read lock. The insert location consists of a parent +node to which we want to attach a new node. + +B) Lock + +RCU-lookup parent node. Take the parent lock. If the parent needs to be +reallocated to make room for this insertion, RCU-lookup parent-parent +node and take the parent-parent lock. For each lock taken, check if +node is being gc'd. If gc'd, release lock, re-RCU-lookup this node, and +retry. + +C) Create + +Construct the whole branch from the new topmost intermediate node down +to the new leaf node we are inserting. + +D) Populate: + - If parent node reallocation is needed: + Reallocate the parent node, adding the new branch to it, and + increment its node count. + set gc flag in old nodes. + call_rcu free for all old nodes. + Populate new parent node with rcu_assign_pointer. + - Else: + Increment parent node count. + Use rcu_assign_pointer to populate this new branch into the parent + node. + +E) Locks + +Release parent and (if taken) parent-parent locks. +rcu_read_unlock() + + +- Leaf removal + +A) Lookup + +rcu_read_lock() +RCU-lookup leaf to remove. If leaf is missing, fail and release rcu +read lock. + +B) Lock + +RCU-lookup parent. Take the parent lock. If the parent needs to be +reallocated because it would be too large for the decremented number of +children, RCU-lookup parent-parent and take the parent-parent lock. Do +so recursively until no node reallocation is needed, or until root is +reached. + +For each lock taken, check if node is being gc'd. If gc'd, release lock, +re-RCU-lookup this node, and retry. + +C) Create + +The branch (or portion of branch) consisting of taken locks necessarily +has a simple node removal or update as operation to do on its top node. + +If the operation is a node removal, then, necessarily, the entire branch +under the node removal operation will simply disappear. No node +allocation is needed. + +Else, if the operation is a child node reallocation, the child node will +necessarily do a node removal. So _its_ entire child branch will +disappear. So reallocate this child node without the removed branch +(remember to decrement its nr children count). + +D) Populate + +No reallocation case: simply set the appropriate child pointer in the +topmost locked node to NULL. Decrement its nr children count. + +Reallocation case: set the child pointer in the topmost locked node to +the newly allocated node. +set old nodes gc flag. +call_rcu free for all old nodes. + +E) Locks + +Release all locks. +rcu_read_unlock() + + +For the various types of nodes: + +- sequential search (type A) + - RCU replacement: mutex + - Entry update: mutex + +- bitmap followed by pointer array (type B) + - RCU replacement: mutex + - Entry update: mutex + +- pigeon hole array (type C) + - RCU replacement: mutex + - Entry update: mutex diff --git a/include/Makefile.am b/include/Makefile.am index 2f68aa2..c6d312a 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -11,7 +11,7 @@ nobase_dist_include_HEADERS = urcu/compiler.h urcu/hlist.h urcu/list.h \ urcu/static/urcu.h urcu/static/urcu-pointer.h \ urcu/static/urcu-qsbr.h urcu/static/wfcqueue.h \ urcu/static/wfqueue.h urcu/static/wfstack.h \ - urcu/tls-compat.h urcu/debug.h + urcu/tls-compat.h urcu/debug.h urcu/rcuja.h # Don't distribute generated headers nobase_nodist_include_HEADERS = urcu/arch.h urcu/uatomic.h urcu/config.h diff --git a/include/urcu/rcuja.h b/include/urcu/rcuja.h new file mode 100644 index 0000000..82e272b --- /dev/null +++ b/include/urcu/rcuja.h @@ -0,0 +1,234 @@ +#ifndef _URCU_RCUJA_H +#define _URCU_RCUJA_H + +/* + * urcu/rcuja.h + * + * Userspace RCU library - RCU Judy Array + * + * Copyright 2012-2013 - Mathieu Desnoyers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Include this file _after_ including your URCU flavor. + */ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Duplicate nodes with the same key are chained into a singly-linked + * list. The last item of this list has a NULL next pointer. + */ +struct cds_ja_node { + struct cds_ja_node *next; +}; + +struct cds_ja; + +/* + * cds_ja_node_init - initialize a judy array node + * @node: the node to initialize. + * + * This function is kept to be eventually used for debugging purposes + * (detection of memory corruption). + */ +static inline +void cds_ja_node_init(struct cds_ja_node *node) +{ +} + +/* + * Note: key UINT64_MAX is reserved internally for iteration. + */ + +/* + * cds_ja_lookup - look up by key. + * @ja: the Judy array. + * @key: key to look up. + * + * Returns the first node of a duplicate chain if a match is found, else + * returns NULL. + * A RCU read-side lock should be held across call to this function and + * use of its return value. + */ +struct cds_ja_node *cds_ja_lookup(struct cds_ja *ja, uint64_t key); + +/* + * cds_ja_lookup_below_equal - look up first node with key <= @key. + * @ja: the Judy array. + * @key: key to look up. + * @result_key: key found. + * + * Returns the first node of a duplicate chain if a node is present in + * the tree which has a key below or equal to @key, else returns NULL. + * A RCU read-side lock should be held across call to this function and + * use of its return value. + */ +struct cds_ja_node *cds_ja_lookup_below_equal(struct cds_ja *ja, + uint64_t key, uint64_t *result_key); + +/* + * cds_ja_lookup_above_equal - look up first node with key >= @key. + * @ja: the Judy array. + * @key: key to look up. + * @result_key: key found. + * + * Returns the first node of a duplicate chain if a node is present in + * the tree which has a key above or equal to @key, else returns NULL. + * A RCU read-side lock should be held across call to this function and + * use of its return value. + */ +struct cds_ja_node *cds_ja_lookup_above_equal(struct cds_ja *ja, + uint64_t key, uint64_t *result_key); + +/* + * cds_ja_add - Add @node at @key, allowing duplicates. + * @ja: the Judy array. + * @key: key at which @node should be added. + * @node: node to add. + * + * Returns 0 on success, negative error value on error. + * A RCU read-side lock should be held across call to this function. + */ +int cds_ja_add(struct cds_ja *ja, uint64_t key, + struct cds_ja_node *node); + +/* + * cds_ja_add_unique - Add @node at @key, without duplicates. + * @ja: the Judy array. + * @key: key at which @node should be added. + * @node: node to add. + * + * Returns @node if successfully added, else returns the already + * existing node (acts as a RCU lookup). + * A RCU read-side lock should be held across call to this function and + * use of its return value. + */ +struct cds_ja_node *cds_ja_add_unique(struct cds_ja *ja, uint64_t key, + struct cds_ja_node *node); + +/* + * cds_ja_del - Remove @node at @key. + * @ja: the Judy array. + * @key: key at which @node is expected. + * @node: node to remove. + * + * Returns 0 on success, negative error value on error. + * A RCU read-side lock should be held across call to this function. + */ +int cds_ja_del(struct cds_ja *ja, uint64_t key, + struct cds_ja_node *node); + +struct cds_ja *_cds_ja_new(unsigned int key_bits, + const struct rcu_flavor_struct *flavor); + +/* + * cds_ja_new - Create a Judy array. + * @key_bits: Number of bits for key. + * + * Returns non-NULL pointer on success, else NULL on error. @key_bits + * needs to be multiple of 8, either: 8, 16, 24, 32, 40, 48, 56, or 64. + */ +static inline +struct cds_ja *cds_ja_new(unsigned int key_bits) +{ + return _cds_ja_new(key_bits, &rcu_flavor); +} + +/* + * cds_ja_destroy - Destroy a Judy array. + * @ja: the Judy array. + * + * Returns 0 on success, negative error value on error. + * There should be no more concurrent add, delete, nor look-up performed + * on the Judy array while it is being destroyed (ensured by the caller). + * RCU read-side lock should _not_ be held when calling this function, + * however, QSBR threads need to be online. + */ +int cds_ja_destroy(struct cds_ja *ja); + +/* + * cds_ja_for_each_duplicate_rcu: Iterate through duplicates. + * @pos: struct cds_ja_node *, start of duplicate list and loop cursor. + * + * Iterate through duplicates returned by cds_ja_lookup*() + * This must be done while rcu_read_lock() is held. + * Receives a struct cds_ja_node * as parameter, which is used as start + * of duplicate list and loop cursor. + * _NOT_ safe against node removal within iteration. + */ +#define cds_ja_for_each_duplicate_rcu(pos) \ + for (; (pos) != NULL; (pos) = rcu_dereference((pos)->next)) + +/* + * cds_ja_for_each_duplicate_safe: Iterate through duplicates. + * @pos: struct cds_ja_node *, start of duplicate list and loop cursor. + * @p: struct cds_ja_node *, temporary pointer to next. + * + * Iterate through duplicates returned by cds_ja_lookup*(). + * Safe against node removal within iteration. + * This must be done while rcu_read_lock() is held. + */ +#define cds_ja_for_each_duplicate_safe_rcu(pos, p) \ + for (; (pos) != NULL ? \ + ((p) = rcu_dereference((pos)->next), 1) : 0; \ + (pos) = (p)) + +/* + * cds_ja_for_each_key_rcu: Iterate over all keys in ascending order. + * @ja: Judy array on which iteration should be done. + * @key: Key cursor, needs to be a uint64_t. + * @pos: struct cds_ja_node *, used as loop cursor. + * + * Iterate over all keys of a RCU Judy array (_not_ duplicates) in + * ascending order. + * This must be done while rcu_read_lock() is held. + * Safe against node removal during iteration. + */ +#define cds_ja_for_each_key_rcu(ja, key, pos) \ + for ((key) = 0; \ + ((key) != UINT64_MAX ? \ + ((pos) = cds_ja_lookup_above_equal(ja, key, &(key))) : 0); \ + (key)++) + +/* + * cds_ja_for_each_key_prev_rcu: Iterate over all keys in descending order. + * @ja: Judy array on which iteration should be done. + * @key: Key cursor, needs to be a uint64_t. + * @pos: struct cds_ja_node *, used as loop cursor. + * + * Iterate over all keys of a RCU Judy array (_not_ duplicates) in + * descending order. + * This must be done while rcu_read_lock() is held. + * Safe against node removal during iteration. + */ +#define cds_ja_for_each_key_prev_rcu(ja, key, pos) \ + for ((key) = UINT64_MAX - 1; \ + ((key) != UINT64_MAX ? \ + ((pos) = cds_ja_lookup_below_equal(ja, key, &(key))) : 0); \ + (key)--) + +#ifdef __cplusplus +} +#endif + +#endif /* _URCU_RCUJA_H */ diff --git a/src/Makefile.am b/src/Makefile.am index 6a2fd7a..0cd41e1 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -26,6 +26,8 @@ COMPAT+=compat_futex.c RCULFHASH = rculfhash.c rculfhash-mm-order.c rculfhash-mm-chunk.c \ rculfhash-mm-mmap.c +RCUJA = rcuja.c rcuja-shadow-nodes.c rcuja-internal.h + lib_LTLIBRARIES = liburcu-common.la \ liburcu.la liburcu-qsbr.la \ liburcu-mb.la liburcu-signal.la liburcu-bp.la \ @@ -55,7 +57,7 @@ liburcu_bp_la_SOURCES = urcu-bp.c urcu-pointer.c $(COMPAT) liburcu_bp_la_LIBADD = liburcu-common.la liburcu_cds_la_SOURCES = rculfqueue.c rculfstack.c lfstack.c \ - $(RCULFHASH) $(COMPAT) + $(RCULFHASH) $(RCUJA) $(COMPAT) liburcu_cds_la_LIBADD = liburcu-common.la pkgconfigdir = $(libdir)/pkgconfig diff --git a/src/rcuja-internal.h b/src/rcuja-internal.h new file mode 100644 index 0000000..6ffaeb1 --- /dev/null +++ b/src/rcuja-internal.h @@ -0,0 +1,277 @@ +#ifndef _URCU_RCUJA_INTERNAL_H +#define _URCU_RCUJA_INTERNAL_H + +/* + * rcuja/rcuja-internal.h + * + * Userspace RCU library - RCU Judy Array Internal Header + * + * Copyright (C) 2000 - 2002 Hewlett-Packard Company + * Copyright 2012-2013 - Mathieu Desnoyers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +/* + * Number of least significant pointer bits reserved to represent the + * child type. + */ +#define JA_TYPE_BITS 3 +#define JA_TYPE_MAX_NR (1UL << JA_TYPE_BITS) +#define JA_TYPE_MASK (JA_TYPE_MAX_NR - 1) +#define JA_PTR_MASK (~JA_TYPE_MASK) + +#define JA_ENTRY_PER_NODE 256UL +#define JA_LOG2_BITS_PER_BYTE 3U +#define JA_BITS_PER_BYTE (1U << JA_LOG2_BITS_PER_BYTE) + +#define JA_POOL_1D_MASK ((JA_BITS_PER_BYTE - 1) << JA_TYPE_BITS) +#define JA_POOL_2D_MASK (JA_POOL_1D_MASK << JA_LOG2_BITS_PER_BYTE) + +#define JA_MAX_DEPTH 9 /* Maximum depth, including leafs */ + +/* + * Entry for NULL node is at index 8 of the table. It is never encoded + * in flags. + */ +#define NODE_INDEX_NULL 8 + +/* + * Number of removals needed on a fallback node before we try to shrink + * it. + */ +#define JA_FALLBACK_REMOVAL_COUNT 8 + +/* Never declared. Opaque type used to store flagged node pointers. */ +struct cds_ja_inode_flag; +struct cds_ja_inode; + +/* + * Shadow node contains mutex and call_rcu head associated with a node. + */ +struct cds_ja_shadow_node { + struct cds_lfht_node ht_node; /* hash table node */ + struct cds_ja_inode_flag *node_flag; /* reverse mapping and hash table key */ + /* + * mutual exclusion on all nodes belonging to the same tree + * position (e.g. both nodes before and after recompaction + * use the same lock). + */ + pthread_mutex_t *lock; + unsigned int nr_child; /* number of children in node */ + struct rcu_head head; /* for deferred node and shadow node reclaim */ + int fallback_removal_count; /* removals left keeping fallback */ + int level; /* level in the tree */ + struct cds_ja *ja; /* toplevel judy array */ +}; + +struct cds_ja { + struct cds_ja_inode_flag *root; + unsigned int tree_depth; + uint64_t key_max; + /* + * We use a hash table to associate node keys to their + * respective shadow node. This helps reducing lookup hot path + * cache footprint, especially for very small nodes. + */ + struct cds_lfht *ht; + unsigned long nr_fallback; /* Number of fallback nodes used */ + + /* For debugging */ + unsigned long node_fallback_count_distribution[JA_ENTRY_PER_NODE]; + unsigned long nr_nodes_allocated, nr_nodes_freed; +}; + +static inline +struct cds_ja_inode_flag *ja_node_flag(struct cds_ja_inode *node, + unsigned long type) +{ + assert(type < (1UL << JA_TYPE_BITS)); + return (struct cds_ja_inode_flag *) (((unsigned long) node) | type); +} + +static inline +struct cds_ja_inode_flag *ja_node_flag_pool_1d(struct cds_ja_inode *node, + unsigned long type, unsigned long bitsel) +{ + assert(type < (1UL << JA_TYPE_BITS)); + assert(bitsel < JA_BITS_PER_BYTE); + return (struct cds_ja_inode_flag *) (((unsigned long) node) | (bitsel << JA_TYPE_BITS) | type); +} + +static inline +struct cds_ja_inode_flag *ja_node_flag_pool_2d(struct cds_ja_inode *node, + unsigned long type, unsigned int bitsel[2]) +{ + assert(type < (1UL << JA_TYPE_BITS)); + assert(bitsel[0] < JA_BITS_PER_BYTE); + assert(bitsel[1] < JA_BITS_PER_BYTE); + return (struct cds_ja_inode_flag *) (((unsigned long) node) | (bitsel[0] << (JA_TYPE_BITS + JA_LOG2_BITS_PER_BYTE)) | (bitsel[1] << JA_TYPE_BITS) | type); +} + +static inline +unsigned long ja_node_pool_1d_bitsel(struct cds_ja_inode_flag *node) +{ + return ((unsigned long) node & JA_POOL_1D_MASK) >> JA_TYPE_BITS; +} + +static inline +void ja_node_pool_2d_bitsel(struct cds_ja_inode_flag *node, unsigned long *bits) +{ + bits[0] = ((unsigned long) node & JA_POOL_2D_MASK) >> (JA_TYPE_BITS + JA_LOG2_BITS_PER_BYTE); + bits[1] = ((unsigned long) node & JA_POOL_1D_MASK) >> JA_TYPE_BITS; +} + +/* Hardcoded pool indexes for fast path */ +#define RCU_JA_POOL_IDX_5 5 +#define RCU_JA_POOL_IDX_6 6 +static inline +struct cds_ja_inode *ja_node_ptr(struct cds_ja_inode_flag *node) +{ + unsigned long v, type_idx; + + if (!node) + return NULL; /* RCU_JA_NULL */ + v = (unsigned long) node; + type_idx = v & JA_TYPE_MASK; + + switch (type_idx) { + case RCU_JA_POOL_IDX_5: + v &= ~(JA_POOL_1D_MASK | JA_TYPE_MASK); + break; + case RCU_JA_POOL_IDX_6: + v &= ~(JA_POOL_2D_MASK | JA_POOL_1D_MASK | JA_TYPE_MASK); + break; + default: + /* RCU_JA_LINEAR or RCU_JA_PIGEON */ + v &= JA_PTR_MASK; + break; + } + return (struct cds_ja_inode *) v; +} + +__attribute__((visibility("protected"))) +unsigned long ja_node_type(struct cds_ja_inode_flag *node); + +__attribute__((visibility("protected"))) +void rcuja_free_all_children(struct cds_ja_shadow_node *shadow_node, + struct cds_ja_inode_flag *node_flag); + +__attribute__((visibility("protected"))) +struct cds_ja_shadow_node *rcuja_shadow_lookup_lock(struct cds_lfht *ht, + struct cds_ja_inode_flag *node_flag); + +__attribute__((visibility("protected"))) +void rcuja_shadow_unlock(struct cds_ja_shadow_node *shadow_node); + +__attribute__((visibility("protected"))) +struct cds_ja_shadow_node *rcuja_shadow_set(struct cds_lfht *ht, + struct cds_ja_inode_flag *new_node_flag, + struct cds_ja_shadow_node *inherit_from, + struct cds_ja *ja, int level); + +/* rcuja_shadow_clear flags */ +enum { + RCUJA_SHADOW_CLEAR_FREE_NODE = (1U << 0), + RCUJA_SHADOW_CLEAR_FREE_LOCK = (1U << 1), +}; + +__attribute__((visibility("protected"))) +int rcuja_shadow_clear(struct cds_lfht *ht, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_shadow_node *shadow_node, + unsigned int flags); + +__attribute__((visibility("protected"))) +void rcuja_shadow_prune(struct cds_lfht *ht, + unsigned int flags); + +__attribute__((visibility("protected"))) +struct cds_lfht *rcuja_create_ht(const struct rcu_flavor_struct *flavor); + +__attribute__((visibility("protected"))) +int rcuja_delete_ht(struct cds_lfht *ht); + +__attribute__((visibility("protected"))) +void free_cds_ja_node(struct cds_ja *ja, struct cds_ja_inode *node); + +/* + * Iterate through duplicates returned by cds_ja_lookup*() + * Receives a struct cds_ja_node * as parameter, which is used as start + * of duplicate list and loop cursor. + */ +#define cds_ja_for_each_duplicate(pos) \ + for (; (pos) != NULL; (pos) = (pos)->next) + +//#define DEBUG +//#define DEBUG_COUNTERS + +#ifdef __linux__ +#include +#endif + +#if defined(_syscall0) +_syscall0(pid_t, gettid) +#elif defined(__NR_gettid) +static inline pid_t gettid(void) +{ + return syscall(__NR_gettid); +} +#else +#warning "use pid as tid" +static inline pid_t gettid(void) +{ + return getpid(); +} +#endif + +#ifdef DEBUG +#define dbg_printf(fmt, args...) \ + fprintf(stderr, "[debug rcuja %lu %s()@%s:%u] " fmt, \ + (unsigned long) gettid(), __func__, \ + __FILE__, __LINE__, ## args) + +#else +#define dbg_printf(fmt, args...) \ +do { \ + /* do nothing but check printf format */ \ + if (0) \ + fprintf(stderr, "[debug rcuja %lu %s()@%s:%u] " fmt, \ + (unsigned long) gettid(), __func__, \ + __FILE__, __LINE__, ## args); \ +} while (0) +#endif + +#ifdef DEBUG_COUNTERS +static inline +int ja_debug_counters(void) +{ + return 1; +} +#else +static inline +int ja_debug_counters(void) +{ + return 0; +} +#endif + +#endif /* _URCU_RCUJA_INTERNAL_H */ diff --git a/src/rcuja-shadow-nodes.c b/src/rcuja-shadow-nodes.c new file mode 100644 index 0000000..19cd143 --- /dev/null +++ b/src/rcuja-shadow-nodes.c @@ -0,0 +1,443 @@ +/* + * rcuja/rcuja-hashtable.c + * + * Userspace RCU library - RCU Judy Array Shadow Node Hash Table + * + * Copyright 2012-2013 - Mathieu Desnoyers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LGPL_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcuja-internal.h" + +static unsigned long hash_seed; + +/* + * Hash function + * Source: http://burtleburtle.net/bob/c/lookup3.c + * Originally Public Domain + */ + +#define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) + +#define mix(a, b, c) \ +do { \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c, 16); c += b; \ + b -= a; b ^= rot(a, 19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} while (0) + +#define final(a, b, c) \ +{ \ + c ^= b; c -= rot(b, 14); \ + a ^= c; a -= rot(c, 11); \ + b ^= a; b -= rot(a, 25); \ + c ^= b; c -= rot(b, 16); \ + a ^= c; a -= rot(c, 4);\ + b ^= a; b -= rot(a, 14); \ + c ^= b; c -= rot(b, 24); \ +} + +static inline __attribute__((unused)) +uint32_t hash_u32( + const uint32_t *k, /* the key, an array of uint32_t values */ + size_t length, /* the length of the key, in uint32_ts */ + uint32_t initval) /* the previous hash, or an arbitrary value */ +{ + uint32_t a, b, c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + (((uint32_t) length) << 2) + initval; + + /*----------------------------------------- handle most of the key */ + while (length > 3) { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a, b, c); + length -= 3; + k += 3; + } + + /*----------------------------------- handle the last 3 uint32_t's */ + switch (length) { /* all the case statements fall through */ + case 3: c += k[2]; + case 2: b += k[1]; + case 1: a += k[0]; + final(a, b, c); + case 0: /* case 0: nothing left to add */ + break; + } + /*---------------------------------------------- report the result */ + return c; +} + +static inline +void hashword2( + const uint32_t *k, /* the key, an array of uint32_t values */ + size_t length, /* the length of the key, in uint32_ts */ + uint32_t *pc, /* IN: seed OUT: primary hash value */ + uint32_t *pb) /* IN: more seed OUT: secondary hash value */ +{ + uint32_t a, b, c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t) (length << 2)) + *pc; + c += *pb; + + /*----------------------------------------- handle most of the key */ + while (length > 3) { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a, b, c); + length -= 3; + k += 3; + } + + /*----------------------------------- handle the last 3 uint32_t's */ + switch (length) { /* all the case statements fall through */ + case 3: c += k[2]; + case 2: b += k[1]; + case 1: a += k[0]; + final(a, b, c); + case 0: /* case 0: nothing left to add */ + break; + } + /*---------------------------------------------- report the result */ + *pc = c; + *pb = b; +} + +#if (CAA_BITS_PER_LONG == 32) +static +unsigned long hash_pointer(const void *_key, unsigned long seed) +{ + unsigned int key = (unsigned int) _key; + + return hash_u32(&key, 1, seed); +} +#else +static +unsigned long hash_pointer(const void *_key, unsigned long seed) +{ + union { + uint64_t v64; + uint32_t v32[2]; + } v; + union { + uint64_t v64; + uint32_t v32[2]; + } key; + + v.v64 = (uint64_t) seed; + key.v64 = (uint64_t) _key; + hashword2(key.v32, 2, &v.v32[0], &v.v32[1]); + return v.v64; +} +#endif + +static +int match_pointer(struct cds_lfht_node *node, const void *key) +{ + struct cds_ja_shadow_node *shadow = + caa_container_of(node, struct cds_ja_shadow_node, ht_node); + + return (key == shadow->node_flag); +} + +__attribute__((visibility("protected"))) +struct cds_ja_shadow_node *rcuja_shadow_lookup_lock(struct cds_lfht *ht, + struct cds_ja_inode_flag *node_flag) +{ + struct cds_lfht_iter iter; + struct cds_lfht_node *lookup_node; + struct cds_ja_shadow_node *shadow_node; + const struct rcu_flavor_struct *flavor; + int ret; + + flavor = cds_lfht_rcu_flavor(ht); + flavor->read_lock(); + cds_lfht_lookup(ht, hash_pointer(node_flag, hash_seed), + match_pointer, node_flag, &iter); + + lookup_node = cds_lfht_iter_get_node(&iter); + if (!lookup_node) { + shadow_node = NULL; + goto rcu_unlock; + } + shadow_node = caa_container_of(lookup_node, + struct cds_ja_shadow_node, ht_node); + dbg_printf("Lock %p\n", shadow_node->lock); + ret = pthread_mutex_lock(shadow_node->lock); + assert(!ret); + if (cds_lfht_is_node_deleted(lookup_node)) { + ret = pthread_mutex_unlock(shadow_node->lock); + assert(!ret); + shadow_node = NULL; + } +rcu_unlock: + flavor->read_unlock(); + return shadow_node; +} + +__attribute__((visibility("protected"))) +void rcuja_shadow_unlock(struct cds_ja_shadow_node *shadow_node) +{ + int ret; + + dbg_printf("Unlock %p\n", shadow_node->lock); + ret = pthread_mutex_unlock(shadow_node->lock); + assert(!ret); +} + +__attribute__((visibility("protected"))) +struct cds_ja_shadow_node *rcuja_shadow_set(struct cds_lfht *ht, + struct cds_ja_inode_flag *new_node_flag, + struct cds_ja_shadow_node *inherit_from, + struct cds_ja *ja, int level) +{ + struct cds_ja_shadow_node *shadow_node; + struct cds_lfht_node *ret_node; + const struct rcu_flavor_struct *flavor; + + shadow_node = calloc(sizeof(*shadow_node), 1); + if (!shadow_node) + return NULL; + + shadow_node->node_flag = new_node_flag; + shadow_node->ja = ja; + /* + * Lock can be inherited from previous node at this position. + */ + if (inherit_from) { + shadow_node->lock = inherit_from->lock; + shadow_node->level = inherit_from->level; + } else { + shadow_node->lock = calloc(sizeof(*shadow_node->lock), 1); + if (!shadow_node->lock) { + free(shadow_node); + return NULL; + } + pthread_mutex_init(shadow_node->lock, NULL); + shadow_node->level = level; + } + + flavor = cds_lfht_rcu_flavor(ht); + flavor->read_lock(); + ret_node = cds_lfht_add_unique(ht, + hash_pointer(new_node_flag, hash_seed), + match_pointer, + new_node_flag, + &shadow_node->ht_node); + flavor->read_unlock(); + + if (ret_node != &shadow_node->ht_node) { + free(shadow_node); + return NULL; + } + return shadow_node; +} + +static +void free_shadow_node(struct rcu_head *head) +{ + struct cds_ja_shadow_node *shadow_node = + caa_container_of(head, struct cds_ja_shadow_node, head); + free(shadow_node); +} + +static +void free_shadow_node_and_node(struct rcu_head *head) +{ + struct cds_ja_shadow_node *shadow_node = + caa_container_of(head, struct cds_ja_shadow_node, head); + free_cds_ja_node(shadow_node->ja, ja_node_ptr(shadow_node->node_flag)); + free(shadow_node); +} + +static +void free_shadow_node_and_lock(struct rcu_head *head) +{ + struct cds_ja_shadow_node *shadow_node = + caa_container_of(head, struct cds_ja_shadow_node, head); + free(shadow_node->lock); + free(shadow_node); +} + +static +void free_shadow_node_and_node_and_lock(struct rcu_head *head) +{ + struct cds_ja_shadow_node *shadow_node = + caa_container_of(head, struct cds_ja_shadow_node, head); + assert(shadow_node->level); + free_cds_ja_node(shadow_node->ja, ja_node_ptr(shadow_node->node_flag)); + free(shadow_node->lock); + free(shadow_node); +} + +__attribute__((visibility("protected"))) +int rcuja_shadow_clear(struct cds_lfht *ht, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_shadow_node *shadow_node, + unsigned int flags) +{ + struct cds_lfht_iter iter; + struct cds_lfht_node *lookup_node; + const struct rcu_flavor_struct *flavor; + int ret, lockret; + int lookup_shadow = 0; + + flavor = cds_lfht_rcu_flavor(ht); + flavor->read_lock(); + + cds_lfht_lookup(ht, hash_pointer(node_flag, hash_seed), + match_pointer, node_flag, &iter); + lookup_node = cds_lfht_iter_get_node(&iter); + if (!lookup_node) { + ret = -ENOENT; + goto rcu_unlock; + } + + if (!shadow_node) { + shadow_node = caa_container_of(lookup_node, + struct cds_ja_shadow_node, ht_node); + lockret = pthread_mutex_lock(shadow_node->lock); + assert(!lockret); + lookup_shadow = 1; + } + + /* + * Holding the mutex across deletion, and by also re-checking if + * the node is deleted with mutex held at lookup ensure that we + * don't let RCU JA use a node being removed. + */ + ret = cds_lfht_del(ht, lookup_node); + if (ret) + goto unlock; + if ((flags & RCUJA_SHADOW_CLEAR_FREE_NODE) + && shadow_node->level) { + if (flags & RCUJA_SHADOW_CLEAR_FREE_LOCK) { + flavor->update_call_rcu(&shadow_node->head, + free_shadow_node_and_node_and_lock); + } else { + flavor->update_call_rcu(&shadow_node->head, + free_shadow_node_and_node); + } + } else { + if (flags & RCUJA_SHADOW_CLEAR_FREE_LOCK) { + flavor->update_call_rcu(&shadow_node->head, + free_shadow_node_and_lock); + } else { + flavor->update_call_rcu(&shadow_node->head, + free_shadow_node); + } + } +unlock: + if (lookup_shadow) { + lockret = pthread_mutex_unlock(shadow_node->lock); + assert(!lockret); + } +rcu_unlock: + flavor->read_unlock(); + + return ret; +} + +/* + * Delete all shadow nodes and nodes from hash table, along with their + * associated lock. + */ +__attribute__((visibility("protected"))) +void rcuja_shadow_prune(struct cds_lfht *ht, + unsigned int flags) +{ + const struct rcu_flavor_struct *flavor; + struct cds_ja_shadow_node *shadow_node; + struct cds_lfht_iter iter; + int ret, lockret; + + flavor = cds_lfht_rcu_flavor(ht); + /* + * Read-side lock is needed to ensure hash table node existence + * vs concurrent resize. + */ + flavor->read_lock(); + cds_lfht_for_each_entry(ht, &iter, shadow_node, ht_node) { + lockret = pthread_mutex_lock(shadow_node->lock); + assert(!lockret); + + ret = cds_lfht_del(ht, &shadow_node->ht_node); + if (ret) + goto unlock; + if ((flags & RCUJA_SHADOW_CLEAR_FREE_NODE) + && shadow_node->level) { + if (flags & RCUJA_SHADOW_CLEAR_FREE_LOCK) { + flavor->update_call_rcu(&shadow_node->head, + free_shadow_node_and_node_and_lock); + } else { + flavor->update_call_rcu(&shadow_node->head, + free_shadow_node_and_node); + } + } else { + if (flags & RCUJA_SHADOW_CLEAR_FREE_LOCK) { + flavor->update_call_rcu(&shadow_node->head, + free_shadow_node_and_lock); + } else { + flavor->update_call_rcu(&shadow_node->head, + free_shadow_node); + } + } + unlock: + lockret = pthread_mutex_unlock(shadow_node->lock); + assert(!lockret); + } + flavor->read_unlock(); +} + +__attribute__((visibility("protected"))) +struct cds_lfht *rcuja_create_ht(const struct rcu_flavor_struct *flavor) +{ + return _cds_lfht_new(1, 1, 0, + CDS_LFHT_AUTO_RESIZE | CDS_LFHT_ACCOUNTING, + NULL, flavor, NULL); +} + +__attribute__((visibility("protected"))) +int rcuja_delete_ht(struct cds_lfht *ht) +{ + return cds_lfht_destroy(ht, NULL); +} + +__attribute__((constructor)) +void rcuja_ht_init(void) +{ + hash_seed = (unsigned long) time(NULL); +} diff --git a/src/rcuja.c b/src/rcuja.c new file mode 100644 index 0000000..92d8d9b --- /dev/null +++ b/src/rcuja.c @@ -0,0 +1,2757 @@ +/* + * rcuja/rcuja.c + * + * Userspace RCU library - RCU Judy Array + * + * Copyright (C) 2000 - 2002 Hewlett-Packard Company + * Copyright 2012-2013 - Mathieu Desnoyers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LGPL_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcuja-internal.h" + +#ifndef abs +#define abs_int(a) ((int) (a) > 0 ? (int) (a) : -((int) (a))) +#endif + +enum cds_ja_type_class { + RCU_JA_LINEAR = 0, /* Type A */ + /* 32-bit: 1 to 25 children, 8 to 128 bytes */ + /* 64-bit: 1 to 28 children, 16 to 256 bytes */ + RCU_JA_POOL = 1, /* Type B */ + /* 32-bit: 26 to 100 children, 256 to 512 bytes */ + /* 64-bit: 29 to 112 children, 512 to 1024 bytes */ + RCU_JA_PIGEON = 2, /* Type C */ + /* 32-bit: 101 to 256 children, 1024 bytes */ + /* 64-bit: 113 to 256 children, 2048 bytes */ + /* Leaf nodes are implicit from their height in the tree */ + RCU_JA_NR_TYPES, + + RCU_JA_NULL, /* not an encoded type, but keeps code regular */ +}; + +struct cds_ja_type { + enum cds_ja_type_class type_class; + uint16_t min_child; /* minimum number of children: 1 to 256 */ + uint16_t max_child; /* maximum number of children: 1 to 256 */ + uint16_t max_linear_child; /* per-pool max nr. children: 1 to 256 */ + uint16_t order; /* node size is (1 << order), in bytes */ + uint16_t nr_pool_order; /* number of pools */ + uint16_t pool_size_order; /* pool size */ +}; + +/* + * Iteration on the array to find the right node size for the number of + * children stops when it reaches .max_child == 256 (this is the largest + * possible node size, which contains 256 children). + * The min_child overlaps with the previous max_child to provide an + * hysteresis loop to reallocation for patterns of cyclic add/removal + * within the same node. + * The node the index within the following arrays is represented on 3 + * bits. It identifies the node type, min/max number of children, and + * the size order. + * The max_child values for the RCU_JA_POOL below result from + * statistical approximation: over million populations, the max_child + * covers between 97% and 99% of the populations generated. Therefore, a + * fallback should exist to cover the rare extreme population unbalance + * cases, but it will not have a major impact on speed nor space + * consumption, since those are rare cases. + */ + +#if (CAA_BITS_PER_LONG < 64) +/* 32-bit pointers */ +enum { + ja_type_0_max_child = 1, + ja_type_1_max_child = 3, + ja_type_2_max_child = 6, + ja_type_3_max_child = 12, + ja_type_4_max_child = 25, + ja_type_5_max_child = 48, + ja_type_6_max_child = 92, + ja_type_7_max_child = 256, + ja_type_8_max_child = 0, /* NULL */ +}; + +enum { + ja_type_0_max_linear_child = 1, + ja_type_1_max_linear_child = 3, + ja_type_2_max_linear_child = 6, + ja_type_3_max_linear_child = 12, + ja_type_4_max_linear_child = 25, + ja_type_5_max_linear_child = 24, + ja_type_6_max_linear_child = 23, +}; + +enum { + ja_type_5_nr_pool_order = 1, + ja_type_6_nr_pool_order = 2, +}; + +const struct cds_ja_type ja_types[] = { + { .type_class = RCU_JA_LINEAR, .min_child = 1, .max_child = ja_type_0_max_child, .max_linear_child = ja_type_0_max_linear_child, .order = 3, }, + { .type_class = RCU_JA_LINEAR, .min_child = 1, .max_child = ja_type_1_max_child, .max_linear_child = ja_type_1_max_linear_child, .order = 4, }, + { .type_class = RCU_JA_LINEAR, .min_child = 3, .max_child = ja_type_2_max_child, .max_linear_child = ja_type_2_max_linear_child, .order = 5, }, + { .type_class = RCU_JA_LINEAR, .min_child = 4, .max_child = ja_type_3_max_child, .max_linear_child = ja_type_3_max_linear_child, .order = 6, }, + { .type_class = RCU_JA_LINEAR, .min_child = 10, .max_child = ja_type_4_max_child, .max_linear_child = ja_type_4_max_linear_child, .order = 7, }, + + /* Pools may fill sooner than max_child */ + /* This pool is hardcoded at index 5. See ja_node_ptr(). */ + { .type_class = RCU_JA_POOL, .min_child = 20, .max_child = ja_type_5_max_child, .max_linear_child = ja_type_5_max_linear_child, .order = 8, .nr_pool_order = ja_type_5_nr_pool_order, .pool_size_order = 7, }, + /* This pool is hardcoded at index 6. See ja_node_ptr(). */ + { .type_class = RCU_JA_POOL, .min_child = 45, .max_child = ja_type_6_max_child, .max_linear_child = ja_type_6_max_linear_child, .order = 9, .nr_pool_order = ja_type_6_nr_pool_order, .pool_size_order = 7, }, + + /* + * Upon node removal below min_child, if child pool is filled + * beyond capacity, we roll back to pigeon. + */ + { .type_class = RCU_JA_PIGEON, .min_child = 83, .max_child = ja_type_7_max_child, .order = 10, }, + + { .type_class = RCU_JA_NULL, .min_child = 0, .max_child = ja_type_8_max_child, }, +}; +#else /* !(CAA_BITS_PER_LONG < 64) */ +/* 64-bit pointers */ +enum { + ja_type_0_max_child = 1, + ja_type_1_max_child = 3, + ja_type_2_max_child = 7, + ja_type_3_max_child = 14, + ja_type_4_max_child = 28, + ja_type_5_max_child = 54, + ja_type_6_max_child = 104, + ja_type_7_max_child = 256, + ja_type_8_max_child = 256, +}; + +enum { + ja_type_0_max_linear_child = 1, + ja_type_1_max_linear_child = 3, + ja_type_2_max_linear_child = 7, + ja_type_3_max_linear_child = 14, + ja_type_4_max_linear_child = 28, + ja_type_5_max_linear_child = 27, + ja_type_6_max_linear_child = 26, +}; + +enum { + ja_type_5_nr_pool_order = 1, + ja_type_6_nr_pool_order = 2, +}; + +const struct cds_ja_type ja_types[] = { + { .type_class = RCU_JA_LINEAR, .min_child = 1, .max_child = ja_type_0_max_child, .max_linear_child = ja_type_0_max_linear_child, .order = 4, }, + { .type_class = RCU_JA_LINEAR, .min_child = 1, .max_child = ja_type_1_max_child, .max_linear_child = ja_type_1_max_linear_child, .order = 5, }, + { .type_class = RCU_JA_LINEAR, .min_child = 3, .max_child = ja_type_2_max_child, .max_linear_child = ja_type_2_max_linear_child, .order = 6, }, + { .type_class = RCU_JA_LINEAR, .min_child = 5, .max_child = ja_type_3_max_child, .max_linear_child = ja_type_3_max_linear_child, .order = 7, }, + { .type_class = RCU_JA_LINEAR, .min_child = 10, .max_child = ja_type_4_max_child, .max_linear_child = ja_type_4_max_linear_child, .order = 8, }, + + /* Pools may fill sooner than max_child. */ + /* This pool is hardcoded at index 5. See ja_node_ptr(). */ + { .type_class = RCU_JA_POOL, .min_child = 22, .max_child = ja_type_5_max_child, .max_linear_child = ja_type_5_max_linear_child, .order = 9, .nr_pool_order = ja_type_5_nr_pool_order, .pool_size_order = 8, }, + /* This pool is hardcoded at index 6. See ja_node_ptr(). */ + { .type_class = RCU_JA_POOL, .min_child = 51, .max_child = ja_type_6_max_child, .max_linear_child = ja_type_6_max_linear_child, .order = 10, .nr_pool_order = ja_type_6_nr_pool_order, .pool_size_order = 8, }, + + /* + * Upon node removal below min_child, if child pool is filled + * beyond capacity, we roll back to pigeon. + */ + { .type_class = RCU_JA_PIGEON, .min_child = 95, .max_child = ja_type_7_max_child, .order = 11, }, + + { .type_class = RCU_JA_NULL, .min_child = 0, .max_child = ja_type_8_max_child, }, +}; +#endif /* !(BITS_PER_LONG < 64) */ + +static inline __attribute__((unused)) +void static_array_size_check(void) +{ + CAA_BUILD_BUG_ON(CAA_ARRAY_SIZE(ja_types) < JA_TYPE_MAX_NR); +} + +/* + * The cds_ja_node contains the compressed node data needed for + * read-side. For linear and pool node configurations, it starts with a + * byte counting the number of children in the node. Then, the + * node-specific data is placed. + * The node mutex, if any is needed, protecting concurrent updated of + * each node is placed in a separate hash table indexed by node address. + * For the pigeon configuration, the number of children is also kept in + * a separate hash table, indexed by node address, because it is only + * required for updates. + */ + +#define DECLARE_LINEAR_NODE(index) \ + struct { \ + uint8_t nr_child; \ + uint8_t child_value[ja_type_## index ##_max_linear_child]; \ + struct cds_ja_inode_flag *child_ptr[ja_type_## index ##_max_linear_child]; \ + } + +#define DECLARE_POOL_NODE(index) \ + struct { \ + struct { \ + uint8_t nr_child; \ + uint8_t child_value[ja_type_## index ##_max_linear_child]; \ + struct cds_ja_inode_flag *child_ptr[ja_type_## index ##_max_linear_child]; \ + } linear[1U << ja_type_## index ##_nr_pool_order]; \ + } + +struct cds_ja_inode { + union { + /* Linear configuration */ + DECLARE_LINEAR_NODE(0) conf_0; + DECLARE_LINEAR_NODE(1) conf_1; + DECLARE_LINEAR_NODE(2) conf_2; + DECLARE_LINEAR_NODE(3) conf_3; + DECLARE_LINEAR_NODE(4) conf_4; + + /* Pool configuration */ + DECLARE_POOL_NODE(5) conf_5; + DECLARE_POOL_NODE(6) conf_6; + + /* Pigeon configuration */ + struct { + struct cds_ja_inode_flag *child[ja_type_7_max_child]; + } conf_7; + /* data aliasing nodes for computed accesses */ + uint8_t data[sizeof(struct cds_ja_inode_flag *) * ja_type_7_max_child]; + } u; +}; + +enum ja_recompact { + JA_RECOMPACT_ADD_SAME, + JA_RECOMPACT_ADD_NEXT, + JA_RECOMPACT_DEL, +}; + +enum ja_lookup_inequality { + JA_LOOKUP_BE, + JA_LOOKUP_AE, +}; + +enum ja_direction { + JA_LEFT, + JA_RIGHT, + JA_LEFTMOST, + JA_RIGHTMOST, +}; + +static +struct cds_ja_inode *_ja_node_mask_ptr(struct cds_ja_inode_flag *node) +{ + return (struct cds_ja_inode *) (((unsigned long) node) & JA_PTR_MASK); +} + +unsigned long ja_node_type(struct cds_ja_inode_flag *node) +{ + unsigned long type; + + if (_ja_node_mask_ptr(node) == NULL) { + return NODE_INDEX_NULL; + } + type = (unsigned int) ((unsigned long) node & JA_TYPE_MASK); + assert(type < (1UL << JA_TYPE_BITS)); + return type; +} + +static +struct cds_ja_inode *alloc_cds_ja_node(struct cds_ja *ja, + const struct cds_ja_type *ja_type) +{ + size_t len = 1U << ja_type->order; + void *p; + int ret; + + ret = posix_memalign(&p, len, len); + if (ret || !p) { + return NULL; + } + memset(p, 0, len); + if (ja_debug_counters()) + uatomic_inc(&ja->nr_nodes_allocated); + return p; +} + +void free_cds_ja_node(struct cds_ja *ja, struct cds_ja_inode *node) +{ + free(node); + if (ja_debug_counters() && node) + uatomic_inc(&ja->nr_nodes_freed); +} + +#define __JA_ALIGN_MASK(v, mask) (((v) + (mask)) & ~(mask)) +#define JA_ALIGN(v, align) __JA_ALIGN_MASK(v, (typeof(v)) (align) - 1) +#define __JA_FLOOR_MASK(v, mask) ((v) & ~(mask)) +#define JA_FLOOR(v, align) __JA_FLOOR_MASK(v, (typeof(v)) (align) - 1) + +static +uint8_t *align_ptr_size(uint8_t *ptr) +{ + return (uint8_t *) JA_ALIGN((unsigned long) ptr, sizeof(void *)); +} + +static +uint8_t ja_linear_node_get_nr_child(const struct cds_ja_type *type, + struct cds_ja_inode *node) +{ + assert(type->type_class == RCU_JA_LINEAR || type->type_class == RCU_JA_POOL); + return rcu_dereference(node->u.data[0]); +} + +/* + * The order in which values and pointers are does does not matter: if + * a value is missing, we return NULL. If a value is there, but its + * associated pointers is still NULL, we return NULL too. + */ +static +struct cds_ja_inode_flag *ja_linear_node_get_nth(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_inode_flag ***node_flag_ptr, + uint8_t n) +{ + uint8_t nr_child; + uint8_t *values; + struct cds_ja_inode_flag **pointers; + struct cds_ja_inode_flag *ptr; + unsigned int i; + + assert(type->type_class == RCU_JA_LINEAR || type->type_class == RCU_JA_POOL); + + nr_child = ja_linear_node_get_nr_child(type, node); + cmm_smp_rmb(); /* read nr_child before values and pointers */ + assert(nr_child <= type->max_linear_child); + assert(type->type_class != RCU_JA_LINEAR || nr_child >= type->min_child); + + values = &node->u.data[1]; + for (i = 0; i < nr_child; i++) { + if (CMM_LOAD_SHARED(values[i]) == n) + break; + } + if (i >= nr_child) { + if (caa_unlikely(node_flag_ptr)) + *node_flag_ptr = NULL; + return NULL; + } + pointers = (struct cds_ja_inode_flag **) align_ptr_size(&values[type->max_linear_child]); + ptr = rcu_dereference(pointers[i]); + if (caa_unlikely(node_flag_ptr)) + *node_flag_ptr = &pointers[i]; + return ptr; +} + +static +struct cds_ja_inode_flag *ja_linear_node_get_direction(const struct cds_ja_type *type, + struct cds_ja_inode *node, + int n, uint8_t *result_key, + enum ja_direction dir) +{ + uint8_t nr_child; + uint8_t *values; + struct cds_ja_inode_flag **pointers; + struct cds_ja_inode_flag *ptr, *match_ptr = NULL; + unsigned int i; + int match_v; + + assert(type->type_class == RCU_JA_LINEAR || type->type_class == RCU_JA_POOL); + assert(dir == JA_LEFT || dir == JA_RIGHT); + + if (dir == JA_LEFT) { + match_v = -1; + } else { + match_v = JA_ENTRY_PER_NODE; + } + + nr_child = ja_linear_node_get_nr_child(type, node); + cmm_smp_rmb(); /* read nr_child before values and pointers */ + assert(nr_child <= type->max_linear_child); + assert(type->type_class != RCU_JA_LINEAR || nr_child >= type->min_child); + + values = &node->u.data[1]; + pointers = (struct cds_ja_inode_flag **) align_ptr_size(&values[type->max_linear_child]); + for (i = 0; i < nr_child; i++) { + unsigned int v; + + v = CMM_LOAD_SHARED(values[i]); + ptr = CMM_LOAD_SHARED(pointers[i]); + if (!ptr) + continue; + if (dir == JA_LEFT) { + if ((int) v < n && (int) v > match_v) { + match_v = v; + match_ptr = ptr; + } + } else { + if ((int) v > n && (int) v < match_v) { + match_v = v; + match_ptr = ptr; + } + } + } + + if (!match_ptr) { + return NULL; + } + assert(match_v >= 0 && match_v < JA_ENTRY_PER_NODE); + + *result_key = (uint8_t) match_v; + return rcu_dereference(match_ptr); +} + +static +void ja_linear_node_get_ith_pos(const struct cds_ja_type *type, + struct cds_ja_inode *node, + uint8_t i, + uint8_t *v, + struct cds_ja_inode_flag **iter) +{ + uint8_t *values; + struct cds_ja_inode_flag **pointers; + + assert(type->type_class == RCU_JA_LINEAR || type->type_class == RCU_JA_POOL); + assert(i < ja_linear_node_get_nr_child(type, node)); + + values = &node->u.data[1]; + *v = values[i]; + pointers = (struct cds_ja_inode_flag **) align_ptr_size(&values[type->max_linear_child]); + *iter = pointers[i]; +} + +static +struct cds_ja_inode_flag *ja_pool_node_get_nth(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_inode_flag ***node_flag_ptr, + uint8_t n) +{ + struct cds_ja_inode *linear; + + assert(type->type_class == RCU_JA_POOL); + + switch (type->nr_pool_order) { + case 1: + { + unsigned long bitsel, index; + + bitsel = ja_node_pool_1d_bitsel(node_flag); + assert(bitsel < CHAR_BIT); + index = ((unsigned long) n >> bitsel) & 0x1; + linear = (struct cds_ja_inode *) &node->u.data[index << type->pool_size_order]; + break; + } + case 2: + { + unsigned long bitsel[2], index[2], rindex; + + ja_node_pool_2d_bitsel(node_flag, bitsel); + assert(bitsel[0] < CHAR_BIT); + assert(bitsel[1] < CHAR_BIT); + index[0] = ((unsigned long) n >> bitsel[0]) & 0x1; + index[0] <<= 1; + index[1] = ((unsigned long) n >> bitsel[1]) & 0x1; + rindex = index[0] | index[1]; + linear = (struct cds_ja_inode *) &node->u.data[rindex << type->pool_size_order]; + break; + } + default: + linear = NULL; + assert(0); + } + return ja_linear_node_get_nth(type, linear, node_flag_ptr, n); +} + +static +struct cds_ja_inode *ja_pool_node_get_ith_pool(const struct cds_ja_type *type, + struct cds_ja_inode *node, + uint8_t i) +{ + assert(type->type_class == RCU_JA_POOL); + return (struct cds_ja_inode *) + &node->u.data[(unsigned int) i << type->pool_size_order]; +} + +static +struct cds_ja_inode_flag *ja_pool_node_get_direction(const struct cds_ja_type *type, + struct cds_ja_inode *node, + int n, uint8_t *result_key, + enum ja_direction dir) +{ + unsigned int pool_nr; + int match_v; + struct cds_ja_inode_flag *match_node_flag = NULL; + + assert(type->type_class == RCU_JA_POOL); + assert(dir == JA_LEFT || dir == JA_RIGHT); + + if (dir == JA_LEFT) { + match_v = -1; + } else { + match_v = JA_ENTRY_PER_NODE; + } + + for (pool_nr = 0; pool_nr < (1U << type->nr_pool_order); pool_nr++) { + struct cds_ja_inode *pool = + ja_pool_node_get_ith_pool(type, + node, pool_nr); + uint8_t nr_child = + ja_linear_node_get_nr_child(type, pool); + unsigned int j; + + for (j = 0; j < nr_child; j++) { + struct cds_ja_inode_flag *iter; + uint8_t v; + + ja_linear_node_get_ith_pos(type, pool, + j, &v, &iter); + if (!iter) + continue; + if (dir == JA_LEFT) { + if ((int) v < n && (int) v > match_v) { + match_v = v; + match_node_flag = iter; + } + } else { + if ((int) v > n && (int) v < match_v) { + match_v = v; + match_node_flag = iter; + } + } + } + } + if (match_node_flag) + *result_key = (uint8_t) match_v; + return match_node_flag; +} + +static +struct cds_ja_inode_flag *ja_pigeon_node_get_nth(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_inode_flag ***node_flag_ptr, + uint8_t n) +{ + struct cds_ja_inode_flag **child_node_flag_ptr; + struct cds_ja_inode_flag *child_node_flag; + + assert(type->type_class == RCU_JA_PIGEON); + child_node_flag_ptr = &((struct cds_ja_inode_flag **) node->u.data)[n]; + child_node_flag = rcu_dereference(*child_node_flag_ptr); + dbg_printf("ja_pigeon_node_get_nth child_node_flag_ptr %p\n", + child_node_flag_ptr); + if (caa_unlikely(node_flag_ptr)) + *node_flag_ptr = child_node_flag_ptr; + return child_node_flag; +} + +static +struct cds_ja_inode_flag *ja_pigeon_node_get_direction(const struct cds_ja_type *type, + struct cds_ja_inode *node, + int n, uint8_t *result_key, + enum ja_direction dir) +{ + struct cds_ja_inode_flag **child_node_flag_ptr; + struct cds_ja_inode_flag *child_node_flag; + int i; + + assert(type->type_class == RCU_JA_PIGEON); + assert(dir == JA_LEFT || dir == JA_RIGHT); + + if (dir == JA_LEFT) { + /* n - 1 is first value left of n */ + for (i = n - 1; i >= 0; i--) { + child_node_flag_ptr = &((struct cds_ja_inode_flag **) node->u.data)[i]; + child_node_flag = rcu_dereference(*child_node_flag_ptr); + if (child_node_flag) { + dbg_printf("ja_pigeon_node_get_left child_node_flag %p\n", + child_node_flag); + *result_key = (uint8_t) i; + return child_node_flag; + } + } + } else { + /* n + 1 is first value right of n */ + for (i = n + 1; i < JA_ENTRY_PER_NODE; i++) { + child_node_flag_ptr = &((struct cds_ja_inode_flag **) node->u.data)[i]; + child_node_flag = rcu_dereference(*child_node_flag_ptr); + if (child_node_flag) { + dbg_printf("ja_pigeon_node_get_right child_node_flag %p\n", + child_node_flag); + *result_key = (uint8_t) i; + return child_node_flag; + } + } + } + return NULL; +} + +static +struct cds_ja_inode_flag *ja_pigeon_node_get_ith_pos(const struct cds_ja_type *type, + struct cds_ja_inode *node, + uint8_t i) +{ + return ja_pigeon_node_get_nth(type, node, NULL, i); +} + +/* + * ja_node_get_nth: get nth item from a node. + * node_flag is already rcu_dereference'd. + */ +static +struct cds_ja_inode_flag *ja_node_get_nth(struct cds_ja_inode_flag *node_flag, + struct cds_ja_inode_flag ***node_flag_ptr, + uint8_t n) +{ + unsigned int type_index; + struct cds_ja_inode *node; + const struct cds_ja_type *type; + + node = ja_node_ptr(node_flag); + assert(node != NULL); + type_index = ja_node_type(node_flag); + type = &ja_types[type_index]; + + switch (type->type_class) { + case RCU_JA_LINEAR: + return ja_linear_node_get_nth(type, node, + node_flag_ptr, n); + case RCU_JA_POOL: + return ja_pool_node_get_nth(type, node, node_flag, + node_flag_ptr, n); + case RCU_JA_PIGEON: + return ja_pigeon_node_get_nth(type, node, + node_flag_ptr, n); + default: + assert(0); + return (void *) -1UL; + } +} + +static +struct cds_ja_inode_flag *ja_node_get_direction(struct cds_ja_inode_flag *node_flag, + int n, uint8_t *result_key, + enum ja_direction dir) +{ + unsigned int type_index; + struct cds_ja_inode *node; + const struct cds_ja_type *type; + + node = ja_node_ptr(node_flag); + assert(node != NULL); + type_index = ja_node_type(node_flag); + type = &ja_types[type_index]; + + switch (type->type_class) { + case RCU_JA_LINEAR: + return ja_linear_node_get_direction(type, node, n, result_key, dir); + case RCU_JA_POOL: + return ja_pool_node_get_direction(type, node, n, result_key, dir); + case RCU_JA_PIGEON: + return ja_pigeon_node_get_direction(type, node, n, result_key, dir); + default: + assert(0); + return (void *) -1UL; + } +} + +static +struct cds_ja_inode_flag *ja_node_get_leftright(struct cds_ja_inode_flag *node_flag, + unsigned int n, uint8_t *result_key, + enum ja_direction dir) +{ + return ja_node_get_direction(node_flag, n, result_key, dir); +} + +static +struct cds_ja_inode_flag *ja_node_get_minmax(struct cds_ja_inode_flag *node_flag, + uint8_t *result_key, + enum ja_direction dir) +{ + switch (dir) { + case JA_LEFTMOST: + return ja_node_get_direction(node_flag, + -1, result_key, JA_RIGHT); + case JA_RIGHTMOST: + return ja_node_get_direction(node_flag, + JA_ENTRY_PER_NODE, result_key, JA_LEFT); + default: + assert(0); + } +} + +static +int ja_linear_node_set_nth(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_shadow_node *shadow_node, + uint8_t n, + struct cds_ja_inode_flag *child_node_flag) +{ + uint8_t nr_child; + uint8_t *values, *nr_child_ptr; + struct cds_ja_inode_flag **pointers; + unsigned int i, unused = 0; + + assert(type->type_class == RCU_JA_LINEAR || type->type_class == RCU_JA_POOL); + + nr_child_ptr = &node->u.data[0]; + dbg_printf("linear set nth: n %u, nr_child_ptr %p\n", + (unsigned int) n, nr_child_ptr); + nr_child = *nr_child_ptr; + assert(nr_child <= type->max_linear_child); + + values = &node->u.data[1]; + pointers = (struct cds_ja_inode_flag **) align_ptr_size(&values[type->max_linear_child]); + /* Check if node value is already populated */ + for (i = 0; i < nr_child; i++) { + if (values[i] == n) { + if (pointers[i]) + return -EEXIST; + else + break; + } else { + if (!pointers[i]) + unused++; + } + } + if (i == nr_child && nr_child >= type->max_linear_child) { + if (unused) + return -ERANGE; /* recompact node */ + else + return -ENOSPC; /* No space left in this node type */ + } + + assert(pointers[i] == NULL); + rcu_assign_pointer(pointers[i], child_node_flag); + /* If we expanded the nr_child, increment it */ + if (i == nr_child) { + CMM_STORE_SHARED(values[nr_child], n); + /* write pointer and value before nr_child */ + cmm_smp_wmb(); + CMM_STORE_SHARED(*nr_child_ptr, nr_child + 1); + } + shadow_node->nr_child++; + dbg_printf("linear set nth: %u child, shadow: %u child, for node %p shadow %p\n", + (unsigned int) CMM_LOAD_SHARED(*nr_child_ptr), + (unsigned int) shadow_node->nr_child, + node, shadow_node); + + return 0; +} + +static +int ja_pool_node_set_nth(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_shadow_node *shadow_node, + uint8_t n, + struct cds_ja_inode_flag *child_node_flag) +{ + struct cds_ja_inode *linear; + + assert(type->type_class == RCU_JA_POOL); + + switch (type->nr_pool_order) { + case 1: + { + unsigned long bitsel, index; + + bitsel = ja_node_pool_1d_bitsel(node_flag); + assert(bitsel < CHAR_BIT); + index = ((unsigned long) n >> bitsel) & 0x1; + linear = (struct cds_ja_inode *) &node->u.data[index << type->pool_size_order]; + break; + } + case 2: + { + unsigned long bitsel[2], index[2], rindex; + + ja_node_pool_2d_bitsel(node_flag, bitsel); + assert(bitsel[0] < CHAR_BIT); + assert(bitsel[1] < CHAR_BIT); + index[0] = ((unsigned long) n >> bitsel[0]) & 0x1; + index[0] <<= 1; + index[1] = ((unsigned long) n >> bitsel[1]) & 0x1; + rindex = index[0] | index[1]; + linear = (struct cds_ja_inode *) &node->u.data[rindex << type->pool_size_order]; + break; + } + default: + linear = NULL; + assert(0); + } + + return ja_linear_node_set_nth(type, linear, shadow_node, + n, child_node_flag); +} + +static +int ja_pigeon_node_set_nth(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_shadow_node *shadow_node, + uint8_t n, + struct cds_ja_inode_flag *child_node_flag) +{ + struct cds_ja_inode_flag **ptr; + + assert(type->type_class == RCU_JA_PIGEON); + ptr = &((struct cds_ja_inode_flag **) node->u.data)[n]; + if (*ptr) + return -EEXIST; + rcu_assign_pointer(*ptr, child_node_flag); + shadow_node->nr_child++; + return 0; +} + +/* + * _ja_node_set_nth: set nth item within a node. Return an error + * (negative error value) if it is already there. + */ +static +int _ja_node_set_nth(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_shadow_node *shadow_node, + uint8_t n, + struct cds_ja_inode_flag *child_node_flag) +{ + switch (type->type_class) { + case RCU_JA_LINEAR: + return ja_linear_node_set_nth(type, node, shadow_node, n, + child_node_flag); + case RCU_JA_POOL: + return ja_pool_node_set_nth(type, node, node_flag, shadow_node, n, + child_node_flag); + case RCU_JA_PIGEON: + return ja_pigeon_node_set_nth(type, node, shadow_node, n, + child_node_flag); + case RCU_JA_NULL: + return -ENOSPC; + default: + assert(0); + return -EINVAL; + } + + return 0; +} + +static +int ja_linear_node_clear_ptr(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_shadow_node *shadow_node, + struct cds_ja_inode_flag **node_flag_ptr) +{ + uint8_t nr_child; + uint8_t *nr_child_ptr; + + assert(type->type_class == RCU_JA_LINEAR || type->type_class == RCU_JA_POOL); + + nr_child_ptr = &node->u.data[0]; + nr_child = *nr_child_ptr; + assert(nr_child <= type->max_linear_child); + + if (type->type_class == RCU_JA_LINEAR) { + assert(!shadow_node->fallback_removal_count); + if (shadow_node->nr_child <= type->min_child) { + /* We need to try recompacting the node */ + return -EFBIG; + } + } + dbg_printf("linear clear ptr: nr_child_ptr %p\n", nr_child_ptr); + assert(*node_flag_ptr != NULL); + rcu_assign_pointer(*node_flag_ptr, NULL); + /* + * Value and nr_child are never changed (would cause ABA issue). + * Instead, we leave the pointer to NULL and recompact the node + * once in a while. It is allowed to set a NULL pointer to a new + * value without recompaction though. + * Only update the shadow node accounting. + */ + shadow_node->nr_child--; + dbg_printf("linear clear ptr: %u child, shadow: %u child, for node %p shadow %p\n", + (unsigned int) CMM_LOAD_SHARED(*nr_child_ptr), + (unsigned int) shadow_node->nr_child, + node, shadow_node); + return 0; +} + +static +int ja_pool_node_clear_ptr(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_shadow_node *shadow_node, + struct cds_ja_inode_flag **node_flag_ptr, + uint8_t n) +{ + struct cds_ja_inode *linear; + + assert(type->type_class == RCU_JA_POOL); + + if (shadow_node->fallback_removal_count) { + shadow_node->fallback_removal_count--; + } else { + /* We should try recompacting the node */ + if (shadow_node->nr_child <= type->min_child) + return -EFBIG; + } + + switch (type->nr_pool_order) { + case 1: + { + unsigned long bitsel, index; + + bitsel = ja_node_pool_1d_bitsel(node_flag); + assert(bitsel < CHAR_BIT); + index = ((unsigned long) n >> bitsel) & type->nr_pool_order; + linear = (struct cds_ja_inode *) &node->u.data[index << type->pool_size_order]; + break; + } + case 2: + { + unsigned long bitsel[2], index[2], rindex; + + ja_node_pool_2d_bitsel(node_flag, bitsel); + assert(bitsel[0] < CHAR_BIT); + assert(bitsel[1] < CHAR_BIT); + index[0] = ((unsigned long) n >> bitsel[0]) & 0x1; + index[0] <<= 1; + index[1] = ((unsigned long) n >> bitsel[1]) & 0x1; + rindex = index[0] | index[1]; + linear = (struct cds_ja_inode *) &node->u.data[rindex << type->pool_size_order]; + break; + } + default: + linear = NULL; + assert(0); + } + + return ja_linear_node_clear_ptr(type, linear, shadow_node, node_flag_ptr); +} + +static +int ja_pigeon_node_clear_ptr(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_shadow_node *shadow_node, + struct cds_ja_inode_flag **node_flag_ptr) +{ + assert(type->type_class == RCU_JA_PIGEON); + + if (shadow_node->fallback_removal_count) { + shadow_node->fallback_removal_count--; + } else { + /* We should try recompacting the node */ + if (shadow_node->nr_child <= type->min_child) + return -EFBIG; + } + dbg_printf("ja_pigeon_node_clear_ptr: clearing ptr: %p\n", *node_flag_ptr); + rcu_assign_pointer(*node_flag_ptr, NULL); + shadow_node->nr_child--; + return 0; +} + +/* + * _ja_node_clear_ptr: clear ptr item within a node. Return an error + * (negative error value) if it is not found (-ENOENT). + */ +static +int _ja_node_clear_ptr(const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_shadow_node *shadow_node, + struct cds_ja_inode_flag **node_flag_ptr, + uint8_t n) +{ + switch (type->type_class) { + case RCU_JA_LINEAR: + return ja_linear_node_clear_ptr(type, node, shadow_node, node_flag_ptr); + case RCU_JA_POOL: + return ja_pool_node_clear_ptr(type, node, node_flag, shadow_node, node_flag_ptr, n); + case RCU_JA_PIGEON: + return ja_pigeon_node_clear_ptr(type, node, shadow_node, node_flag_ptr); + case RCU_JA_NULL: + return -ENOENT; + default: + assert(0); + return -EINVAL; + } + + return 0; +} + +/* + * Calculate bit distribution. Returns the bit (0 to 7) that splits the + * distribution in two sub-distributions containing as much elements one + * compared to the other. + */ +static +unsigned int ja_node_sum_distribution_1d(enum ja_recompact mode, + struct cds_ja *ja, + unsigned int type_index, + const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_shadow_node *shadow_node, + uint8_t n, + struct cds_ja_inode_flag *child_node_flag, + struct cds_ja_inode_flag **nullify_node_flag_ptr) +{ + uint8_t nr_one[JA_BITS_PER_BYTE]; + unsigned int bitsel = 0, bit_i, overall_best_distance = UINT_MAX; + unsigned int distrib_nr_child = 0; + + memset(nr_one, 0, sizeof(nr_one)); + + switch (type->type_class) { + case RCU_JA_LINEAR: + { + uint8_t nr_child = + ja_linear_node_get_nr_child(type, node); + unsigned int i; + + for (i = 0; i < nr_child; i++) { + struct cds_ja_inode_flag *iter; + uint8_t v; + + ja_linear_node_get_ith_pos(type, node, i, &v, &iter); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + if (v & (1U << bit_i)) + nr_one[bit_i]++; + } + distrib_nr_child++; + } + break; + } + case RCU_JA_POOL: + { + unsigned int pool_nr; + + for (pool_nr = 0; pool_nr < (1U << type->nr_pool_order); pool_nr++) { + struct cds_ja_inode *pool = + ja_pool_node_get_ith_pool(type, + node, pool_nr); + uint8_t nr_child = + ja_linear_node_get_nr_child(type, pool); + unsigned int j; + + for (j = 0; j < nr_child; j++) { + struct cds_ja_inode_flag *iter; + uint8_t v; + + ja_linear_node_get_ith_pos(type, pool, + j, &v, &iter); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + if (v & (1U << bit_i)) + nr_one[bit_i]++; + } + distrib_nr_child++; + } + } + break; + } + case RCU_JA_PIGEON: + { + unsigned int i; + + assert(mode == JA_RECOMPACT_DEL); + for (i = 0; i < JA_ENTRY_PER_NODE; i++) { + struct cds_ja_inode_flag *iter; + + iter = ja_pigeon_node_get_ith_pos(type, node, i); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + if (i & (1U << bit_i)) + nr_one[bit_i]++; + } + distrib_nr_child++; + } + break; + } + case RCU_JA_NULL: + assert(mode == JA_RECOMPACT_ADD_NEXT); + break; + default: + assert(0); + break; + } + + if (mode == JA_RECOMPACT_ADD_NEXT || mode == JA_RECOMPACT_ADD_SAME) { + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + if (n & (1U << bit_i)) + nr_one[bit_i]++; + } + distrib_nr_child++; + } + + /* + * The best bit selector is that for which the number of ones is + * closest to half of the number of children in the + * distribution. We calculate the distance using the double of + * the sub-distribution sizes to eliminate truncation error. + */ + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + unsigned int distance_to_best; + + distance_to_best = abs_int(((unsigned int) nr_one[bit_i] << 1U) - distrib_nr_child); + if (distance_to_best < overall_best_distance) { + overall_best_distance = distance_to_best; + bitsel = bit_i; + } + } + dbg_printf("1 dimension pool bit selection: (%u)\n", bitsel); + return bitsel; +} + +/* + * Calculate bit distribution in two dimensions. Returns the two bits + * (each 0 to 7) that splits the distribution in four sub-distributions + * containing as much elements one compared to the other. + */ +static +void ja_node_sum_distribution_2d(enum ja_recompact mode, + struct cds_ja *ja, + unsigned int type_index, + const struct cds_ja_type *type, + struct cds_ja_inode *node, + struct cds_ja_shadow_node *shadow_node, + uint8_t n, + struct cds_ja_inode_flag *child_node_flag, + struct cds_ja_inode_flag **nullify_node_flag_ptr, + unsigned int *_bitsel) +{ + uint8_t nr_2d_11[JA_BITS_PER_BYTE][JA_BITS_PER_BYTE], + nr_2d_10[JA_BITS_PER_BYTE][JA_BITS_PER_BYTE], + nr_2d_01[JA_BITS_PER_BYTE][JA_BITS_PER_BYTE], + nr_2d_00[JA_BITS_PER_BYTE][JA_BITS_PER_BYTE]; + unsigned int bitsel[2] = { 0, 1 }; + unsigned int bit_i, bit_j; + int overall_best_distance = INT_MAX; + unsigned int distrib_nr_child = 0; + + memset(nr_2d_11, 0, sizeof(nr_2d_11)); + memset(nr_2d_10, 0, sizeof(nr_2d_10)); + memset(nr_2d_01, 0, sizeof(nr_2d_01)); + memset(nr_2d_00, 0, sizeof(nr_2d_00)); + + switch (type->type_class) { + case RCU_JA_LINEAR: + { + uint8_t nr_child = + ja_linear_node_get_nr_child(type, node); + unsigned int i; + + for (i = 0; i < nr_child; i++) { + struct cds_ja_inode_flag *iter; + uint8_t v; + + ja_linear_node_get_ith_pos(type, node, i, &v, &iter); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + for (bit_j = 0; bit_j < bit_i; bit_j++) { + if (v & (1U << bit_i)) { + if (v & (1U << bit_j)) { + nr_2d_11[bit_i][bit_j]++; + } else { + nr_2d_10[bit_i][bit_j]++; + } + } else { + if (v & (1U << bit_j)) { + nr_2d_01[bit_i][bit_j]++; + } else { + nr_2d_00[bit_i][bit_j]++; + } + } + } + } + distrib_nr_child++; + } + break; + } + case RCU_JA_POOL: + { + unsigned int pool_nr; + + for (pool_nr = 0; pool_nr < (1U << type->nr_pool_order); pool_nr++) { + struct cds_ja_inode *pool = + ja_pool_node_get_ith_pool(type, + node, pool_nr); + uint8_t nr_child = + ja_linear_node_get_nr_child(type, pool); + unsigned int j; + + for (j = 0; j < nr_child; j++) { + struct cds_ja_inode_flag *iter; + uint8_t v; + + ja_linear_node_get_ith_pos(type, pool, + j, &v, &iter); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + for (bit_j = 0; bit_j < bit_i; bit_j++) { + if (v & (1U << bit_i)) { + if (v & (1U << bit_j)) { + nr_2d_11[bit_i][bit_j]++; + } else { + nr_2d_10[bit_i][bit_j]++; + } + } else { + if (v & (1U << bit_j)) { + nr_2d_01[bit_i][bit_j]++; + } else { + nr_2d_00[bit_i][bit_j]++; + } + } + } + } + distrib_nr_child++; + } + } + break; + } + case RCU_JA_PIGEON: + { + unsigned int i; + + assert(mode == JA_RECOMPACT_DEL); + for (i = 0; i < JA_ENTRY_PER_NODE; i++) { + struct cds_ja_inode_flag *iter; + + iter = ja_pigeon_node_get_ith_pos(type, node, i); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + for (bit_j = 0; bit_j < bit_i; bit_j++) { + if (i & (1U << bit_i)) { + if (i & (1U << bit_j)) { + nr_2d_11[bit_i][bit_j]++; + } else { + nr_2d_10[bit_i][bit_j]++; + } + } else { + if (i & (1U << bit_j)) { + nr_2d_01[bit_i][bit_j]++; + } else { + nr_2d_00[bit_i][bit_j]++; + } + } + } + } + distrib_nr_child++; + } + break; + } + case RCU_JA_NULL: + assert(mode == JA_RECOMPACT_ADD_NEXT); + break; + default: + assert(0); + break; + } + + if (mode == JA_RECOMPACT_ADD_NEXT || mode == JA_RECOMPACT_ADD_SAME) { + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + for (bit_j = 0; bit_j < bit_i; bit_j++) { + if (n & (1U << bit_i)) { + if (n & (1U << bit_j)) { + nr_2d_11[bit_i][bit_j]++; + } else { + nr_2d_10[bit_i][bit_j]++; + } + } else { + if (n & (1U << bit_j)) { + nr_2d_01[bit_i][bit_j]++; + } else { + nr_2d_00[bit_i][bit_j]++; + } + } + } + } + distrib_nr_child++; + } + + /* + * The best bit selector is that for which the number of nodes + * in each sub-class is closest to one-fourth of the number of + * children in the distribution. We calculate the distance using + * 4 times the size of the sub-distribution to eliminate + * truncation error. + */ + for (bit_i = 0; bit_i < JA_BITS_PER_BYTE; bit_i++) { + for (bit_j = 0; bit_j < bit_i; bit_j++) { + int distance_to_best[4]; + + distance_to_best[0] = ((unsigned int) nr_2d_11[bit_i][bit_j] << 2U) - distrib_nr_child; + distance_to_best[1] = ((unsigned int) nr_2d_10[bit_i][bit_j] << 2U) - distrib_nr_child; + distance_to_best[2] = ((unsigned int) nr_2d_01[bit_i][bit_j] << 2U) - distrib_nr_child; + distance_to_best[3] = ((unsigned int) nr_2d_00[bit_i][bit_j] << 2U) - distrib_nr_child; + + /* Consider worse distance above best */ + if (distance_to_best[1] > 0 && distance_to_best[1] > distance_to_best[0]) + distance_to_best[0] = distance_to_best[1]; + if (distance_to_best[2] > 0 && distance_to_best[2] > distance_to_best[0]) + distance_to_best[0] = distance_to_best[2]; + if (distance_to_best[3] > 0 && distance_to_best[3] > distance_to_best[0]) + distance_to_best[0] = distance_to_best[3]; + + /* + * If our worse distance is better than overall, + * we become new best candidate. + */ + if (distance_to_best[0] < overall_best_distance) { + overall_best_distance = distance_to_best[0]; + bitsel[0] = bit_i; + bitsel[1] = bit_j; + } + } + } + + dbg_printf("2 dimensions pool bit selection: (%u,%u)\n", bitsel[0], bitsel[1]); + + /* Return our bit selection */ + _bitsel[0] = bitsel[0]; + _bitsel[1] = bitsel[1]; +} + +static +unsigned int find_nearest_type_index(unsigned int type_index, + unsigned int nr_nodes) +{ + const struct cds_ja_type *type; + + assert(type_index != NODE_INDEX_NULL); + if (nr_nodes == 0) + return NODE_INDEX_NULL; + for (;;) { + type = &ja_types[type_index]; + if (nr_nodes < type->min_child) + type_index--; + else if (nr_nodes > type->max_child) + type_index++; + else + break; + } + return type_index; +} + +/* + * ja_node_recompact_add: recompact a node, adding a new child. + * Return 0 on success, -EAGAIN if need to retry, or other negative + * error value otherwise. + */ +static +int ja_node_recompact(enum ja_recompact mode, + struct cds_ja *ja, + unsigned int old_type_index, + const struct cds_ja_type *old_type, + struct cds_ja_inode *old_node, + struct cds_ja_shadow_node *shadow_node, + struct cds_ja_inode_flag **old_node_flag_ptr, uint8_t n, + struct cds_ja_inode_flag *child_node_flag, + struct cds_ja_inode_flag **nullify_node_flag_ptr, + int level) +{ + unsigned int new_type_index; + struct cds_ja_inode *new_node; + struct cds_ja_shadow_node *new_shadow_node = NULL; + const struct cds_ja_type *new_type; + struct cds_ja_inode_flag *new_node_flag, *old_node_flag; + int ret; + int fallback = 0; + + old_node_flag = *old_node_flag_ptr; + + /* + * Need to find nearest type index even for ADD_SAME, because + * this recompaction, when applied to linear nodes, will garbage + * collect dummy (NULL) entries, and can therefore cause a few + * linear representations to be skipped. + */ + switch (mode) { + case JA_RECOMPACT_ADD_SAME: + new_type_index = find_nearest_type_index(old_type_index, + shadow_node->nr_child + 1); + dbg_printf("Recompact for node with %u children\n", + shadow_node->nr_child + 1); + break; + case JA_RECOMPACT_ADD_NEXT: + if (!shadow_node || old_type_index == NODE_INDEX_NULL) { + new_type_index = 0; + dbg_printf("Recompact for NULL\n"); + } else { + new_type_index = find_nearest_type_index(old_type_index, + shadow_node->nr_child + 1); + dbg_printf("Recompact for node with %u children\n", + shadow_node->nr_child + 1); + } + break; + case JA_RECOMPACT_DEL: + new_type_index = find_nearest_type_index(old_type_index, + shadow_node->nr_child - 1); + dbg_printf("Recompact for node with %u children\n", + shadow_node->nr_child - 1); + break; + default: + assert(0); + } + +retry: /* for fallback */ + dbg_printf("Recompact from type %d to type %d\n", + old_type_index, new_type_index); + new_type = &ja_types[new_type_index]; + if (new_type_index != NODE_INDEX_NULL) { + new_node = alloc_cds_ja_node(ja, new_type); + if (!new_node) + return -ENOMEM; + + if (new_type->type_class == RCU_JA_POOL) { + switch (new_type->nr_pool_order) { + case 1: + { + unsigned int node_distrib_bitsel; + + node_distrib_bitsel = + ja_node_sum_distribution_1d(mode, ja, + old_type_index, old_type, + old_node, shadow_node, + n, child_node_flag, + nullify_node_flag_ptr); + assert(!((unsigned long) new_node & JA_POOL_1D_MASK)); + new_node_flag = ja_node_flag_pool_1d(new_node, + new_type_index, node_distrib_bitsel); + break; + } + case 2: + { + unsigned int node_distrib_bitsel[2]; + + ja_node_sum_distribution_2d(mode, ja, + old_type_index, old_type, + old_node, shadow_node, + n, child_node_flag, + nullify_node_flag_ptr, + node_distrib_bitsel); + assert(!((unsigned long) new_node & JA_POOL_1D_MASK)); + assert(!((unsigned long) new_node & JA_POOL_2D_MASK)); + new_node_flag = ja_node_flag_pool_2d(new_node, + new_type_index, node_distrib_bitsel); + break; + } + default: + assert(0); + } + } else { + new_node_flag = ja_node_flag(new_node, new_type_index); + } + + dbg_printf("Recompact inherit lock from %p\n", shadow_node); + new_shadow_node = rcuja_shadow_set(ja->ht, new_node_flag, shadow_node, ja, level); + if (!new_shadow_node) { + free_cds_ja_node(ja, new_node); + return -ENOMEM; + } + if (fallback) + new_shadow_node->fallback_removal_count = + JA_FALLBACK_REMOVAL_COUNT; + } else { + new_node = NULL; + new_node_flag = NULL; + } + + assert(mode != JA_RECOMPACT_ADD_NEXT || old_type->type_class != RCU_JA_PIGEON); + + if (new_type_index == NODE_INDEX_NULL) + goto skip_copy; + + switch (old_type->type_class) { + case RCU_JA_LINEAR: + { + uint8_t nr_child = + ja_linear_node_get_nr_child(old_type, old_node); + unsigned int i; + + for (i = 0; i < nr_child; i++) { + struct cds_ja_inode_flag *iter; + uint8_t v; + + ja_linear_node_get_ith_pos(old_type, old_node, i, &v, &iter); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + ret = _ja_node_set_nth(new_type, new_node, new_node_flag, + new_shadow_node, + v, iter); + if (new_type->type_class == RCU_JA_POOL && ret) { + goto fallback_toosmall; + } + assert(!ret); + } + break; + } + case RCU_JA_POOL: + { + unsigned int pool_nr; + + for (pool_nr = 0; pool_nr < (1U << old_type->nr_pool_order); pool_nr++) { + struct cds_ja_inode *pool = + ja_pool_node_get_ith_pool(old_type, + old_node, pool_nr); + uint8_t nr_child = + ja_linear_node_get_nr_child(old_type, pool); + unsigned int j; + + for (j = 0; j < nr_child; j++) { + struct cds_ja_inode_flag *iter; + uint8_t v; + + ja_linear_node_get_ith_pos(old_type, pool, + j, &v, &iter); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + ret = _ja_node_set_nth(new_type, new_node, new_node_flag, + new_shadow_node, + v, iter); + if (new_type->type_class == RCU_JA_POOL + && ret) { + goto fallback_toosmall; + } + assert(!ret); + } + } + break; + } + case RCU_JA_NULL: + assert(mode == JA_RECOMPACT_ADD_NEXT); + break; + case RCU_JA_PIGEON: + { + unsigned int i; + + assert(mode == JA_RECOMPACT_DEL); + for (i = 0; i < JA_ENTRY_PER_NODE; i++) { + struct cds_ja_inode_flag *iter; + + iter = ja_pigeon_node_get_ith_pos(old_type, old_node, i); + if (!iter) + continue; + if (mode == JA_RECOMPACT_DEL && *nullify_node_flag_ptr == iter) + continue; + ret = _ja_node_set_nth(new_type, new_node, new_node_flag, + new_shadow_node, + i, iter); + if (new_type->type_class == RCU_JA_POOL && ret) { + goto fallback_toosmall; + } + assert(!ret); + } + break; + } + default: + assert(0); + ret = -EINVAL; + goto end; + } +skip_copy: + + if (mode == JA_RECOMPACT_ADD_NEXT || mode == JA_RECOMPACT_ADD_SAME) { + /* add node */ + ret = _ja_node_set_nth(new_type, new_node, new_node_flag, + new_shadow_node, + n, child_node_flag); + if (new_type->type_class == RCU_JA_POOL && ret) { + goto fallback_toosmall; + } + assert(!ret); + } + + if (fallback) { + dbg_printf("Using fallback for %u children, node type index: %u, mode %s\n", + new_shadow_node->nr_child, old_type_index, mode == JA_RECOMPACT_ADD_NEXT ? "add_next" : + (mode == JA_RECOMPACT_DEL ? "del" : "add_same")); + if (ja_debug_counters()) + uatomic_inc(&ja->node_fallback_count_distribution[new_shadow_node->nr_child]); + } + + /* Return pointer to new recompacted node through old_node_flag_ptr */ + *old_node_flag_ptr = new_node_flag; + if (old_node) { + int flags; + + flags = RCUJA_SHADOW_CLEAR_FREE_NODE; + /* + * It is OK to free the lock associated with a node + * going to NULL, since we are holding the parent lock. + * This synchronizes removal with re-add of that node. + */ + if (new_type_index == NODE_INDEX_NULL) + flags |= RCUJA_SHADOW_CLEAR_FREE_LOCK; + ret = rcuja_shadow_clear(ja->ht, old_node_flag, shadow_node, + flags); + assert(!ret); + } + + ret = 0; +end: + return ret; + +fallback_toosmall: + /* fallback if next pool is too small */ + assert(new_shadow_node); + ret = rcuja_shadow_clear(ja->ht, new_node_flag, new_shadow_node, + RCUJA_SHADOW_CLEAR_FREE_NODE); + assert(!ret); + + switch (mode) { + case JA_RECOMPACT_ADD_SAME: + /* + * JA_RECOMPACT_ADD_SAME is only triggered if a linear + * node within a pool has unused entries. It should + * therefore _never_ be too small. + */ + assert(0); + + /* Fall-through */ + case JA_RECOMPACT_ADD_NEXT: + { + const struct cds_ja_type *next_type; + + /* + * Recompaction attempt on add failed. Should only + * happen if target node type is pool. Caused by + * hard-to-split distribution. Recompact using the next + * distribution size. + */ + assert(new_type->type_class == RCU_JA_POOL); + next_type = &ja_types[new_type_index + 1]; + /* + * Try going to the next pool size if our population + * fits within its range. This is not flagged as a + * fallback. + */ + if (shadow_node->nr_child + 1 >= next_type->min_child + && shadow_node->nr_child + 1 <= next_type->max_child) { + new_type_index++; + goto retry; + } else { + new_type_index++; + dbg_printf("Add fallback to type %d\n", new_type_index); + if (ja_debug_counters()) + uatomic_inc(&ja->nr_fallback); + fallback = 1; + goto retry; + } + break; + } + case JA_RECOMPACT_DEL: + /* + * Recompaction attempt on delete failed. Should only + * happen if target node type is pool. This is caused by + * a hard-to-split distribution. Recompact on same node + * size, but flag current node as "fallback" to ensure + * we don't attempt recompaction before some activity + * has reshuffled our node. + */ + assert(new_type->type_class == RCU_JA_POOL); + new_type_index = old_type_index; + dbg_printf("Delete fallback keeping type %d\n", new_type_index); + uatomic_inc(&ja->nr_fallback); + fallback = 1; + goto retry; + default: + assert(0); + return -EINVAL; + } + + /* + * Last resort fallback: pigeon. + */ + new_type_index = (1UL << JA_TYPE_BITS) - 1; + dbg_printf("Fallback to type %d\n", new_type_index); + uatomic_inc(&ja->nr_fallback); + fallback = 1; + goto retry; +} + +/* + * Return 0 on success, -EAGAIN if need to retry, or other negative + * error value otherwise. + */ +static +int ja_node_set_nth(struct cds_ja *ja, + struct cds_ja_inode_flag **node_flag, uint8_t n, + struct cds_ja_inode_flag *child_node_flag, + struct cds_ja_shadow_node *shadow_node, + int level) +{ + int ret; + unsigned int type_index; + const struct cds_ja_type *type; + struct cds_ja_inode *node; + + dbg_printf("ja_node_set_nth for n=%u, node %p, shadow %p\n", + (unsigned int) n, ja_node_ptr(*node_flag), shadow_node); + + node = ja_node_ptr(*node_flag); + type_index = ja_node_type(*node_flag); + type = &ja_types[type_index]; + ret = _ja_node_set_nth(type, node, *node_flag, shadow_node, + n, child_node_flag); + switch (ret) { + case -ENOSPC: + /* Not enough space in node, need to recompact to next type. */ + ret = ja_node_recompact(JA_RECOMPACT_ADD_NEXT, ja, type_index, type, node, + shadow_node, node_flag, n, child_node_flag, NULL, level); + break; + case -ERANGE: + /* Node needs to be recompacted. */ + ret = ja_node_recompact(JA_RECOMPACT_ADD_SAME, ja, type_index, type, node, + shadow_node, node_flag, n, child_node_flag, NULL, level); + break; + } + return ret; +} + +/* + * Return 0 on success, -EAGAIN if need to retry, or other negative + * error value otherwise. + */ +static +int ja_node_clear_ptr(struct cds_ja *ja, + struct cds_ja_inode_flag **node_flag_ptr, /* Pointer to location to nullify */ + struct cds_ja_inode_flag **parent_node_flag_ptr, /* Address of parent ptr in its parent */ + struct cds_ja_shadow_node *shadow_node, /* of parent */ + uint8_t n, int level) +{ + int ret; + unsigned int type_index; + const struct cds_ja_type *type; + struct cds_ja_inode *node; + + dbg_printf("ja_node_clear_ptr for node %p, shadow %p, target ptr %p\n", + ja_node_ptr(*parent_node_flag_ptr), shadow_node, node_flag_ptr); + + node = ja_node_ptr(*parent_node_flag_ptr); + type_index = ja_node_type(*parent_node_flag_ptr); + type = &ja_types[type_index]; + ret = _ja_node_clear_ptr(type, node, *parent_node_flag_ptr, shadow_node, node_flag_ptr, n); + if (ret == -EFBIG) { + /* Should try recompaction. */ + ret = ja_node_recompact(JA_RECOMPACT_DEL, ja, type_index, type, node, + shadow_node, parent_node_flag_ptr, n, NULL, + node_flag_ptr, level); + } + return ret; +} + +struct cds_ja_node *cds_ja_lookup(struct cds_ja *ja, uint64_t key) +{ + unsigned int tree_depth, i; + struct cds_ja_inode_flag *node_flag; + + if (caa_unlikely(key > ja->key_max || key == UINT64_MAX)) + return NULL; + tree_depth = ja->tree_depth; + node_flag = rcu_dereference(ja->root); + + /* level 0: root node */ + if (!ja_node_ptr(node_flag)) + return NULL; + + for (i = 1; i < tree_depth; i++) { + uint8_t iter_key; + + iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (tree_depth - i - 1))); + node_flag = ja_node_get_nth(node_flag, NULL, iter_key); + dbg_printf("cds_ja_lookup iter key lookup %u finds node_flag %p\n", + (unsigned int) iter_key, node_flag); + if (!ja_node_ptr(node_flag)) + return NULL; + } + + /* Last level lookup succeded. We got an actual match. */ + return (struct cds_ja_node *) node_flag; +} + +static +struct cds_ja_node *cds_ja_lookup_inequality(struct cds_ja *ja, uint64_t key, + uint64_t *result_key, enum ja_lookup_inequality mode) +{ + int tree_depth, level; + struct cds_ja_inode_flag *node_flag, *cur_node_depth[JA_MAX_DEPTH]; + uint8_t cur_key[JA_MAX_DEPTH]; + uint64_t _result_key = 0; + enum ja_direction dir; + + switch (mode) { + case JA_LOOKUP_BE: + case JA_LOOKUP_AE: + if (caa_unlikely(key > ja->key_max || key == UINT64_MAX)) + return NULL; + break; + default: + return NULL; + } + + memset(cur_node_depth, 0, sizeof(cur_node_depth)); + memset(cur_key, 0, sizeof(cur_key)); + tree_depth = ja->tree_depth; + node_flag = rcu_dereference(ja->root); + cur_node_depth[0] = node_flag; + + /* level 0: root node */ + if (!ja_node_ptr(node_flag)) + return NULL; + + for (level = 1; level < tree_depth; level++) { + uint8_t iter_key; + + iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (tree_depth - level - 1))); + node_flag = ja_node_get_nth(node_flag, NULL, iter_key); + if (!ja_node_ptr(node_flag)) + break; + cur_key[level - 1] = iter_key; + cur_node_depth[level] = node_flag; + dbg_printf("cds_ja_lookup_inequality iter key lookup %u finds node_flag %p\n", + (unsigned int) iter_key, node_flag); + } + + if (level == tree_depth) { + /* Last level lookup succeded. We got an equal match. */ + if (result_key) + *result_key = key; + return (struct cds_ja_node *) node_flag; + } + + /* + * Find highest value left/right of current node. + * Current node is cur_node_depth[level]. + * Start at current level. If we cannot find any key left/right + * of ours, go one level up, seek highest value left/right of + * current (recursively), and when we find one, get the + * rightmost/leftmost child of its rightmost/leftmost child + * (recursively). + */ + switch (mode) { + case JA_LOOKUP_BE: + dir = JA_LEFT; + break; + case JA_LOOKUP_AE: + dir = JA_RIGHT; + break; + default: + assert(0); + } + for (; level > 0; level--) { + uint8_t iter_key; + + iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (tree_depth - level - 1))); + node_flag = ja_node_get_leftright(cur_node_depth[level - 1], + iter_key, &cur_key[level - 1], dir); + dbg_printf("cds_ja_lookup_inequality find sibling from %u at %u finds node_flag %p\n", + (unsigned int) iter_key, (unsigned int) cur_key[level - 1], + node_flag); + /* If found left/right sibling, find rightmost/leftmost child. */ + if (ja_node_ptr(node_flag)) + break; + } + + if (!level) { + /* Reached the root and could not find a left/right sibling. */ + return NULL; + } + + level++; + + /* + * From this point, we are guaranteed to be able to find a + * "below than"/"above than" match. ja_attach_node() and + * ja_detach_node() both guarantee that it is not possible for a + * lookup to reach a dead-end. + */ + + /* + * Find rightmost/leftmost child of rightmost/leftmost child + * (recursively). + */ + switch (mode) { + case JA_LOOKUP_BE: + dir = JA_RIGHTMOST; + break; + case JA_LOOKUP_AE: + dir = JA_LEFTMOST; + break; + default: + assert(0); + } + for (; level < tree_depth; level++) { + node_flag = ja_node_get_minmax(node_flag, &cur_key[level - 1], dir); + dbg_printf("cds_ja_lookup_inequality find minmax at %u finds node_flag %p\n", + (unsigned int) cur_key[level - 1], + node_flag); + if (!ja_node_ptr(node_flag)) + break; + } + + assert(level == tree_depth); + + if (result_key) { + for (level = 1; level < tree_depth; level++) { + _result_key |= ((uint64_t) cur_key[level - 1]) + << (JA_BITS_PER_BYTE * (tree_depth - level - 1)); + } + *result_key = _result_key; + } + return (struct cds_ja_node *) node_flag; +} + +struct cds_ja_node *cds_ja_lookup_below_equal(struct cds_ja *ja, + uint64_t key, uint64_t *result_key) +{ + dbg_printf("cds_ja_lookup_below_equal key %" PRIu64 "\n", key); + return cds_ja_lookup_inequality(ja, key, result_key, JA_LOOKUP_BE); +} + +struct cds_ja_node *cds_ja_lookup_above_equal(struct cds_ja *ja, + uint64_t key, uint64_t *result_key) +{ + dbg_printf("cds_ja_lookup_above_equal key %" PRIu64 "\n", key); + return cds_ja_lookup_inequality(ja, key, result_key, JA_LOOKUP_AE); +} + +/* + * We reached an unpopulated node. Create it and the children we need, + * and then attach the entire branch to the current node. This may + * trigger recompaction of the current node. Locks needed: node lock + * (for add), and, possibly, parent node lock (to update pointer due to + * node recompaction). + * + * First take node lock, check if recompaction is needed, then take + * parent lock (if needed). Then we can proceed to create the new + * branch. Publish the new branch, and release locks. + * TODO: we currently always take the parent lock even when not needed. + * + * ja_attach_node() ensures that a lookup will _never_ see a branch that + * leads to a dead-end: before attaching a branch, the entire content of + * the new branch is populated, thus creating a cluster, before + * attaching the cluster to the rest of the tree, thus making it visible + * to lookups. + */ +static +int ja_attach_node(struct cds_ja *ja, + struct cds_ja_inode_flag **attach_node_flag_ptr, + struct cds_ja_inode_flag *attach_node_flag, + struct cds_ja_inode_flag *parent_attach_node_flag, + struct cds_ja_inode_flag **old_node_flag_ptr, + struct cds_ja_inode_flag *old_node_flag, + uint64_t key, + unsigned int level, + struct cds_ja_node *child_node) +{ + struct cds_ja_shadow_node *shadow_node = NULL, + *parent_shadow_node = NULL; + struct cds_ja_inode_flag *iter_node_flag, *iter_dest_node_flag; + int ret, i; + struct cds_ja_inode_flag *created_nodes[JA_MAX_DEPTH]; + int nr_created_nodes = 0; + + dbg_printf("Attach node at level %u (old_node_flag %p, attach_node_flag_ptr %p attach_node_flag %p, parent_attach_node_flag %p)\n", + level, old_node_flag, attach_node_flag_ptr, attach_node_flag, parent_attach_node_flag); + + assert(!old_node_flag); + if (attach_node_flag) { + shadow_node = rcuja_shadow_lookup_lock(ja->ht, attach_node_flag); + if (!shadow_node) { + ret = -EAGAIN; + goto end; + } + } + if (parent_attach_node_flag) { + parent_shadow_node = rcuja_shadow_lookup_lock(ja->ht, + parent_attach_node_flag); + if (!parent_shadow_node) { + ret = -EAGAIN; + goto unlock_shadow; + } + } + + if (old_node_flag_ptr && ja_node_ptr(*old_node_flag_ptr)) { + /* + * Target node has been updated between RCU lookup and + * lock acquisition. We need to re-try lookup and + * attach. + */ + ret = -EAGAIN; + goto unlock_parent; + } + + /* + * Perform a lookup query to handle the case where + * old_node_flag_ptr is NULL. We cannot use it to check if the + * node has been populated between RCU lookup and mutex + * acquisition. + */ + if (!old_node_flag_ptr) { + uint8_t iter_key; + struct cds_ja_inode_flag *lookup_node_flag; + struct cds_ja_inode_flag **lookup_node_flag_ptr; + + iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (ja->tree_depth - level))); + lookup_node_flag = ja_node_get_nth(attach_node_flag, + &lookup_node_flag_ptr, + iter_key); + if (lookup_node_flag) { + ret = -EEXIST; + goto unlock_parent; + } + } + + if (attach_node_flag_ptr && ja_node_ptr(*attach_node_flag_ptr) != + ja_node_ptr(attach_node_flag)) { + /* + * Target node has been updated between RCU lookup and + * lock acquisition. We need to re-try lookup and + * attach. + */ + ret = -EAGAIN; + goto unlock_parent; + } + + /* Create new branch, starting from bottom */ + iter_node_flag = (struct cds_ja_inode_flag *) child_node; + + for (i = ja->tree_depth - 1; i >= (int) level; i--) { + uint8_t iter_key; + + iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (ja->tree_depth - i - 1))); + dbg_printf("branch creation level %d, key %u\n", + i, (unsigned int) iter_key); + iter_dest_node_flag = NULL; + ret = ja_node_set_nth(ja, &iter_dest_node_flag, + iter_key, + iter_node_flag, + NULL, i); + if (ret) { + dbg_printf("branch creation error %d\n", ret); + goto check_error; + } + created_nodes[nr_created_nodes++] = iter_dest_node_flag; + iter_node_flag = iter_dest_node_flag; + } + assert(level > 0); + + /* Publish branch */ + if (level == 1) { + /* + * Attaching to root node. + */ + rcu_assign_pointer(ja->root, iter_node_flag); + } else { + uint8_t iter_key; + + iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (ja->tree_depth - level))); + dbg_printf("publish branch at level %d, key %u\n", + level - 1, (unsigned int) iter_key); + /* We need to use set_nth on the previous level. */ + iter_dest_node_flag = attach_node_flag; + ret = ja_node_set_nth(ja, &iter_dest_node_flag, + iter_key, + iter_node_flag, + shadow_node, level - 1); + if (ret) { + dbg_printf("branch publish error %d\n", ret); + goto check_error; + } + /* + * Attach branch + */ + rcu_assign_pointer(*attach_node_flag_ptr, iter_dest_node_flag); + } + + /* Success */ + ret = 0; + +check_error: + if (ret) { + for (i = 0; i < nr_created_nodes; i++) { + int tmpret; + int flags; + + flags = RCUJA_SHADOW_CLEAR_FREE_LOCK; + if (i) + flags |= RCUJA_SHADOW_CLEAR_FREE_NODE; + tmpret = rcuja_shadow_clear(ja->ht, + created_nodes[i], + NULL, + flags); + assert(!tmpret); + } + } +unlock_parent: + if (parent_shadow_node) + rcuja_shadow_unlock(parent_shadow_node); +unlock_shadow: + if (shadow_node) + rcuja_shadow_unlock(shadow_node); +end: + return ret; +} + +/* + * Lock the parent containing the pointer to list of duplicates, and add + * node to this list. Failure can happen if concurrent update changes + * the parent before we get the lock. We return -EAGAIN in that case. + * Return 0 on success, negative error value on failure. + */ +static +int ja_chain_node(struct cds_ja *ja, + struct cds_ja_inode_flag *parent_node_flag, + struct cds_ja_inode_flag **node_flag_ptr, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_node *last_node, + struct cds_ja_node *node) +{ + struct cds_ja_shadow_node *shadow_node; + struct cds_ja_node *iter_node; + int ret = 0, found = 0; + + shadow_node = rcuja_shadow_lookup_lock(ja->ht, parent_node_flag); + if (!shadow_node) { + return -EAGAIN; + } + /* + * Ensure that previous node is still there at end of list. + */ + iter_node = (struct cds_ja_node *) ja_node_ptr(node_flag); + if ((struct cds_ja_node *) ja_node_ptr(*node_flag_ptr) != iter_node) { + ret = -EAGAIN; + goto end; + } + cds_ja_for_each_duplicate(iter_node) { + if (iter_node == last_node) + found = 1; + } + if (!found) { + ret = -EAGAIN; + goto end; + } + /* + * Add node to tail of list to ensure that RCU traversals will + * always see either the prior node or the newly added if + * executed concurrently with a sequence of add followed by del + * on the same key. Safe against concurrent RCU read traversals. + */ + node->next = NULL; + rcu_assign_pointer(last_node->next, node); +end: + rcuja_shadow_unlock(shadow_node); + return ret; +} + +static +int _cds_ja_add(struct cds_ja *ja, uint64_t key, + struct cds_ja_node *node, + struct cds_ja_node **unique_node_ret) +{ + unsigned int tree_depth, i; + struct cds_ja_inode_flag *attach_node_flag, + *parent_node_flag, + *parent2_node_flag, + *node_flag, + *parent_attach_node_flag; + struct cds_ja_inode_flag **attach_node_flag_ptr, + **parent_node_flag_ptr, + **node_flag_ptr; + int ret; + + if (caa_unlikely(key > ja->key_max || key == UINT64_MAX)) { + return -EINVAL; + } + tree_depth = ja->tree_depth; + +retry: + dbg_printf("cds_ja_add attempt: key %" PRIu64 ", node %p\n", + key, node); + parent2_node_flag = NULL; + parent_node_flag = + (struct cds_ja_inode_flag *) &ja->root; /* Use root ptr address as key for mutex */ + parent_node_flag_ptr = NULL; + node_flag = rcu_dereference(ja->root); + node_flag_ptr = &ja->root; + + /* Iterate on all internal levels */ + for (i = 1; i < tree_depth; i++) { + uint8_t iter_key; + + if (!ja_node_ptr(node_flag)) + break; + dbg_printf("cds_ja_add iter parent2_node_flag %p parent_node_flag %p node_flag_ptr %p node_flag %p\n", + parent2_node_flag, parent_node_flag, node_flag_ptr, node_flag); + iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (tree_depth - i - 1))); + parent2_node_flag = parent_node_flag; + parent_node_flag = node_flag; + parent_node_flag_ptr = node_flag_ptr; + node_flag = ja_node_get_nth(node_flag, + &node_flag_ptr, + iter_key); + } + + /* + * We reached either bottom of tree or internal NULL node, + * simply add node to last internal level, or chain it if key is + * already present. + */ + if (!ja_node_ptr(node_flag)) { + dbg_printf("cds_ja_add NULL parent2_node_flag %p parent_node_flag %p node_flag_ptr %p node_flag %p\n", + parent2_node_flag, parent_node_flag, node_flag_ptr, node_flag); + + attach_node_flag = parent_node_flag; + attach_node_flag_ptr = parent_node_flag_ptr; + parent_attach_node_flag = parent2_node_flag; + + ret = ja_attach_node(ja, attach_node_flag_ptr, + attach_node_flag, + parent_attach_node_flag, + node_flag_ptr, + node_flag, + key, i, node); + } else { + struct cds_ja_node *iter_node, *last_node = NULL; + + if (unique_node_ret) { + *unique_node_ret = (struct cds_ja_node *) ja_node_ptr(node_flag); + return -EEXIST; + } + + /* Find last duplicate */ + iter_node = (struct cds_ja_node *) ja_node_ptr(node_flag); + cds_ja_for_each_duplicate_rcu(iter_node) + last_node = iter_node; + + dbg_printf("cds_ja_add duplicate parent2_node_flag %p parent_node_flag %p node_flag_ptr %p node_flag %p\n", + parent2_node_flag, parent_node_flag, node_flag_ptr, node_flag); + + attach_node_flag = node_flag; + attach_node_flag_ptr = node_flag_ptr; + parent_attach_node_flag = parent_node_flag; + + ret = ja_chain_node(ja, + parent_attach_node_flag, + attach_node_flag_ptr, + attach_node_flag, + last_node, + node); + } + if (ret == -EAGAIN || ret == -EEXIST) + goto retry; + + return ret; +} + +int cds_ja_add(struct cds_ja *ja, uint64_t key, + struct cds_ja_node *node) +{ + return _cds_ja_add(ja, key, node, NULL); +} + +struct cds_ja_node *cds_ja_add_unique(struct cds_ja *ja, uint64_t key, + struct cds_ja_node *node) +{ + int ret; + struct cds_ja_node *ret_node; + + ret = _cds_ja_add(ja, key, node, &ret_node); + if (ret == -EEXIST) + return ret_node; + else + return node; +} + +/* + * Note: there is no need to lookup the pointer address associated with + * each node's nth item after taking the lock: it's already been done by + * cds_ja_del while holding the rcu read-side lock, and our node rules + * ensure that when a match value -> pointer is found in a node, it is + * _NEVER_ changed for that node without recompaction, and recompaction + * reallocates the node. + * However, when a child is removed from "linear" nodes, its pointer + * is set to NULL. We therefore check, while holding the locks, if this + * pointer is NULL, and return -ENOENT to the caller if it is the case. + * + * ja_detach_node() ensures that a lookup will _never_ see a branch that + * leads to a dead-end: when removing branch, it makes sure to perform + * the "cut" at the highest node that has only one child, effectively + * replacing it with a NULL pointer. + */ +static +int ja_detach_node(struct cds_ja *ja, + struct cds_ja_inode_flag **snapshot, + struct cds_ja_inode_flag ***snapshot_ptr, + uint8_t *snapshot_n, + int nr_snapshot, + uint64_t key, + struct cds_ja_node *node) +{ + struct cds_ja_shadow_node *shadow_nodes[JA_MAX_DEPTH]; + struct cds_ja_inode_flag **node_flag_ptr = NULL, + *parent_node_flag = NULL, + **parent_node_flag_ptr = NULL; + struct cds_ja_inode_flag *iter_node_flag; + int ret, i, nr_shadow = 0, nr_clear = 0, nr_branch = 0; + uint8_t n = 0; + + assert(nr_snapshot == ja->tree_depth + 1); + + /* + * From the last internal level node going up, get the node + * lock, check if the node has only one child left. If it is the + * case, we continue iterating upward. When we reach a node + * which has more that one child left, we lock the parent, and + * proceed to the node deletion (removing its children too). + */ + for (i = nr_snapshot - 2; i >= 1; i--) { + struct cds_ja_shadow_node *shadow_node; + + shadow_node = rcuja_shadow_lookup_lock(ja->ht, + snapshot[i]); + if (!shadow_node) { + ret = -EAGAIN; + goto end; + } + shadow_nodes[nr_shadow++] = shadow_node; + + /* + * Check if node has been removed between RCU + * lookup and lock acquisition. + */ + assert(snapshot_ptr[i + 1]); + if (ja_node_ptr(*snapshot_ptr[i + 1]) + != ja_node_ptr(snapshot[i + 1])) { + ret = -ENOENT; + goto end; + } + + assert(shadow_node->nr_child > 0); + if (shadow_node->nr_child == 1 && i > 1) + nr_clear++; + nr_branch++; + if (shadow_node->nr_child > 1 || i == 1) { + /* Lock parent and break */ + shadow_node = rcuja_shadow_lookup_lock(ja->ht, + snapshot[i - 1]); + if (!shadow_node) { + ret = -EAGAIN; + goto end; + } + shadow_nodes[nr_shadow++] = shadow_node; + + /* + * Check if node has been removed between RCU + * lookup and lock acquisition. + */ + assert(snapshot_ptr[i]); + if (ja_node_ptr(*snapshot_ptr[i]) + != ja_node_ptr(snapshot[i])) { + ret = -ENOENT; + goto end; + } + + node_flag_ptr = snapshot_ptr[i + 1]; + n = snapshot_n[i + 1]; + parent_node_flag_ptr = snapshot_ptr[i]; + parent_node_flag = snapshot[i]; + + if (i > 1) { + /* + * Lock parent's parent, in case we need + * to recompact parent. + */ + shadow_node = rcuja_shadow_lookup_lock(ja->ht, + snapshot[i - 2]); + if (!shadow_node) { + ret = -EAGAIN; + goto end; + } + shadow_nodes[nr_shadow++] = shadow_node; + + /* + * Check if node has been removed between RCU + * lookup and lock acquisition. + */ + assert(snapshot_ptr[i - 1]); + if (ja_node_ptr(*snapshot_ptr[i - 1]) + != ja_node_ptr(snapshot[i - 1])) { + ret = -ENOENT; + goto end; + } + } + + break; + } + } + + /* + * At this point, we want to delete all nodes that are about to + * be removed from shadow_nodes (except the last one, which is + * either the root or the parent of the upmost node with 1 + * child). OK to free lock here, because RCU read lock is held, + * and free only performed in call_rcu. + */ + + for (i = 0; i < nr_clear; i++) { + ret = rcuja_shadow_clear(ja->ht, + shadow_nodes[i]->node_flag, + shadow_nodes[i], + RCUJA_SHADOW_CLEAR_FREE_NODE + | RCUJA_SHADOW_CLEAR_FREE_LOCK); + assert(!ret); + } + + iter_node_flag = parent_node_flag; + /* Remove from parent */ + if (nr_branch < 1) + abort(); /* Should never happen. */ + ret = ja_node_clear_ptr(ja, + node_flag_ptr, /* Pointer to location to nullify */ + &iter_node_flag, /* Old new parent ptr in its parent */ + shadow_nodes[nr_branch - 1], /* of parent */ + n, nr_branch - 1); + if (ret) + goto end; + + dbg_printf("ja_detach_node: publish %p instead of %p\n", + iter_node_flag, *parent_node_flag_ptr); + /* Update address of parent ptr in its parent */ + rcu_assign_pointer(*parent_node_flag_ptr, iter_node_flag); + +end: + for (i = 0; i < nr_shadow; i++) + rcuja_shadow_unlock(shadow_nodes[i]); + return ret; +} + +static +int ja_unchain_node(struct cds_ja *ja, + struct cds_ja_inode_flag *parent_node_flag, + struct cds_ja_inode_flag **node_flag_ptr, + struct cds_ja_inode_flag *node_flag, + struct cds_ja_node *node) +{ + struct cds_ja_shadow_node *shadow_node; + struct cds_ja_node *iter_node, **iter_node_ptr, **prev_node_ptr = NULL; + int ret = 0, count = 0, found = 0; + + shadow_node = rcuja_shadow_lookup_lock(ja->ht, parent_node_flag); + if (!shadow_node) + return -EAGAIN; + if (ja_node_ptr(*node_flag_ptr) != ja_node_ptr(node_flag)) { + ret = -EAGAIN; + goto end; + } + /* + * Find the previous node's next pointer pointing to our node, + * so we can update it. Retry if another thread removed all but + * one of duplicates since check (this check was performed + * without lock). Ensure that the node we are about to remove is + * still in the list (while holding lock). No need for RCU + * traversal here since we hold the lock on the parent. + */ + iter_node_ptr = (struct cds_ja_node **) node_flag_ptr; + iter_node = (struct cds_ja_node *) ja_node_ptr(node_flag); + cds_ja_for_each_duplicate(iter_node) { + count++; + if (iter_node == node) { + prev_node_ptr = iter_node_ptr; + found++; + } + iter_node_ptr = &iter_node->next; + } + assert(found <= 1); + if (!found || count == 1) { + ret = -EAGAIN; + goto end; + } + CMM_STORE_SHARED(*prev_node_ptr, node->next); + /* + * Validate that we indeed removed the node from linked list. + */ + assert(ja_node_ptr(*node_flag_ptr) != (struct cds_ja_inode *) node); +end: + rcuja_shadow_unlock(shadow_node); + return ret; +} + +/* + * Called with RCU read lock held. + */ +int cds_ja_del(struct cds_ja *ja, uint64_t key, + struct cds_ja_node *node) +{ + unsigned int tree_depth, i; + struct cds_ja_inode_flag *snapshot[JA_MAX_DEPTH]; + struct cds_ja_inode_flag **snapshot_ptr[JA_MAX_DEPTH]; + uint8_t snapshot_n[JA_MAX_DEPTH]; + struct cds_ja_inode_flag *node_flag; + struct cds_ja_inode_flag **prev_node_flag_ptr, + **node_flag_ptr; + int nr_snapshot; + int ret; + + if (caa_unlikely(key > ja->key_max || key == UINT64_MAX)) + return -EINVAL; + tree_depth = ja->tree_depth; + +retry: + nr_snapshot = 0; + dbg_printf("cds_ja_del attempt: key %" PRIu64 ", node %p\n", + key, node); + + /* snapshot for level 0 is only for shadow node lookup */ + snapshot_n[0] = 0; + snapshot_n[1] = 0; + snapshot_ptr[nr_snapshot] = NULL; + snapshot[nr_snapshot++] = (struct cds_ja_inode_flag *) &ja->root; + node_flag = rcu_dereference(ja->root); + prev_node_flag_ptr = &ja->root; + node_flag_ptr = &ja->root; + + /* Iterate on all internal levels */ + for (i = 1; i < tree_depth; i++) { + uint8_t iter_key; + + dbg_printf("cds_ja_del iter node_flag %p\n", + node_flag); + if (!ja_node_ptr(node_flag)) { + return -ENOENT; + } + iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (tree_depth - i - 1))); + snapshot_n[nr_snapshot + 1] = iter_key; + snapshot_ptr[nr_snapshot] = prev_node_flag_ptr; + snapshot[nr_snapshot++] = node_flag; + node_flag = ja_node_get_nth(node_flag, + &node_flag_ptr, + iter_key); + if (node_flag) + prev_node_flag_ptr = node_flag_ptr; + dbg_printf("cds_ja_del iter key lookup %u finds node_flag %p, prev_node_flag_ptr %p\n", + (unsigned int) iter_key, node_flag, + prev_node_flag_ptr); + } + /* + * We reached bottom of tree, try to find the node we are trying + * to remove. Fail if we cannot find it. + */ + if (!ja_node_ptr(node_flag)) { + dbg_printf("cds_ja_del: no node found for key %" PRIu64 "\n", + key); + return -ENOENT; + } else { + struct cds_ja_node *iter_node, *match = NULL; + int count = 0; + + iter_node = (struct cds_ja_node *) ja_node_ptr(node_flag); + cds_ja_for_each_duplicate_rcu(iter_node) { + dbg_printf("cds_ja_del: compare %p with iter_node %p\n", node, iter_node); + if (iter_node == node) + match = iter_node; + count++; + } + + if (!match) { + dbg_printf("cds_ja_del: no node match for node %p key %" PRIu64 "\n", node, key); + return -ENOENT; + } + assert(count > 0); + if (count == 1) { + /* + * Removing last of duplicates. Last snapshot + * does not have a shadow node (external leafs). + */ + snapshot_ptr[nr_snapshot] = prev_node_flag_ptr; + snapshot[nr_snapshot++] = node_flag; + ret = ja_detach_node(ja, snapshot, snapshot_ptr, + snapshot_n, nr_snapshot, key, node); + } else { + ret = ja_unchain_node(ja, snapshot[nr_snapshot - 1], + node_flag_ptr, node_flag, match); + } + } + /* + * Explanation of -ENOENT handling: caused by concurrent delete + * between RCU lookup and actual removal. Need to re-do the + * lookup and removal attempt. + */ + if (ret == -EAGAIN || ret == -ENOENT) + goto retry; + return ret; +} + +struct cds_ja *_cds_ja_new(unsigned int key_bits, + const struct rcu_flavor_struct *flavor) +{ + struct cds_ja *ja; + int ret; + struct cds_ja_shadow_node *root_shadow_node; + + ja = calloc(sizeof(*ja), 1); + if (!ja) + goto ja_error; + + switch (key_bits) { + case 8: + case 16: + case 24: + case 32: + case 40: + case 48: + case 56: + ja->key_max = (1ULL << key_bits) - 1; + break; + case 64: + ja->key_max = UINT64_MAX; + break; + default: + goto check_error; + } + + /* ja->root is NULL */ + /* tree_depth 0 is for pointer to root node */ + ja->tree_depth = (key_bits >> JA_LOG2_BITS_PER_BYTE) + 1; + assert(ja->tree_depth <= JA_MAX_DEPTH); + ja->ht = rcuja_create_ht(flavor); + if (!ja->ht) + goto ht_error; + + /* + * Note: we should not free this node until judy array destroy. + */ + root_shadow_node = rcuja_shadow_set(ja->ht, + (struct cds_ja_inode_flag *) &ja->root, + NULL, ja, 0); + if (!root_shadow_node) { + ret = -ENOMEM; + goto ht_node_error; + } + + return ja; + +ht_node_error: + ret = rcuja_delete_ht(ja->ht); + assert(!ret); +ht_error: +check_error: + free(ja); +ja_error: + return NULL; +} + +static +void print_debug_fallback_distribution(struct cds_ja *ja) +{ + int i; + + fprintf(stderr, "Fallback node distribution:\n"); + for (i = 0; i < JA_ENTRY_PER_NODE; i++) { + if (!ja->node_fallback_count_distribution[i]) + continue; + fprintf(stderr, " %3u: %4lu\n", + i, ja->node_fallback_count_distribution[i]); + } +} + +static +int ja_final_checks(struct cds_ja *ja) +{ + double fallback_ratio; + unsigned long na, nf, nr_fallback; + int ret = 0; + + if (!ja_debug_counters()) + return 0; + + fallback_ratio = (double) uatomic_read(&ja->nr_fallback); + fallback_ratio /= (double) uatomic_read(&ja->nr_nodes_allocated); + nr_fallback = uatomic_read(&ja->nr_fallback); + if (nr_fallback) + fprintf(stderr, + "[warning] RCU Judy Array used %lu fallback node(s) (ratio: %g)\n", + uatomic_read(&ja->nr_fallback), + fallback_ratio); + + na = uatomic_read(&ja->nr_nodes_allocated); + nf = uatomic_read(&ja->nr_nodes_freed); + dbg_printf("Nodes allocated: %lu, Nodes freed: %lu.\n", na, nf); + if (nr_fallback) + print_debug_fallback_distribution(ja); + + if (na != nf) { + fprintf(stderr, "[error] Judy array leaked %ld nodes. Allocated: %lu, freed: %lu.\n", + (long) na - nf, na, nf); + ret = -1; + } + return ret; +} + +/* + * There should be no more concurrent add, delete, nor look-up performed + * on the Judy array while it is being destroyed (ensured by the + * caller). + */ +int cds_ja_destroy(struct cds_ja *ja) +{ + const struct rcu_flavor_struct *flavor; + int ret; + + flavor = cds_lfht_rcu_flavor(ja->ht); + rcuja_shadow_prune(ja->ht, + RCUJA_SHADOW_CLEAR_FREE_NODE | RCUJA_SHADOW_CLEAR_FREE_LOCK); + flavor->thread_offline(); + ret = rcuja_delete_ht(ja->ht); + if (ret) + return ret; + + /* Wait for in-flight call_rcu free to complete. */ + flavor->barrier(); + + flavor->thread_online(); + ret = ja_final_checks(ja); + free(ja); + return ret; +} diff --git a/tests/regression/Makefile.am b/tests/regression/Makefile.am index aa7a00c..c5c6198 100644 --- a/tests/regression/Makefile.am +++ b/tests/regression/Makefile.am @@ -11,7 +11,8 @@ noinst_PROGRAMS = test_urcu_fork \ rcutorture_urcu_signal \ rcutorture_urcu_mb \ rcutorture_urcu_bp \ - rcutorture_urcu_qsbr + rcutorture_urcu_qsbr \ + test_urcu_ja noinst_HEADERS = rcutorture.h @@ -47,6 +48,10 @@ rcutorture_urcu_bp_SOURCES = urcutorture.c rcutorture_urcu_bp_CFLAGS = -DRCU_BP $(AM_CFLAGS) rcutorture_urcu_bp_LDADD = $(URCU_BP_LIB) $(TAP_LIB) +test_urcu_ja_SOURCES = test_urcu_ja.c test_urcu_ja.h +test_urcu_ja_CFLAGS = -DRCU_MEMBARRIER $(AM_CFLAGS) +test_urcu_ja_LDADD = $(URCU_QSBR_LIB) $(URCU_CDS_LIB) $(TAP_LIB) + urcutorture.c: ../common/api.h all-local: diff --git a/tests/regression/test_urcu_ja.c b/tests/regression/test_urcu_ja.c new file mode 100644 index 0000000..0141648 --- /dev/null +++ b/tests/regression/test_urcu_ja.c @@ -0,0 +1,1339 @@ +/* + * test_urcu_ja.c + * + * Userspace RCU library - test program + * + * Copyright 2009-2012 - Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "test_urcu_ja.h" +#include "../common/debug-yield.h" +#include +#include + +DEFINE_URCU_TLS(unsigned int, rand_lookup); +DEFINE_URCU_TLS(unsigned long, nr_add); +DEFINE_URCU_TLS(unsigned long, nr_addexist); +DEFINE_URCU_TLS(unsigned long, nr_del); +DEFINE_URCU_TLS(unsigned long, nr_delnoent); +DEFINE_URCU_TLS(unsigned long, lookup_fail); +DEFINE_URCU_TLS(unsigned long, lookup_ok); + +struct cds_ja *test_ja; + +volatile int test_go, test_stop; + +unsigned long wdelay; + +unsigned long duration; + +/* read-side C.S. duration, in loops */ +unsigned long rduration; + +unsigned long init_populate; +int add_only; + +unsigned long init_pool_offset, lookup_pool_offset, write_pool_offset; +unsigned long init_pool_size = DEFAULT_RAND_POOL, + lookup_pool_size = DEFAULT_RAND_POOL, + write_pool_size = DEFAULT_RAND_POOL; +int validate_lookup; +int sanity_test; +unsigned int key_bits = 32; + +int count_pipe[2]; + +int verbose_mode; + +unsigned int cpu_affinities[NR_CPUS]; +unsigned int next_aff = 0; +int use_affinity = 0; + +pthread_mutex_t affinity_mutex = PTHREAD_MUTEX_INITIALIZER; + +DEFINE_URCU_TLS(unsigned long long, nr_writes); +DEFINE_URCU_TLS(unsigned long long, nr_reads); + +unsigned int nr_readers; +unsigned int nr_writers; + +static unsigned int add_ratio = 50; +static uint64_t key_mul = 1ULL; + +static int add_unique, add_replace; + +static pthread_mutex_t rcu_copy_mutex = PTHREAD_MUTEX_INITIALIZER; + +static int leak_detection; +static unsigned long test_nodes_allocated, test_nodes_freed; + +void set_affinity(void) +{ + cpu_set_t mask; + int cpu; + int ret; + + if (!use_affinity) + return; + +#if HAVE_SCHED_SETAFFINITY + ret = pthread_mutex_lock(&affinity_mutex); + if (ret) { + perror("Error in pthread mutex lock"); + exit(-1); + } + cpu = cpu_affinities[next_aff++]; + ret = pthread_mutex_unlock(&affinity_mutex); + if (ret) { + perror("Error in pthread mutex unlock"); + exit(-1); + } + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); +#if SCHED_SETAFFINITY_ARGS == 2 + sched_setaffinity(0, &mask); +#else + sched_setaffinity(0, sizeof(mask), &mask); +#endif +#endif /* HAVE_SCHED_SETAFFINITY */ +} + +void rcu_copy_mutex_lock(void) +{ + int ret; + ret = pthread_mutex_lock(&rcu_copy_mutex); + if (ret) { + perror("Error in pthread mutex lock"); + exit(-1); + } +} + +void rcu_copy_mutex_unlock(void) +{ + int ret; + + ret = pthread_mutex_unlock(&rcu_copy_mutex); + if (ret) { + perror("Error in pthread mutex unlock"); + exit(-1); + } +} + +static +struct ja_test_node *node_alloc(void) +{ + struct ja_test_node *node; + + node = calloc(sizeof(*node), 1); + if (leak_detection && node) + uatomic_inc(&test_nodes_allocated); + return node; +} + +static +void free_test_node(struct ja_test_node *node) +{ + poison_free(node); + if (leak_detection) + uatomic_inc(&test_nodes_freed); +} + +static +void free_test_node_cb(struct rcu_head *head) +{ + struct ja_test_node *node = + caa_container_of(head, struct ja_test_node, head); + free_test_node(node); +} + +static +void rcu_free_test_node(struct ja_test_node *test_node) +{ + call_rcu(&test_node->head, free_test_node_cb); +} + +static +void free_node(struct cds_ja_node *node) +{ + struct ja_test_node *test_node = to_test_node(node); + + free_test_node(test_node); +} + +#if 0 +static +void test_delete_all_nodes(struct cds_lfht *ht) +{ + struct cds_lfht_iter iter; + struct lfht_test_node *node; + unsigned long count = 0; + + cds_lfht_for_each_entry(ht, &iter, node, node) { + int ret; + + ret = cds_lfht_del(test_ht, cds_lfht_iter_get_node(&iter)); + assert(!ret); + call_rcu(&node->head, free_node_cb); + count++; + } + printf("deleted %lu nodes.\n", count); +} +#endif + +void show_usage(int argc, char **argv) +{ + printf("Usage : %s nr_readers nr_writers duration (s)\n", argv[0]); +#ifdef DEBUG_YIELD + printf(" [-r] [-w] (yield reader and/or writer)\n"); +#endif + printf(" [-d delay] (writer period (us))\n"); + printf(" [-c duration] (reader C.S. duration (in loops))\n"); + printf(" [-v] (verbose output)\n"); + printf(" [-a cpu#] [-a cpu#]... (affinity)\n"); + printf(" [-u] Add unique keys.\n"); + printf(" [-s] Replace existing keys.\n"); +printf(" [not -u nor -s] Add entries (supports redundant keys).\n"); + printf(" [-r ratio] Add ratio (in %% of add+removal).\n"); + printf(" [-k] Populate init nodes.\n"); + printf(" [-R offset] Lookup pool offset.\n"); + printf(" [-S offset] Write pool offset.\n"); + printf(" [-T offset] Init pool offset.\n"); + printf(" [-M size] Lookup pool size.\n"); + printf(" [-N size] Write pool size.\n"); + printf(" [-O size] Init pool size.\n"); + printf(" [-V] Validate lookups of init values (use with filled init pool, same lookup range, with different write range).\n"); + printf(" [-t] Do sanity test.\n"); + printf(" [-B] Key bits for multithread test (default: 32).\n"); + printf(" [-m factor] Key multiplication factor.\n"); + printf(" [-l] Memory leak detection.\n"); + printf("\n\n"); +} + +static +int test_free_all_nodes(struct cds_ja *ja) +{ + uint64_t key; + struct cds_ja_node *ja_node; + int ret = 0; + + rcu_read_lock(); + cds_ja_for_each_key_rcu(test_ja, key, ja_node) { + struct cds_ja_node *tmp_node; + + cds_ja_for_each_duplicate_safe_rcu(ja_node, tmp_node) { + ret = cds_ja_del(test_ja, key, ja_node); + if (ret) { + fprintf(stderr, "Error (%d) removing node %" PRIu64 "\n", ret, key); + goto end; + } + /* Alone using Judy array, OK to free now */ + free_node(ja_node); + } + } +end: + rcu_read_unlock(); + return ret; +} + +static +int test_8bit_key(void) +{ + int ret, i; + uint64_t key; + uint64_t ka[] = { 5, 17, 100, 222 }; + uint64_t ka_test_offset = 5; + struct cds_ja_node *ja_node; + + /* Test with 8-bit key */ + test_ja = cds_ja_new(8); + if (!test_ja) { + printf("Error allocating judy array.\n"); + return -1; + } + + /* Add keys */ + printf("Test #1: add keys (8-bit).\n"); + for (key = 0; key < 200; key++) { + struct ja_test_node *node = node_alloc(); + + ja_test_node_init(node, key); + rcu_read_lock(); + ret = cds_ja_add(test_ja, key, &node->node); + rcu_read_unlock(); + if (ret) { + fprintf(stderr, "Error (%d) adding node %" PRIu64 "\n", + ret, key); + assert(0); + } + } + printf("OK\n"); + + printf("Test #2: successful key lookup (8-bit).\n"); + for (key = 0; key < 200; key++) { + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key); + if (!ja_node) { + fprintf(stderr, "Error lookup node %" PRIu64 "\n", key); + assert(0); + } + rcu_read_unlock(); + } + printf("OK\n"); + printf("Test #3: unsuccessful key lookup (8-bit).\n"); + for (key = 200; key < 240; key++) { + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key); + if (ja_node) { + fprintf(stderr, + "Error unexpected lookup node %" PRIu64 "\n", + key); + assert(0); + } + rcu_read_unlock(); + } + printf("OK\n"); + printf("Test #4: remove keys (8-bit).\n"); + for (key = 0; key < 200; key++) { + struct ja_test_node *node; + + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key); + if (!ja_node) { + fprintf(stderr, "Error lookup node %" PRIu64 "\n", key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + ret = cds_ja_del(test_ja, key, &node->node); + if (ret) { + fprintf(stderr, "Error (%d) removing node %" PRIu64 "\n", ret, key); + assert(0); + } + rcu_free_test_node(node); + ja_node = cds_ja_lookup(test_ja, key); + if (ja_node) { + fprintf(stderr, "Error lookup %" PRIu64 ": %p (after delete) failed. Node is not expected.\n", key, ja_node); + assert(0); + } + rcu_read_unlock(); + } + printf("OK\n"); + + printf("Test #5: lookup below/above equal (8-bit).\n"); + + for (i = 0; i < CAA_ARRAY_SIZE(ka); i++) { + struct ja_test_node *node = node_alloc(); + + key = ka[i]; + ja_test_node_init(node, key); + rcu_read_lock(); + ret = cds_ja_add(test_ja, key, &node->node); + rcu_read_unlock(); + if (ret) { + fprintf(stderr, "Error (%d) adding node %" PRIu64 "\n", + ret, key); + assert(0); + } + } + + for (i = 0; i < CAA_ARRAY_SIZE(ka); i++) { + struct ja_test_node *node; + uint64_t result_key; + + key = ka[i] + ka_test_offset; + rcu_read_lock(); + ja_node = cds_ja_lookup_below_equal(test_ja, key, &result_key); + if (!ja_node) { + fprintf(stderr, "Error lookup below equal. Cannot find expected key %" PRIu64" below or equal to %" PRIu64 ".\n", + ka[i], key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + if (node->key != ka[i] || result_key != ka[i]) { + fprintf(stderr, "Error lookup below equal. Expecting key %" PRIu64 " below or equal to %" PRIu64 ", but found %" PRIu64 "/%" PRIu64" instead.\n", + ka[i], key, node->key, result_key); + assert(0); + } + rcu_read_unlock(); + } + + for (i = 0; i < CAA_ARRAY_SIZE(ka); i++) { + struct ja_test_node *node; + uint64_t result_key; + + key = ka[i] - ka_test_offset; + rcu_read_lock(); + ja_node = cds_ja_lookup_above_equal(test_ja, key, &result_key); + if (!ja_node) { + fprintf(stderr, "Error lookup above equal. Cannot find expected key %" PRIu64" below or equal to %" PRIu64 ".\n", + ka[i], key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + if (node->key != ka[i] || result_key != ka[i]) { + fprintf(stderr, "Error lookup above equal. Expecting key %" PRIu64 " below or equal to %" PRIu64 ", but found %" PRIu64 "/%" PRIu64" instead.\n", + ka[i], key, node->key, result_key); + assert(0); + } + rcu_read_unlock(); + } + + for (i = 0; i < CAA_ARRAY_SIZE(ka); i++) { + struct ja_test_node *node; + uint64_t result_key; + + key = ka[i]; /* without offset */ + rcu_read_lock(); + ja_node = cds_ja_lookup_below_equal(test_ja, key, &result_key); + if (!ja_node) { + fprintf(stderr, "Error lookup below equal. Cannot find expected key %" PRIu64" below or equal to %" PRIu64 ".\n", + ka[i], key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + if (node->key != ka[i] || result_key != ka[i]) { + fprintf(stderr, "Error lookup below equal. Expecting key %" PRIu64 " below or equal to %" PRIu64 ", but found %" PRIu64 "/%" PRIu64" instead.\n", + ka[i], key, node->key, result_key); + assert(0); + } + + ja_node = cds_ja_lookup_above_equal(test_ja, key, &result_key); + if (!ja_node) { + fprintf(stderr, "Error lookup above equal. Cannot find expected key %" PRIu64" below or equal to %" PRIu64 ".\n", + ka[i], key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + if (node->key != ka[i] || result_key != ka[i]) { + fprintf(stderr, "Error lookup above equal. Expecting key %" PRIu64 " below or equal to %" PRIu64 ", but found %" PRIu64 "/%" PRIu64" instead.\n", + ka[i], key, node->key, result_key); + assert(0); + } + rcu_read_unlock(); + } + + printf("OK\n"); + + ret = test_free_all_nodes(test_ja); + if (ret) { + fprintf(stderr, "Error freeing all nodes\n"); + return -1; + } + + ret = cds_ja_destroy(test_ja); + if (ret) { + fprintf(stderr, "Error destroying judy array\n"); + return -1; + } + return 0; +} + +static +int test_16bit_key(void) +{ + int ret, i; + uint64_t key; + uint64_t ka[] = { 105, 206, 4000, 4111, 59990, 65435 }; + uint64_t ka_test_offset = 100; + struct cds_ja_node *ja_node; + + /* Test with 16-bit key */ + test_ja = cds_ja_new(16); + if (!test_ja) { + printf("Error allocating judy array.\n"); + return -1; + } + + /* Add keys */ + printf("Test #1: add keys (16-bit).\n"); + for (key = 0; key < 10000; key++) { + //for (key = 0; key < 65536; key+=256) { + struct ja_test_node *node = node_alloc(); + + ja_test_node_init(node, key); + rcu_read_lock(); + ret = cds_ja_add(test_ja, key, &node->node); + rcu_read_unlock(); + if (ret) { + fprintf(stderr, "Error (%d) adding node %" PRIu64 "\n", + ret, key); + assert(0); + } + } + printf("OK\n"); + + printf("Test #2: successful key lookup (16-bit).\n"); + for (key = 0; key < 10000; key++) { + //for (key = 0; key < 65536; key+=256) { + struct cds_ja_node *ja_node; + + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key); + if (!ja_node) { + fprintf(stderr, "Error lookup node %" PRIu64 "\n", key); + assert(0); + } + rcu_read_unlock(); + } + printf("OK\n"); + printf("Test #3: unsuccessful key lookup (16-bit).\n"); + for (key = 11000; key <= 11002; key++) { + struct cds_ja_node *ja_node; + + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key); + if (ja_node) { + fprintf(stderr, + "Error unexpected lookup node %" PRIu64 "\n", + key); + assert(0); + } + rcu_read_unlock(); + } + printf("OK\n"); + printf("Test #4: remove keys (16-bit).\n"); + for (key = 0; key < 10000; key++) { + //for (key = 0; key < 65536; key+=256) { + struct ja_test_node *node; + + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key); + if (!ja_node) { + fprintf(stderr, "Error lookup node %" PRIu64 "\n", key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + ret = cds_ja_del(test_ja, key, &node->node); + if (ret) { + fprintf(stderr, "Error (%d) removing node %" PRIu64 "\n", ret, key); + assert(0); + } + rcu_free_test_node(node); + ja_node = cds_ja_lookup(test_ja, key); + if (ja_node) { + fprintf(stderr, "Error lookup %" PRIu64 ": %p (after delete) failed. Node is not expected.\n", key, ja_node); + assert(0); + } + rcu_read_unlock(); + } + printf("OK\n"); + + printf("Test #5: lookup below/above equal (16-bit).\n"); + + for (i = 0; i < CAA_ARRAY_SIZE(ka); i++) { + struct ja_test_node *node = node_alloc(); + + key = ka[i]; + ja_test_node_init(node, key); + rcu_read_lock(); + ret = cds_ja_add(test_ja, key, &node->node); + rcu_read_unlock(); + if (ret) { + fprintf(stderr, "Error (%d) adding node %" PRIu64 "\n", + ret, key); + assert(0); + } + } + + for (i = 0; i < CAA_ARRAY_SIZE(ka); i++) { + struct ja_test_node *node; + uint64_t result_key; + + key = ka[i] + ka_test_offset; + rcu_read_lock(); + ja_node = cds_ja_lookup_below_equal(test_ja, key, &result_key); + if (!ja_node) { + fprintf(stderr, "Error lookup below equal. Cannot find expected key %" PRIu64" below or equal to %" PRIu64 ".\n", + ka[i], key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + if (node->key != ka[i] || result_key != ka[i]) { + fprintf(stderr, "Error lookup below equal. Expecting key %" PRIu64 " below or equal to %" PRIu64 ", but found %" PRIu64 "/%" PRIu64" instead.\n", + ka[i], key, node->key, result_key); + assert(0); + } + rcu_read_unlock(); + } + + for (i = 0; i < CAA_ARRAY_SIZE(ka); i++) { + struct ja_test_node *node; + uint64_t result_key; + + key = ka[i] - ka_test_offset; + rcu_read_lock(); + ja_node = cds_ja_lookup_above_equal(test_ja, key, &result_key); + if (!ja_node) { + fprintf(stderr, "Error lookup above equal. Cannot find expected key %" PRIu64" above or equal to %" PRIu64 ".\n", + ka[i], key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + if (node->key != ka[i] || result_key != ka[i]) { + fprintf(stderr, "Error lookup above equal. Expecting key %" PRIu64 " above or equal to %" PRIu64 ", but found %" PRIu64 "/%" PRIu64" instead.\n", + ka[i], key, node->key, result_key); + assert(0); + } + rcu_read_unlock(); + } + + for (i = 0; i < CAA_ARRAY_SIZE(ka); i++) { + struct ja_test_node *node; + uint64_t result_key; + + key = ka[i]; /* without offset */ + rcu_read_lock(); + ja_node = cds_ja_lookup_below_equal(test_ja, key, &result_key); + if (!ja_node) { + fprintf(stderr, "Error lookup below equal. Cannot find expected key %" PRIu64" below or equal to %" PRIu64 ".\n", + ka[i], key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + if (node->key != ka[i] || result_key != ka[i]) { + fprintf(stderr, "Error lookup below equal. Expecting key %" PRIu64 " below or equal to %" PRIu64 ", but found %" PRIu64 "/%" PRIu64" instead.\n", + ka[i], key, node->key, result_key); + assert(0); + } + + ja_node = cds_ja_lookup_above_equal(test_ja, key, &result_key); + if (!ja_node) { + fprintf(stderr, "Error lookup above equal. Cannot find expected key %" PRIu64" above or equal to %" PRIu64 ".\n", + ka[i], key); + assert(0); + } + node = caa_container_of(ja_node, struct ja_test_node, node); + if (node->key != ka[i] || result_key != ka[i]) { + fprintf(stderr, "Error lookup above equal. Expecting key %" PRIu64 " above or equal to %" PRIu64 ", but found %" PRIu64 "/%" PRIu64" instead.\n", + ka[i], key, node->key, result_key); + assert(0); + } + rcu_read_unlock(); + } + + printf("OK\n"); + + ret = test_free_all_nodes(test_ja); + if (ret) { + fprintf(stderr, "Error freeing all nodes\n"); + return -1; + } + + ret = cds_ja_destroy(test_ja); + if (ret) { + fprintf(stderr, "Error destroying judy array\n"); + return -1; + } + return 0; +} + +/* + * nr_dup is number of nodes per key. + */ +static +int test_sparse_key(unsigned int bits, int nr_dup) +{ + uint64_t key, max_key; + int zerocount, i, ret; + struct cds_ja_node *ja_node; + + if (bits == 64) + max_key = UINT64_MAX; + else + max_key = (1ULL << bits) - 1; + + printf("Sparse key test begins for %u-bit keys\n", bits); + /* Test with 16-bit key */ + test_ja = cds_ja_new(bits); + if (!test_ja) { + printf("Error allocating judy array.\n"); + return -1; + } + + /* Add keys */ + printf("Test #1: add keys (%u-bit).\n", bits); + for (i = 0; i < nr_dup; i++) { + zerocount = 0; + for (key = 0; key <= max_key && (key != 0 || zerocount < 1); key += 1ULL << (bits - 8)) { + struct ja_test_node *node = node_alloc(); + + ja_test_node_init(node, key); + rcu_read_lock(); + ret = cds_ja_add(test_ja, key, &node->node); + rcu_read_unlock(); + if (ret) { + fprintf(stderr, "Error (%d) adding node %" PRIu64 "\n", + ret, key); + assert(0); + } + if (key == 0) + zerocount++; + } + } + printf("OK\n"); + + printf("Test #2: successful key lookup (%u-bit).\n", bits); + zerocount = 0; + for (key = 0; key <= max_key && (key != 0 || zerocount < 1); key += 1ULL << (bits - 8)) { + struct ja_test_node *node; + int count = 0; + + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key); + if (!ja_node) { + fprintf(stderr, "Error lookup node %" PRIu64 "\n", key); + assert(0); + } + cds_ja_for_each_duplicate_rcu(ja_node) { + count++; + } + if (count != nr_dup) { + fprintf(stderr, "Unexpected number of match for key %" PRIu64 ", expected %d, got %d.\n", key, nr_dup, count); + } + rcu_read_unlock(); + if (key == 0) + zerocount++; + } + printf("OK\n"); + if (bits > 8) { + printf("Test #3: unsuccessful key lookup (%u-bit).\n", bits); + zerocount = 0; + for (key = 0; key <= max_key && (key != 0 || zerocount < 1); key += 1ULL << (bits - 8)) { + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key + 42); + if (ja_node) { + fprintf(stderr, + "Error unexpected lookup node %" PRIu64 "\n", + key + 42); + assert(0); + } + rcu_read_unlock(); + if (key == 0) + zerocount++; + } + printf("OK\n"); + } + printf("Test #4: remove keys (%u-bit).\n", bits); + zerocount = 0; + for (key = 0; key <= max_key && (key != 0 || zerocount < 1); key += 1ULL << (bits - 8)) { + int count = 0; + + rcu_read_lock(); + ja_node = cds_ja_lookup(test_ja, key); + + cds_ja_for_each_duplicate_rcu(ja_node) { + struct cds_ja_node *test_ja_node; + struct ja_test_node *node; + + count++; + node = caa_container_of(ja_node, + struct ja_test_node, node); + ret = cds_ja_del(test_ja, key, &node->node); + if (ret) { + fprintf(stderr, "Error (%d) removing node %" PRIu64 "\n", ret, key); + assert(0); + } + rcu_free_test_node(node); + test_ja_node = cds_ja_lookup(test_ja, key); + if (count < nr_dup && !test_ja_node) { + fprintf(stderr, "Error: no node found after deletion of some nodes of a key\n"); + assert(0); + } + } + ja_node = cds_ja_lookup(test_ja, key); + if (ja_node) { + fprintf(stderr, "Error lookup %" PRIu64 ": %p (after delete) failed. Node is not expected.\n", key, ja_node); + assert(0); + } + rcu_read_unlock(); + if (key == 0) + zerocount++; + } + printf("OK\n"); + + ret = test_free_all_nodes(test_ja); + if (ret) { + fprintf(stderr, "Error freeing all nodes\n"); + return -1; + } + + ret = cds_ja_destroy(test_ja); + if (ret) { + fprintf(stderr, "Error destroying judy array\n"); + return -1; + } + printf("Test ends\n"); + + return 0; +} + +static +int do_sanity_test(void) +{ + int i, j, ret; + + printf("Sanity test start.\n"); + + for (i = 0; i < 3; i++) { + ret = test_8bit_key(); + if (ret) { + return ret; + } + rcu_quiescent_state(); + } + ret = test_16bit_key(); + if (ret) { + return ret; + } + rcu_quiescent_state(); + + /* key bits */ + for (i = 8; i <= 64; i *= 2) { + /* nr of nodes per key */ + for (j = 1; j < 4; j++) { + ret = test_sparse_key(i, j); + if (ret) { + return ret; + } + rcu_quiescent_state(); + } + } + printf("Sanity test end.\n"); + + return 0; +} + +enum urcu_ja_addremove { + AR_RANDOM = 0, + AR_ADD = 1, + AR_REMOVE = -1, +}; /* 1: add, -1 remove, 0: random */ + +static enum urcu_ja_addremove addremove; /* 1: add, -1 remove, 0: random */ + +static +void test_ja_rw_sigusr1_handler(int signo) +{ + switch (addremove) { + case AR_ADD: + printf("Add/Remove: random.\n"); + addremove = AR_RANDOM; + break; + case AR_RANDOM: + printf("Add/Remove: remove only.\n"); + addremove = AR_REMOVE; + break; + case AR_REMOVE: + printf("Add/Remove: add only.\n"); + addremove = AR_ADD; + break; + } +} + +static +void *test_ja_rw_thr_reader(void *_count) +{ + unsigned long long *count = _count; + struct cds_ja_node *ja_node; + uint64_t key; + + printf_verbose("thread_begin %s, tid %lu\n", + "reader", urcu_get_thread_id()); + + URCU_TLS(rand_lookup) = urcu_get_thread_id() ^ time(NULL); + + set_affinity(); + + rcu_register_thread(); + + while (!test_go) + { + } + cmm_smp_mb(); + + for (;;) { + rcu_read_lock(); + + /* note: only looking up ulong keys */ + key = ((unsigned long) rand_r(&URCU_TLS(rand_lookup)) % lookup_pool_size) + lookup_pool_offset; + key *= key_mul; + ja_node = cds_ja_lookup(test_ja, key); + if (!ja_node) { + if (validate_lookup) { + printf("[ERROR] Lookup cannot find initial node.\n"); + exit(-1); + } + URCU_TLS(lookup_fail)++; + } else { + URCU_TLS(lookup_ok)++; + } + rcu_debug_yield_read(); + if (caa_unlikely(rduration)) + loop_sleep(rduration); + rcu_read_unlock(); + URCU_TLS(nr_reads)++; + if (caa_unlikely(!test_duration_read())) + break; + if (caa_unlikely((URCU_TLS(nr_reads) & ((1 << 10) - 1)) == 0)) + rcu_quiescent_state(); + } + + rcu_unregister_thread(); + + *count = URCU_TLS(nr_reads); + printf_verbose("thread_end %s, tid %lu\n", + "reader", urcu_get_thread_id()); + printf_verbose("readid : %lx, lookupfail %lu, lookupok %lu\n", + pthread_self(), URCU_TLS(lookup_fail), + URCU_TLS(lookup_ok)); + return ((void*)1); +} + +static +int is_add(void) +{ + return ((unsigned int) rand_r(&URCU_TLS(rand_lookup)) % 100) < add_ratio; +} + +static +void *test_ja_rw_thr_writer(void *_count) +{ + struct wr_count *count = _count; + uint64_t key; + int ret; + + printf_verbose("thread_begin %s, tid %lu\n", + "writer", urcu_get_thread_id()); + + URCU_TLS(rand_lookup) = urcu_get_thread_id() ^ time(NULL); + + set_affinity(); + + rcu_register_thread(); + + while (!test_go) + { + } + cmm_smp_mb(); + + for (;;) { + if ((addremove == AR_ADD) + || (addremove == AR_RANDOM && is_add())) { + struct ja_test_node *node = node_alloc(); + struct cds_ja_node *ret_node; + + /* note: only inserting ulong keys */ + key = ((unsigned long) rand_r(&URCU_TLS(rand_lookup)) % write_pool_size) + write_pool_offset; + key *= key_mul; + ja_test_node_init(node, key); + rcu_read_lock(); + if (add_unique) { + ret_node = cds_ja_add_unique(test_ja, key, &node->node); + if (ret_node != &node->node) { + free_test_node(node); + URCU_TLS(nr_addexist)++; + } else { + URCU_TLS(nr_add)++; + } + } else if (add_replace) { + assert(0); /* not implemented yet. */ + } else { + ret = cds_ja_add(test_ja, key, &node->node); + if (ret) { + fprintf(stderr, "Error in cds_ja_add: %d\n", ret); + free_test_node(node); + } else { + URCU_TLS(nr_add)++; + } + } + rcu_read_unlock(); + } else { + struct cds_ja_node *ja_node; + struct ja_test_node *node; + + /* May delete */ + /* note: only deleting ulong keys */ + key = ((unsigned long) rand_r(&URCU_TLS(rand_lookup)) % write_pool_size) + write_pool_offset; + key *= key_mul; + + rcu_read_lock(); + + ja_node = cds_ja_lookup(test_ja, key); + /* Remove first entry */ + if (ja_node) { + node = caa_container_of(ja_node, + struct ja_test_node, node); + ret = cds_ja_del(test_ja, key, &node->node); + if (!ret) { + rcu_free_test_node(node); + URCU_TLS(nr_del)++; + } else { + URCU_TLS(nr_delnoent)++; + } + } else { + URCU_TLS(nr_delnoent)++; + } + rcu_read_unlock(); + } + + URCU_TLS(nr_writes)++; + if (caa_unlikely(!test_duration_write())) + break; + if (caa_unlikely(wdelay)) + loop_sleep(wdelay); + if (caa_unlikely((URCU_TLS(nr_writes) & ((1 << 10) - 1)) == 0)) + rcu_quiescent_state(); + } + + rcu_unregister_thread(); + + printf_verbose("thread_end %s, tid %lu\n", + "writer", urcu_get_thread_id()); + printf_verbose("info id %lx: nr_add %lu, nr_addexist %lu, nr_del %lu, " + "nr_delnoent %lu\n", pthread_self(), URCU_TLS(nr_add), + URCU_TLS(nr_addexist), URCU_TLS(nr_del), + URCU_TLS(nr_delnoent)); + count->update_ops = URCU_TLS(nr_writes); + count->add = URCU_TLS(nr_add); + count->add_exist = URCU_TLS(nr_addexist); + count->remove = URCU_TLS(nr_del); + return ((void*)2); +} + +static +int do_mt_populate_ja(void) +{ + uint64_t iter; + int ret; + + if (!init_populate) + return 0; + + printf("Starting rw test\n"); + + for (iter = init_pool_offset; iter < init_pool_offset + init_pool_size; iter++) { + struct ja_test_node *node = node_alloc(); + uint64_t key; + + /* note: only inserting ulong keys */ + key = (unsigned long) iter; + key *= key_mul; + ja_test_node_init(node, key); + rcu_read_lock(); + ret = cds_ja_add(test_ja, key, &node->node); + URCU_TLS(nr_add)++; + URCU_TLS(nr_writes)++; + rcu_read_unlock(); + /* Hash table resize only occurs in call_rcu thread */ + if (!(iter % 100)) + rcu_quiescent_state(); + if (ret) { + fprintf(stderr, "Error (%d) adding node %" PRIu64 "\n", + ret, key); + assert(0); + } + } + return 0; +} + +static +int do_mt_test(void) +{ + pthread_t *tid_reader, *tid_writer; + void *tret; + int ret, i, err; + unsigned long long *count_reader; + struct wr_count *count_writer; + unsigned long long tot_reads = 0, tot_writes = 0, + tot_add = 0, tot_add_exist = 0, tot_remove = 0; + unsigned int remain; + + tid_reader = malloc(sizeof(*tid_reader) * nr_readers); + tid_writer = malloc(sizeof(*tid_writer) * nr_writers); + count_reader = malloc(sizeof(*count_reader) * nr_readers); + count_writer = malloc(sizeof(*count_writer) * nr_writers); + + printf("Allocating Judy Array for %u-bit keys\n", key_bits); + test_ja = cds_ja_new(key_bits); + if (!test_ja) { + printf("Error allocating judy array.\n"); + ret = -1; + goto end; + } + + do_mt_populate_ja(); + + next_aff = 0; + + for (i = 0; i < nr_readers; i++) { + err = pthread_create(&tid_reader[i], + NULL, test_ja_rw_thr_reader, + &count_reader[i]); + if (err != 0) + exit(1); + } + for (i = 0; i < nr_writers; i++) { + err = pthread_create(&tid_writer[i], + NULL, test_ja_rw_thr_writer, + &count_writer[i]); + if (err != 0) + exit(1); + } + + cmm_smp_mb(); + + test_go = 1; + + rcu_thread_offline_qsbr(); + + remain = duration; + do { + remain = sleep(remain); + } while (remain > 0); + + test_stop = 1; + + for (i = 0; i < nr_readers; i++) { + err = pthread_join(tid_reader[i], &tret); + if (err != 0) + exit(1); + tot_reads += count_reader[i]; + } + for (i = 0; i < nr_writers; i++) { + err = pthread_join(tid_writer[i], &tret); + if (err != 0) + exit(1); + tot_writes += count_writer[i].update_ops; + tot_add += count_writer[i].add; + tot_add_exist += count_writer[i].add_exist; + tot_remove += count_writer[i].remove; + } + rcu_thread_online_qsbr(); + + ret = test_free_all_nodes(test_ja); + if (ret) { + fprintf(stderr, "Error freeing all nodes\n"); + return -1; + } + + ret = cds_ja_destroy(test_ja); + if (ret) { + fprintf(stderr, "Error destroying judy array\n"); + goto end; + } + + free(tid_reader); + free(tid_writer); + free(count_reader); + free(count_writer); + ret = 0; +end: + return ret; +} + +static +int check_memory_leaks(void) +{ + unsigned long na, nf; + + na = uatomic_read(&test_nodes_allocated); + nf = uatomic_read(&test_nodes_freed); + if (na != nf) { + fprintf(stderr, "Memory leak of %ld test nodes detected. Allocated: %lu, freed: %lu\n", + na - nf, na, nf); + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int i, j, a, ret, err; + uint64_t key; + struct sigaction act; + + if (argc < 4) { + show_usage(argc, argv); + return -1; + } + + err = sscanf(argv[1], "%u", &nr_readers); + if (err != 1) { + show_usage(argc, argv); + return -1; + } + + err = sscanf(argv[2], "%u", &nr_writers); + if (err != 1) { + show_usage(argc, argv); + return -1; + } + + err = sscanf(argv[3], "%lu", &duration); + if (err != 1) { + show_usage(argc, argv); + return -1; + } + + for (i = 4; i < argc; i++) { + if (argv[i][0] != '-') + continue; + switch (argv[i][1]) { +#ifdef DEBUG_YIELD + case 'r': + yield_active |= YIELD_READ; + break; + case 'w': + yield_active |= YIELD_WRITE; + break; +#endif + case 'a': + if (argc < i + 2) { + show_usage(argc, argv); + return -1; + } + a = atoi(argv[++i]); + cpu_affinities[next_aff++] = a; + use_affinity = 1; + printf_verbose("Adding CPU %d affinity\n", a); + break; + case 'c': + if (argc < i + 2) { + show_usage(argc, argv); + return -1; + } + rduration = atol(argv[++i]); + break; + case 'd': + if (argc < i + 2) { + show_usage(argc, argv); + return -1; + } + wdelay = atol(argv[++i]); + break; + case 'v': + verbose_mode = 1; + break; + case 'r': + add_ratio = atoi(argv[++i]); + break; + case 'k': + init_populate = 1; + break; + case 'R': + lookup_pool_offset = atol(argv[++i]); + break; + case 'S': + write_pool_offset = atol(argv[++i]); + break; + case 'T': + init_pool_offset = atol(argv[++i]); + break; + case 'M': + lookup_pool_size = atol(argv[++i]); + break; + case 'N': + write_pool_size = atol(argv[++i]); + break; + case 'O': + init_pool_size = atol(argv[++i]); + break; + case 'V': + validate_lookup = 1; + break; + case 't': + sanity_test = 1; + break; + case 'B': + key_bits = atol(argv[++i]); + break; + case 'm': + key_mul = atoll(argv[++i]); + break; + case 'u': + add_unique = 1; + break; + case 's': + add_replace = 1; + break; + case 'l': + leak_detection = 1; + break; + } + } + + printf_verbose("running test for %lu seconds, %u readers, %u writers.\n", + duration, nr_readers, nr_writers); + printf_verbose("Writer delay : %lu loops.\n", wdelay); + printf_verbose("Reader duration : %lu loops.\n", rduration); + printf_verbose("Add ratio: %u%%.\n", add_ratio); + printf_verbose("Mode:%s%s.\n", + " add/remove", + add_unique ? " uniquify" : ( add_replace ? " replace" : " insert")); + printf_verbose("Key multiplication factor: %" PRIu64 ".\n", key_mul); + printf_verbose("Init pool size offset %lu size %lu.\n", + init_pool_offset, init_pool_size); + printf_verbose("Lookup pool size offset %lu size %lu.\n", + lookup_pool_offset, lookup_pool_size); + printf_verbose("Update pool size offset %lu size %lu.\n", + write_pool_offset, write_pool_size); + if (validate_lookup) + printf_verbose("Validating lookups.\n"); + if (leak_detection) + printf_verbose("Memory leak dection activated.\n"); + printf_verbose("thread %-6s, tid %lu\n", + "main", urcu_get_thread_id()); + + memset(&act, 0, sizeof(act)); + ret = sigemptyset(&act.sa_mask); + if (ret == -1) { + perror("sigemptyset"); + return -1; + } + act.sa_handler = test_ja_rw_sigusr1_handler; + act.sa_flags = SA_RESTART; + ret = sigaction(SIGUSR1, &act, NULL); + if (ret == -1) { + perror("sigaction"); + return -1; + } + + err = create_all_cpu_call_rcu_data(0); + if (err) { + printf("Per-CPU call_rcu() worker threads unavailable. Using default global worker thread.\n"); + } + + rcu_register_thread(); + + if (sanity_test) { + ret = do_sanity_test(); + } else { + ret = do_mt_test(); + } + + /* Wait for in-flight call_rcu free to complete for leak detection */ + rcu_barrier(); + + ret |= check_memory_leaks(); + + rcu_unregister_thread(); + free_all_cpu_call_rcu_data(); + + if (ret) { + printf("Test ended with error: %d\n", ret); + } + return ret; +} diff --git a/tests/regression/test_urcu_ja.h b/tests/regression/test_urcu_ja.h new file mode 100644 index 0000000..092945d --- /dev/null +++ b/tests/regression/test_urcu_ja.h @@ -0,0 +1,175 @@ +#ifndef _TEST_URCU_JA_H +#define _TEST_URCU_JA_H + +/* + * test_urcu_ja.h + * + * Userspace RCU library - test program + * + * Copyright 2009-2012 - Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "thread-id.h" + +#define DEFAULT_RAND_POOL 1000000 + +/* Make this big enough to include the POWER5+ L3 cacheline size of 256B */ +#define CACHE_LINE_SIZE 4096 + +/* hardcoded number of CPUs */ +#define NR_CPUS 16384 + +#ifdef POISON_FREE +#define poison_free(ptr) \ + do { \ + memset(ptr, 0x42, sizeof(*(ptr))); \ + free(ptr); \ + } while (0) +#else +#define poison_free(ptr) free(ptr) +#endif + +#ifndef DYNAMIC_LINK_TEST +#define _LGPL_SOURCE +#else +#define debug_yield_read() +#endif +#include +#include +#include + +struct wr_count { + unsigned long update_ops; + unsigned long add; + unsigned long add_exist; + unsigned long remove; +}; + +extern DECLARE_URCU_TLS(unsigned int, rand_lookup); +extern DECLARE_URCU_TLS(unsigned long, nr_add); +extern DECLARE_URCU_TLS(unsigned long, nr_addexist); +extern DECLARE_URCU_TLS(unsigned long, nr_del); +extern DECLARE_URCU_TLS(unsigned long, nr_delnoent); +extern DECLARE_URCU_TLS(unsigned long, lookup_fail); +extern DECLARE_URCU_TLS(unsigned long, lookup_ok); + +extern struct cds_ja *test_ja; + +struct ja_test_node { + struct cds_ja_node node; + uint64_t key; /* for testing */ + struct rcu_head head; /* delayed reclaim */ +}; + +static inline struct ja_test_node * +to_test_node(struct cds_ja_node *node) +{ + return caa_container_of(node, struct ja_test_node, node); +} + +static inline +void ja_test_node_init(struct ja_test_node *node, uint64_t key) +{ + cds_ja_node_init(&node->node); + node->key = key; +} + +extern volatile int test_go, test_stop; + +extern unsigned long wdelay; + +extern unsigned long duration; + +/* read-side C.S. duration, in loops */ +extern unsigned long rduration; + +extern unsigned long init_populate; +extern int add_only; + +extern unsigned long init_pool_offset, lookup_pool_offset, write_pool_offset; +extern unsigned long init_pool_size, + lookup_pool_size, + write_pool_size; +extern int validate_lookup; + +extern int count_pipe[2]; + +static inline void loop_sleep(unsigned long l) +{ + while(l-- != 0) + caa_cpu_relax(); +} + +extern int verbose_mode; + +#define printf_verbose(fmt, args...) \ + do { \ + if (verbose_mode) \ + printf(fmt, ## args); \ + } while (0) + +extern unsigned int cpu_affinities[NR_CPUS]; +extern unsigned int next_aff; +extern int use_affinity; + +extern pthread_mutex_t affinity_mutex; + +#ifndef HAVE_CPU_SET_T +typedef unsigned long cpu_set_t; +# define CPU_ZERO(cpuset) do { *(cpuset) = 0; } while(0) +# define CPU_SET(cpu, cpuset) do { *(cpuset) |= (1UL << (cpu)); } while(0) +#endif + +void set_affinity(void); + +/* + * returns 0 if test should end. + */ +static inline int test_duration_write(void) +{ + return !test_stop; +} + +static inline int test_duration_read(void) +{ + return !test_stop; +} + +extern DECLARE_URCU_TLS(unsigned long long, nr_writes); +extern DECLARE_URCU_TLS(unsigned long long, nr_reads); + +extern unsigned int nr_readers; +extern unsigned int nr_writers; + +void rcu_copy_mutex_lock(void); +void rcu_copy_mutex_unlock(void); + +#endif /* _TEST_URCU_JA_H */ diff --git a/tests/utils/Makefile.am b/tests/utils/Makefile.am index 440ac38..a26b038 100644 --- a/tests/utils/Makefile.am +++ b/tests/utils/Makefile.am @@ -3,6 +3,9 @@ AM_CFLAGS = -I$(top_srcdir)/include -I$(top_builddir)/include -I$(top_srcdir)/sr noinst_LIBRARIES = libtap.a libtap_a_SOURCES = tap.c tap.h +noinst_PROGRAMS = rcuja-testpop +rcuja_testpop_SOURCES = rcuja-testpop.c + SCRIPT_LIST = tap.sh dist_noinst_SCRIPTS = $(SCRIPT_LIST) diff --git a/tests/utils/rcuja-testpop.c b/tests/utils/rcuja-testpop.c new file mode 100644 index 0000000..76dbb03 --- /dev/null +++ b/tests/utils/rcuja-testpop.c @@ -0,0 +1,333 @@ +/* + * rcuja/testpop.c + * + * Userspace RCU library - RCU Judy Array population size test + * + * Copyright 2012-2013 - Mathieu Desnoyers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This program generates random populations, and shows the largest + * sub-class generated, as well as the distribution of sub-class size + * for the largest sub-class of each population. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int sel_pool_len = 50; /* default */ +static int nr_distrib = 2; /* default */ +//#define SEL_POOL_LEN 100 +//#define NR_POOLS 10000000ULL + +static uint8_t pool[256]; +static uint8_t nr_one[8]; +static uint8_t nr_2d_11[8][8]; +static uint8_t nr_2d_10[8][8]; +static uint8_t nr_2d_01[8][8]; +static uint8_t nr_2d_00[8][8]; +static int global_max_minsubclass_len = 0; + +static unsigned int subclass_len_distrib[256]; + +static int verbose; + +static +uint8_t random_char(void) +{ + return (uint8_t) random(); +} + +static +void print_pool(void) +{ + int i; + + printf("pool: "); + for (i = 0; i < sel_pool_len; i++) { + printf("%d ", (int) pool[i]); + } + printf("\n"); +} + +static +void gen_pool(void) +{ + uint8_t src_pool[256]; + int i; + int nr_left = 256; + + memset(pool, 0, sizeof(pool)); + for (i = 0; i < 256; i++) + src_pool[i] = (uint8_t) i; + for (i = 0; i < sel_pool_len; i++) { + int sel; + + sel = random_char() % nr_left; + pool[i] = src_pool[sel]; + src_pool[sel] = src_pool[nr_left - 1]; + nr_left--; + } +} + +static +void count_pool(void) +{ + int i; + + memset(nr_one, 0, sizeof(nr_one)); + memset(nr_2d_11, 0, sizeof(nr_2d_11)); + memset(nr_2d_10, 0, sizeof(nr_2d_10)); + memset(nr_2d_01, 0, sizeof(nr_2d_01)); + memset(nr_2d_00, 0, sizeof(nr_2d_00)); + + for (i = 0; i < sel_pool_len; i++) { + if (nr_distrib == 2) { + int j; + + for (j = 0; j < 8; j++) { + if (pool[i] & (1U << j)) + nr_one[j]++; + } + } + + if (nr_distrib == 4) { + int bit_i, bit_j; + + for (bit_i = 0; bit_i < 8; bit_i++) { + for (bit_j = 0; bit_j < bit_i; bit_j++) { + if (pool[i] & (1U << bit_i)) { + if (pool[i] & (1U << bit_j)) { + nr_2d_11[bit_i][bit_j]++; + } else { + nr_2d_10[bit_i][bit_j]++; + } + } else { + if (pool[i] & (1U << bit_j)) { + nr_2d_01[bit_i][bit_j]++; + } else { + nr_2d_00[bit_i][bit_j]++; + } + } + } + } + } + } +} + +static +void print_count(void) +{ + int i; + + printf("pool distribution:\n"); + + if (nr_distrib == 2) { + printf(" 0 1\n"); + printf("----------\n"); + for (i = 0; i < 8; i++) { + printf("%3d %3d\n", + sel_pool_len - nr_one[i], nr_one[i]); + } + } + + if (nr_distrib == 4) { + /* TODO */ + } + printf("\n"); +} + +static +void stat_count_1d(void) +{ + unsigned int overall_best_distance = UINT_MAX; + unsigned int overall_minsubclass_len; + int i; + + for (i = 0; i < 8; i++) { + int distance_to_best; + + distance_to_best = ((unsigned int) nr_one[i] << 1U) - sel_pool_len; + if (distance_to_best < 0) + distance_to_best = -distance_to_best; + if (distance_to_best < overall_best_distance) { + overall_best_distance = distance_to_best; + } + } + overall_minsubclass_len = (overall_best_distance + sel_pool_len) >> 1UL; + if (overall_minsubclass_len > global_max_minsubclass_len) { + global_max_minsubclass_len = overall_minsubclass_len; + } + subclass_len_distrib[overall_minsubclass_len]++; +} + +static +void stat_count_2d(void) +{ + int overall_best_distance = INT_MAX; + unsigned int overall_minsubclass_len = 0; + int bit_i, bit_j; + + for (bit_i = 0; bit_i < 8; bit_i++) { + for (bit_j = 0; bit_j < bit_i; bit_j++) { + int distance_to_best[4], subclass_len[4]; + + distance_to_best[0] = ((unsigned int) nr_2d_11[bit_i][bit_j] << 2U) - sel_pool_len; + distance_to_best[1] = ((unsigned int) nr_2d_10[bit_i][bit_j] << 2U) - sel_pool_len; + distance_to_best[2] = ((unsigned int) nr_2d_01[bit_i][bit_j] << 2U) - sel_pool_len; + distance_to_best[3] = ((unsigned int) nr_2d_00[bit_i][bit_j] << 2U) - sel_pool_len; + + subclass_len[0] = nr_2d_11[bit_i][bit_j]; + subclass_len[1] = nr_2d_10[bit_i][bit_j]; + subclass_len[2] = nr_2d_01[bit_i][bit_j]; + subclass_len[3] = nr_2d_00[bit_i][bit_j]; + + /* Consider worse distance above best */ + if (distance_to_best[1] > 0 && distance_to_best[1] > distance_to_best[0]) { + distance_to_best[0] = distance_to_best[1]; + subclass_len[0] = subclass_len[1]; + } + if (distance_to_best[2] > 0 && distance_to_best[2] > distance_to_best[0]) { + distance_to_best[0] = distance_to_best[2]; + subclass_len[0] = subclass_len[2]; + } + if (distance_to_best[3] > 0 && distance_to_best[3] > distance_to_best[0]) { + distance_to_best[0] = distance_to_best[3]; + subclass_len[0] = subclass_len[3]; + } + + /* + * If our worse distance is better than overall, + * we become new best candidate. + */ + if (distance_to_best[0] < overall_best_distance) { + overall_best_distance = distance_to_best[0]; + overall_minsubclass_len = subclass_len[0]; + } + } + } + if (overall_minsubclass_len > global_max_minsubclass_len) { + global_max_minsubclass_len = overall_minsubclass_len; + } + subclass_len_distrib[overall_minsubclass_len]++; +} + +static +void stat_count(void) +{ + switch (nr_distrib) { + case 2: + stat_count_1d(); + break; + case 4: + stat_count_2d(); + break; + default: + assert(0); + break; + } +} + +static +void print_distrib(void) +{ + int i; + unsigned long long tot = 0; + + for (i = 0; i < 256; i++) { + tot += subclass_len_distrib[i]; + } + if (tot == 0) + return; + printf("Distribution:\n"); + for (i = 0; i < 256; i++) { + if (!subclass_len_distrib[i]) + continue; + printf("(%u, %u, %llu%%) ", + i, subclass_len_distrib[i], + 100 * (unsigned long long) subclass_len_distrib[i] / tot); + } + printf("\n"); +} + +static +void print_stat(uint64_t i) +{ + printf("after %llu pools, global_max_minsubclass_len: %d\n", + (unsigned long long) i, global_max_minsubclass_len); + print_distrib(); +} + +int main(int argc, char **argv) +{ + uint64_t i = 0; + + srandom(time(NULL)); + + if (argc > 1) { + sel_pool_len = atoi(argv[1]); + if (sel_pool_len > 256 || sel_pool_len < 1) { + printf("Wrong pool len\n"); + return -1; + } + } + printf("pool len: %d\n", sel_pool_len); + + if (argc > 2) { + nr_distrib = atoi(argv[2]); + if (nr_distrib > 256 || nr_distrib < 1) { + printf("Wrong number of distributions\n"); + return -1; + } + } + + if (argc > 3) { + if (!strcmp(argv[3], "-v")) { + verbose = 1; + } + } + + printf("pool distributions: %d\n", nr_distrib); + + if (nr_distrib != 2 && nr_distrib != 4) { + printf("Wrong number of distributions. Only 2 and 4 supported.\n"); + return -1; + } + + //for (i = 0; i < NR_POOLS; i++) { + while (1) { + gen_pool(); + count_pool(); + if (verbose) { + print_pool(); + print_count(); + } + stat_count(); + if (!(i % 100000ULL)) + print_stat(i); + i++; + } + print_stat(i); + print_distrib(); + + return 0; +}