Here are benchmarks on batching of synchronize_rcu(), and it leads to
very interesting scalability improvement and speedups, e.g., on a
24-core AMD, with a write-heavy scenario (4 readers threads, 20 updater
threads, each updater using synchronize_rcu()):
* Serialized grace periods:
./test_urcu 4 20 20
SUMMARY ./test_urcu testdur 20 nr_readers 4
rdur 0 wdur 0 nr_writers 20 wdelay 0
nr_reads
714598368 nr_writes
5032889 nr_ops
719631257
* Batched grace periods:
./test_urcu 4 20 20
SUMMARY ./test_urcu testdur 20 nr_readers 4
rdur 0 wdur 0 nr_writers 20 wdelay 0
nr_reads
611848168 nr_writes
9877965 nr_ops
621726133
For a
9877965/
5032889 = 1.96 speedup for 20 updaters.
Of course, we can see that readers have slowed down, probably due to
increased update traffic, given there is no change to the read-side code
whatsoever.
Now let's see the penality of managing the stack for single-updater.
With 4 readers, single updater:
* Serialized grace periods :
./test_urcu 4 1 20
SUMMARY ./test_urcu testdur 20 nr_readers 4
rdur 0 wdur 0 nr_writers 1 wdelay 0
nr_reads
241959144 nr_writes
11146189 nr_ops
253105333
SUMMARY ./test_urcu testdur 20 nr_readers 4
rdur 0 wdur 0 nr_writers 1 wdelay 0
nr_reads
257131080 nr_writes
12310537 nr_ops
269441617
SUMMARY ./test_urcu testdur 20 nr_readers 4
rdur 0 wdur 0 nr_writers 1 wdelay 0
nr_reads
259973359 nr_writes
12203025 nr_ops
272176384
* Batched grace periods :
SUMMARY ./test_urcu testdur 20 nr_readers 4
rdur 0 wdur 0 nr_writers 1 wdelay 0
nr_reads
298926555 nr_writes
14018748 nr_ops
312945303
SUMMARY ./test_urcu testdur 20 nr_readers 4
rdur 0 wdur 0 nr_writers 1 wdelay 0
nr_reads
272411290 nr_writes
12832166 nr_ops
285243456
SUMMARY ./test_urcu testdur 20 nr_readers 4
rdur 0 wdur 0 nr_writers 1 wdelay 0
nr_reads
267511858 nr_writes
12822026 nr_ops
280333884
Serialized vs batched seems to similar, batched possibly even slightly
faster, but this is probably caused by NUMA affinity.
More benchmark results:
* Serialized synchronize_rcu() -- test_urcu (mb)
./test_urcu 4 1 20
SUMMARY ./test_urcu testdur 20 nr_readers 4 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
222512859 nr_writes
10723654 nr_ops
233236513
./test_urcu 4 20 20
SUMMARY ./test_urcu testdur 20 nr_readers 4 rdur 0 wdur 0 nr_writers 20 wdelay 0 nr_reads
722096653 nr_writes
5012429 nr_ops
727109082
./test_urcu 12 12 20
SUMMARY ./test_urcu testdur 20 nr_readers 12 rdur 0 wdur 0 nr_writers 12 wdelay 0 nr_reads
1822868768 nr_writes
2300787 nr_ops
1825169555
./test_urcu 16 8 20
SUMMARY ./test_urcu testdur 20 nr_readers 16 rdur 0 wdur 0 nr_writers 8 wdelay 0 nr_reads
2355908375 nr_writes
1604850 nr_ops
2357513225
./test_urcu 20 4 20
SUMMARY ./test_urcu testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 4 wdelay 0 nr_reads
3003457459 nr_writes
1074828 nr_ops
3004532287
./test_urcu 20 3 20
SUMMARY ./test_urcu testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 3 wdelay 0 nr_reads
2956972543 nr_writes
1036556 nr_ops
2958009099
./test_urcu 20 2 20
SUMMARY ./test_urcu testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 2 wdelay 0 nr_reads
2890178860 nr_writes
1030095 nr_ops
2891208955
./test_urcu 20 1 20
SUMMARY ./test_urcu testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
3017482290 nr_writes 783420 nr_ops
3018265710
* Batched synchronize_rcu() -- test_urcu (mb)
./test_urcu 4 1 20
SUMMARY ./test_urcu testdur 20 nr_readers 4 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
271476751 nr_writes
12858885 nr_ops
284335636
./test_urcu 4 20 20
SUMMARY ./test_urcu testdur 20 nr_readers 4 rdur 0 wdur 0 nr_writers 20 wdelay 0 nr_reads
608488583 nr_writes
10080610 nr_ops
618569193
./test_urcu 12 12 20
SUMMARY ./test_urcu testdur 20 nr_readers 12 rdur 0 wdur 0 nr_writers 12 wdelay 0 nr_reads
1260044362 nr_writes
7957711 nr_ops
1268002073
./test_urcu 16 8 20
SUMMARY ./test_urcu testdur 20 nr_readers 16 rdur 0 wdur 0 nr_writers 8 wdelay 0 nr_reads
2048890674 nr_writes
5440985 nr_ops
2054331659
./test_urcu 20 4 20
SUMMARY ./test_urcu testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 4 wdelay 0 nr_reads
2819267217 nr_writes
3093008 nr_ops
2822360225
./test_urcu 20 3 20
SUMMARY ./test_urcu testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 3 wdelay 0 nr_reads
3067795320 nr_writes
2817760 nr_ops
3070613080
./test_urcu 20 2 20
SUMMARY ./test_urcu testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 2 wdelay 0 nr_reads
3116770603 nr_writes
2404242 nr_ops
3119174845
./test_urcu 20 1 20
SUMMARY ./test_urcu testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
2238534130 nr_writes
3737588 nr_ops
2242271718
* Serialized synchronize_rcu() -- test_urcu_signal
./test_urcu_signal 4 1 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 4 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
16063309841 nr_writes 9217 nr_ops
16063319058
./test_urcu_signal 4 20 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 4 rdur 0 wdur 0 nr_writers 20 wdelay 0 nr_reads
16065183739 nr_writes 9182 nr_ops
16065192921
./test_urcu_signal 12 12 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 12 rdur 0 wdur 0 nr_writers 12 wdelay 0 nr_reads
48028512672 nr_writes 8890 nr_ops
48028521562
./test_urcu_signal 16 8 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 16 rdur 0 wdur 0 nr_writers 8 wdelay 0 nr_reads
64001589198 nr_writes 8756 nr_ops
64001597954
./test_urcu_signal 20 4 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 4 wdelay 0 nr_reads
79907434070 nr_writes 9068 nr_ops
79907443138
./test_urcu_signal 20 3 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 3 wdelay 0 nr_reads
79987250839 nr_writes 8589 nr_ops
79987259428
./test_urcu_signal 20 2 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 2 wdelay 0 nr_reads
79749947176 nr_writes 8596 nr_ops
79749955772
./test_urcu_signal 20 1 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
79751023090 nr_writes 8624 nr_ops
79751031714
* Batched synchronize_rcu() -- test_urcu_signal
./test_urcu_signal 4 1 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 4 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
15739087241 nr_writes 9218 nr_ops
15739096459
./test_urcu_signal 4 20 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 4 rdur 0 wdur 0 nr_writers 20 wdelay 0 nr_reads
15662135806 nr_writes 94833 nr_ops
15662230639
./test_urcu_signal 12 12 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 12 rdur 0 wdur 0 nr_writers 12 wdelay 0 nr_reads
46634363289 nr_writes 56903 nr_ops
46634420192
./test_urcu_signal 16 8 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 16 rdur 0 wdur 0 nr_writers 8 wdelay 0 nr_reads
62263951759 nr_writes 39058 nr_ops
62263990817
./test_urcu_signal 20 4 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 4 wdelay 0 nr_reads
77799768623 nr_writes 21065 nr_ops
77799789688
./test_urcu_signal 20 3 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 3 wdelay 0 nr_reads
76408008440 nr_writes 17026 nr_ops
76408025466
./test_urcu_signal 20 2 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 2 wdelay 0 nr_reads
77868927424 nr_writes 12630 nr_ops
77868940054
./test_urcu_signal 20 1 20
SUMMARY ./test_urcu_signal testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
77293186844 nr_writes 8680 nr_ops
77293195524
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Lai Jiangshan <laijs@cn.fujitsu.com>
CC: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
#include "urcu/tls-compat.h"
#include "urcu-die.h"
+#include "urcu-wait.h"
/* Do not #define _LGPL_SOURCE to ensure we can emit the wrapper symbols */
#undef _LGPL_SOURCE
static CDS_LIST_HEAD(registry);
+/*
+ * Queue keeping threads awaiting to wait for a grace period. Contains
+ * struct gp_waiters_thread objects.
+ */
+static DEFINE_URCU_WAIT_QUEUE(gp_waiters);
+
static void mutex_lock(pthread_mutex_t *mutex)
{
int ret;
{
CDS_LIST_HEAD(cur_snap_readers);
CDS_LIST_HEAD(qsreaders);
+ DEFINE_URCU_WAIT_NODE(wait, URCU_WAIT_WAITING);
+ struct urcu_waiters waiters;
+
+ /*
+ * Add ourself to gp_waiters queue of threads awaiting to wait
+ * for a grace period. Proceed to perform the grace period only
+ * if we are the first thread added into the queue.
+ * The implicit memory barrier before urcu_wait_add()
+ * orders prior memory accesses of threads put into the wait
+ * queue before their insertion into the wait queue.
+ */
+ if (urcu_wait_add(&gp_waiters, &wait) != 0) {
+ /* Not first in queue: will be awakened by another thread. */
+ urcu_adaptative_busy_wait(&wait);
+ /* Order following memory accesses after grace period. */
+ cmm_smp_mb();
+ return;
+ }
+ /* We won't need to wake ourself up */
+ urcu_wait_set_state(&wait, URCU_WAIT_RUNNING);
mutex_lock(&rcu_gp_lock);
+ /*
+ * Move all waiters into our local queue.
+ */
+ urcu_move_waiters(&waiters, &gp_waiters);
+
if (cds_list_empty(®istry))
goto out;
smp_mb_master(RCU_MB_GROUP);
out:
mutex_unlock(&rcu_gp_lock);
+
+ /*
+ * Wakeup waiters only after we have completed the grace period
+ * and have ensured the memory barriers at the end of the grace
+ * period have been issued.
+ */
+ urcu_wake_all_waiters(&waiters);
}
/*