Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys Import thmap -- a concurrent trie-hash map, combining th...
details: https://anonhg.NetBSD.org/src/rev/5c1cdd3b64ad
branches: trunk
changeset: 446741:5c1cdd3b64ad
user: rmind <rmind%NetBSD.org@localhost>
date: Sun Dec 16 14:06:56 2018 +0000
description:
Import thmap -- a concurrent trie-hash map, combining the elements of
hashing and radix trie. It supports lock-free lookups and concurrent
inserts/deletes. It is designed to be optimal as a general purpose
*concurrent* associative array.
Upstream: https://github.com/rmind/thmap
Discussed on tech-kern@
diffstat:
sys/kern/files.kern | 3 +-
sys/kern/subr_thmap.c | 934 ++++++++++++++++++++++++++++
sys/rump/librump/rumpkern/Makefile.rumpkern | 3 +-
sys/sys/thmap.h | 62 +
4 files changed, 1000 insertions(+), 2 deletions(-)
diffs (truncated from 1038 to 300 lines):
diff -r 6db077adc6bd -r 5c1cdd3b64ad sys/kern/files.kern
--- a/sys/kern/files.kern Sun Dec 16 14:04:14 2018 +0000
+++ b/sys/kern/files.kern Sun Dec 16 14:06:56 2018 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: files.kern,v 1.27 2018/12/03 00:11:02 christos Exp $
+# $NetBSD: files.kern,v 1.28 2018/12/16 14:06:56 rmind Exp $
#
# kernel sources
@@ -142,6 +142,7 @@
file kern/subr_specificdata.c kern
file kern/subr_tftproot.c tftproot
file kern/subr_time.c kern
+file kern/subr_thmap.c kern
file kern/subr_userconf.c userconf
file kern/subr_vmem.c kern
file kern/subr_workqueue.c kern
diff -r 6db077adc6bd -r 5c1cdd3b64ad sys/kern/subr_thmap.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/kern/subr_thmap.c Sun Dec 16 14:06:56 2018 +0000
@@ -0,0 +1,934 @@
+/*-
+ * Copyright (c) 2018 Mindaugas Rasiukevicius <rmind at noxt eu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Upstream: https://github.com/rmind/thmap/
+ */
+
+/*
+ * Concurrent trie-hash map.
+ *
+ * The data structure is conceptually a radix trie on hashed keys.
+ * Keys are hashed using a 32-bit function. The root level is a special
+ * case: it is managed using the compare-and-swap (CAS) atomic operation
+ * and has a fanout of 64. The subsequent levels are constructed using
+ * intermediate nodes with a fanout of 16 (using 4 bits). As more levels
+ * are created, more blocks of the 32-bit hash value might be generated
+ * by incrementing the seed parameter of the hash function.
+ *
+ * Concurrency
+ *
+ * - READERS: Descending is simply walking through the slot values of
+ * the intermediate nodes. It is lock-free as there is no intermediate
+ * state: the slot is either empty or has a pointer to the child node.
+ * The main assumptions here are the following:
+ *
+ * i) modifications must preserve consistency with the respect to the
+ * readers i.e. the readers can only see the valid node values;
+ *
+ * ii) any invalid view must "fail" the reads, e.g. by making them
+ * re-try from the root; this is a case for deletions and is achieved
+ * using the NODE_DELETED flag.
+ *
+ * iii) the node destruction must be synchronised with the readers,
+ * e.g. by using the Epoch-based reclamation or other techniques.
+ *
+ * - WRITERS AND LOCKING: Each intermediate node has a spin-lock (which
+ * is implemented using the NODE_LOCKED bit) -- it provides mutual
+ * exclusion amongst concurrent writers. The lock order for the nodes
+ * is "bottom-up" i.e. they are locked as we ascend the trie. A key
+ * constraint here is that parent pointer never changes.
+ *
+ * - DELETES: In addition to writer's locking, the deletion keeps the
+ * intermediate nodes in a valid state and sets the NODE_DELETED flag,
+ * to indicate that the readers must re-start the walk from the root.
+ * As the levels are collapsed, NODE_DELETED gets propagated up-tree.
+ * The leaf nodes just stay as-is until they are reclaimed.
+ *
+ * - ROOT LEVEL: The root level is a special case, as it is implemented
+ * as an array (rather than intermediate node). The root-level slot can
+ * only be set using CAS and it can only be set to a valid intermediate
+ * node. The root-level slot can only be cleared when the node it points
+ * at becomes empty, is locked and marked as NODE_DELETED (this causes
+ * the insert/delete operations to re-try until the slot is set to NULL).
+ *
+ * References:
+ *
+ * W. Litwin, 1981, Trie Hashing.
+ * Proceedings of the 1981 ACM SIGMOD, p. 19-29
+ * https://dl.acm.org/citation.cfm?id=582322
+ *
+ * P. L. Lehman and S. B. Yao.
+ * Efficient locking for concurrent operations on B-trees.
+ * ACM TODS, 6(4):650-670, 1981
+ * https://www.csd.uoc.gr/~hy460/pdf/p650-lehman.pdf
+ */
+
+#ifdef _KERNEL
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/thmap.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>
+#include <sys/atomic.h>
+#include <sys/hash.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <string.h>
+#include <limits.h>
+
+#include "thmap.h"
+#include "utils.h"
+#endif
+
+/*
+ * NetBSD kernel wrappers
+ */
+#ifdef _KERNEL
+#define ASSERT KASSERT
+#define DEBUG 1
+#define atomic_thread_fence(x) x
+#define memory_order_stores membar_producer()
+#define memory_order_loads membar_consumer()
+#define atomic_cas_32_p(p, e, n) (atomic_cas_32((p), (e), (n)) == (e))
+#define atomic_cas_ptr_p(p, e, n) \
+ (atomic_cas_ptr((p), (void *)(e), (void *)(n)) == (e))
+#define atomic_exchange atomic_swap_ptr
+#define murmurhash3 murmurhash2
+#endif
+
+/*
+ * The root level fanout is 64 (indexed by the last 6 bits of the hash
+ * value XORed with the length). Each subsequent level, represented by
+ * intermediate nodes, has a fanout of 16 (using 4 bits).
+ *
+ * The hash function produces 32-bit values.
+ */
+
+#define HASHVAL_BITS (32)
+#define HASHVAL_MOD (HASHVAL_BITS - 1)
+#define HASHVAL_SHIFT (5)
+
+#define ROOT_BITS (6)
+#define ROOT_SIZE (1 << ROOT_BITS)
+#define ROOT_MASK (ROOT_SIZE - 1)
+#define ROOT_MSBITS (HASHVAL_BITS - ROOT_BITS)
+
+#define LEVEL_BITS (4)
+#define LEVEL_SIZE (1 << LEVEL_BITS)
+#define LEVEL_MASK (LEVEL_SIZE - 1)
+
+/*
+ * Instead of raw pointers, we use offsets from the base address.
+ * This accommodates the use of this data structure in shared memory,
+ * where mappings can be in different address spaces.
+ *
+ * The pointers must be aligned, since pointer tagging is used to
+ * differentiate the intermediate nodes from leaves. We reserve the
+ * least significant bit.
+ */
+typedef uintptr_t thmap_ptr_t;
+
+#define THMAP_NULL ((thmap_ptr_t)0)
+
+#define THMAP_LEAF_BIT (0x1)
+
+#define THMAP_ALIGNED_P(p) (((uintptr_t)(p) & 3) == 0)
+#define THMAP_ALIGN(p) ((uintptr_t)(p) & ~(uintptr_t)3)
+#define THMAP_INODE_P(p) (((uintptr_t)(p) & THMAP_LEAF_BIT) == 0)
+
+#define THMAP_GETPTR(th, p) ((void *)((th)->baseptr + (uintptr_t)(p)))
+#define THMAP_GETOFF(th, p) ((thmap_ptr_t)((uintptr_t)(p) - (th)->baseptr))
+#define THMAP_NODE(th, p) THMAP_GETPTR(th, THMAP_ALIGN(p))
+
+/*
+ * State field.
+ */
+
+#define NODE_LOCKED (1U << 31) // lock (writers)
+#define NODE_DELETED (1U << 30) // node deleted
+#define NODE_COUNT(s) ((s) & 0x3fffffff) // slot count mask
+
+/*
+ * There are two types of nodes:
+ * - Intermediate nodes -- arrays pointing to another level or a leaf;
+ * - Leaves, which store a key-value pair.
+ */
+
+typedef struct {
+ uint32_t state;
+ thmap_ptr_t parent;
+ thmap_ptr_t slots[LEVEL_SIZE];
+} thmap_inode_t;
+
+#define THMAP_INODE_LEN sizeof(thmap_inode_t)
+
+typedef struct {
+ thmap_ptr_t key;
+ size_t len;
+ void * val;
+} thmap_leaf_t;
+
+typedef struct {
+ unsigned rslot; // root-level slot index
+ unsigned level; // current level in the tree
+ unsigned hashidx; // current hash index (block of bits)
+ uint32_t hashval; // current hash value
+} thmap_query_t;
+
+typedef struct {
+ uintptr_t addr;
+ size_t len;
+ void * next;
+} thmap_gc_t;
+
+#define THMAP_ROOT_LEN (sizeof(thmap_ptr_t) * ROOT_SIZE)
+
+struct thmap {
+ uintptr_t baseptr;
+ thmap_ptr_t * root;
+ unsigned flags;
+ const thmap_ops_t *ops;
+ thmap_gc_t * gc_list;
+};
+
+static void stage_mem_gc(thmap_t *, uintptr_t, size_t);
+
+/*
+ * A few low-level helper routines.
+ */
+
+static uintptr_t
+alloc_wrapper(size_t len)
+{
+ return (uintptr_t)kmem_intr_alloc(len, KM_SLEEP);
+}
+
+static void
+free_wrapper(uintptr_t addr, size_t len)
+{
+ kmem_intr_free((void *)addr, len);
+}
+
+static const thmap_ops_t thmap_default_ops = {
+ .alloc = alloc_wrapper,
+ .free = free_wrapper
+};
+
+/*
+ * NODE LOCKING.
+ */
+
+#ifdef DEBUG
+static inline bool
+node_locked_p(const thmap_inode_t *node)
+{
+ return (node->state & NODE_LOCKED) != 0;
+}
+#endif
+
+static void
+lock_node(thmap_inode_t *node)
+{
+ unsigned bcount = SPINLOCK_BACKOFF_MIN;
+ uint32_t s;
+again:
+ s = node->state;
+ if (s & NODE_LOCKED) {
+ SPINLOCK_BACKOFF(bcount);
+ goto again;
+ }
+ /*
+ * CAS will issue a full memory fence for us.
+ *
+ * WARNING: for optimisations purposes, callers rely on us
+ * issuing load and store fence
+ */
+ if (!atomic_cas_32_p(&node->state, s, s | NODE_LOCKED)) {
+ bcount = SPINLOCK_BACKOFF_MIN;
+ goto again;
+ }
+}
+
+static void
Home |
Main Index |
Thread Index |
Old Index