From cd7664aea7022902e08d26ef91a1a88421fde3c6 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 23 Jan 2023 18:00:20 +0700 Subject: [PATCH v22 20/22] Do some rewriting and proofreading of comments In passing, change one ternary operator to if/else. --- src/include/lib/radixtree.h | 160 +++++++++++++++++++++--------------- 1 file changed, 92 insertions(+), 68 deletions(-) diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index 5927437034..7fcd212ea4 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -9,25 +9,38 @@ * types, each with a different numbers of elements. Depending on the number of * children, the appropriate node type is used. * - * There are some differences from the proposed implementation. For instance, - * there is not support for path compression and lazy path expansion. The radix - * tree supports fixed length of the key so we don't expect the tree level - * wouldn't be high. + * WIP: notes about traditional radix tree trading off span vs height... * - * Both the key and the value are 64-bit unsigned integer. The inner nodes and - * the leaf nodes have slightly different structure: for inner tree nodes, - * shift > 0, store the pointer to its child node as the value. The leaf nodes, - * shift == 0, have the 64-bit unsigned integer that is specified by the user as - * the value. The paper refers to this technique as "Multi-value leaves". We - * choose it to avoid an additional pointer traversal. It is the reason this code - * currently does not support variable-length keys. + * There are two kinds of nodes, inner nodes and leaves. Inner nodes + * map partial keys to child pointers. * - * XXX: Most functions in this file have two variants for inner nodes and leaf - * nodes, therefore there are duplication codes. While this sometimes makes the - * code maintenance tricky, this reduces branch prediction misses when judging - * whether the node is a inner node of a leaf node. + * The ART paper mentions three ways to implement leaves: * - * XXX: the radix tree node never be shrunk. + * "- Single-value leaves: The values are stored using an addi- + * tional leaf node type which stores one value. + * - Multi-value leaves: The values are stored in one of four + * different leaf node types, which mirror the structure of + * inner nodes, but contain values instead of pointers. + * - Combined pointer/value slots: If values fit into point- + * ers, no separate node types are necessary. Instead, each + * pointer storage location in an inner node can either + * store a pointer or a value." + * + * We chose "multi-value leaves" to avoid the additional pointer traversal + * required by "single-value leaves" + * + * For simplicity, the key is assumed to be 64-bit unsigned integer. The + * tree doesn't need to contain paths where the highest bytes of all keys + * are zero. That way, the tree's height adapts to the distribution of keys. + * + * TODO: In the future it might be worthwhile to offer configurability of + * leaf implementation for different use cases. Single-values leaves would + * give more flexibility in key type, including variable-length keys. + * + * There are some optimizations not yet implemented, particularly path + * compression and lazy path expansion. + * + * WIP: the radix tree nodes don't shrink. * * To generate a radix tree and associated functions for a use case several * macros have to be #define'ed before this file is included. Including @@ -42,11 +55,11 @@ * - RT_DEFINE - if defined function definitions are generated * - RT_SCOPE - in which scope (e.g. extern, static inline) do function * declarations reside - * - RT_SHMEM - if defined, the radix tree is created in the DSA area - * so that multiple processes can access it simultaneously. * - RT_VALUE_TYPE - the type of the value. * * Optional parameters: + * - RT_SHMEM - if defined, the radix tree is created in the DSA area + * so that multiple processes can access it simultaneously. * - RT_DEBUG - if defined add stats tracking and debugging functions * * Interface @@ -54,9 +67,6 @@ * * RT_CREATE - Create a new, empty radix tree * RT_FREE - Free the radix tree - * RT_ATTACH - Attach to the radix tree - * RT_DETACH - Detach from the radix tree - * RT_GET_HANDLE - Return the handle of the radix tree * RT_SEARCH - Search a key-value pair * RT_SET - Set a key-value pair * RT_BEGIN_ITERATE - Begin iterating through all key-value pairs @@ -64,11 +74,12 @@ * RT_END_ITER - End iteration * RT_MEMORY_USAGE - Get the memory usage * - * RT_CREATE() creates an empty radix tree in the given memory context - * and memory contexts for all kinds of radix tree node under the memory context. + * Interface for Shared Memory + * --------- * - * RT_ITERATE_NEXT() ensures returning key-value pairs in the ascending - * order of the key. + * RT_ATTACH - Attach to the radix tree + * RT_DETACH - Detach from the radix tree + * RT_GET_HANDLE - Return the handle of the radix tree * * Optional Interface * --------- @@ -360,13 +371,23 @@ typedef struct RT_NODE #define RT_INVALID_PTR_ALLOC NULL #endif +/* + * Inner nodes and leaf nodes have analogous structure. To distinguish + * them at runtime, we take advantage of the fact that the key chunk + * is accessed by shifting: Inner tree nodes (shift > 0), store the + * pointer to its child node in the slot. In leaf nodes (shift == 0), + * the slot contains the value corresponding to the key. + */ #define RT_NODE_IS_LEAF(n) (((RT_PTR_LOCAL) (n))->shift == 0) + #define RT_NODE_MUST_GROW(node) \ ((node)->base.n.count == (node)->base.n.fanout) -/* Base type of each node kinds for leaf and inner nodes */ -/* The base types must be a be able to accommodate the largest size -class for variable-sized node kinds*/ +/* + * Base type of each node kinds for leaf and inner nodes. + * The base types must be a be able to accommodate the largest size + * class for variable-sized node kinds. + */ typedef struct RT_NODE_BASE_3 { RT_NODE n; @@ -384,9 +405,9 @@ typedef struct RT_NODE_BASE_32 } RT_NODE_BASE_32; /* - * node-125 uses slot_idx array, an array of RT_NODE_MAX_SLOTS length, typically - * 256, to store indexes into a second array that contains up to 125 values (or - * child pointers in inner nodes). + * node-125 uses slot_idx array, an array of RT_NODE_MAX_SLOTS length + * to store indexes into a second array that contains the values (or + * child pointers). */ typedef struct RT_NODE_BASE_125 { @@ -407,15 +428,8 @@ typedef struct RT_NODE_BASE_256 /* * Inner and leaf nodes. * - * Theres are separate for two main reasons: - * - * 1) the value type might be different than something fitting into a pointer - * width type - * 2) Need to represent non-existing values in a key-type independent way. - * - * 1) is clearly worth being concerned about, but it's not clear 2) is as - * good. It might be better to just indicate non-existing entries the same way - * in inner nodes. + * Theres are separate because the value type might be different than + * something fitting into a pointer-width type. */ typedef struct RT_NODE_INNER_3 { @@ -466,8 +480,10 @@ typedef struct RT_NODE_LEAF_125 } RT_NODE_LEAF_125; /* - * node-256 is the largest node type. This node has RT_NODE_MAX_SLOTS length array + * node-256 is the largest node type. This node has an array * for directly storing values (or child pointers in inner nodes). + * Unlike other node kinds, it's array size is by definition + * fixed. */ typedef struct RT_NODE_INNER_256 { @@ -481,7 +497,10 @@ typedef struct RT_NODE_LEAF_256 { RT_NODE_BASE_256 base; - /* isset is a bitmap to track which slot is in use */ + /* + * Unlike with inner256, zero is a valid value here, so we use a + * bitmap to track which slot is in use. + */ bitmapword isset[BM_IDX(RT_NODE_MAX_SLOTS)]; /* Slots for 256 values */ @@ -570,7 +589,8 @@ static const RT_SIZE_CLASS_ELEM RT_SIZE_CLASS_INFO[] = { #define RT_RADIX_TREE_MAGIC 0x54A48167 #endif -/* A radix tree with nodes */ +/* Contains the actual tree and ancillary info */ +// WIP: this name is a bit strange typedef struct RT_RADIX_TREE_CONTROL { #ifdef RT_SHMEM @@ -588,7 +608,7 @@ typedef struct RT_RADIX_TREE_CONTROL #endif } RT_RADIX_TREE_CONTROL; -/* A radix tree with nodes */ +/* Entry point for allocating and accessing the tree */ typedef struct RT_RADIX_TREE { MemoryContext context; @@ -613,15 +633,15 @@ typedef struct RT_RADIX_TREE * RT_NODE_ITER struct is used to track the iteration within a node. * * RT_ITER is the struct for iteration of the radix tree, and uses RT_NODE_ITER - * in order to track the iteration of each level. During the iteration, we also + * in order to track the iteration of each level. During iteration, we also * construct the key whenever updating the node iteration information, e.g., when * advancing the current index within the node or when moving to the next node * at the same level. -+ * -+ * XXX: Currently we allow only one process to do iteration. Therefore, rt_node_iter -+ * has the local pointers to nodes, rather than RT_PTR_ALLOC. -+ * We need either a safeguard to disallow other processes to begin the iteration -+ * while one process is doing or to allow multiple processes to do the iteration. + * + * XXX: Currently we allow only one process to do iteration. Therefore, rt_node_iter + * has the local pointers to nodes, rather than RT_PTR_ALLOC. + * We need either a safeguard to disallow other processes to begin the iteration + * while one process is doing or to allow multiple processes to do the iteration. */ typedef struct RT_NODE_ITER { @@ -637,7 +657,7 @@ typedef struct RT_ITER RT_NODE_ITER stack[RT_MAX_LEVEL]; int stack_len; - /* The key is being constructed during the iteration */ + /* The key is constructed during iteration */ uint64 key; } RT_ITER; @@ -672,8 +692,8 @@ RT_PTR_ALLOC_IS_VALID(RT_PTR_ALLOC ptr) } /* - * Return index of the first element in 'base' that equals 'key'. Return -1 - * if there is no such element. + * Return index of the first element in the node's chunk array that equals + * 'chunk'. Return -1 if there is no such element. */ static inline int RT_NODE_3_SEARCH_EQ(RT_NODE_BASE_3 *node, uint8 chunk) @@ -693,7 +713,8 @@ RT_NODE_3_SEARCH_EQ(RT_NODE_BASE_3 *node, uint8 chunk) } /* - * Return index of the chunk to insert into chunks in the given node. + * Return index of the chunk and slot arrays for inserting into the node, + * such that the chunk array remains ordered. */ static inline int RT_NODE_3_GET_INSERTPOS(RT_NODE_BASE_3 *node, uint8 chunk) @@ -744,7 +765,7 @@ RT_NODE_32_SEARCH_EQ(RT_NODE_BASE_32 *node, uint8 chunk) /* replicate the search key */ spread_chunk = vector8_broadcast(chunk); - /* compare to the 32 keys stored in the node */ + /* compare to all 32 keys stored in the node */ vector8_load(&haystack1, &node->chunks[0]); vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]); cmp1 = vector8_eq(spread_chunk, haystack1); @@ -768,7 +789,7 @@ RT_NODE_32_SEARCH_EQ(RT_NODE_BASE_32 *node, uint8 chunk) } /* - * Return index of the node's chunk array to insert into, + * Return index of the chunk and slot arrays for inserting into the node, * such that the chunk array remains ordered. */ static inline int @@ -809,7 +830,7 @@ RT_NODE_32_GET_INSERTPOS(RT_NODE_BASE_32 *node, uint8 chunk) * This is a bit more complicated than RT_NODE_32_SEARCH_EQ(), because * no unsigned uint8 comparison instruction exists, at least for SSE2. So * we need to play some trickery using vector8_min() to effectively get - * <=. There'll never be any equal elements in the current uses, but that's + * <=. There'll never be any equal elements in urrent uses, but that's * what we get here... */ spread_chunk = vector8_broadcast(chunk); @@ -834,6 +855,7 @@ RT_NODE_32_GET_INSERTPOS(RT_NODE_BASE_32 *node, uint8 chunk) #endif } + /* * Functions to manipulate both chunks array and children/values array. * These are used for node-3 and node-32. @@ -993,18 +1015,19 @@ RT_NODE_LEAF_256_DELETE(RT_NODE_LEAF_256 *node, uint8 chunk) } /* - * Return the shift that is satisfied to store the given key. + * Return the largest shift that will allowing storing the given key. */ static inline int RT_KEY_GET_SHIFT(uint64 key) { - return (key == 0) - ? 0 - : (pg_leftmost_one_pos64(key) / RT_NODE_SPAN) * RT_NODE_SPAN; + if (key == 0) + return 0; + else + return (pg_leftmost_one_pos64(key) / RT_NODE_SPAN) * RT_NODE_SPAN; } /* - * Return the max value stored in a node with the given shift. + * Return the max value that can be stored in the tree with the given shift. */ static uint64 RT_SHIFT_GET_MAX_VAL(int shift) @@ -1155,6 +1178,7 @@ RT_FREE_NODE(RT_RADIX_TREE *tree, RT_PTR_ALLOC allocnode) #endif } +/* Update the parent's pointer when growing a node */ static inline void RT_NODE_UPDATE_INNER(RT_PTR_LOCAL node, uint64 key, RT_PTR_ALLOC new_child) { @@ -1182,7 +1206,7 @@ RT_REPLACE_NODE(RT_RADIX_TREE *tree, RT_PTR_LOCAL parent, if (parent == old_child) { - /* Replace the root node with the new large node */ + /* Replace the root node with the new larger node */ tree->ctl->root = new_child; } else @@ -1192,8 +1216,8 @@ RT_REPLACE_NODE(RT_RADIX_TREE *tree, RT_PTR_LOCAL parent, } /* - * The radix tree doesn't sufficient height. Extend the radix tree so it can - * store the key. + * The radix tree doesn't have sufficient height. Extend the radix tree so + * it can store the key. */ static void RT_EXTEND(RT_RADIX_TREE *tree, uint64 key) @@ -1337,7 +1361,7 @@ RT_NODE_INSERT_INNER(RT_RADIX_TREE *tree, RT_PTR_LOCAL parent, RT_PTR_ALLOC stor #undef RT_NODE_LEVEL_INNER } -/* Like, RT_NODE_INSERT_INNER, but for leaf nodes */ +/* Like RT_NODE_INSERT_INNER, but for leaf nodes */ static bool RT_NODE_INSERT_LEAF(RT_RADIX_TREE *tree, RT_PTR_LOCAL parent, RT_PTR_ALLOC stored_node, RT_PTR_LOCAL node, uint64 key, RT_VALUE_TYPE value) @@ -1377,7 +1401,7 @@ RT_CREATE(MemoryContext ctx) #else tree->ctl = (RT_RADIX_TREE_CONTROL *) palloc0(sizeof(RT_RADIX_TREE_CONTROL)); - /* Create the slab allocator for each size class */ + /* Create a slab context for each size class */ for (int i = 0; i < RT_SIZE_CLASS_COUNT; i++) { RT_SIZE_CLASS_ELEM size_class = RT_SIZE_CLASS_INFO[i]; @@ -1570,7 +1594,7 @@ RT_SET(RT_RADIX_TREE *tree, uint64 key, RT_VALUE_TYPE value) parent = RT_PTR_GET_LOCAL(tree, stored_child); shift = parent->shift; - /* Descend the tree until a leaf node */ + /* Descend the tree until we reach a leaf node */ while (shift >= 0) { RT_PTR_ALLOC new_child; -- 2.39.0