From b85513ab0f8654df36aa913f4b29b626e652943f Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Thu, 27 Oct 2022 14:02:00 +0900 Subject: [PATCH v10 6/7] PoC: DSA support for radix tree. --- .../bench_radix_tree--1.0.sql | 2 + contrib/bench_radix_tree/bench_radix_tree.c | 12 +- src/backend/lib/radixtree.c | 483 +++++++++++++----- src/backend/utils/mmgr/dsa.c | 12 + src/include/lib/radixtree.h | 8 +- src/include/utils/dsa.h | 1 + .../expected/test_radixtree.out | 17 + .../modules/test_radixtree/test_radixtree.c | 100 ++-- 8 files changed, 482 insertions(+), 153 deletions(-) diff --git a/contrib/bench_radix_tree/bench_radix_tree--1.0.sql b/contrib/bench_radix_tree/bench_radix_tree--1.0.sql index e0205b364e..b5f731f329 100644 --- a/contrib/bench_radix_tree/bench_radix_tree--1.0.sql +++ b/contrib/bench_radix_tree/bench_radix_tree--1.0.sql @@ -7,6 +7,7 @@ create function bench_shuffle_search( minblk int4, maxblk int4, random_block bool DEFAULT false, +shared bool DEFAULT false, OUT nkeys int8, OUT rt_mem_allocated int8, OUT array_mem_allocated int8, @@ -23,6 +24,7 @@ create function bench_seq_search( minblk int4, maxblk int4, random_block bool DEFAULT false, +shared bool DEFAULT false, OUT nkeys int8, OUT rt_mem_allocated int8, OUT array_mem_allocated int8, diff --git a/contrib/bench_radix_tree/bench_radix_tree.c b/contrib/bench_radix_tree/bench_radix_tree.c index 70ca989118..225a1b3bb1 100644 --- a/contrib/bench_radix_tree/bench_radix_tree.c +++ b/contrib/bench_radix_tree/bench_radix_tree.c @@ -15,6 +15,7 @@ #include "lib/radixtree.h" #include #include "miscadmin.h" +#include "storage/lwlock.h" #include "utils/timestamp.h" PG_MODULE_MAGIC; @@ -150,7 +151,9 @@ bench_search(FunctionCallInfo fcinfo, bool shuffle) BlockNumber minblk = PG_GETARG_INT32(0); BlockNumber maxblk = PG_GETARG_INT32(1); bool random_block = PG_GETARG_BOOL(2); + bool shared = PG_GETARG_BOOL(3); radix_tree *rt = NULL; + dsa_area *dsa = NULL; uint64 ntids; uint64 key; uint64 last_key = PG_UINT64_MAX; @@ -172,8 +175,11 @@ bench_search(FunctionCallInfo fcinfo, bool shuffle) tids = generate_tids(minblk, maxblk, TIDS_PER_BLOCK_FOR_LOAD, &ntids, random_block); + if (shared) + dsa = dsa_create(LWLockNewTrancheId()); + /* measure the load time of the radix tree */ - rt = rt_create(CurrentMemoryContext); + rt = rt_create(CurrentMemoryContext, dsa); start_time = GetCurrentTimestamp(); for (int i = 0; i < ntids; i++) { @@ -324,7 +330,7 @@ bench_load_random_int(PG_FUNCTION_ARGS) elog(ERROR, "return type must be a row type"); pg_prng_seed(&state, 0); - rt = rt_create(CurrentMemoryContext); + rt = rt_create(CurrentMemoryContext, NULL); start_time = GetCurrentTimestamp(); for (uint64 i = 0; i < cnt; i++) @@ -450,7 +456,7 @@ bench_fixed_height_search(PG_FUNCTION_ARGS) if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); - rt = rt_create(CurrentMemoryContext); + rt = rt_create(CurrentMemoryContext, NULL); start_time = GetCurrentTimestamp(); diff --git a/src/backend/lib/radixtree.c b/src/backend/lib/radixtree.c index 08d580a899..1f2bb95e24 100644 --- a/src/backend/lib/radixtree.c +++ b/src/backend/lib/radixtree.c @@ -22,6 +22,15 @@ * choose it to avoid an additional pointer traversal. It is the reason this code * currently does not support variable-length keys. * + * If DSA space is specified when rt_create(), the radix tree is created in the + * DSA space so that multiple processes can access to it simultaneously. The process + * who created the shared radix tree need to tell both DSA area specified when + * calling to rt_create() and dsa_pointer of the radix tree, fetched by + * rt_get_dsa_pointer(), other processes so that they can attach by rt_attach(). + * + * XXX: shared radix tree is still PoC state as it doesn't have any locking support. + * Also, it supports only single-process iteration. + * * XXX: Most functions in this file have two variants for inner nodes and leaf * nodes, therefore there are duplication codes. While this sometimes makes the * code maintenance tricky, this reduces branch prediction misses when judging @@ -34,6 +43,9 @@ * * rt_create - Create a new, empty radix tree * rt_free - Free the radix tree + * rt_attach - Attach to the radix tree + * rt_detach - Detach from the radix tree + * rt_get_handle - Return the handle of the radix tree * rt_search - Search a key-value pair * rt_set - Set a key-value pair * rt_delete - Delete a key-value pair @@ -64,6 +76,7 @@ #include "miscadmin.h" #include "port/pg_bitutils.h" #include "port/pg_lfind.h" +#include "utils/dsa.h" #include "utils/memutils.h" /* The number of bits encoded in one tree level */ @@ -384,6 +397,11 @@ static rt_node_kind_info_elem rt_node_kind_info[RT_NODE_KIND_COUNT] = { * construct the key whenever updating the node iteration information, e.g., when * advancing the current index within the node or when moving to the next node * at the same level. + * + * XXX: Currently we allow only one process to do iteration. Therefore, rt_node_iter + * has the local pointers to nodes, rather than rt_node_ptr. + * We need either a safeguard to disallow other processes to begin the iteration + * while one process is doing or to allow multiple processes to do the iteration. */ typedef struct rt_node_iter { @@ -403,23 +421,43 @@ struct rt_iter uint64 key; }; -/* A radix tree with nodes */ -struct radix_tree +/* A magic value used to identify our radix tree */ +#define RADIXTREE_MAGIC 0x54A48167 + +/* Control information for an radix tree */ +typedef struct radix_tree_control { - MemoryContext context; + rt_handle handle; + uint32 magic; + /* Root node */ rt_pointer root; - uint64 max_val; - uint64 num_keys; - MemoryContextData *inner_slabs[RT_NODE_KIND_COUNT]; - MemoryContextData *leaf_slabs[RT_NODE_KIND_COUNT]; + pg_atomic_uint64 max_val; + pg_atomic_uint64 num_keys; /* statistics */ #ifdef RT_DEBUG int32 cnt[RT_NODE_KIND_COUNT]; #endif +} radix_tree_control; + +/* A radix tree with nodes */ +struct radix_tree +{ + MemoryContext context; + + /* control object in either backend-local memory or DSA */ + radix_tree_control *ctl; + + /* used only when the radix tree is shared */ + dsa_area *area; + + /* used only when the radix tree is private */ + MemoryContextData *inner_slabs[RT_NODE_KIND_COUNT]; + MemoryContextData *leaf_slabs[RT_NODE_KIND_COUNT]; }; +#define RadixTreeIsShared(rt) ((rt)->area != NULL) static void rt_new_root(radix_tree *tree, uint64 key); static rt_node_ptr rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, @@ -446,24 +484,31 @@ static void rt_verify_node(rt_node_ptr node); /* Decode and encode function of rt_pointer */ static inline rt_node * -rt_pointer_decode(rt_pointer encoded) +rt_pointer_decode(radix_tree *tree, rt_pointer encoded) { - return (rt_node *) RTPointerUnTagKind(encoded); + encoded = RTPointerUnTagKind(encoded); + + if (RadixTreeIsShared(tree)) + return (rt_node *) dsa_get_address(tree->area, encoded); + else + return (rt_node *) encoded; } static inline rt_pointer -rt_pointer_encode(rt_node *decoded, uint8 kind) +rt_pointer_encode(rt_pointer decoded, uint8 kind) { + Assert((decoded & RT_POINTER_KIND_MASK) == 0); + return (rt_pointer) RTPointerTagKind(decoded, kind); } /* Return a rt_pointer created from the given encoded pointer */ static inline rt_node_ptr -rt_node_ptr_encoded(rt_pointer encoded) +rt_node_ptr_encoded(radix_tree *tree, rt_pointer encoded) { return (rt_node_ptr) { .encoded = encoded, - .decoded = rt_pointer_decode(encoded) + .decoded = rt_pointer_decode(tree, encoded) }; } @@ -908,8 +953,8 @@ rt_new_root(radix_tree *tree, uint64 key) rt_node_ptr node; node = rt_alloc_node(tree, RT_NODE_KIND_4, shift, 0, shift > 0); - tree->max_val = shift_get_max_val(shift); - tree->root = node.encoded; + pg_atomic_write_u64(&tree->ctl->max_val, shift_get_max_val(shift)); + tree->ctl->root = node.encoded; } /* @@ -918,16 +963,35 @@ rt_new_root(radix_tree *tree, uint64 key) static rt_node_ptr rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, bool inner) { - rt_node_ptr newnode; + rt_node_ptr newnode; + + if (tree->area != NULL) + { + dsa_pointer dp; + + if (inner) + dp = dsa_allocate0(tree->area, rt_node_kind_info[kind].inner_size); + else + dp = dsa_allocate0(tree->area, rt_node_kind_info[kind].leaf_size); - if (inner) - newnode.decoded = (rt_node *) MemoryContextAllocZero(tree->inner_slabs[kind], - rt_node_kind_info[kind].inner_size); + newnode.encoded = rt_pointer_encode((rt_pointer) dp, kind); + newnode.decoded = (rt_node *) dsa_get_address(tree->area, dp); + } else - newnode.decoded = (rt_node *) MemoryContextAllocZero(tree->leaf_slabs[kind], - rt_node_kind_info[kind].leaf_size); + { + rt_node *new; + + if (inner) + new = (rt_node *) MemoryContextAllocZero(tree->inner_slabs[kind], + rt_node_kind_info[kind].inner_size); + else + new = (rt_node *) MemoryContextAllocZero(tree->leaf_slabs[kind], + rt_node_kind_info[kind].leaf_size); + + newnode.encoded = rt_pointer_encode((rt_pointer) new, kind); + newnode.decoded = new; + } - newnode.encoded = rt_pointer_encode(newnode.decoded, kind); NODE_SHIFT(newnode) = shift; NODE_CHUNK(newnode) = chunk; @@ -941,7 +1005,7 @@ rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, bool inner) #ifdef RT_DEBUG /* update the statistics */ - tree->cnt[kind]++; + tree->ctl->cnt[kind]++; #endif return newnode; @@ -968,16 +1032,19 @@ static void rt_free_node(radix_tree *tree, rt_node_ptr node) { /* If we're deleting the root node, make the tree empty */ - if (tree->root == node.encoded) - tree->root = InvalidRTPointer; + if (tree->ctl->root == node.encoded) + tree->ctl->root = InvalidRTPointer; #ifdef RT_DEBUG /* update the statistics */ - tree->cnt[NODE_KIND(node)]--; - Assert(tree->cnt[NODE_KIND(node)] >= 0); + tree->ctl->cnt[NODE_KIND(node)]--; + Assert(tree->ctl->cnt[NODE_KIND(node)] >= 0); #endif - pfree(node.decoded); + if (RadixTreeIsShared(tree)) + dsa_free(tree->area, (dsa_pointer) RTPointerUnTagKind(node.encoded)); + else + pfree(node.decoded); } /* @@ -993,7 +1060,7 @@ rt_replace_node(radix_tree *tree, rt_node_ptr parent, rt_node_ptr old_child, if (rt_node_ptr_eq(&parent, &old_child)) { /* Replace the root node with the new large node */ - tree->root = new_child.encoded; + tree->ctl->root = new_child.encoded; } else { @@ -1015,7 +1082,7 @@ static void rt_extend(radix_tree *tree, uint64 key) { int target_shift; - rt_node *root = rt_pointer_decode(tree->root); + rt_node *root = rt_pointer_decode(tree, tree->ctl->root); int shift = root->shift + RT_NODE_SPAN; target_shift = key_get_shift(key); @@ -1031,15 +1098,15 @@ rt_extend(radix_tree *tree, uint64 key) n4->base.n.count = 1; n4->base.chunks[0] = 0; - n4->children[0] = tree->root; + n4->children[0] = tree->ctl->root; root->chunk = 0; - tree->root = node.encoded; + tree->ctl->root = node.encoded; shift += RT_NODE_SPAN; } - tree->max_val = shift_get_max_val(target_shift); + pg_atomic_write_u64(&tree->ctl->max_val, shift_get_max_val(target_shift)); } /* @@ -1068,7 +1135,7 @@ rt_set_extend(radix_tree *tree, uint64 key, uint64 value, rt_node_ptr parent, } rt_node_insert_leaf(tree, parent, node, key, value); - tree->num_keys++; + pg_atomic_add_fetch_u64(&tree->ctl->num_keys, 1); } /* @@ -1079,8 +1146,7 @@ rt_set_extend(radix_tree *tree, uint64 key, uint64 value, rt_node_ptr parent, * pointer is set to child_p. */ static inline bool -rt_node_search_inner(rt_node_ptr node, uint64 key, rt_action action, - rt_pointer *child_p) +rt_node_search_inner(rt_node_ptr node, uint64 key, rt_action action, rt_pointer *child_p) { uint8 chunk = RT_GET_KEY_CHUNK(key, NODE_SHIFT(node)); bool found = false; @@ -1115,6 +1181,7 @@ rt_node_search_inner(rt_node_ptr node, uint64 key, rt_action action, break; found = true; + if (action == RT_ACTION_FIND) child = n32->children[idx]; else /* RT_ACTION_DELETE */ @@ -1604,33 +1671,50 @@ rt_node_insert_leaf(radix_tree *tree, rt_node_ptr parent, rt_node_ptr node, * Create the radix tree in the given memory context and return it. */ radix_tree * -rt_create(MemoryContext ctx) +rt_create(MemoryContext ctx, dsa_area *area) { radix_tree *tree; MemoryContext old_ctx; old_ctx = MemoryContextSwitchTo(ctx); - tree = palloc(sizeof(radix_tree)); + tree = (radix_tree *) palloc0(sizeof(radix_tree)); tree->context = ctx; - tree->root = InvalidRTPointer; - tree->max_val = 0; - tree->num_keys = 0; + + if (area != NULL) + { + dsa_pointer dp; + + tree->area = area; + dp = dsa_allocate0(area, sizeof(radix_tree_control)); + tree->ctl = (radix_tree_control *) dsa_get_address(area, dp); + tree->ctl->handle = (rt_handle) dp; + } + else + { + tree->ctl = (radix_tree_control *) palloc0(sizeof(radix_tree_control)); + tree->ctl->handle = InvalidDsaPointer; + } + + tree->ctl->magic = RADIXTREE_MAGIC; + tree->ctl->root = InvalidRTPointer; + pg_atomic_init_u64(&tree->ctl->max_val, 0); + pg_atomic_init_u64(&tree->ctl->num_keys, 0); /* Create the slab allocator for each size class */ - for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + if (area == NULL) { - tree->inner_slabs[i] = SlabContextCreate(ctx, - rt_node_kind_info[i].name, - rt_node_kind_info[i].inner_blocksize, - rt_node_kind_info[i].inner_size); - tree->leaf_slabs[i] = SlabContextCreate(ctx, - rt_node_kind_info[i].name, - rt_node_kind_info[i].leaf_blocksize, - rt_node_kind_info[i].leaf_size); -#ifdef RT_DEBUG - tree->cnt[i] = 0; -#endif + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + tree->inner_slabs[i] = SlabContextCreate(ctx, + rt_node_kind_info[i].name, + rt_node_kind_info[i].inner_blocksize, + rt_node_kind_info[i].inner_size); + tree->leaf_slabs[i] = SlabContextCreate(ctx, + rt_node_kind_info[i].name, + rt_node_kind_info[i].leaf_blocksize, + rt_node_kind_info[i].leaf_size); + } } MemoryContextSwitchTo(old_ctx); @@ -1638,16 +1722,160 @@ rt_create(MemoryContext ctx) return tree; } +/* + * Get a handle that can be used by other processes to attach to this radix + * tree. + */ +dsa_pointer +rt_get_handle(radix_tree *tree) +{ + Assert(RadixTreeIsShared(tree)); + Assert(tree->ctl->magic == RADIXTREE_MAGIC); + + return tree->ctl->handle; +} + +/* + * Attach to an existing radix tree using a handle. The returned object is + * allocated in backend-local memory using the CurrentMemoryContext. + */ +radix_tree * +rt_attach(dsa_area *area, rt_handle handle) +{ + radix_tree *tree; + dsa_pointer control; + + /* Allocate the backend-local object representing the radix tree */ + tree = (radix_tree *) palloc0(sizeof(radix_tree)); + + /* Find the control object in shard memory */ + control = handle; + + /* Set up the local radix tree */ + tree->area = area; + tree->ctl = (radix_tree_control *) dsa_get_address(area, control); + Assert(tree->ctl->magic == RADIXTREE_MAGIC); + + return tree; +} + +/* + * Detach from a radix tree. This frees backend-local resources associated + * with the radix tree, but the radix tree will continue to exist until + * it is explicitly freed. + */ +void +rt_detach(radix_tree *tree) +{ + Assert(RadixTreeIsShared(tree)); + Assert(tree->ctl->magic == RADIXTREE_MAGIC); + + pfree(tree); +} + +/* + * Recursively free all nodes allocated to the dsa area. + */ +static void +rt_free_recurse(radix_tree *tree, rt_pointer ptr) +{ + rt_node_ptr node = rt_node_ptr_encoded(tree, ptr); + + Assert(RadixTreeIsShared(tree)); + + /* The leaf node doesn't have child pointers, so free it */ + if (NODE_IS_LEAF(node)) + { + dsa_free(tree->area, RTPointerUnTagKind(node.encoded)); + return; + } + + switch (NODE_KIND(node)) + { + case RT_NODE_KIND_4: + { + rt_node_inner_4 *n4 = (rt_node_inner_4 *) node.decoded; + + /* Free all children recursively */ + for (int i = 0; i < NODE_COUNT(node); i++) + rt_free_recurse(tree, n4->children[i]); + + break; + } + case RT_NODE_KIND_32: + { + rt_node_inner_32 *n32 = (rt_node_inner_32 *) node.decoded; + + /* Free all children recursively */ + for (int i = 0; i < NODE_COUNT(node); i++) + rt_free_recurse(tree, n32->children[i]); + + break; + } + case RT_NODE_KIND_128: + { + rt_node_inner_128 *n128 = (rt_node_inner_128 *) node.decoded; + + /* Free all children recursively */ + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + if (!node_128_is_chunk_used((rt_node_base_128 *) n128, i)) + continue; + + rt_free_recurse(tree, node_inner_128_get_child(n128, i)); + } + break; + } + case RT_NODE_KIND_256: + { + rt_node_inner_256 *n256 = (rt_node_inner_256 *) node.decoded; + + /* Free all children recursively */ + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + if (!node_inner_256_is_chunk_used(n256, i)) + continue; + + rt_free_recurse(tree, node_inner_256_get_child(n256, i)); + } + break; + } + } + + /* Free the inner node itself */ + dsa_free(tree->area, RTPointerUnTagKind(node.encoded)); +} + /* * Free the given radix tree. */ void rt_free(radix_tree *tree) { - for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + Assert(!RadixTreeIsShared(tree) || tree->ctl->magic == RADIXTREE_MAGIC); + + if (RadixTreeIsShared(tree)) { - MemoryContextDelete(tree->inner_slabs[i]); - MemoryContextDelete(tree->leaf_slabs[i]); + /* Free all memory used for radix tree nodes */ + if (RTPointerIsValid(tree->ctl->root)) + rt_free_recurse(tree, tree->ctl->root); + + /* + * Vandalize the control block to help catch programming error where + * other backends access the memory formerly occupied by this radix tree. + */ + tree->ctl->magic = 0; + dsa_free(tree->area, tree->ctl->handle); + } + else + { + /* Free all memory used for radix tree nodes */ + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + MemoryContextDelete(tree->inner_slabs[i]); + MemoryContextDelete(tree->leaf_slabs[i]); + } + pfree(tree->ctl); } pfree(tree); @@ -1665,16 +1893,18 @@ rt_set(radix_tree *tree, uint64 key, uint64 value) rt_node_ptr node; rt_node_ptr parent; + Assert(!RadixTreeIsShared(tree) || tree->ctl->magic == RADIXTREE_MAGIC); + /* Empty tree, create the root */ - if (!RTPointerIsValid(tree->root)) + if (!RTPointerIsValid(tree->ctl->root)) rt_new_root(tree, key); /* Extend the tree if necessary */ - if (key > tree->max_val) + if (key > pg_atomic_read_u64(&tree->ctl->max_val)) rt_extend(tree, key); /* Descend the tree until a leaf node */ - node = parent = rt_node_ptr_encoded(tree->root); + node = parent = rt_node_ptr_encoded(tree, tree->ctl->root); shift = NODE_SHIFT(node); while (shift >= 0) { @@ -1690,7 +1920,7 @@ rt_set(radix_tree *tree, uint64 key, uint64 value) } parent = node; - node = rt_node_ptr_encoded(child); + node = rt_node_ptr_encoded(tree, child); shift -= RT_NODE_SPAN; } @@ -1698,7 +1928,7 @@ rt_set(radix_tree *tree, uint64 key, uint64 value) /* Update the statistics */ if (!updated) - tree->num_keys++; + pg_atomic_add_fetch_u64(&tree->ctl->num_keys, 1); return updated; } @@ -1714,12 +1944,14 @@ rt_search(radix_tree *tree, uint64 key, uint64 *value_p) rt_node_ptr node; int shift; + Assert(!RadixTreeIsShared(tree) || tree->ctl->magic == RADIXTREE_MAGIC); Assert(value_p != NULL); - if (!RTPointerIsValid(tree->root) || key > tree->max_val) + if (!RTPointerIsValid(tree->ctl->root) || + key > pg_atomic_read_u64(&tree->ctl->max_val)) return false; - node = rt_node_ptr_encoded(tree->root); + node = rt_node_ptr_encoded(tree, tree->ctl->root); shift = NODE_SHIFT(node); /* Descend the tree until a leaf node */ @@ -1733,7 +1965,7 @@ rt_search(radix_tree *tree, uint64 key, uint64 *value_p) if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) return false; - node = rt_node_ptr_encoded(child); + node = rt_node_ptr_encoded(tree, child); shift -= RT_NODE_SPAN; } @@ -1753,14 +1985,17 @@ rt_delete(radix_tree *tree, uint64 key) int level; bool deleted; - if (!tree->root || key > tree->max_val) + Assert(!RadixTreeIsShared(tree) || tree->ctl->magic == RADIXTREE_MAGIC); + + if (!RTPointerIsValid(tree->ctl->root) || + key > pg_atomic_read_u64(&tree->ctl->max_val)) return false; /* * Descend the tree to search the key while building a stack of nodes we * visited. */ - node = rt_node_ptr_encoded(tree->root); + node = rt_node_ptr_encoded(tree, tree->ctl->root); shift = NODE_SHIFT(node); level = -1; while (shift > 0) @@ -1773,7 +2008,7 @@ rt_delete(radix_tree *tree, uint64 key) if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) return false; - node = rt_node_ptr_encoded(child); + node = rt_node_ptr_encoded(tree, child); shift -= RT_NODE_SPAN; } @@ -1788,7 +2023,7 @@ rt_delete(radix_tree *tree, uint64 key) } /* Found the key to delete. Update the statistics */ - tree->num_keys--; + pg_atomic_sub_fetch_u64(&tree->ctl->num_keys, 1); /* * Return if the leaf node still has keys and we don't need to delete the @@ -1822,8 +2057,8 @@ rt_delete(radix_tree *tree, uint64 key) */ if (level == 0) { - tree->root = InvalidRTPointer; - tree->max_val = 0; + tree->ctl->root = InvalidRTPointer; + pg_atomic_write_u64(&tree->ctl->max_val, 0); } return true; @@ -1838,6 +2073,8 @@ rt_begin_iterate(radix_tree *tree) rt_iter *iter; int top_level; + Assert(!RadixTreeIsShared(tree) || tree->ctl->magic == RADIXTREE_MAGIC); + old_ctx = MemoryContextSwitchTo(tree->context); iter = (rt_iter *) palloc0(sizeof(rt_iter)); @@ -1847,7 +2084,7 @@ rt_begin_iterate(radix_tree *tree) if (!RTPointerIsValid(iter->tree)) return iter; - root = rt_node_ptr_encoded(iter->tree->root); + root = rt_node_ptr_encoded(tree, iter->tree->ctl->root); top_level = NODE_SHIFT(root) / RT_NODE_SPAN; iter->stack_len = top_level; @@ -1898,6 +2135,8 @@ rt_update_iter_stack(rt_iter *iter, rt_node_ptr from_node, int from) bool rt_iterate_next(rt_iter *iter, uint64 *key_p, uint64 *value_p) { + Assert(!RadixTreeIsShared(iter->tree) || iter->tree->ctl->magic == RADIXTREE_MAGIC); + /* Empty tree */ if (!iter->tree) return false; @@ -2043,7 +2282,7 @@ rt_node_inner_iterate_next(rt_iter *iter, rt_node_iter *node_iter, rt_node_ptr * if (found) { rt_iter_update_key(iter, key_chunk, NODE_SHIFT(node)); - *child_p = rt_node_ptr_encoded(child); + *child_p = rt_node_ptr_encoded(iter->tree, child); } return found; @@ -2146,7 +2385,7 @@ rt_node_leaf_iterate_next(rt_iter *iter, rt_node_iter *node_iter, uint64 *value_ uint64 rt_num_entries(radix_tree *tree) { - return tree->num_keys; + return pg_atomic_read_u64(&tree->ctl->num_keys); } /* @@ -2155,12 +2394,19 @@ rt_num_entries(radix_tree *tree) uint64 rt_memory_usage(radix_tree *tree) { - Size total = sizeof(radix_tree); + Size total = sizeof(radix_tree) + sizeof(radix_tree_control); - for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + Assert(!RadixTreeIsShared(tree) || tree->ctl->magic == RADIXTREE_MAGIC); + + if (RadixTreeIsShared(tree)) + total = dsa_get_total_size(tree->area); + else { - total += MemoryContextMemAllocated(tree->inner_slabs[i], true); - total += MemoryContextMemAllocated(tree->leaf_slabs[i], true); + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + total += MemoryContextMemAllocated(tree->inner_slabs[i], true); + total += MemoryContextMemAllocated(tree->leaf_slabs[i], true); + } } return total; @@ -2244,19 +2490,19 @@ rt_verify_node(rt_node_ptr node) void rt_stats(radix_tree *tree) { - rt_node_ptr root = rt_node_ptr_encoded(tree->root); + rt_node_ptr root = rt_node_ptr_encoded(tree, tree->ctl->root); ereport(LOG, (errmsg("num_keys = %lu, height = %u, n4 = %u, n32 = %u, n128 = %u, n256 = %u", - tree->num_keys, + pg_atomic_read_u64(&tree->ctl->num_keys), NODE_SHIFT(root) / RT_NODE_SPAN, - tree->cnt[0], - tree->cnt[1], - tree->cnt[2], - tree->cnt[3]))); + tree->ctl->cnt[0], + tree->ctl->cnt[1], + tree->ctl->cnt[2], + tree->ctl->cnt[3]))); } static void -rt_dump_node(rt_node_ptr node, int level, bool recurse) +rt_dump_node(radix_tree *tree, rt_node_ptr node, int level, bool recurse) { rt_node *n = node.decoded; char space[128] = {0}; @@ -2292,7 +2538,7 @@ rt_dump_node(rt_node_ptr node, int level, bool recurse) space, n4->base.chunks[i]); if (recurse) - rt_dump_node(rt_node_ptr_encoded(n4->children[i]), + rt_dump_node(tree, rt_node_ptr_encoded(tree, n4->children[i]), level + 1, recurse); else fprintf(stderr, "\n"); @@ -2320,7 +2566,7 @@ rt_dump_node(rt_node_ptr node, int level, bool recurse) if (recurse) { - rt_dump_node(rt_node_ptr_encoded(n32->children[i]), + rt_dump_node(tree, rt_node_ptr_encoded(tree, n32->children[i]), level + 1, recurse); } else @@ -2373,7 +2619,9 @@ rt_dump_node(rt_node_ptr node, int level, bool recurse) space, i); if (recurse) - rt_dump_node(rt_node_ptr_encoded(node_inner_128_get_child(n128, i)), + rt_dump_node(tree, + rt_node_ptr_encoded(tree, + node_inner_128_get_child(n128, i)), level + 1, recurse); else fprintf(stderr, "\n"); @@ -2406,7 +2654,9 @@ rt_dump_node(rt_node_ptr node, int level, bool recurse) space, i); if (recurse) - rt_dump_node(rt_node_ptr_encoded(node_inner_256_get_child(n256, i)), + rt_dump_node(tree, + rt_node_ptr_encoded(tree, + node_inner_256_get_child(n256, i)), level + 1, recurse); else fprintf(stderr, "\n"); @@ -2417,6 +2667,27 @@ rt_dump_node(rt_node_ptr node, int level, bool recurse) } } +void +rt_dump(radix_tree *tree) +{ + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + fprintf(stderr, "%s\tinner_size%lu\tinner_blocksize %lu\tleaf_size %lu\tleaf_blocksize %lu\n", + rt_node_kind_info[i].name, + rt_node_kind_info[i].inner_size, + rt_node_kind_info[i].inner_blocksize, + rt_node_kind_info[i].leaf_size, + rt_node_kind_info[i].leaf_blocksize); + fprintf(stderr, "max_val = %lu\n", pg_atomic_read_u64(&tree->ctl->max_val)); + + if (!tree->ctl->root) + { + fprintf(stderr, "empty tree\n"); + return; + } + + rt_dump_node(tree, rt_node_ptr_encoded(tree, tree->ctl->root), 0, true); +} + void rt_dump_search(radix_tree *tree, uint64 key) { @@ -2425,28 +2696,30 @@ rt_dump_search(radix_tree *tree, uint64 key) int level = 0; elog(NOTICE, "-----------------------------------------------------------"); - elog(NOTICE, "max_val = %lu (0x%lX)", tree->max_val, tree->max_val); + elog(NOTICE, "max_val = %lu (0x%lX)", + pg_atomic_read_u64(&tree->ctl->max_val), + pg_atomic_read_u64(&tree->ctl->max_val)); - if (!RTPointerIsValid(tree->root)) + if (!RTPointerIsValid(tree->ctl->root)) { elog(NOTICE, "tree is empty"); return; } - if (key > tree->max_val) + if (key > pg_atomic_read_u64(&tree->ctl->max_val)) { elog(NOTICE, "key %lu (0x%lX) is larger than max val", key, key); return; } - node = rt_node_ptr_encoded(tree->root); + node = rt_node_ptr_encoded(tree, tree->ctl->root); shift = NODE_SHIFT(node); while (shift >= 0) { rt_pointer child; - rt_dump_node(node, level, false); + rt_dump_node(tree, node, level, false); if (NODE_IS_LEAF(node)) { @@ -2461,33 +2734,9 @@ rt_dump_search(radix_tree *tree, uint64 key) if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) break; - node = rt_node_ptr_encoded(child); + node = rt_node_ptr_encoded(tree, child); shift -= RT_NODE_SPAN; level++; } } - -void -rt_dump(radix_tree *tree) -{ - rt_node_ptr root; - - for (int i = 0; i < RT_NODE_KIND_COUNT; i++) - fprintf(stderr, "%s\tinner_size%lu\tinner_blocksize %lu\tleaf_size %lu\tleaf_blocksize %lu\n", - rt_node_kind_info[i].name, - rt_node_kind_info[i].inner_size, - rt_node_kind_info[i].inner_blocksize, - rt_node_kind_info[i].leaf_size, - rt_node_kind_info[i].leaf_blocksize); - fprintf(stderr, "max_val = %lu\n", tree->max_val); - - if (!RTPointerIsValid(tree->root)) - { - fprintf(stderr, "empty tree\n"); - return; - } - - root = rt_node_ptr_encoded(tree->root); - rt_dump_node(root, 0, true); -} #endif diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 82376fde2d..ad169882af 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -1024,6 +1024,18 @@ dsa_set_size_limit(dsa_area *area, size_t limit) LWLockRelease(DSA_AREA_LOCK(area)); } +size_t +dsa_get_total_size(dsa_area *area) +{ + size_t size; + + LWLockAcquire(DSA_AREA_LOCK(area), LW_SHARED); + size = area->control->total_segment_size; + LWLockRelease(DSA_AREA_LOCK(area)); + + return size; +} + /* * Aggressively free all spare memory in the hope of returning DSM segments to * the operating system. diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index d5d7668617..68a11df970 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -14,18 +14,24 @@ #define RADIXTREE_H #include "postgres.h" +#include "utils/dsa.h" #define RT_DEBUG 1 typedef struct radix_tree radix_tree; typedef struct rt_iter rt_iter; +typedef dsa_pointer rt_handle; -extern radix_tree *rt_create(MemoryContext ctx); +extern radix_tree *rt_create(MemoryContext ctx, dsa_area *dsa); extern void rt_free(radix_tree *tree); extern bool rt_search(radix_tree *tree, uint64 key, uint64 *val_p); extern bool rt_set(radix_tree *tree, uint64 key, uint64 val); extern rt_iter *rt_begin_iterate(radix_tree *tree); +extern rt_handle rt_get_handle(radix_tree *tree); +extern radix_tree *rt_attach(dsa_area *dsa, dsa_pointer dp); +extern void rt_detach(radix_tree *tree); + extern bool rt_iterate_next(rt_iter *iter, uint64 *key_p, uint64 *value_p); extern void rt_end_iterate(rt_iter *iter); extern bool rt_delete(radix_tree *tree, uint64 key); diff --git a/src/include/utils/dsa.h b/src/include/utils/dsa.h index 405606fe2f..dad06adecc 100644 --- a/src/include/utils/dsa.h +++ b/src/include/utils/dsa.h @@ -117,6 +117,7 @@ extern dsa_handle dsa_get_handle(dsa_area *area); extern dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags); extern void dsa_free(dsa_area *area, dsa_pointer dp); extern void *dsa_get_address(dsa_area *area, dsa_pointer dp); +extern size_t dsa_get_total_size(dsa_area *area); extern void dsa_trim(dsa_area *area); extern void dsa_dump(dsa_area *area); diff --git a/src/test/modules/test_radixtree/expected/test_radixtree.out b/src/test/modules/test_radixtree/expected/test_radixtree.out index cc6970c87c..a0ff1e1c77 100644 --- a/src/test/modules/test_radixtree/expected/test_radixtree.out +++ b/src/test/modules/test_radixtree/expected/test_radixtree.out @@ -5,21 +5,38 @@ CREATE EXTENSION test_radixtree; -- SELECT test_radixtree(); NOTICE: testing radix tree node types with shift "0" +NOTICE: testing radix tree node types with shift "0" +NOTICE: testing radix tree node types with shift "8" NOTICE: testing radix tree node types with shift "8" NOTICE: testing radix tree node types with shift "16" +NOTICE: testing radix tree node types with shift "16" NOTICE: testing radix tree node types with shift "24" +NOTICE: testing radix tree node types with shift "24" +NOTICE: testing radix tree node types with shift "32" NOTICE: testing radix tree node types with shift "32" NOTICE: testing radix tree node types with shift "40" +NOTICE: testing radix tree node types with shift "40" +NOTICE: testing radix tree node types with shift "48" NOTICE: testing radix tree node types with shift "48" NOTICE: testing radix tree node types with shift "56" +NOTICE: testing radix tree node types with shift "56" +NOTICE: testing radix tree with pattern "all ones" NOTICE: testing radix tree with pattern "all ones" NOTICE: testing radix tree with pattern "alternating bits" +NOTICE: testing radix tree with pattern "alternating bits" +NOTICE: testing radix tree with pattern "clusters of ten" NOTICE: testing radix tree with pattern "clusters of ten" NOTICE: testing radix tree with pattern "clusters of hundred" +NOTICE: testing radix tree with pattern "clusters of hundred" +NOTICE: testing radix tree with pattern "one-every-64k" NOTICE: testing radix tree with pattern "one-every-64k" NOTICE: testing radix tree with pattern "sparse" +NOTICE: testing radix tree with pattern "sparse" +NOTICE: testing radix tree with pattern "single values, distance > 2^32" NOTICE: testing radix tree with pattern "single values, distance > 2^32" NOTICE: testing radix tree with pattern "clusters, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^60" NOTICE: testing radix tree with pattern "clusters, distance > 2^60" test_radixtree ---------------- diff --git a/src/test/modules/test_radixtree/test_radixtree.c b/src/test/modules/test_radixtree/test_radixtree.c index cb3596755d..a948cba4ec 100644 --- a/src/test/modules/test_radixtree/test_radixtree.c +++ b/src/test/modules/test_radixtree/test_radixtree.c @@ -19,6 +19,7 @@ #include "nodes/bitmapset.h" #include "storage/block.h" #include "storage/itemptr.h" +#include "storage/lwlock.h" #include "utils/memutils.h" #include "utils/timestamp.h" @@ -111,7 +112,7 @@ test_empty(void) radix_tree *radixtree; uint64 dummy; - radixtree = rt_create(CurrentMemoryContext); + radixtree = rt_create(CurrentMemoryContext, NULL); if (rt_search(radixtree, 0, &dummy)) elog(ERROR, "rt_search on empty tree returned true"); @@ -217,14 +218,10 @@ test_node_types_delete(radix_tree *radixtree, uint8 shift) * level. */ static void -test_node_types(uint8 shift) +do_test_node_types(radix_tree *radixtree, uint8 shift) { - radix_tree *radixtree; - elog(NOTICE, "testing radix tree node types with shift \"%d\"", shift); - radixtree = rt_create(CurrentMemoryContext); - /* * Insert and search entries for every node type at the 'shift' level, * then delete all entries to make it empty, and insert and search entries @@ -233,19 +230,39 @@ test_node_types(uint8 shift) test_node_types_insert(radixtree, shift); test_node_types_delete(radixtree, shift); test_node_types_insert(radixtree, shift); +} - rt_free(radixtree); +static void +test_node_types(void) +{ + int tranche_id = LWLockNewTrancheId(); + + for (int shift = 0; shift <= (64 - 8); shift += 8) + { + radix_tree *tree; + dsa_area *dsa; + + /* Test the local radix tree */ + tree = rt_create(CurrentMemoryContext, NULL); + do_test_node_types(tree, shift); + rt_free(tree); + + /* Test the shared radix tree */ + dsa = dsa_create(tranche_id); + tree = rt_create(CurrentMemoryContext, dsa); + do_test_node_types(tree, shift); + rt_free(tree); + dsa_detach(dsa); + } } /* * Test with a repeating pattern, defined by the 'spec'. */ static void -test_pattern(const test_spec * spec) +do_test_pattern(radix_tree *radixtree, const test_spec * spec) { - radix_tree *radixtree; rt_iter *iter; - MemoryContext radixtree_ctx; TimestampTz starttime; TimestampTz endtime; uint64 n; @@ -271,18 +288,6 @@ test_pattern(const test_spec * spec) pattern_values[pattern_num_values++] = i; } - /* - * Allocate the radix tree. - * - * Allocate it in a separate memory context, so that we can print its - * memory usage easily. - */ - radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext, - "radixtree test", - ALLOCSET_SMALL_SIZES); - MemoryContextSetIdentifier(radixtree_ctx, spec->test_name); - radixtree = rt_create(radixtree_ctx); - /* * Add values to the set. */ @@ -336,8 +341,6 @@ test_pattern(const test_spec * spec) mem_usage = rt_memory_usage(radixtree); fprintf(stderr, "rt_memory_usage() reported " UINT64_FORMAT " (%0.2f bytes / integer)\n", mem_usage, (double) mem_usage / spec->num_values); - - MemoryContextStats(radixtree_ctx); } /* Check that rt_num_entries works */ @@ -484,21 +487,54 @@ test_pattern(const test_spec * spec) if ((nbefore - ndeleted) != nafter) elog(ERROR, "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT "after " UINT64_FORMAT " deletion", nafter, (nbefore - ndeleted), ndeleted); +} + +static void +test_patterns(void) +{ + int tranche_id = LWLockNewTrancheId(); + + /* Test different test patterns, with lots of entries */ + for (int i = 0; i < lengthof(test_specs); i++) + { + radix_tree *tree; + MemoryContext radixtree_ctx; + dsa_area *dsa; + const test_spec *spec = &test_specs[i]; - MemoryContextDelete(radixtree_ctx); + /* + * Allocate the radix tree. + * + * Allocate it in a separate memory context, so that we can print its + * memory usage easily. + */ + radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext, + "radixtree test", + ALLOCSET_SMALL_SIZES); + MemoryContextSetIdentifier(radixtree_ctx, spec->test_name); + + /* Test the local radix tree */ + tree = rt_create(radixtree_ctx, NULL); + do_test_pattern(tree, spec); + rt_free(tree); + MemoryContextReset(radixtree_ctx); + + /* Test the shared radix tree */ + dsa = dsa_create(tranche_id); + tree = rt_create(radixtree_ctx, dsa); + do_test_pattern(tree, spec); + rt_free(tree); + dsa_detach(dsa); + MemoryContextDelete(radixtree_ctx); + } } Datum test_radixtree(PG_FUNCTION_ARGS) { test_empty(); - - for (int shift = 0; shift <= (64 - 8); shift += 8) - test_node_types(shift); - - /* Test different test patterns, with lots of entries */ - for (int i = 0; i < lengthof(test_specs); i++) - test_pattern(&test_specs[i]); + test_node_types(); + test_patterns(); PG_RETURN_VOID(); } -- 2.31.1