From 6cd239b14d521f2f1377730874c27b4eb9281217 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Wed, 14 Sep 2022 12:38:51 +0000 Subject: [POC PATCH 2/3] Add radix implementation. --- src/backend/lib/Makefile | 1 + src/backend/lib/radixtree.c | 2439 +++++++++++++++++ src/include/lib/radixtree.h | 42 + src/test/modules/Makefile | 1 + src/test/modules/test_radixtree/.gitignore | 4 + src/test/modules/test_radixtree/Makefile | 23 + src/test/modules/test_radixtree/README | 7 + .../expected/test_radixtree.out | 28 + .../test_radixtree/sql/test_radixtree.sql | 7 + .../test_radixtree/test_radixtree--1.0.sql | 8 + .../modules/test_radixtree/test_radixtree.c | 504 ++++ .../test_radixtree/test_radixtree.control | 4 + 12 files changed, 3068 insertions(+) create mode 100644 src/backend/lib/radixtree.c create mode 100644 src/include/lib/radixtree.h create mode 100644 src/test/modules/test_radixtree/.gitignore create mode 100644 src/test/modules/test_radixtree/Makefile create mode 100644 src/test/modules/test_radixtree/README create mode 100644 src/test/modules/test_radixtree/expected/test_radixtree.out create mode 100644 src/test/modules/test_radixtree/sql/test_radixtree.sql create mode 100644 src/test/modules/test_radixtree/test_radixtree--1.0.sql create mode 100644 src/test/modules/test_radixtree/test_radixtree.c create mode 100644 src/test/modules/test_radixtree/test_radixtree.control diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile index 9dad31398a..4c1db794b6 100644 --- a/src/backend/lib/Makefile +++ b/src/backend/lib/Makefile @@ -22,6 +22,7 @@ OBJS = \ integerset.o \ knapsack.o \ pairingheap.o \ + radixtree.o \ rbtree.o \ include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/radixtree.c b/src/backend/lib/radixtree.c new file mode 100644 index 0000000000..93c81b843f --- /dev/null +++ b/src/backend/lib/radixtree.c @@ -0,0 +1,2439 @@ +/*------------------------------------------------------------------------- + * + * radixtree.c + * Implementation for adaptive radix tree. + * + * This module employs the idea from the paper "The Adaptive Radix Tree: ARTful + * Indexing for Main-Memory Databases" by Viktor Leis, Alfons Kemper, and Thomas + * Neumann, 2013. The radix tree uses adaptive node sizes, a small number of node + * types, each with a different numbers of elements. Depending on the number of + * children, the appropriate node type is used. + * + * There are some differences from the proposed implementation. For instance, + * this radix tree module utilizes AVX2 instruction, enabling us to use 256-bit + * width SIMD vector, whereas 128-bit width SIMD vector is used in the paper. + * Also, there is no support for path compression and lazy path expansion. The + * radix tree supports fixed length of the key so we don't expect the tree level + * wouldn't be high. + * + * Both the key and the value are 64-bit unsigned integer. The inner nodes and + * the leaf nodes have slightly different structure: for inner tree nodes, + * shift > 0, store the pointer to its child node as the value. The leaf nodes, + * shift == 0, have the 64-bit unsigned integer that is specified by the user as + * the value. The paper refers to this technique as "Multi-value leaves". We + * choose it to avoid an additional pointer traversal. It is the reason this code + * currently does not support variable-length keys. + * + * XXX: Most functions in this file have two variants for inner nodes and leaf + * nodes, therefore there are duplication codes. While this sometimes makes the + * code maintenance tricky, this reduces branch prediction misses when judging + * whether the node is a inner node of a leaf node. + * + * XXX: the radix tree node never be shrunk. + * + * Interface + * --------- + * + * rt_create - Create a new, empty radix tree + * rt_free - Free the radix tree + * rt_search - Search a key-value pair + * rt_set - Set a key-value pair + * rt_delete - Delete a key-value pair + * rt_begin_iterate - Begin iterating through all key-value pairs + * rt_iterate_next - Return next key-value pair, if any + * rt_end_iter - End iteration + * rt_memory_usage - Get the memory usage + * rt_num_entries - Get the number of key-value pairs + * + * rt_create() creates an empty radix tree in the given memory context + * and memory contexts for all kinds of radix tree node under the memory context. + * + * rt_iterate_next() ensures returning key-value pairs in the ascending + * order of the key. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/radixtree.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "port/pg_lfind.h" +#include "utils/memutils.h" +#include "lib/radixtree.h" +#include "lib/stringinfo.h" + +/* The number of bits encoded in one tree level */ +#define RT_NODE_SPAN BITS_PER_BYTE + +/* The number of maximum slots in the node */ +#define RT_NODE_MAX_SLOTS (1 << RT_NODE_SPAN) + +/* + * Return the number of bits required to represent nslots slots, used + * nodes indexed by array lookup. + */ +#define RT_NODE_NSLOTS_BITS(nslots) ((nslots) / (sizeof(uint8) * BITS_PER_BYTE)) + +/* Mask for extracting a chunk from the key */ +#define RT_CHUNK_MASK ((1 << RT_NODE_SPAN) - 1) + +/* Maximum shift the radix tree uses */ +#define RT_MAX_SHIFT key_get_shift(UINT64_MAX) + +/* Tree level the radix tree uses */ +#define RT_MAX_LEVEL ((sizeof(uint64) * BITS_PER_BYTE) / RT_NODE_SPAN) + +/* Invalid index used in node-128 */ +#define RT_NODE_128_INVALID_IDX 0xFF + +/* Get a chunk from the key */ +#define RT_GET_KEY_CHUNK(key, shift) ((uint8) (((key) >> (shift)) & RT_CHUNK_MASK)) + +/* + * Mapping from the value to the bit in is-set bitmap in the node-256. + */ +#define RT_NODE_BITMAP_BYTE(v) ((v) / BITS_PER_BYTE) +#define RT_NODE_BITMAP_BIT(v) (UINT64CONST(1) << ((v) % RT_NODE_SPAN)) + +/* Enum used rt_node_search() */ +typedef enum +{ + RT_ACTION_FIND = 0, /* find the key-value */ + RT_ACTION_DELETE, /* delete the key-value */ +} rt_action; + +/* Base type for all nodes types */ +typedef struct rt_node +{ + /* The number of children and the node kind */ + uint16 info; + + /* + * Shift indicates which part of the key space is represented by this + * node. That is, the key is shifted by 'shift' and the lowest + * RT_NODE_SPAN bits are then represented in chunk. + */ + uint8 shift; + uint8 chunk; +} rt_node; + +/* + * Flags and masks for 'info'. + * + * The lowest 9 bits of 'info' represent the number of children in the node, and + * the next 2 bits are node kind. + */ +#define RT_NODE_INFO_COUNT_BITS 9 +#define RT_NODE_INFO_KIND_BITS 2 +#define RT_NODE_INFO_COUNT_MASK ((1 << RT_NODE_INFO_COUNT_BITS) - 1) +#define RT_NODE_INFO_KIND_MASK ((1 << RT_NODE_INFO_KIND_BITS) - 1) + +/* + * Supported radix tree node kinds. + * + * XXX: These are currently not well chosen. To reduce memory fragmentation + * smaller class should optimally fit neatly into the next larger class + * (except perhaps at the lowest end). Right now its + * 40/40 -> 296/286 -> 1288/1304 -> 2056/2088 bytes for inner nodes and + * leaf nodes, respectively, leading to large amount of allocator padding + * with aset.c. Hence the use of slab. + * + * XXX: need to have node-1 until there is no path compression optimization? + * + * XXX: need to explain why we choose these node types based on benchmark + * results etc. + */ +#define RT_NODE_KIND_4 0x00 +#define RT_NODE_KIND_32 0x01 +#define RT_NODE_KIND_128 0x02 +#define RT_NODE_KIND_256 0x03 +#define RT_NODE_KIND_COUNT 4 + +/* Macros to access the count and the kind in 'info' */ +#define NODE_GET_COUNT(n) (((rt_node *) (n))->info & RT_NODE_INFO_COUNT_MASK) +#define NODE_GET_KIND(n) \ + (((((rt_node* ) (n))->info) >> RT_NODE_INFO_COUNT_BITS) & RT_NODE_INFO_KIND_MASK) +#define NODE_INCREMENT_COUNT(n) \ + { \ + ((rt_node *) (n))->info++; \ + Assert(NODE_GET_COUNT(n) <= rt_node_kind_info[NODE_GET_KIND(n)].fanout); \ + } while (0) +#define NODE_DECREMENT_COUNT(n) \ + { \ + ((rt_node *) (n))->info--; \ + Assert(NODE_GET_COUNT(n) >= 0); \ + } while(0) +#define NODE_SET_COUNT(n, count) \ + { \ + ((rt_node *) (n))->info &= ~RT_NODE_INFO_COUNT_MASK; \ + ((rt_node *) (n))->info |= (count); \ + } while (0) +#define NODE_SET_KIND(n, kind) \ + { \ + ((rt_node *) (n))->info &= ~RT_NODE_INFO_KIND_MASK; \ + ((rt_node *) (n))->info |= ((kind) << RT_NODE_INFO_COUNT_BITS); \ + } while (0) +#define NODE_IS_LEAF(n) (((rt_node *) (n))->shift == 0) +#define NODE_IS_EMPTY(n) (NODE_GET_COUNT(((rt_node *) (n))) == 0) +#define NODE_HAS_FREE_SLOT(n) \ + (NODE_GET_COUNT(n) < rt_node_kind_info[NODE_GET_KIND(n)].fanout) + +/* Base type of each node kinds for leaf and inner nodes */ +typedef struct rt_node_base_4 +{ + rt_node n; + + /* 4 children, for key chunks */ + uint8 chunks[4]; +} rt_node_base_4; + +typedef struct rt_node_base32 +{ + rt_node n; + + /* 32 children, for key chunks */ + uint8 chunks[32]; +} rt_node_base_32; + +/* + * node-128 uses slot_idx array, an array of RT_NODE_MAX_SLOTS length, typically + * 256, to store indexes into a second array that contains up to 128 values (or + * child pointers in inner nodes). + */ +typedef struct rt_node_base128 +{ + rt_node n; + + /* The index of slots for each fanout */ + uint8 slot_idxs[RT_NODE_MAX_SLOTS]; +} rt_node_base_128; + +typedef struct rt_node_base256 +{ + rt_node n; +} rt_node_base_256; + +/* + * Inner and leaf nodes. + * + * There are separate from inner node size classes for two main reasons: + * + * 1) the value type might be different than something fitting into a pointer + * width type + * 2) Need to represent non-existing values in a key-type independent way. + * + * 1) is clearly worth being concerned about, but it's not clear 2) is as + * good. It might be better to just indicate non-existing entries the same way + * in inner nodes. + */ +typedef struct rt_node_inner_4 +{ + rt_node_base_4 base; + + /* 4 children, for key chunks */ + rt_node *children[4]; +} rt_node_inner_4; + +typedef struct rt_node_leaf_4 +{ + rt_node_base_4 base; + + /* 4 values, for key chunks */ + uint64 values[4]; +} rt_node_leaf_4; + +typedef struct rt_node_inner_32 +{ + rt_node_base_32 base; + + /* 32 children, for key chunks */ + rt_node *children[32]; +} rt_node_inner_32; + +typedef struct rt_node_leaf_32 +{ + rt_node_base_32 base; + + /* 32 values, for key chunks */ + uint64 values[32]; +} rt_node_leaf_32; + +typedef struct rt_node_inner_128 +{ + rt_node_base_128 base; + + /* Slots for 128 children */ + rt_node *children[128]; +} rt_node_inner_128; + +typedef struct rt_node_leaf_128 +{ + rt_node_base_128 base; + + /* isset is a bitmap to track which slot is in use */ + uint8 isset[RT_NODE_NSLOTS_BITS(128)]; + + /* Slots for 128 values */ + uint64 values[128]; +} rt_node_leaf_128; + +/* + * node-256 is the largest node type. This node has RT_NODE_MAX_SLOTS length array + * for directly storing values (or child pointers in inner nodes). + */ +typedef struct rt_node_inner_256 +{ + rt_node_base_256 base; + + /* Slots for 256 children */ + rt_node *children[RT_NODE_MAX_SLOTS]; +} rt_node_inner_256; + +typedef struct rt_node_leaf_256 +{ + rt_node_base_256 base; + + /* isset is a bitmap to track which slot is in use */ + uint8 isset[RT_NODE_NSLOTS_BITS(RT_NODE_MAX_SLOTS)]; + + /* Slots for 256 values */ + uint64 values[RT_NODE_MAX_SLOTS]; +} rt_node_leaf_256; + +/* Information of each size kinds */ +typedef struct rt_node_kind_info_elem +{ + const char *name; + int fanout; + + /* slab chunk size */ + Size inner_size; + Size leaf_size; + + /* slab block size */ + Size inner_blocksize; + Size leaf_blocksize; +} rt_node_kind_info_elem; + +/* + * Calculate the slab blocksize so that we can allocate at least 32 chunks + * from the block. + */ +#define NODE_SLAB_BLOCK_SIZE(size) \ + Max((SLAB_DEFAULT_BLOCK_SIZE / (size)) * size, (size) * 32) +static rt_node_kind_info_elem rt_node_kind_info[RT_NODE_KIND_COUNT] = { + + [RT_NODE_KIND_4] = { + .name = "radix tree node 4", + .fanout = 4, + .inner_size = sizeof(rt_node_inner_4), + .leaf_size = sizeof(rt_node_leaf_4), + .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_4)), + .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_4)), + }, + [RT_NODE_KIND_32] = { + .name = "radix tree node 32", + .fanout = 32, + .inner_size = sizeof(rt_node_inner_32), + .leaf_size = sizeof(rt_node_leaf_32), + .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_32)), + .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_32)), + }, + [RT_NODE_KIND_128] = { + .name = "radix tree node 128", + .fanout = 128, + .inner_size = sizeof(rt_node_inner_128), + .leaf_size = sizeof(rt_node_leaf_128), + .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_128)), + .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_128)), + }, + [RT_NODE_KIND_256] = { + .name = "radix tree node 256", + .fanout = 256, + .inner_size = sizeof(rt_node_inner_256), + .leaf_size = sizeof(rt_node_leaf_256), + .inner_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_inner_256)), + .leaf_blocksize = NODE_SLAB_BLOCK_SIZE(sizeof(rt_node_leaf_256)), + }, +}; + +/* + * Iteration support. + * + * Iterating the radix tree returns each pair of key and value in the ascending + * order of the key. To support this, the we iterate nodes of each level. + * + * rt_node_iter struct is used to track the iteration within a node. + * + * rt_iter is the struct for iteration of the radix tree, and uses rt_node_iter + * in order to track the iteration of each level. During the iteration, we also + * construct the key whenever updating the node iteration information, e.g., when + * advancing the current index within the node or when moving to the next node + * at the same level. + */ +typedef struct rt_node_iter +{ + rt_node *node; /* current node being iterated */ + int current_idx; /* current position. -1 for initial value */ +} rt_node_iter; + +struct rt_iter +{ + radix_tree *tree; + + /* Track the iteration on nodes of each level */ + rt_node_iter stack[RT_MAX_LEVEL]; + int stack_len; + + /* The key is being constructed during the iteration */ + uint64 key; +}; + +/* A radix tree with nodes */ +struct radix_tree +{ + MemoryContext context; + + rt_node *root; + uint64 max_val; + uint64 num_keys; + + MemoryContextData *inner_slabs[RT_NODE_KIND_COUNT]; + MemoryContextData *leaf_slabs[RT_NODE_KIND_COUNT]; + + /* statistics */ +#ifdef RT_DEBUG + int32 cnt[RT_NODE_KIND_COUNT]; +#endif +}; + +static void rt_new_root(radix_tree *tree, uint64 key); +static rt_node *rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, + bool inner); +static void rt_free_node(radix_tree *tree, rt_node *node); +static void rt_extend(radix_tree *tree, uint64 key); +static inline bool rt_node_search_inner(rt_node *node, uint64 key, rt_action action, + rt_node **child_p); +static inline bool rt_node_search_leaf(rt_node *node, uint64 key, rt_action action, + uint64 *value_p); +static rt_node *rt_node_add_new_child(radix_tree *tree, rt_node *parent, + rt_node *node, uint64 key); +static bool rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, + uint64 key, rt_node *child); +static bool rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, + uint64 key, uint64 value); +static inline rt_node *rt_node_inner_iterate_next(rt_iter *iter, rt_node_iter *node_iter); +static inline bool rt_node_leaf_iterate_next(rt_iter *iter, rt_node_iter *node_iter, + uint64 *value_p); +static void rt_update_iter_stack(rt_iter *iter, int from); +static void rt_update_node_iter(rt_iter *iter, rt_node_iter *node_iter, + rt_node *node); +static inline void rt_iter_update_key(rt_iter *iter, uint8 chunk, uint8 shift); + +/* verification (available only with assertion) */ +static void rt_verify_node(rt_node *node); + +/* + * Return index of the first element in 'base' that equals 'key'. Return -1 + * if there is no such element. + */ +static inline int +node_4_search_eq(rt_node_base_4 *node, uint8 chunk) +{ + int idx = -1; + + for (int i = 0; i < NODE_GET_COUNT(node); i++) + { + if (node->chunks[i] == chunk) + { + idx = i; + break; + } + } + + return idx; +} + +/* + * Return index of the first element in chunks in the given node that is greater + * than or equal to 'key'. Return -1 if there is no such element. + */ +static inline int +node_4_search_ge(rt_node_base_4 * node, uint8 chunk) +{ + int idx = -1; + + for (int i = 0; i < NODE_GET_COUNT(node); i++) + { + if (node->chunks[i] >= chunk) + { + idx = i; + break; + } + } + + return idx; +} + +/* + * Return index of the first element in 'base' that equals 'key'. Return -1 + * if there is no such element. + */ +static inline int +node_32_search_eq(rt_node_base_32 *node, uint8 chunk) +{ + int count = NODE_GET_COUNT(node); +#ifndef USE_NO_SIMD + Vector8 spread_chunk; + Vector8 haystack1; + Vector8 haystack2; + Vector8 cmp1; + Vector8 cmp2; + uint32 bitfield; + int index_simd = -1; +#endif + +#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING) + int index = -1; + for (int i = 0; i < count; i++) + { + if (node->chunks[i] == chunk) + { + index = i; + break; + } + } +#endif + +#ifndef USE_NO_SIMD + spread_chunk = vector8_broadcast(chunk); + vector8_load(&haystack1, &node->chunks[0]); + vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]); + cmp1 = vector8_eq(spread_chunk, haystack1); + cmp2 = vector8_eq(spread_chunk, haystack2); + /* XXX: should not to use vector8_highbit_mask */ + bitfield = vector8_highbit_mask(cmp1) | (vector8_highbit_mask(cmp2) << sizeof(Vector8)); + bitfield &= ((UINT64CONST(1) << count) - 1); + + if (bitfield) + index_simd = pg_rightmost_one_pos32(bitfield); + + Assert(index_simd == index); + return index_simd; +#else + return index; +#endif +} + +/* + * Return index of the first element in chunks in the given node that is greater + * than or equal to 'key'. Return -1 if there is no such element. + */ +static inline int +node_32_search_ge(rt_node_base_32 *node, uint8 chunk) +{ + int count = NODE_GET_COUNT(node); +#ifndef USE_NO_SIMD + Vector8 spread_chunk; + Vector8 haystack1; + Vector8 haystack2; + Vector8 cmp1; + Vector8 cmp2; + Vector8 min1; + Vector8 min2; + uint32 bitfield; + int index_simd = -1; +#endif + +#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING) + int index = -1; + for (int i = 0; i < count; i++) + { + if (node->chunks[i] >= chunk) + { + index = i; + break; + } + } +#endif + +#ifndef USE_NO_SIMD + spread_chunk = vector8_broadcast(chunk); + vector8_load(&haystack1, &node->chunks[0]); + vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]); + min1 = vector8_min(spread_chunk, haystack1); + min2 = vector8_min(spread_chunk, haystack2); + cmp1 = vector8_eq(spread_chunk, min1); + cmp2 = vector8_eq(spread_chunk, min2); + bitfield = vector8_highbit_mask(cmp1) | (vector8_highbit_mask(cmp2) << sizeof(Vector8)); + bitfield &= ((UINT64CONST(1) << count) - 1); + + if (bitfield) + index_simd = pg_rightmost_one_pos32(bitfield); + + Assert(index_simd == index); + return index_simd; +#else + return index; +#endif +} + +/* + * Functions to manipulate both chunks array and children/values array. + * These are used for node-4 and node-32. + */ + +/* Shift the elements right at 'idx' */ +static inline void +chunk_children_array_shift(uint8 *chunks, rt_node **children, int count, int idx) +{ + memmove(&(chunks[idx + 1]), &(chunks[idx]), sizeof(uint8) * (count - idx)); + memmove(&(children[idx + 1]), &(children[idx]), sizeof(rt_node *) * (count - idx)); +} + +static inline void +chunk_values_array_shift(uint8 *chunks, uint64 *values, int count, int idx) +{ + memmove(&(chunks[idx + 1]), &(chunks[idx]), sizeof(uint8) * (count - idx)); + memmove(&(values[idx + 1]), &(values[idx]), sizeof(uint64 *) * (count - idx)); +} + +/* Delete the element at 'idx' */ +static inline void +chunk_children_array_delete(uint8 *chunks, rt_node **children, int count, int idx) +{ + memmove(&(chunks[idx]), &(chunks[idx + 1]), sizeof(uint8) * (count - idx - 1)); + memmove(&(children[idx]), &(children[idx + 1]), sizeof(rt_node *) * (count - idx - 1)); +} + +static inline void +chunk_values_array_delete(uint8 *chunks, uint64 *values, int count, int idx) +{ + memmove(&(chunks[idx]), &(chunks[idx + 1]), sizeof(uint8) * (count - idx - 1)); + memmove(&(values[idx]), &(values[idx + 1]), sizeof(uint64) * (count - idx - 1)); +} + +/* Copy both chunks and children/values arrays */ +static inline void +chunk_children_array_copy(uint8 *src_chunks, rt_node **src_children, + uint8 *dst_chunks, rt_node **dst_children, int count) +{ + memcpy(dst_chunks, src_chunks, sizeof(uint8) * count); + memcpy(dst_children, src_children, sizeof(rt_node *) * count); +} + +static inline void +chunk_values_array_copy(uint8 *src_chunks, uint64 *src_values, + uint8 *dst_chunks, uint64 *dst_values, int count) +{ + memcpy(dst_chunks, src_chunks, sizeof(uint8) * count); + memcpy(dst_values, src_values, sizeof(uint64) * count); +} + +/* Functions to manipulate inner and leaf node-128 */ + +/* Does the given chunk in the node has the value? */ +static inline bool +node_128_is_chunk_used(rt_node_base_128 *node, uint8 chunk) +{ + return node->slot_idxs[chunk] != RT_NODE_128_INVALID_IDX; +} + +/* Is the slot in the node used? */ +static inline bool +node_inner_128_is_slot_used(rt_node_inner_128 *node, uint8 slot) +{ + Assert(!NODE_IS_LEAF(node)); + return (node->children[slot] != NULL); +} + +static inline bool +node_leaf_128_is_slot_used(rt_node_leaf_128 *node, uint8 slot) +{ + Assert(NODE_IS_LEAF(node)); + return ((node->isset[RT_NODE_BITMAP_BYTE(slot)] & RT_NODE_BITMAP_BIT(slot)) != 0); +} + +static inline rt_node * +node_inner_128_get_child(rt_node_inner_128 *node, uint8 chunk) +{ + Assert(!NODE_IS_LEAF(node)); + return node->children[node->base.slot_idxs[chunk]]; +} + +static inline uint64 +node_leaf_128_get_value(rt_node_leaf_128 *node, uint8 chunk) +{ + Assert(NODE_IS_LEAF(node)); + Assert(((rt_node_base_128 *) node)->slot_idxs[chunk] != RT_NODE_128_INVALID_IDX); + return node->values[node->base.slot_idxs[chunk]]; +} + +/* Delete the chunk in the node */ +static void +node_inner_128_delete(rt_node_inner_128 *node, uint8 chunk) +{ + Assert(!NODE_IS_LEAF(node)); + node->base.slot_idxs[chunk] = RT_NODE_128_INVALID_IDX; +} + +/* Delete the chunk in the node */ +static void +node_leaf_128_delete(rt_node_leaf_128 *node, uint8 chunk) +{ + int slotpos = node->base.slot_idxs[chunk]; + + Assert(NODE_IS_LEAF(node)); + node->isset[RT_NODE_BITMAP_BYTE(slotpos)] &= ~(RT_NODE_BITMAP_BIT(slotpos)); + node->base.slot_idxs[chunk] = RT_NODE_128_INVALID_IDX; +} + +static int +node_inner_128_find_unused_slot(rt_node_inner_128 *node, uint8 chunk) +{ + int slotpos = 0; + + Assert(!NODE_IS_LEAF(node)); + while (node_inner_128_is_slot_used(node, slotpos)) + slotpos++; + + return slotpos; +} + +/* Return an unused slot in node-128 */ +static int +node_leaf_128_find_unused_slot(rt_node_leaf_128 *node, uint8 chunk) +{ + int slotpos; + + Assert(NODE_IS_LEAF(node)); + + /* + * Find an unused slot. We iterate over the isset bitmap per byte then + * check each bit. + */ + for (slotpos = 0; slotpos < RT_NODE_NSLOTS_BITS(128); slotpos++) + { + if (node->isset[slotpos] < 0xFF) + break; + } + Assert(slotpos < RT_NODE_NSLOTS_BITS(128)); + + slotpos *= BITS_PER_BYTE; + while (node_leaf_128_is_slot_used(node, slotpos)) + slotpos++; + + return slotpos; +} + +static inline void +node_inner_128_insert(rt_node_inner_128 *node, uint8 chunk, rt_node *child) +{ + int slotpos; + + Assert(!NODE_IS_LEAF(node)); + + /* find unused slot */ + slotpos = node_inner_128_find_unused_slot(node, chunk); + + node->base.slot_idxs[chunk] = slotpos; + node->children[slotpos] = child; +} + +/* Set the slot at the corresponding chunk */ +static inline void +node_leaf_128_insert(rt_node_leaf_128 *node, uint8 chunk, uint64 value) +{ + int slotpos; + + Assert(NODE_IS_LEAF(node)); + + /* find unused slot */ + slotpos = node_leaf_128_find_unused_slot(node, chunk); + + node->base.slot_idxs[chunk] = slotpos; + node->isset[RT_NODE_BITMAP_BYTE(slotpos)] |= RT_NODE_BITMAP_BIT(slotpos); + node->values[slotpos] = value; +} + +/* Update the child corresponding to 'chunk' to 'child' */ +static inline void +ndoe_inner_128_update(rt_node_inner_128 *node, uint8 chunk, rt_node *child) +{ + Assert(!NODE_IS_LEAF(node)); + node->children[node->base.slot_idxs[chunk]] = child; +} + +/* Update the value corresponding to 'chunk' to 'value' */ +static inline void +ndoe_leaf_128_update(rt_node_leaf_128 *node, uint8 chunk, uint64 value) +{ + Assert(NODE_IS_LEAF(node)); + node->values[node->base.slot_idxs[chunk]] = value; +} + +/* Functions to manipulate inner and leaf node-256 */ + +/* Return true if the slot corresponding to the given chunk is in use */ +static inline bool +node_inner_256_is_chunk_used(rt_node_inner_256 *node, uint8 chunk) +{ + Assert(!NODE_IS_LEAF(node)); + return (node->children[chunk] != NULL); +} + +static inline bool +node_leaf_256_is_chunk_used(rt_node_leaf_256 *node, uint8 chunk) +{ + Assert(NODE_IS_LEAF(node)); + return (node->isset[RT_NODE_BITMAP_BYTE(chunk)] & RT_NODE_BITMAP_BIT(chunk)) != 0; +} + +static inline rt_node * +node_inner_256_get_child(rt_node_inner_256 *node, uint8 chunk) +{ + Assert(!NODE_IS_LEAF(node)); + Assert(node_inner_256_is_chunk_used(node, chunk)); + return node->children[chunk]; +} + +static inline uint64 +node_leaf_256_get_value(rt_node_leaf_256 *node, uint8 chunk) +{ + Assert(NODE_IS_LEAF(node)); + Assert(node_leaf_256_is_chunk_used(node, chunk)); + return node->values[chunk]; +} + +/* Set the child in the node-256 */ +static inline void +node_inner_256_set(rt_node_inner_256 *node, uint8 chunk, rt_node *child) +{ + Assert(!NODE_IS_LEAF(node)); + node->children[chunk] = child; +} + +/* Set the value in the node-256 */ +static inline void +node_leaf_256_set(rt_node_leaf_256 *node, uint8 chunk, uint64 value) +{ + Assert(NODE_IS_LEAF(node)); + node->isset[RT_NODE_BITMAP_BYTE(chunk)] |= RT_NODE_BITMAP_BIT(chunk); + node->values[chunk] = value; +} + +/* Set the slot at the given chunk position */ +static inline void +node_inner_256_delete(rt_node_inner_256 *node, uint8 chunk) +{ + Assert(!NODE_IS_LEAF(node)); + node->children[chunk] = NULL; +} + +static inline void +node_leaf_256_delete(rt_node_leaf_256 *node, uint8 chunk) +{ + Assert(NODE_IS_LEAF(node)); + node->isset[RT_NODE_BITMAP_BYTE(chunk)] &= ~(RT_NODE_BITMAP_BIT(chunk)); +} + +/* + * Return the shift that is satisfied to store the given key. + */ +static inline int +key_get_shift(uint64 key) +{ + return (key == 0) + ? 0 + : (pg_leftmost_one_pos64(key) / RT_NODE_SPAN) * RT_NODE_SPAN; +} + +/* + * Return the max value stored in a node with the given shift. + */ +static uint64 +shift_get_max_val(int shift) +{ + if (shift == RT_MAX_SHIFT) + return UINT64_MAX; + + return (UINT64CONST(1) << (shift + RT_NODE_SPAN)) - 1; +} + +/* + * Create a new node as the root. Subordinate nodes will be created during + * the insertion. + */ +static void +rt_new_root(radix_tree *tree, uint64 key) +{ + int shift = key_get_shift(key); + rt_node *node; + + node = (rt_node *) rt_alloc_node(tree, RT_NODE_KIND_4, shift, 0, + shift > 0); + tree->max_val = shift_get_max_val(shift); + tree->root = node; +} + +/* + * Allocate a new node with the given node kind. + */ +static rt_node * +rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, bool inner) +{ + rt_node *newnode; + + if (inner) + newnode = (rt_node *) MemoryContextAllocZero(tree->inner_slabs[kind], + rt_node_kind_info[kind].inner_size); + else + newnode = (rt_node *) MemoryContextAllocZero(tree->leaf_slabs[kind], + rt_node_kind_info[kind].leaf_size); + + NODE_SET_KIND(newnode, kind); + newnode->shift = shift; + newnode->chunk = chunk; + + /* Initialize slot_idxs to invalid values */ + if (kind == RT_NODE_KIND_128) + { + rt_node_base_128 *n128 = (rt_node_base_128 *) newnode; + + memset(n128->slot_idxs, RT_NODE_128_INVALID_IDX, sizeof(n128->slot_idxs)); + } + +#ifdef RT_DEBUG + /* update the statistics */ + tree->cnt[kind]++; +#endif + + return newnode; +} + +static rt_node * +rt_copy_node(radix_tree *tree, rt_node *node, int new_kind) +{ + rt_node *newnode; + + newnode = rt_alloc_node(tree, new_kind, node->shift, node->chunk, + node->shift > 0); + NODE_SET_COUNT(newnode, NODE_GET_COUNT(node)); + + return newnode; +} + +/* Free the given node */ +static void +rt_free_node(radix_tree *tree, rt_node *node) +{ + /* If we're deleting the root node, make the tree empty */ + if (tree->root == node) + tree->root = NULL; + +#ifdef RT_DEBUG + /* update the statistics */ + tree->cnt[NODE_GET_KIND(node)]--; + Assert(tree->cnt[NODE_GET_KIND(node)] >= 0); +#endif + + pfree(node); +} + +/* + * Replace old_child with new_child, and free the old one. + */ +static void +rt_replace_node(radix_tree *tree, rt_node *parent, rt_node *old_child, + rt_node *new_child, uint64 key) +{ + Assert(old_child->chunk == new_child->chunk); + Assert(old_child->shift == new_child->shift); + + if (parent == old_child) + { + /* Replace the root node with the new large node */ + tree->root = new_child; + } + else + { + bool replaced PG_USED_FOR_ASSERTS_ONLY; + + replaced = rt_node_insert_inner(tree, NULL, parent, key, new_child); + Assert(replaced); + } + + rt_free_node(tree, old_child); +} + +/* + * The radix tree doesn't sufficient height. Extend the radix tree so it can + * store the key. + */ +static void +rt_extend(radix_tree *tree, uint64 key) +{ + int target_shift; + int shift = tree->root->shift + RT_NODE_SPAN; + + target_shift = key_get_shift(key); + + /* Grow tree from 'shift' to 'target_shift' */ + while (shift <= target_shift) + { + rt_node_inner_4 *node; + + node = (rt_node_inner_4 *) rt_alloc_node(tree, RT_NODE_KIND_4, + shift, 0, true); + NODE_SET_COUNT(node, 1); + node->base.chunks[0] = 0; + node->children[0] = tree->root; + + tree->root->chunk = 0; + tree->root = (rt_node *) node; + + shift += RT_NODE_SPAN; + } + + tree->max_val = shift_get_max_val(target_shift); +} + +/* + * Search for the child pointer corresponding to 'key' in the given node, and + * do the specified 'action'. + * + * Return true if the key is found, otherwise return false. On success, the child + * pointer is set to child_p. + */ +static inline bool +rt_node_search_inner(rt_node *node, uint64 key, rt_action action, rt_node **child_p) +{ + uint8 chunk = RT_GET_KEY_CHUNK(key, node->shift); + bool found = false; + rt_node *child = NULL; + + switch (NODE_GET_KIND(node)) + { + case RT_NODE_KIND_4: + { + rt_node_inner_4 *n4 = (rt_node_inner_4 *) node; + int idx = node_4_search_eq((rt_node_base_4 *) n4, chunk); + + if (idx < 0) + break; + + found = true; + + if (action == RT_ACTION_FIND) + child = n4->children[idx]; + else /* RT_ACTION_DELETE */ + chunk_children_array_delete(n4->base.chunks, n4->children, + NODE_GET_COUNT(n4), idx); + + break; + } + case RT_NODE_KIND_32: + { + rt_node_inner_32 *n32 = (rt_node_inner_32 *) node; + int idx = node_32_search_eq((rt_node_base_32 *) n32, chunk); + + if (idx < 0) + break; + + found = true; + if (action == RT_ACTION_FIND) + child = n32->children[idx]; + else /* RT_ACTION_DELETE */ + chunk_children_array_delete(n32->base.chunks, n32->children, + NODE_GET_COUNT(n32), idx); + break; + } + case RT_NODE_KIND_128: + { + rt_node_inner_128 *n128 = (rt_node_inner_128 *) node; + + if (!node_128_is_chunk_used((rt_node_base_128 *) n128, chunk)) + break; + + found = true; + + if (action == RT_ACTION_FIND) + child = node_inner_128_get_child(n128, chunk); + else /* RT_ACTION_DELETE */ + node_inner_128_delete(n128, chunk); + + break; + } + case RT_NODE_KIND_256: + { + rt_node_inner_256 *n256 = (rt_node_inner_256 *) node; + + if (!node_inner_256_is_chunk_used(n256, chunk)) + break; + + found = true; + if (action == RT_ACTION_FIND) + child = node_inner_256_get_child(n256, chunk); + else /* RT_ACTION_DELETE */ + node_inner_256_delete(n256, chunk); + + break; + } + } + + /* update statistics */ + if (action == RT_ACTION_DELETE && found) + NODE_DECREMENT_COUNT(node); + + if (found && child_p) + *child_p = child; + + return found; +} + +/* + * Search for the value corresponding to 'key' in the given node, and do the + * specified 'action'. + * + * Return true if the key is found, otherwise return false. On success, the pointer + * to the value is set to value_p. + */ +static inline bool +rt_node_search_leaf(rt_node *node, uint64 key, rt_action action, uint64 *value_p) +{ + uint8 chunk = RT_GET_KEY_CHUNK(key, node->shift); + bool found = false; + uint64 value = 0; + + switch (NODE_GET_KIND(node)) + { + case RT_NODE_KIND_4: + { + rt_node_leaf_4 *n4 = (rt_node_leaf_4 *) node; + int idx = node_4_search_eq((rt_node_base_4 *) n4, chunk); + + if (idx < 0) + break; + + found = true; + + if (action == RT_ACTION_FIND) + value = n4->values[idx]; + else /* RT_ACTION_DELETE */ + chunk_values_array_delete(n4->base.chunks, (uint64 *) n4->values, + NODE_GET_COUNT(n4), idx); + + break; + } + case RT_NODE_KIND_32: + { + rt_node_leaf_32 *n32 = (rt_node_leaf_32 *) node; + int idx = node_32_search_eq((rt_node_base_32 *) n32, chunk); + + if (idx < 0) + break; + + found = true; + if (action == RT_ACTION_FIND) + value = n32->values[idx]; + else /* RT_ACTION_DELETE */ + chunk_values_array_delete(n32->base.chunks, (uint64 *) n32->values, + NODE_GET_COUNT(n32), idx); + break; + } + case RT_NODE_KIND_128: + { + rt_node_leaf_128 *n128 = (rt_node_leaf_128 *) node; + + if (!node_128_is_chunk_used((rt_node_base_128 *) n128, chunk)) + break; + + found = true; + + if (action == RT_ACTION_FIND) + value = node_leaf_128_get_value(n128, chunk); + else /* RT_ACTION_DELETE */ + node_leaf_128_delete(n128, chunk); + + break; + } + case RT_NODE_KIND_256: + { + rt_node_leaf_256 *n256 = (rt_node_leaf_256 *) node; + + if (!node_leaf_256_is_chunk_used(n256, chunk)) + break; + + found = true; + if (action == RT_ACTION_FIND) + value = node_leaf_256_get_value(n256, chunk); + else /* RT_ACTION_DELETE */ + node_leaf_256_delete(n256, chunk); + + break; + } + } + + /* update statistics */ + if (action == RT_ACTION_DELETE && found) + NODE_DECREMENT_COUNT(node); + + if (found && value_p) + *value_p = value; + + return found; +} + +/* Insert a new child to 'node' */ +static rt_node * +rt_node_add_new_child(radix_tree *tree, rt_node *parent, rt_node *node, uint64 key) +{ + uint8 newshift = node->shift - RT_NODE_SPAN; + rt_node *newchild; + + Assert(!NODE_IS_LEAF(node)); + + newchild = rt_alloc_node(tree, RT_NODE_KIND_4, newshift, + RT_GET_KEY_CHUNK(key, node->shift), + newshift > 0); + + rt_node_insert_inner(tree, parent, node, key, newchild); + + return (rt_node *) newchild; +} + +/* Insert the child to the inner node */ +static bool +rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 key, + rt_node *child) +{ + uint8 chunk = RT_GET_KEY_CHUNK(key, node->shift); + bool chunk_exists = false; + + Assert(!NODE_IS_LEAF(node)); + + switch (NODE_GET_KIND(node)) + { + case RT_NODE_KIND_4: + { + rt_node_inner_4 *n4 = (rt_node_inner_4 *) node; + rt_node_inner_32 *new32; + int idx; + + idx = node_4_search_eq((rt_node_base_4 *) n4, chunk); + if (idx != -1) + { + /* found the existing chunk */ + chunk_exists = true; + n4->children[idx] = child; + break; + } + + if (likely(NODE_HAS_FREE_SLOT(n4))) + { + int insertpos = node_4_search_ge((rt_node_base_4 *) n4, chunk); + uint16 count = NODE_GET_COUNT(n4); + + if (insertpos < 0) + insertpos = count; /* insert to the tail */ + + /* shift chunks and children */ + if (count != 0 && insertpos < count) + chunk_children_array_shift(n4->base.chunks, n4->children, + count, insertpos); + + n4->base.chunks[insertpos] = chunk; + n4->children[insertpos] = child; + break; + } + + /* grow node from 4 to 32 */ + new32 = (rt_node_inner_32 *) rt_copy_node(tree, (rt_node *) n4, + RT_NODE_KIND_32); + chunk_children_array_copy(n4->base.chunks, n4->children, + new32->base.chunks, new32->children, + NODE_GET_COUNT(n4)); + + rt_replace_node(tree, parent, (rt_node *) n4, (rt_node *) new32, + key); + node = (rt_node *) new32; + } + /* FALLTHROUGH */ + case RT_NODE_KIND_32: + { + rt_node_inner_32 *n32 = (rt_node_inner_32 *) node; + rt_node_inner_128 *new128; + + int idx; + + idx = node_32_search_eq((rt_node_base_32 *) n32, chunk); + if (idx != -1) + { + /* found the existing chunk */ + chunk_exists = true; + n32->children[idx] = child; + break; + } + + if (likely(NODE_HAS_FREE_SLOT(n32))) + { + int insertpos = node_32_search_ge((rt_node_base_32 *) n32, chunk); + int16 count = NODE_GET_COUNT(n32); + + if (insertpos < 0) + insertpos = count; /* insert to the tail */ + + if (count != 0 && insertpos < count) + chunk_children_array_shift(n32->base.chunks, n32->children, + count, insertpos); + + n32->base.chunks[insertpos] = chunk; + n32->children[insertpos] = child; + break; + } + + /* grow node from 32 to 128 */ + new128 = (rt_node_inner_128 *) rt_copy_node(tree, (rt_node *) n32, + RT_NODE_KIND_128); + for (int i = 0; i < NODE_GET_COUNT(n32); i++) + node_inner_128_insert(new128, n32->base.chunks[i], n32->children[i]); + + rt_replace_node(tree, parent, (rt_node *) n32, (rt_node *) new128, + key); + node = (rt_node *) new128; + } + /* FALLTHROUGH */ + case RT_NODE_KIND_128: + { + rt_node_inner_128 *n128 = (rt_node_inner_128 *) node; + rt_node_inner_256 *new256; + int cnt = 0; + + if (node_128_is_chunk_used((rt_node_base_128 *) n128, chunk)) + { + /* found the existing chunk */ + chunk_exists = true; + ndoe_inner_128_update(n128, chunk, child); + break; + } + + if (likely(NODE_HAS_FREE_SLOT(n128))) + { + node_inner_128_insert(n128, chunk, child); + break; + } + + /* grow node from 128 to 256 */ + new256 = (rt_node_inner_256 *) rt_copy_node(tree, (rt_node *) n128, + RT_NODE_KIND_256); + for (int i = 0; i < RT_NODE_MAX_SLOTS && cnt < NODE_GET_COUNT(n128); i++) + { + if (!node_128_is_chunk_used((rt_node_base_128 *) n128, i)) + continue; + + node_inner_256_set(new256, i, node_inner_128_get_child(n128, i)); + cnt++; + } + + rt_replace_node(tree, parent, (rt_node *) n128, (rt_node *) new256, + key); + node = (rt_node *) new256; + } + /* FALLTHROUGH */ + case RT_NODE_KIND_256: + { + rt_node_inner_256 *n256 = (rt_node_inner_256 *) node; + + chunk_exists = node_inner_256_is_chunk_used(n256, chunk); + Assert(chunk_exists || NODE_HAS_FREE_SLOT(n256)); + + node_inner_256_set(n256, chunk, child); + break; + } + } + + /* Update statistics */ + if (!chunk_exists) + NODE_INCREMENT_COUNT(node); + + /* + * Done. Finally, verify the chunk and value is inserted or replaced + * properly in the node. + */ + rt_verify_node(node); + + return chunk_exists; +} + +/* Insert the value to the leaf node */ +static bool +rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, + uint64 key, uint64 value) +{ + uint8 chunk = RT_GET_KEY_CHUNK(key, node->shift); + bool chunk_exists = false; + + Assert(NODE_IS_LEAF(node)); + + switch (NODE_GET_KIND(node)) + { + case RT_NODE_KIND_4: + { + rt_node_leaf_4 *n4 = (rt_node_leaf_4 *) node; + rt_node_leaf_32 *new32; + int idx; + + idx = node_4_search_eq((rt_node_base_4 *) n4, chunk); + if (idx != -1) + { + /* found the existing chunk */ + chunk_exists = true; + n4->values[idx] = value; + break; + } + + if (likely(NODE_HAS_FREE_SLOT(n4))) + { + int insertpos = node_4_search_ge((rt_node_base_4 *) n4, chunk); + int count = NODE_GET_COUNT(n4); + + if (insertpos < 0) + insertpos = count; /* insert to the tail */ + + /* shift chunks and values */ + if (count != 0 && insertpos < count) + chunk_values_array_shift(n4->base.chunks, n4->values, + count, insertpos); + + n4->base.chunks[insertpos] = chunk; + n4->values[insertpos] = value; + break; + } + + /* grow node from 4 to 32 */ + new32 = (rt_node_leaf_32 *) rt_copy_node(tree, (rt_node *) n4, + RT_NODE_KIND_32); + chunk_values_array_copy(n4->base.chunks, n4->values, + new32->base.chunks, new32->values, + NODE_GET_COUNT(n4)); + + rt_replace_node(tree, parent, (rt_node *) n4, (rt_node *) new32, + key); + node = (rt_node *) new32; + } + /* FALLTHROUGH */ + case RT_NODE_KIND_32: + { + rt_node_leaf_32 *n32 = (rt_node_leaf_32 *) node; + rt_node_leaf_128 *new128; + int idx; + + idx = node_32_search_eq((rt_node_base_32 *) n32, chunk); + if (idx != -1) + { + /* found the existing chunk */ + chunk_exists = true; + n32->values[idx] = value; + break; + } + + if (likely(NODE_HAS_FREE_SLOT(n32))) + { + int insertpos = node_32_search_ge((rt_node_base_32 *) n32, chunk); + int count = NODE_GET_COUNT(n32); + + if (insertpos < 0) + insertpos = count; /* insert to the tail */ + + if (count != 0 && insertpos < count) + chunk_values_array_shift(n32->base.chunks, n32->values, + count, insertpos); + + n32->base.chunks[insertpos] = chunk; + n32->values[insertpos] = value; + break; + } + + /* grow node from 32 to 128 */ + new128 = (rt_node_leaf_128 *) rt_copy_node(tree, (rt_node *) n32, + RT_NODE_KIND_128); + for (int i = 0; i < NODE_GET_COUNT(n32); i++) + node_leaf_128_insert(new128, n32->base.chunks[i], n32->values[i]); + + rt_replace_node(tree, parent, (rt_node *) n32, (rt_node *) new128, + key); + node = (rt_node *) new128; + } + /* FALLTHROUGH */ + case RT_NODE_KIND_128: + { + rt_node_leaf_128 *n128 = (rt_node_leaf_128 *) node; + rt_node_leaf_256 *new256; + int cnt = 0; + + if (node_128_is_chunk_used((rt_node_base_128 *) n128, chunk)) + { + /* found the existing chunk */ + chunk_exists = true; + ndoe_leaf_128_update(n128, chunk, value); + break; + } + + if (likely(NODE_HAS_FREE_SLOT(n128))) + { + node_leaf_128_insert(n128, chunk, value); + break; + } + + /* grow node from 128 to 256 */ + new256 = (rt_node_leaf_256 *) rt_copy_node(tree, (rt_node *) n128, + RT_NODE_KIND_256); + for (int i = 0; i < RT_NODE_MAX_SLOTS && cnt < NODE_GET_COUNT(n128); i++) + { + if (!node_128_is_chunk_used((rt_node_base_128 *) n128, i)) + continue; + + node_leaf_256_set(new256, i, node_leaf_128_get_value(n128, i)); + cnt++; + } + + rt_replace_node(tree, parent, (rt_node *) n128, (rt_node *) new256, + key); + node = (rt_node *) new256; + } + /* FALLTHROUGH */ + case RT_NODE_KIND_256: + { + rt_node_leaf_256 *n256 = (rt_node_leaf_256 *) node; + + chunk_exists = node_leaf_256_is_chunk_used(n256, chunk); + Assert(chunk_exists || NODE_HAS_FREE_SLOT(n256)); + + node_leaf_256_set(n256, chunk, value); + break; + } + } + + /* Update statistics */ + if (!chunk_exists) + NODE_INCREMENT_COUNT(node); + + /* + * Done. Finally, verify the chunk and value is inserted or replaced + * properly in the node. + */ + rt_verify_node(node); + + return chunk_exists; +} + +/* + * Create the radix tree in the given memory context and return it. + */ +radix_tree * +rt_create(MemoryContext ctx) +{ + radix_tree *tree; + MemoryContext old_ctx; + + old_ctx = MemoryContextSwitchTo(ctx); + + tree = palloc(sizeof(radix_tree)); + tree->context = ctx; + tree->root = NULL; + tree->max_val = 0; + tree->num_keys = 0; + + /* Create the slab allocator for each size class */ + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + tree->inner_slabs[i] = SlabContextCreate(ctx, + rt_node_kind_info[i].name, + rt_node_kind_info[i].inner_blocksize, + rt_node_kind_info[i].inner_size); + tree->leaf_slabs[i] = SlabContextCreate(ctx, + rt_node_kind_info[i].name, + rt_node_kind_info[i].leaf_blocksize, + rt_node_kind_info[i].leaf_size); +#ifdef RT_DEBUG + tree->cnt[i] = 0; +#endif + } + + MemoryContextSwitchTo(old_ctx); + + return tree; +} + +/* + * Free the given radix tree. + */ +void +rt_free(radix_tree *tree) +{ + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + MemoryContextDelete(tree->inner_slabs[i]); + MemoryContextDelete(tree->leaf_slabs[i]); + } + + pfree(tree); +} + +/* + * Set key to value. If the entry already exists, we update its value to 'value' + * and return true. Returns false if entry doesn't yet exist. + */ +bool +rt_set(radix_tree *tree, uint64 key, uint64 value) +{ + int shift; + bool updated; + rt_node *node; + rt_node *parent = tree->root; + + /* Empty tree, create the root */ + if (!tree->root) + rt_new_root(tree, key); + + /* Extend the tree if necessary */ + if (key > tree->max_val) + rt_extend(tree, key); + + Assert(tree->root); + + shift = tree->root->shift; + node = tree->root; + + /* Descend the tree until a leaf node */ + while (shift >= 0) + { + rt_node *child; + + if (NODE_IS_LEAF(node)) + break; + + if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) + child = rt_node_add_new_child(tree, parent, node, key); + + Assert(child); + + parent = node; + node = child; + shift -= RT_NODE_SPAN; + } + + /* arrived at a leaf */ + Assert(NODE_IS_LEAF(node)); + + updated = rt_node_insert_leaf(tree, parent, node, key, value); + + /* Update the statistics */ + if (!updated) + tree->num_keys++; + + return updated; +} + +/* + * Search the given key in the radix tree. Return true if there is the key, + * otherwise return false. On success, we set the value to *val_p so it must + * not be NULL. + */ +bool +rt_search(radix_tree *tree, uint64 key, uint64 *value_p) +{ + rt_node *node; + int shift; + + Assert(value_p != NULL); + + if (!tree->root || key > tree->max_val) + return false; + + node = tree->root; + shift = tree->root->shift; + + /* Descend the tree until a leaf node */ + while (shift >= 0) + { + rt_node *child; + + if (NODE_IS_LEAF(node)) + break; + + if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) + return false; + + node = child; + shift -= RT_NODE_SPAN; + } + + /* We reached at a leaf node, so search the corresponding slot */ + Assert(NODE_IS_LEAF(node)); + if (!rt_node_search_leaf(node, key, RT_ACTION_FIND, value_p)) + return false; + + return true; +} + +/* + * Delete the given key from the radix tree. Return true if the key is found (and + * deleted), otherwise do nothing and return false. + */ +bool +rt_delete(radix_tree *tree, uint64 key) +{ + rt_node *node; + int shift; + rt_node *stack[RT_MAX_LEVEL] = {0}; + int level; + + if (!tree->root || key > tree->max_val) + return false; + + /* + * Descend the tree to search the key while building a stack of nodes + * we visited. + */ + node = tree->root; + shift = tree->root->shift; + level = 0; + while (shift >= 0) + { + rt_node *child; + + /* Push the current node to the stack */ + stack[level] = node; + + if (NODE_IS_LEAF(node)) + break; + + if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) + return false; + + node = child; + shift -= RT_NODE_SPAN; + level++; + } + + Assert(NODE_IS_LEAF(node)); + + /* there is no key to delete */ + if (!rt_node_search_leaf(node, key, RT_ACTION_FIND, NULL)) + return false; + + /* Update the statistics */ + tree->num_keys--; + + /* + * Delete the key from the leaf node and recursively delete the key in + * inner nodes if necessary. + */ + Assert(NODE_IS_LEAF(stack[level])); + while (level >= 0) + { + rt_node *node = stack[level--]; + + if (NODE_IS_LEAF(node)) + rt_node_search_leaf(node, key, RT_ACTION_DELETE, NULL); + else + rt_node_search_inner(node, key, RT_ACTION_DELETE, NULL); + + /* If the node didn't become empty, we stop deleting the key */ + if (!NODE_IS_EMPTY(node)) + break; + + /* The node became empty */ + rt_free_node(tree, node); + } + + /* + * If we eventually deleted the root node while recursively deleting empty + * nodes, we make the tree empty. + */ + if (level == 0) + { + tree->root = NULL; + tree->max_val = 0; + } + + return true;; +} + +/* Create and return the iterator for the given radix tree */ +rt_iter * +rt_begin_iterate(radix_tree *tree) +{ + MemoryContext old_ctx; + rt_iter *iter; + int top_level; + + old_ctx = MemoryContextSwitchTo(tree->context); + + iter = (rt_iter *) palloc0(sizeof(rt_iter)); + iter->tree = tree; + + /* empty tree */ + if (!iter->tree) + return iter; + + top_level = iter->tree->root->shift / RT_NODE_SPAN; + + iter->stack_len = top_level; + iter->stack[top_level].node = iter->tree->root; + iter->stack[top_level].current_idx = -1; + + /* + * Descend to the left most leaf node from the root. The key is being + * constructed while descending to the leaf. + */ + rt_update_iter_stack(iter, top_level); + + MemoryContextSwitchTo(old_ctx); + + return iter; +} + +/* + * Update the stack of the radix tree node while descending to the leaf from + * the 'from' level. + */ +static void +rt_update_iter_stack(rt_iter *iter, int from) +{ + rt_node *node = iter->stack[from].node; + int level = from; + + for (;;) + { + rt_node_iter *node_iter = &(iter->stack[level--]); + + /* Set the node to this level */ + rt_update_node_iter(iter, node_iter, node); + + /* Finish if we reached to the leaf node */ + if (NODE_IS_LEAF(node)) + break; + + /* Advance to the next slot in the node */ + node = rt_node_inner_iterate_next(iter, node_iter); + + /* + * Since we always get the first slot in the node, we have to found + * the slot. + */ + Assert(node); + } +} + +/* + * Return true with setting key_p and value_p if there is next key. Otherwise, + * return false. + */ +bool +rt_iterate_next(rt_iter *iter, uint64 *key_p, uint64 *value_p) +{ + /* Empty tree */ + if (!iter->tree) + return false; + + for (;;) + { + rt_node_iter *node_iter; + rt_node *child = NULL; + uint64 value; + int level; + bool found; + + found = rt_node_leaf_iterate_next(iter, &(iter->stack[0]), &value); + + if (found) + { + *key_p = iter->key; + *value_p = value; + return true; + } + + /* + * Iterate node at each level from the level=1 inner node until + * we find the next value to return. + */ + for (level = 1; level <= iter->stack_len; level++) + { + child = rt_node_inner_iterate_next(iter, &(iter->stack[level])); + + if (child) + break; + } + + /* We could not find any new key-value pair, the iteration finished */ + if (!child) + return false; + + /* + * We have advanced slots more than one nodes including both the lead + * node and inner nodes. So we update the stack by descending to + * the left most leaf node from this level. + */ + node_iter = &(iter->stack[level - 1]); + rt_update_node_iter(iter, node_iter, child); + rt_update_iter_stack(iter, level - 1); + } + + return false; +} + +void +rt_end_iterate(rt_iter *iter) +{ + pfree(iter); +} + +static inline void +rt_iter_update_key(rt_iter *iter, uint8 chunk, uint8 shift) +{ + iter->key &= ~(((uint64) RT_CHUNK_MASK) << shift); + iter->key |= (((uint64) chunk) << shift); +} + +/* + * Advance the slot in the inner node. Return the child if exists, otherwise + * null. + */ +static inline rt_node * +rt_node_inner_iterate_next(rt_iter *iter, rt_node_iter *node_iter) +{ + rt_node *child = NULL; + bool found = false; + uint8 key_chunk; + + switch (NODE_GET_KIND(node_iter->node)) + { + case RT_NODE_KIND_4: + { + rt_node_inner_4 *n4 = (rt_node_inner_4 *) node_iter->node; + + node_iter->current_idx++; + if (node_iter->current_idx >= NODE_GET_COUNT(n4)) + break; + + child = n4->children[node_iter->current_idx]; + key_chunk = n4->base.chunks[node_iter->current_idx]; + found = true; + break; + } + case RT_NODE_KIND_32: + { + rt_node_inner_32 *n32 = (rt_node_inner_32 *) node_iter->node; + + node_iter->current_idx++; + if (node_iter->current_idx >= NODE_GET_COUNT(n32)) + break; + + child = n32->children[node_iter->current_idx]; + key_chunk = n32->base.chunks[node_iter->current_idx]; + found = true; + break; + } + case RT_NODE_KIND_128: + { + rt_node_inner_128 *n128 = (rt_node_inner_128 *) node_iter->node; + int i; + + for (i = node_iter->current_idx + 1; i < 256; i++) + { + if (node_128_is_chunk_used((rt_node_base_128 *) n128, i)) + break; + } + + if (i >= 256) + break; + + node_iter->current_idx = i; + child = node_inner_128_get_child(n128, i); + key_chunk = i; + found = true; + break; + } + case RT_NODE_KIND_256: + { + rt_node_inner_256 *n256 = (rt_node_inner_256 *) node_iter->node; + int i; + for (i = node_iter->current_idx + 1; i < 256; i++) + { + if (node_inner_256_is_chunk_used(n256, i)) + break; + } + + if (i >= 256) + break; + + node_iter->current_idx = i; + child = node_inner_256_get_child(n256, i); + key_chunk = i; + found = true; + break; + } + } + + if (found) + rt_iter_update_key(iter, key_chunk, node_iter->node->shift); + + return child; +} + +/* + * Advance the slot in the leaf node. On success, return true and the value + * is set to value_p, otherwise return false. + */ +static inline bool +rt_node_leaf_iterate_next(rt_iter *iter, rt_node_iter *node_iter, + uint64 *value_p) +{ + rt_node *node = node_iter->node; + bool found = false; + uint64 value; + uint8 key_chunk; + + switch (NODE_GET_KIND(node)) + { + case RT_NODE_KIND_4: + { + rt_node_leaf_4 *n4 = (rt_node_leaf_4 *) node_iter->node; + + node_iter->current_idx++; + if (node_iter->current_idx >= NODE_GET_COUNT(n4)) + break; + + value = n4->values[node_iter->current_idx]; + key_chunk = n4->base.chunks[node_iter->current_idx]; + found = true; + break; + } + case RT_NODE_KIND_32: + { + rt_node_leaf_32 *n32 = (rt_node_leaf_32 *) node_iter->node; + + node_iter->current_idx++; + if (node_iter->current_idx >= NODE_GET_COUNT(n32)) + break; + + value = n32->values[node_iter->current_idx]; + key_chunk = n32->base.chunks[node_iter->current_idx]; + found = true; + break; + } + case RT_NODE_KIND_128: + { + rt_node_leaf_128 *n128 = (rt_node_leaf_128 *) node_iter->node; + int i; + + for (i = node_iter->current_idx + 1; i < 256; i++) + { + if (node_128_is_chunk_used((rt_node_base_128 *) n128, i)) + break; + } + + if (i >= 256) + break; + + node_iter->current_idx = i; + value = node_leaf_128_get_value(n128, i); + key_chunk = i; + found = true; + break; + } + case RT_NODE_KIND_256: + { + rt_node_leaf_256 *n256 = (rt_node_leaf_256 *) node_iter->node; + int i; + for (i = node_iter->current_idx + 1; i < 256; i++) + { + if (node_leaf_256_is_chunk_used(n256, i)) + break; + } + + if (i >= 256) + break; + + node_iter->current_idx = i; + value = node_leaf_256_get_value(n256, i); + key_chunk = i; + found = true; + break; + } + } + + if (found) + { + rt_iter_update_key(iter, key_chunk, node_iter->node->shift); + *value_p = value; + } + + return found; +} + +/* + * Set the node to the node_iter so we can begin the iteration of the node. + * Also, we update the part of the key by the chunk of the given node. + */ +static void +rt_update_node_iter(rt_iter *iter, rt_node_iter *node_iter, + rt_node *node) +{ + node_iter->node = node; + node_iter->current_idx = -1; + + rt_iter_update_key(iter, node->chunk, node->shift + RT_NODE_SPAN); +} + +/* + * Return the number of keys in the radix tree. + */ +uint64 +rt_num_entries(radix_tree *tree) +{ + return tree->num_keys; +} + +/* + * Return the statistics of the amount of memory used by the radix tree. + */ +uint64 +rt_memory_usage(radix_tree *tree) +{ + Size total = 0; + + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + total += MemoryContextMemAllocated(tree->inner_slabs[i], true); + total += MemoryContextMemAllocated(tree->leaf_slabs[i], true); + } + + return total; +} + +/* + * Verify the radix tree node. + */ +static void +rt_verify_node(rt_node *node) +{ +#ifdef USE_ASSERT_CHECKING + Assert(NODE_GET_COUNT(node) >= 0); + + switch (NODE_GET_KIND(node)) + { + case RT_NODE_KIND_4: + { + rt_node_base_4 *n4 = (rt_node_base_4 *) node; + + for (int i = 1; i < NODE_GET_COUNT(n4); i++) + Assert(n4->chunks[i - 1] < n4->chunks[i]); + + break; + } + case RT_NODE_KIND_32: + { + rt_node_base_32 *n32 = (rt_node_base_32 *) node; + + for (int i = 1; i < NODE_GET_COUNT(n32); i++) + Assert(n32->chunks[i - 1] < n32->chunks[i]); + + break; + } + case RT_NODE_KIND_128: + { + rt_node_base_128 *n128 = (rt_node_base_128 *) node; + int cnt = 0; + + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + if (!node_128_is_chunk_used(n128, i)) + continue; + + /* Check if the corresponding slot is used */ + if (NODE_IS_LEAF(node)) + Assert(node_leaf_128_is_slot_used((rt_node_leaf_128 *) node, + n128->slot_idxs[i])); + else + Assert(node_inner_128_is_slot_used((rt_node_inner_128 *) node, + n128->slot_idxs[i])); + + cnt++; + } + + Assert(NODE_GET_COUNT(n128) == cnt); + break; + } + case RT_NODE_KIND_256: + { + if (NODE_IS_LEAF(node)) + { + rt_node_leaf_256 *n256 = (rt_node_leaf_256 *) node; + int cnt = 0; + + for (int i = 0; i < RT_NODE_NSLOTS_BITS(RT_NODE_MAX_SLOTS); i++) + cnt += pg_popcount32(n256->isset[i]); + + /* Check if the number of used chunk matches */ + Assert(NODE_GET_COUNT(n256) == cnt); + + break; + } + } + } +#endif +} + +/***************** DEBUG FUNCTIONS *****************/ +#ifdef RT_DEBUG +void +rt_stats(radix_tree *tree) +{ + ereport(LOG, (errmsg("num_keys = %lu, height = %u, n4 = %u, n32 = %u, n128 = %u, n256 = %u", + tree->num_keys, + tree->root->shift / RT_NODE_SPAN, + tree->cnt[0], + tree->cnt[1], + tree->cnt[2], + tree->cnt[3]))); +} + +static void +rt_dump_node(rt_node *node, int level, bool recurse) +{ + char space[128] = {0}; + + fprintf(stderr, "[%s] kind %d, count %u, shift %u, chunk 0x%X:\n", + NODE_IS_LEAF(node) ? "LEAF" : "INNR", + (NODE_GET_KIND(node) == RT_NODE_KIND_4) ? 4 : + (NODE_GET_KIND(node) == RT_NODE_KIND_32) ? 32 : + (NODE_GET_KIND(node) == RT_NODE_KIND_128) ? 128 : 256, + NODE_GET_COUNT(node), node->shift, node->chunk); + + if (level > 0) + sprintf(space, "%*c", level * 4, ' '); + + switch (NODE_GET_KIND(node)) + { + case RT_NODE_KIND_4: + { + for (int i = 0; i < NODE_GET_COUNT(node); i++) + { + if (NODE_IS_LEAF(node)) + { + rt_node_leaf_4 *n4 = (rt_node_leaf_4 *) node; + + fprintf(stderr, "%schunk 0x%X value 0x%lX\n", + space, n4->base.chunks[i], n4->values[i]); + } + else + { + rt_node_inner_4 *n4 = (rt_node_inner_4 *) node; + + fprintf(stderr, "%schunk 0x%X ->", + space, n4->base.chunks[i]); + + if (recurse) + rt_dump_node(n4->children[i], level + 1, recurse); + else + fprintf(stderr, "\n"); + } + } + break; + } + case RT_NODE_KIND_32: + { + for (int i = 0; i < NODE_GET_COUNT(node); i++) + { + if (NODE_IS_LEAF(node)) + { + rt_node_leaf_32 *n32 = (rt_node_leaf_32 *) node; + + fprintf(stderr, "%schunk 0x%X value 0x%lX\n", + space, n32->base.chunks[i], n32->values[i]); + } + else + { + rt_node_inner_32 *n32 = (rt_node_inner_32 *) node; + + fprintf(stderr, "%schunk 0x%X ->", + space, n32->base.chunks[i]); + + if (recurse) + { + rt_dump_node(n32->children[i], level + 1, recurse); + } + else + fprintf(stderr, "\n"); + } + } + break; + } + case RT_NODE_KIND_128: + { + rt_node_base_128 *b128 = (rt_node_base_128 *) node; + + fprintf(stderr, "slot_idxs "); + for (int i = 0; i < 256; i++) + { + if (!node_128_is_chunk_used(b128, i)) + continue; + + fprintf(stderr, " [%d]=%d, ", i, b128->slot_idxs[i]); + } + if (NODE_IS_LEAF(node)) + { + rt_node_leaf_128 *n = (rt_node_leaf_128 *) node; + + fprintf(stderr, ", isset-bitmap:"); + for (int i = 0; i < 16; i++) + { + fprintf(stderr, "%X ", (uint8) n->isset[i]); + } + fprintf(stderr, "\n"); + } + + for (int i = 0; i < 256; i++) + { + if (!node_128_is_chunk_used(b128, i)) + continue; + + if (NODE_IS_LEAF(node)) + { + rt_node_leaf_128 *n128 = (rt_node_leaf_128 *) b128; + + fprintf(stderr, "%schunk 0x%X value 0x%lX\n", + space, i, node_leaf_128_get_value(n128, i)); + } + else + { + rt_node_inner_128 *n128 = (rt_node_inner_128 *) b128; + + fprintf(stderr, "%schunk 0x%X ->", + space, i); + + if (recurse) + rt_dump_node(node_inner_128_get_child(n128, i), + level + 1, recurse); + else + fprintf(stderr, "\n"); + } + } + break; + } + case RT_NODE_KIND_256: + { + for (int i = 0; i < 256; i++) + { + if (NODE_IS_LEAF(node)) + { + rt_node_leaf_256 *n256 = (rt_node_leaf_256 *) node; + + if (!node_leaf_256_is_chunk_used(n256, i)) + continue; + + fprintf(stderr, "%schunk 0x%X value 0x%lX\n", + space, i, node_leaf_256_get_value(n256, i)); + } + else + { + rt_node_inner_256 *n256 = (rt_node_inner_256 *) node; + + if (!node_inner_256_is_chunk_used(n256, i)) + continue; + + fprintf(stderr, "%schunk 0x%X ->", + space, i); + + if (recurse) + rt_dump_node(node_inner_256_get_child(n256, i), level + 1, + recurse); + else + fprintf(stderr, "\n"); + } + } + break; + } + } +} + +void +rt_dump_search(radix_tree *tree, uint64 key) +{ + rt_node *node; + int shift; + int level = 0; + + elog(NOTICE, "-----------------------------------------------------------"); + elog(NOTICE, "max_val = %lu (0x%lX)", tree->max_val, tree->max_val); + + if (!tree->root) + { + elog(NOTICE, "tree is empty"); + return; + } + + if (key > tree->max_val) + { + elog(NOTICE, "key %lu (0x%lX) is larger than max val", + key, key); + return; + } + + node = tree->root; + shift = tree->root->shift; + while (shift >= 0) + { + rt_node *child; + + rt_dump_node(node, level, false); + + if (NODE_IS_LEAF(node)) + { + uint64 dummy; + + /* We reached at a leaf node, find the corresponding slot */ + rt_node_search_leaf(node, key, RT_ACTION_FIND, &dummy); + + break; + } + + if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) + break; + + node = child; + shift -= RT_NODE_SPAN; + level++; + } +} + +void +rt_dump(radix_tree *tree) +{ + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + fprintf(stderr, "%s\tinner_size%lu\tinner_blocksize %lu\tleaf_size %lu\tleaf_blocksize %lu\n", + rt_node_kind_info[i].name, + rt_node_kind_info[i].inner_size, + rt_node_kind_info[i].inner_blocksize, + rt_node_kind_info[i].leaf_size, + rt_node_kind_info[i].leaf_blocksize); + fprintf(stderr, "max_val = %lu\n", tree->max_val); + + if (!tree->root) + { + fprintf(stderr, "empty tree\n"); + return; + } + + rt_dump_node(tree->root, 0, true); +} +#endif diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h new file mode 100644 index 0000000000..d5d7668617 --- /dev/null +++ b/src/include/lib/radixtree.h @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * radixtree.h + * Interface for radix tree. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/lib/radixtree.h + * + *------------------------------------------------------------------------- + */ +#ifndef RADIXTREE_H +#define RADIXTREE_H + +#include "postgres.h" + +#define RT_DEBUG 1 + +typedef struct radix_tree radix_tree; +typedef struct rt_iter rt_iter; + +extern radix_tree *rt_create(MemoryContext ctx); +extern void rt_free(radix_tree *tree); +extern bool rt_search(radix_tree *tree, uint64 key, uint64 *val_p); +extern bool rt_set(radix_tree *tree, uint64 key, uint64 val); +extern rt_iter *rt_begin_iterate(radix_tree *tree); + +extern bool rt_iterate_next(rt_iter *iter, uint64 *key_p, uint64 *value_p); +extern void rt_end_iterate(rt_iter *iter); +extern bool rt_delete(radix_tree *tree, uint64 key); + +extern uint64 rt_memory_usage(radix_tree *tree); +extern uint64 rt_num_entries(radix_tree *tree); + +#ifdef RT_DEBUG +extern void rt_dump(radix_tree *tree); +extern void rt_dump_search(radix_tree *tree, uint64 key); +extern void rt_stats(radix_tree *tree); +#endif + +#endif /* RADIXTREE_H */ diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 7b3f292965..e587cabe13 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -26,6 +26,7 @@ SUBDIRS = \ test_parser \ test_pg_dump \ test_predtest \ + test_radixtree \ test_rbtree \ test_regex \ test_rls_hooks \ diff --git a/src/test/modules/test_radixtree/.gitignore b/src/test/modules/test_radixtree/.gitignore new file mode 100644 index 0000000000..5dcb3ff972 --- /dev/null +++ b/src/test/modules/test_radixtree/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/test/modules/test_radixtree/Makefile b/src/test/modules/test_radixtree/Makefile new file mode 100644 index 0000000000..da06b93da3 --- /dev/null +++ b/src/test/modules/test_radixtree/Makefile @@ -0,0 +1,23 @@ +# src/test/modules/test_radixtree/Makefile + +MODULE_big = test_radixtree +OBJS = \ + $(WIN32RES) \ + test_radixtree.o +PGFILEDESC = "test_radixtree - test code for src/backend/lib/radixtree.c" + +EXTENSION = test_radixtree +DATA = test_radixtree--1.0.sql + +REGRESS = test_radixtree + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_radixtree +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_radixtree/README b/src/test/modules/test_radixtree/README new file mode 100644 index 0000000000..a8b271869a --- /dev/null +++ b/src/test/modules/test_radixtree/README @@ -0,0 +1,7 @@ +test_integerset contains unit tests for testing the integer set implementation +in src/backend/lib/integerset.c. + +The tests verify the correctness of the implementation, but they can also be +used as a micro-benchmark. If you set the 'intset_test_stats' flag in +test_integerset.c, the tests will print extra information about execution time +and memory usage. diff --git a/src/test/modules/test_radixtree/expected/test_radixtree.out b/src/test/modules/test_radixtree/expected/test_radixtree.out new file mode 100644 index 0000000000..cc6970c87c --- /dev/null +++ b/src/test/modules/test_radixtree/expected/test_radixtree.out @@ -0,0 +1,28 @@ +CREATE EXTENSION test_radixtree; +-- +-- All the logic is in the test_radixtree() function. It will throw +-- an error if something fails. +-- +SELECT test_radixtree(); +NOTICE: testing radix tree node types with shift "0" +NOTICE: testing radix tree node types with shift "8" +NOTICE: testing radix tree node types with shift "16" +NOTICE: testing radix tree node types with shift "24" +NOTICE: testing radix tree node types with shift "32" +NOTICE: testing radix tree node types with shift "40" +NOTICE: testing radix tree node types with shift "48" +NOTICE: testing radix tree node types with shift "56" +NOTICE: testing radix tree with pattern "all ones" +NOTICE: testing radix tree with pattern "alternating bits" +NOTICE: testing radix tree with pattern "clusters of ten" +NOTICE: testing radix tree with pattern "clusters of hundred" +NOTICE: testing radix tree with pattern "one-every-64k" +NOTICE: testing radix tree with pattern "sparse" +NOTICE: testing radix tree with pattern "single values, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^60" + test_radixtree +---------------- + +(1 row) + diff --git a/src/test/modules/test_radixtree/sql/test_radixtree.sql b/src/test/modules/test_radixtree/sql/test_radixtree.sql new file mode 100644 index 0000000000..41ece5e9f5 --- /dev/null +++ b/src/test/modules/test_radixtree/sql/test_radixtree.sql @@ -0,0 +1,7 @@ +CREATE EXTENSION test_radixtree; + +-- +-- All the logic is in the test_radixtree() function. It will throw +-- an error if something fails. +-- +SELECT test_radixtree(); diff --git a/src/test/modules/test_radixtree/test_radixtree--1.0.sql b/src/test/modules/test_radixtree/test_radixtree--1.0.sql new file mode 100644 index 0000000000..074a5a7ea7 --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree--1.0.sql @@ -0,0 +1,8 @@ +/* src/test/modules/test_radixtree/test_radixtree--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_radixtree" to load this file. \quit + +CREATE FUNCTION test_radixtree() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_radixtree/test_radixtree.c b/src/test/modules/test_radixtree/test_radixtree.c new file mode 100644 index 0000000000..a4aa80a99c --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree.c @@ -0,0 +1,504 @@ +/*-------------------------------------------------------------------------- + * + * test_radixtree.c + * Test radixtree set data structure. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_radixtree/test_radixtree.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "common/pg_prng.h" +#include "fmgr.h" +#include "lib/radixtree.h" +#include "miscadmin.h" +#include "nodes/bitmapset.h" +#include "storage/block.h" +#include "storage/itemptr.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "X" + +/* + * If you enable this, the "pattern" tests will print information about + * how long populating, probing, and iterating the test set takes, and + * how much memory the test set consumed. That can be used as + * micro-benchmark of various operations and input patterns (you might + * want to increase the number of values used in each of the test, if + * you do that, to reduce noise). + * + * The information is printed to the server's stderr, mostly because + * that's where MemoryContextStats() output goes. + */ +static const bool rt_test_stats = false; + +/* The maximum number of entries each node type can have */ +static int rt_node_max_entries[] = { + 4, /* RT_NODE_KIND_4 */ + 16, /* RT_NODE_KIND_16 */ + 32, /* RT_NODE_KIND_32 */ + 128, /* RT_NODE_KIND_128 */ + 256 /* RT_NODE_KIND_256 */ +}; + +/* + * A struct to define a pattern of integers, for use with the test_pattern() + * function. + */ +typedef struct +{ + char *test_name; /* short name of the test, for humans */ + char *pattern_str; /* a bit pattern */ + uint64 spacing; /* pattern repeats at this interval */ + uint64 num_values; /* number of integers to set in total */ +} test_spec; + +/* Test patterns borrowed from test_integerset.c */ +static const test_spec test_specs[] = { + { + "all ones", "1111111111", + 10, 1000000 + }, + { + "alternating bits", "0101010101", + 10, 1000000 + }, + { + "clusters of ten", "1111111111", + 10000, 1000000 + }, + { + "clusters of hundred", + "1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111", + 10000, 10000000 + }, + { + "one-every-64k", "1", + 65536, 1000000 + }, + { + "sparse", "100000000000000000000000000000001", + 10000000, 1000000 + }, + { + "single values, distance > 2^32", "1", + UINT64CONST(10000000000), 100000 + }, + { + "clusters, distance > 2^32", "10101010", + UINT64CONST(10000000000), 1000000 + }, + { + "clusters, distance > 2^60", "10101010", + UINT64CONST(2000000000000000000), + 23 /* can't be much higher than this, or we + * overflow uint64 */ + } +}; + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(test_radixtree); + +static void +test_empty(void) +{ + radix_tree *radixtree; + uint64 dummy; + + radixtree = rt_create(CurrentMemoryContext); + + if (rt_search(radixtree, 0, &dummy)) + elog(ERROR, "rt_search on empty tree returned true"); + + if (rt_search(radixtree, 1, &dummy)) + elog(ERROR, "rt_search on empty tree returned true"); + + if (rt_search(radixtree, PG_UINT64_MAX, &dummy)) + elog(ERROR, "rt_search on empty tree returned true"); + + if (rt_num_entries(radixtree) != 0) + elog(ERROR, "rt_num_entries on empty tree return non-zero"); + + rt_free(radixtree); +} + +/* + * Check if keys from start to end with the shift exist in the tree. + */ +static void +check_search_on_node(radix_tree *radixtree, uint8 shift, int start, int end) +{ + for (int i = start; i < end; i++) + { + uint64 key = ((uint64) i << shift); + uint64 val; + + if (!rt_search(radixtree, key, &val)) + elog(ERROR, "key 0x" UINT64_HEX_FORMAT " is not found on node-%d", + key, end); + if (val != key) + elog(ERROR, "rt_search with key 0x" UINT64_HEX_FORMAT " returns 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT, + key, val, key); + } +} + +static void +test_node_types_insert(radix_tree *radixtree, uint8 shift) +{ + uint64 num_entries; + + for (int i = 0; i < 256; i++) + { + uint64 key = ((uint64) i << shift); + bool found; + + found = rt_set(radixtree, key, key); + + if (found) + elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " found", key); + + for (int j = 0; j < lengthof(rt_node_max_entries); j++) + { + /* + * After filling all slots in each node type, check if the values are + * stored properly. + */ + if (i == (rt_node_max_entries[j] - 1)) + { + check_search_on_node(radixtree, shift, + (j == 0) ? 0 : rt_node_max_entries[j - 1], + rt_node_max_entries[j]); + break; + } + } + } + + num_entries = rt_num_entries(radixtree); + + if (num_entries != 256) + elog(ERROR, + "rt_num_entries returned" UINT64_FORMAT ", expected " UINT64_FORMAT, + num_entries, UINT64CONST(256)); +} + +static void +test_node_types_delete(radix_tree *radixtree, uint8 shift) +{ + uint64 num_entries; + + for (int i = 0; i < 256; i++) + { + uint64 key = ((uint64) i << shift); + bool found; + + found = rt_delete(radixtree, key); + + if (!found) + elog(ERROR, "inserted key 0x" UINT64_HEX_FORMAT " is not found", key); + } + + num_entries = rt_num_entries(radixtree); + + /* The tree must be empty */ + if (num_entries != 0) + elog(ERROR, + "rt_num_entries returned" UINT64_FORMAT ", expected " UINT64_FORMAT, + num_entries, UINT64CONST(256)); +} + +/* + * Test for inserting and deleting key-value pairs to each node type at the given shift + * level. + */ +static void +test_node_types(uint8 shift) +{ + radix_tree *radixtree; + + elog(NOTICE, "testing radix tree node types with shift \"%d\"", shift); + + radixtree = rt_create(CurrentMemoryContext); + + /* + * Insert and search entries for every node type at the 'shift' level, + * then delete all entries to make it empty, and insert and search + * entries again. + */ + test_node_types_insert(radixtree, shift); + test_node_types_delete(radixtree, shift); + test_node_types_insert(radixtree, shift); + + rt_free(radixtree); +} + +/* + * Test with a repeating pattern, defined by the 'spec'. + */ +static void +test_pattern(const test_spec *spec) +{ + radix_tree *radixtree; + rt_iter *iter; + MemoryContext radixtree_ctx; + TimestampTz starttime; + TimestampTz endtime; + uint64 n; + uint64 last_int; + uint64 ndeleted; + uint64 nbefore; + uint64 nafter; + int patternlen; + uint64 *pattern_values; + uint64 pattern_num_values; + + elog(NOTICE, "testing radix tree with pattern \"%s\"", spec->test_name); + if (rt_test_stats) + fprintf(stderr, "-----\ntesting radix tree with pattern \"%s\"\n", spec->test_name); + + /* Pre-process the pattern, creating an array of integers from it. */ + patternlen = strlen(spec->pattern_str); + pattern_values = palloc(patternlen * sizeof(uint64)); + pattern_num_values = 0; + for (int i = 0; i < patternlen; i++) + { + if (spec->pattern_str[i] == '1') + pattern_values[pattern_num_values++] = i; + } + + /* + * Allocate the radix tree. + * + * Allocate it in a separate memory context, so that we can print its + * memory usage easily. + */ + radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext, + "radixtree test", + ALLOCSET_SMALL_SIZES); + MemoryContextSetIdentifier(radixtree_ctx, spec->test_name); + radixtree = rt_create(radixtree_ctx); + + /* + * Add values to the set. + */ + starttime = GetCurrentTimestamp(); + + n = 0; + last_int = 0; + while (n < spec->num_values) + { + uint64 x = 0; + + for (int i = 0; i < pattern_num_values && n < spec->num_values; i++) + { + bool found; + + x = last_int + pattern_values[i]; + + found = rt_set(radixtree, x, x); + + if (found) + elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " found", x); + + n++; + } + last_int += spec->spacing; + } + + endtime = GetCurrentTimestamp(); + + if (rt_test_stats) + fprintf(stderr, "added " UINT64_FORMAT " values in %d ms\n", + spec->num_values, (int) (endtime - starttime) / 1000); + + /* + * Print stats on the amount of memory used. + * + * We print the usage reported by rt_memory_usage(), as well as the + * stats from the memory context. They should be in the same ballpark, + * but it's hard to automate testing that, so if you're making changes to + * the implementation, just observe that manually. + */ + if (rt_test_stats) + { + uint64 mem_usage; + + /* + * Also print memory usage as reported by rt_memory_usage(). It + * should be in the same ballpark as the usage reported by + * MemoryContextStats(). + */ + mem_usage = rt_memory_usage(radixtree); + fprintf(stderr, "rt_memory_usage() reported " UINT64_FORMAT " (%0.2f bytes / integer)\n", + mem_usage, (double) mem_usage / spec->num_values); + + MemoryContextStats(radixtree_ctx); + } + + /* Check that rt_num_entries works */ + n = rt_num_entries(radixtree); + if (n != spec->num_values) + elog(ERROR, "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT, n, spec->num_values); + + /* + * Test random-access probes with rt_search() + */ + starttime = GetCurrentTimestamp(); + + for (n = 0; n < 100000; n++) + { + bool found; + bool expected; + uint64 x; + uint64 v; + + /* + * Pick next value to probe at random. We limit the probes to the + * last integer that we added to the set, plus an arbitrary constant + * (1000). There's no point in probing the whole 0 - 2^64 range, if + * only a small part of the integer space is used. We would very + * rarely hit values that are actually in the set. + */ + x = pg_prng_uint64_range(&pg_global_prng_state, 0, last_int + 1000); + + /* Do we expect this value to be present in the set? */ + if (x >= last_int) + expected = false; + else + { + uint64 idx = x % spec->spacing; + + if (idx >= patternlen) + expected = false; + else if (spec->pattern_str[idx] == '1') + expected = true; + else + expected = false; + } + + /* Is it present according to rt_search() ? */ + found = rt_search(radixtree, x, &v); + + if (found != expected) + elog(ERROR, "mismatch at 0x" UINT64_HEX_FORMAT ": %d vs %d", x, found, expected); + if (found && (v != x)) + elog(ERROR, "found 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT, + v, x); + } + endtime = GetCurrentTimestamp(); + if (rt_test_stats) + fprintf(stderr, "probed " UINT64_FORMAT " values in %d ms\n", + n, (int) (endtime - starttime) / 1000); + + /* + * Test iterator + */ + starttime = GetCurrentTimestamp(); + + iter = rt_begin_iterate(radixtree); + n = 0; + last_int = 0; + while (n < spec->num_values) + { + for (int i = 0; i < pattern_num_values && n < spec->num_values; i++) + { + uint64 expected = last_int + pattern_values[i]; + uint64 x; + uint64 val; + + if (!rt_iterate_next(iter, &x, &val)) + break; + + if (x != expected) + elog(ERROR, + "iterate returned wrong key; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", + x, expected, i); + if (val != expected) + elog(ERROR, + "iterate returned wrong value; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", x, expected, i); + n++; + } + last_int += spec->spacing; + } + endtime = GetCurrentTimestamp(); + if (rt_test_stats) + fprintf(stderr, "iterated " UINT64_FORMAT " values in %d ms\n", + n, (int) (endtime - starttime) / 1000); + + if (n < spec->num_values) + elog(ERROR, "iterator stopped short after " UINT64_FORMAT " entries, expected " UINT64_FORMAT, n, spec->num_values); + if (n > spec->num_values) + elog(ERROR, "iterator returned " UINT64_FORMAT " entries, " UINT64_FORMAT " was expected", n, spec->num_values); + + /* + * Test random-access probes with rt_delete() + */ + starttime = GetCurrentTimestamp(); + + nbefore = rt_num_entries(radixtree); + ndeleted = 0; + for (n = 0; n < 100000; n++) + { + bool found; + uint64 x; + uint64 v; + + /* + * Pick next value to probe at random. We limit the probes to the + * last integer that we added to the set, plus an arbitrary constant + * (1000). There's no point in probing the whole 0 - 2^64 range, if + * only a small part of the integer space is used. We would very + * rarely hit values that are actually in the set. + */ + x = pg_prng_uint64_range(&pg_global_prng_state, 0, last_int + 1000); + + /* Is it present according to rt_search() ? */ + found = rt_search(radixtree, x, &v); + + if (!found) + continue; + + /* If the key is found, delete it and check again */ + if (!rt_delete(radixtree, x)) + elog(ERROR, "could not delete key 0x" UINT64_HEX_FORMAT, x); + if (rt_search(radixtree, x, &v)) + elog(ERROR, "found deleted key 0x" UINT64_HEX_FORMAT, x); + if (rt_delete(radixtree, x)) + elog(ERROR, "deleted already-deleted key 0x" UINT64_HEX_FORMAT, x); + + ndeleted++; + } + endtime = GetCurrentTimestamp(); + if (rt_test_stats) + fprintf(stderr, "deleted " UINT64_FORMAT " values in %d ms\n", + ndeleted, (int) (endtime - starttime) / 1000); + + nafter = rt_num_entries(radixtree); + + /* Check that rt_num_entries works */ + if ((nbefore - ndeleted) != nafter) + elog(ERROR, "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT "after " UINT64_FORMAT " deletion", + nafter, (nbefore - ndeleted), ndeleted); + + MemoryContextDelete(radixtree_ctx); +} + +Datum +test_radixtree(PG_FUNCTION_ARGS) +{ + test_empty(); + + for (int shift = 0; shift <= (64 - 8); shift += 8) + test_node_types(shift); + + /* Test different test patterns, with lots of entries */ + for (int i = 0; i < lengthof(test_specs); i++) + test_pattern(&test_specs[i]); + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_radixtree/test_radixtree.control b/src/test/modules/test_radixtree/test_radixtree.control new file mode 100644 index 0000000000..e53f2a3e0c --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree.control @@ -0,0 +1,4 @@ +comment = 'Test code for radix tree' +default_version = '1.0' +module_pathname = '$libdir/test_radixtree' +relocatable = true -- 2.31.1