From fc373e0312e0b3c30bba8bd54286283542d627a2 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Thu, 16 Feb 2023 23:45:39 +0900 Subject: [PATCH v29 08/10] Review TidStore. --- src/backend/access/common/tidstore.c | 340 +++++++++--------- src/include/access/tidstore.h | 37 +- .../modules/test_tidstore/test_tidstore.c | 68 ++-- 3 files changed, 234 insertions(+), 211 deletions(-) diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c index 8c05e60d92..9360520482 100644 --- a/src/backend/access/common/tidstore.c +++ b/src/backend/access/common/tidstore.c @@ -3,18 +3,19 @@ * tidstore.c * Tid (ItemPointerData) storage implementation. * - * This module provides a in-memory data structure to store Tids (ItemPointer). - * Internally, a tid is encoded as a pair of 64-bit key and 64-bit value, and - * stored in the radix tree. + * TidStore is a in-memory data structure to store tids (ItemPointerData). + * Internally, a tid is encoded as a pair of 64-bit key and 64-bit value, + * and stored in the radix tree. * - * A TidStore can be shared among parallel worker processes by passing DSA area - * to tidstore_create(). Other backends can attach to the shared TidStore by - * tidstore_attach(). + * TidStore can be shared among parallel worker processes by passing DSA area + * to TidStoreCreate(). Other backends can attach to the shared TidStore by + * TidStoreAttach(). * - * Regarding the concurrency, it basically relies on the concurrency support in - * the radix tree, but we acquires the lock on a TidStore in some cases, for - * example, when to reset the store and when to access the number tids in the - * store (num_tids). + * Regarding the concurrency support, we use a single LWLock for the TidStore. + * The TidStore is exclusively locked when inserting encoded tids to the + * radix tree or when resetting itself. When searching on the TidStore or + * doing the iteration, it is not locked but the underlying radix tree is + * locked in shared mode. * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -34,16 +35,18 @@ #include "utils/memutils.h" /* - * For encoding purposes, tids are represented as a pair of 64-bit key and - * 64-bit value. First, we construct 64-bit unsigned integer by combining - * the block number and the offset number. The number of bits used for the - * offset number is specified by max_offsets in tidstore_create(). We are - * frugal with the bits, because smaller keys could help keeping the radix - * tree shallow. + * For encoding purposes, a tid is represented as a pair of 64-bit key and + * 64-bit value. * - * For example, a tid of heap with 8kB blocks uses the lowest 9 bits for - * the offset number and uses the next 32 bits for the block number. That - * is, only 41 bits are used: + * First, we construct a 64-bit unsigned integer by combining the block + * number and the offset number. The number of bits used for the offset number + * is specified by max_off in TidStoreCreate(). We are frugal with the bits, + * because smaller keys could help keeping the radix tree shallow. + * + * For example, a tid of heap on a 8kB block uses the lowest 9 bits for + * the offset number and uses the next 32 bits for the block number. 9 bits + * are enough for the offset number, because MaxHeapTuplesPerPage < 2^9 + * on 8kB blocks. That is, only 41 bits are used: * * uuuuuuuY YYYYYYYY YYYYYYYY YYYYYYYY YYYYYYYX XXXXXXXX * @@ -52,30 +55,34 @@ * u = unused bit * (high on the left, low on the right) * - * 9 bits are enough for the offset number, because MaxHeapTuplesPerPage < 2^9 - * on 8kB blocks. - * - * The 64-bit value is the bitmap representation of the lowest 6 bits - * (TIDSTORE_VALUE_NBITS) of the integer, and the rest 35 bits are used - * as the key: + * Then, 64-bit value is the bitmap representation of the lowest 6 bits + * (LOWER_OFFSET_NBITS) of the integer, and 64-bit key consists of the + * upper 3 bits of the offset number and the block number, 35 bits in + * total: * * uuuuuuuY YYYYYYYY YYYYYYYY YYYYYYYY YYYYYYYX XXXXXXXX * |----| value - * |---------------------------------------------| key + * |--------------------------------------| key * * The maximum height of the radix tree is 5 in this case. + * + * If the number of bits required for offset numbers fits in LOWER_OFFSET_NBITS, + * 64-bit value is the bitmap representation of the offset number, and the + * 64-bit key is the block number. */ -#define TIDSTORE_VALUE_NBITS 6 /* log(64, 2) */ -#define TIDSTORE_OFFSET_MASK ((1 << TIDSTORE_VALUE_NBITS) - 1) +typedef uint64 tidkey; +typedef uint64 offsetbm; +#define LOWER_OFFSET_NBITS 6 /* log(sizeof(offsetbm), 2) */ +#define LOWER_OFFSET_MASK ((1 << LOWER_OFFSET_NBITS) - 1) -/* A magic value used to identify our TidStores. */ +/* A magic value used to identify our TidStore. */ #define TIDSTORE_MAGIC 0x826f6a10 #define RT_PREFIX local_rt #define RT_SCOPE static #define RT_DECLARE #define RT_DEFINE -#define RT_VALUE_TYPE uint64 +#define RT_VALUE_TYPE tidkey #include "lib/radixtree.h" #define RT_PREFIX shared_rt @@ -83,7 +90,7 @@ #define RT_SCOPE static #define RT_DECLARE #define RT_DEFINE -#define RT_VALUE_TYPE uint64 +#define RT_VALUE_TYPE tidkey #include "lib/radixtree.h" /* The control object for a TidStore */ @@ -94,10 +101,10 @@ typedef struct TidStoreControl /* These values are never changed after creation */ size_t max_bytes; /* the maximum bytes a TidStore can use */ - int max_offset; /* the maximum offset number */ - int offset_nbits; /* the number of bits required for an offset - * number */ - int offset_key_nbits; /* the number of bits of an offset number + int max_off; /* the maximum offset number */ + int max_off_nbits; /* the number of bits required for offset + * numbers */ + int upper_off_nbits; /* the number of bits of offset numbers * used in a key */ /* The below fields are used only in shared case */ @@ -106,7 +113,7 @@ typedef struct TidStoreControl LWLock lock; /* handles for TidStore and radix tree */ - tidstore_handle handle; + TidStoreHandle handle; shared_rt_handle tree_handle; } TidStoreControl; @@ -147,24 +154,27 @@ typedef struct TidStoreIter bool finished; /* save for the next iteration */ - uint64 next_key; - uint64 next_val; + tidkey next_tidkey; + offsetbm next_off_bitmap; - /* output for the caller */ - TidStoreIterResult result; + /* + * output for the caller. Must be last because variable-size. + */ + TidStoreIterResult output; } TidStoreIter; -static void tidstore_iter_extract_tids(TidStoreIter *iter, uint64 key, uint64 val); -static inline BlockNumber key_get_blkno(TidStore *ts, uint64 key); -static inline uint64 encode_key_off(TidStore *ts, BlockNumber block, uint32 offset, uint64 *off_bit); -static inline uint64 tid_to_key_off(TidStore *ts, ItemPointer tid, uint64 *off_bit); +static void iter_decode_key_off(TidStoreIter *iter, tidkey key, offsetbm off_bitmap); +static inline BlockNumber key_get_blkno(TidStore *ts, tidkey key); +static inline tidkey encode_blk_off(TidStore *ts, BlockNumber block, + OffsetNumber offset, offsetbm *off_bit); +static inline tidkey encode_tid(TidStore *ts, ItemPointer tid, offsetbm *off_bit); /* * Create a TidStore. The returned object is allocated in backend-local memory. * The radix tree for storage is allocated in DSA area is 'area' is non-NULL. */ TidStore * -tidstore_create(size_t max_bytes, int max_offset, dsa_area *area) +TidStoreCreate(size_t max_bytes, int max_off, dsa_area *area) { TidStore *ts; @@ -176,12 +186,12 @@ tidstore_create(size_t max_bytes, int max_offset, dsa_area *area) * Memory consumption depends on the number of stored tids, but also on the * distribution of them, how the radix tree stores, and the memory management * that backed the radix tree. The maximum bytes that a TidStore can - * use is specified by the max_bytes in tidstore_create(). We want the total + * use is specified by the max_bytes in TidStoreCreate(). We want the total * amount of memory consumption by a TidStore not to exceed the max_bytes. * * In local TidStore cases, the radix tree uses slab allocators for each kind * of node class. The most memory consuming case while adding Tids associated - * with one page (i.e. during tidstore_add_tids()) is that we allocate a new + * with one page (i.e. during TidStoreSetBlockOffsets()) is that we allocate a new * slab block for a new radix tree node, which is approximately 70kB. Therefore, * we deduct 70kB from the max_bytes. * @@ -202,7 +212,7 @@ tidstore_create(size_t max_bytes, int max_offset, dsa_area *area) dp = dsa_allocate0(area, sizeof(TidStoreControl)); ts->control = (TidStoreControl *) dsa_get_address(area, dp); - ts->control->max_bytes = (uint64) (max_bytes * ratio); + ts->control->max_bytes = (size_t) (max_bytes * ratio); ts->area = area; ts->control->magic = TIDSTORE_MAGIC; @@ -218,14 +228,14 @@ tidstore_create(size_t max_bytes, int max_offset, dsa_area *area) ts->control->max_bytes = max_bytes - (70 * 1024); } - ts->control->max_offset = max_offset; - ts->control->offset_nbits = pg_ceil_log2_32(max_offset); + ts->control->max_off = max_off; + ts->control->max_off_nbits = pg_ceil_log2_32(max_off); - if (ts->control->offset_nbits < TIDSTORE_VALUE_NBITS) - ts->control->offset_nbits = TIDSTORE_VALUE_NBITS; + if (ts->control->max_off_nbits < LOWER_OFFSET_NBITS) + ts->control->max_off_nbits = LOWER_OFFSET_NBITS; - ts->control->offset_key_nbits = - ts->control->offset_nbits - TIDSTORE_VALUE_NBITS; + ts->control->upper_off_nbits = + ts->control->max_off_nbits - LOWER_OFFSET_NBITS; return ts; } @@ -235,7 +245,7 @@ tidstore_create(size_t max_bytes, int max_offset, dsa_area *area) * allocated in backend-local memory using the CurrentMemoryContext. */ TidStore * -tidstore_attach(dsa_area *area, tidstore_handle handle) +TidStoreAttach(dsa_area *area, TidStoreHandle handle) { TidStore *ts; dsa_pointer control; @@ -266,7 +276,7 @@ tidstore_attach(dsa_area *area, tidstore_handle handle) * to the operating system. */ void -tidstore_detach(TidStore *ts) +TidStoreDetach(TidStore *ts) { Assert(TidStoreIsShared(ts) && ts->control->magic == TIDSTORE_MAGIC); @@ -279,12 +289,12 @@ tidstore_detach(TidStore *ts) * * TODO: The caller must be certain that no other backend will attempt to * access the TidStore before calling this function. Other backend must - * explicitly call tidstore_detach to free up backend-local memory associated - * with the TidStore. The backend that calls tidstore_destroy must not call - * tidstore_detach. + * explicitly call TidStoreDetach() to free up backend-local memory associated + * with the TidStore. The backend that calls TidStoreDestroy() must not call + * TidStoreDetach(). */ void -tidstore_destroy(TidStore *ts) +TidStoreDestroy(TidStore *ts) { if (TidStoreIsShared(ts)) { @@ -309,11 +319,11 @@ tidstore_destroy(TidStore *ts) } /* - * Forget all collected Tids. It's similar to tidstore_destroy but we don't free + * Forget all collected Tids. It's similar to TidStoreDestroy() but we don't free * entire TidStore but recreate only the radix tree storage. */ void -tidstore_reset(TidStore *ts) +TidStoreReset(TidStore *ts) { if (TidStoreIsShared(ts)) { @@ -350,30 +360,34 @@ tidstore_reset(TidStore *ts) } } -/* Add Tids on a block to TidStore */ +/* + * Set the given tids on the blkno to TidStore. + * + * NB: the offset numbers in offsets must be sorted in ascending order. + */ void -tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, - int num_offsets) +TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, + int num_offsets) { - uint64 *values; - uint64 key; - uint64 prev_key; - uint64 off_bitmap = 0; + offsetbm *bitmaps; + tidkey key; + tidkey prev_key; + offsetbm off_bitmap = 0; int idx; - const uint64 key_base = ((uint64) blkno) << ts->control->offset_key_nbits; - const int nkeys = UINT64CONST(1) << ts->control->offset_key_nbits; + const tidkey key_base = ((uint64) blkno) << ts->control->upper_off_nbits; + const int nkeys = UINT64CONST(1) << ts->control->upper_off_nbits; Assert(!TidStoreIsShared(ts) || ts->control->magic == TIDSTORE_MAGIC); - values = palloc(sizeof(uint64) * nkeys); + bitmaps = palloc(sizeof(offsetbm) * nkeys); key = prev_key = key_base; for (int i = 0; i < num_offsets; i++) { - uint64 off_bit; + offsetbm off_bit; /* encode the tid to a key and partial offset */ - key = encode_key_off(ts, blkno, offsets[i], &off_bit); + key = encode_blk_off(ts, blkno, offsets[i], &off_bit); /* make sure we scanned the line pointer array in order */ Assert(key >= prev_key); @@ -384,11 +398,11 @@ tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, Assert(idx >= 0 && idx < nkeys); /* write out offset bitmap for this key */ - values[idx] = off_bitmap; + bitmaps[idx] = off_bitmap; /* zero out any gaps up to the current key */ for (int empty_idx = idx + 1; empty_idx < key - key_base; empty_idx++) - values[empty_idx] = 0; + bitmaps[empty_idx] = 0; /* reset for current key -- the current offset will be handled below */ off_bitmap = 0; @@ -401,7 +415,7 @@ tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, /* save the final index for later */ idx = key - key_base; /* write out last offset bitmap */ - values[idx] = off_bitmap; + bitmaps[idx] = off_bitmap; if (TidStoreIsShared(ts)) LWLockAcquire(&ts->control->lock, LW_EXCLUSIVE); @@ -409,14 +423,14 @@ tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, /* insert the calculated key-values to the tree */ for (int i = 0; i <= idx; i++) { - if (values[i]) + if (bitmaps[i]) { key = key_base + i; if (TidStoreIsShared(ts)) - shared_rt_set(ts->tree.shared, key, &values[i]); + shared_rt_set(ts->tree.shared, key, &bitmaps[i]); else - local_rt_set(ts->tree.local, key, &values[i]); + local_rt_set(ts->tree.local, key, &bitmaps[i]); } } @@ -426,70 +440,70 @@ tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, if (TidStoreIsShared(ts)) LWLockRelease(&ts->control->lock); - pfree(values); + pfree(bitmaps); } /* Return true if the given tid is present in the TidStore */ bool -tidstore_lookup_tid(TidStore *ts, ItemPointer tid) +TidStoreIsMember(TidStore *ts, ItemPointer tid) { - uint64 key; - uint64 val = 0; - uint64 off_bit; + tidkey key; + offsetbm off_bitmap = 0; + offsetbm off_bit; bool found; - key = tid_to_key_off(ts, tid, &off_bit); + key = encode_tid(ts, tid, &off_bit); if (TidStoreIsShared(ts)) - found = shared_rt_search(ts->tree.shared, key, &val); + found = shared_rt_search(ts->tree.shared, key, &off_bitmap); else - found = local_rt_search(ts->tree.local, key, &val); + found = local_rt_search(ts->tree.local, key, &off_bitmap); if (!found) return false; - return (val & off_bit) != 0; + return (off_bitmap & off_bit) != 0; } /* - * Prepare to iterate through a TidStore. Since the radix tree is locked during the - * iteration, so tidstore_end_iterate() needs to called when finished. + * Prepare to iterate through a TidStore. Since the radix tree is locked during + * the iteration, so TidStoreEndIterate() needs to be called when finished. + * + * The TidStoreIter struct is created in the caller's memory context. * * Concurrent updates during the iteration will be blocked when inserting a * key-value to the radix tree. */ TidStoreIter * -tidstore_begin_iterate(TidStore *ts) +TidStoreBeginIterate(TidStore *ts) { TidStoreIter *iter; Assert(!TidStoreIsShared(ts) || ts->control->magic == TIDSTORE_MAGIC); - iter = palloc0(sizeof(TidStoreIter)); + iter = palloc0(sizeof(TidStoreIter) + + sizeof(OffsetNumber) * ts->control->max_off); iter->ts = ts; - iter->result.blkno = InvalidBlockNumber; - iter->result.offsets = palloc(sizeof(OffsetNumber) * ts->control->max_offset); - if (TidStoreIsShared(ts)) iter->tree_iter.shared = shared_rt_begin_iterate(ts->tree.shared); else iter->tree_iter.local = local_rt_begin_iterate(ts->tree.local); /* If the TidStore is empty, there is no business */ - if (tidstore_num_tids(ts) == 0) + if (TidStoreNumTids(ts) == 0) iter->finished = true; return iter; } static inline bool -tidstore_iter_kv(TidStoreIter *iter, uint64 *key, uint64 *val) +tidstore_iter(TidStoreIter *iter, tidkey *key, offsetbm *off_bitmap) { if (TidStoreIsShared(iter->ts)) - return shared_rt_iterate_next(iter->tree_iter.shared, key, val); + return shared_rt_iterate_next(iter->tree_iter.shared, key, off_bitmap); - return local_rt_iterate_next(iter->tree_iter.local, key, val); + return local_rt_iterate_next(iter->tree_iter.local, key, off_bitmap); } /* @@ -498,45 +512,48 @@ tidstore_iter_kv(TidStoreIter *iter, uint64 *key, uint64 *val) * numbers in each result is also sorted in ascending order. */ TidStoreIterResult * -tidstore_iterate_next(TidStoreIter *iter) +TidStoreIterateNext(TidStoreIter *iter) { - uint64 key; - uint64 val; - TidStoreIterResult *result = &(iter->result); + tidkey key; + offsetbm off_bitmap = 0; + TidStoreIterResult *output = &(iter->output); if (iter->finished) return NULL; - if (BlockNumberIsValid(result->blkno)) - { - /* Process the previously collected key-value */ - result->num_offsets = 0; - tidstore_iter_extract_tids(iter, iter->next_key, iter->next_val); - } + /* Initialize the outputs */ + output->blkno = InvalidBlockNumber; + output->num_offsets = 0; - while (tidstore_iter_kv(iter, &key, &val)) - { - BlockNumber blkno; + /* + * Decode the key and offset bitmap that are collected in the previous + * time, if exists. + */ + if (iter->next_off_bitmap > 0) + iter_decode_key_off(iter, iter->next_tidkey, iter->next_off_bitmap); - blkno = key_get_blkno(iter->ts, key); + while (tidstore_iter(iter, &key, &off_bitmap)) + { + BlockNumber blkno = key_get_blkno(iter->ts, key); - if (BlockNumberIsValid(result->blkno) && result->blkno != blkno) + if (BlockNumberIsValid(output->blkno) && output->blkno != blkno) { /* - * We got a key-value pair for a different block. So return the - * collected tids, and remember the key-value for the next iteration. + * We got tids for a different block. We return the collected + * tids so far, and remember the key-value for the next + * iteration. */ - iter->next_key = key; - iter->next_val = val; - return result; + iter->next_tidkey = key; + iter->next_off_bitmap = off_bitmap; + return output; } - /* Collect tids extracted from the key-value pair */ - tidstore_iter_extract_tids(iter, key, val); + /* Collect tids decoded from the key and offset bitmap */ + iter_decode_key_off(iter, key, off_bitmap); } iter->finished = true; - return result; + return output; } /* @@ -544,22 +561,21 @@ tidstore_iterate_next(TidStoreIter *iter) * or when existing an iteration. */ void -tidstore_end_iterate(TidStoreIter *iter) +TidStoreEndIterate(TidStoreIter *iter) { if (TidStoreIsShared(iter->ts)) shared_rt_end_iterate(iter->tree_iter.shared); else local_rt_end_iterate(iter->tree_iter.local); - pfree(iter->result.offsets); pfree(iter); } /* Return the number of tids we collected so far */ int64 -tidstore_num_tids(TidStore *ts) +TidStoreNumTids(TidStore *ts) { - uint64 num_tids; + int64 num_tids; Assert(!TidStoreIsShared(ts) || ts->control->magic == TIDSTORE_MAGIC); @@ -575,16 +591,16 @@ tidstore_num_tids(TidStore *ts) /* Return true if the current memory usage of TidStore exceeds the limit */ bool -tidstore_is_full(TidStore *ts) +TidStoreIsFull(TidStore *ts) { Assert(!TidStoreIsShared(ts) || ts->control->magic == TIDSTORE_MAGIC); - return (tidstore_memory_usage(ts) > ts->control->max_bytes); + return (TidStoreMemoryUsage(ts) > ts->control->max_bytes); } /* Return the maximum memory TidStore can use */ size_t -tidstore_max_memory(TidStore *ts) +TidStoreMaxMemory(TidStore *ts) { Assert(!TidStoreIsShared(ts) || ts->control->magic == TIDSTORE_MAGIC); @@ -593,7 +609,7 @@ tidstore_max_memory(TidStore *ts) /* Return the memory usage of TidStore */ size_t -tidstore_memory_usage(TidStore *ts) +TidStoreMemoryUsage(TidStore *ts) { Assert(!TidStoreIsShared(ts) || ts->control->magic == TIDSTORE_MAGIC); @@ -611,71 +627,75 @@ tidstore_memory_usage(TidStore *ts) /* * Get a handle that can be used by other processes to attach to this TidStore */ -tidstore_handle -tidstore_get_handle(TidStore *ts) +TidStoreHandle +TidStoreGetHandle(TidStore *ts) { Assert(TidStoreIsShared(ts) && ts->control->magic == TIDSTORE_MAGIC); return ts->control->handle; } -/* Extract tids from the given key-value pair */ +/* + * Decode the key and offset bitmap to tids and store them to the iteration + * result. + */ static void -tidstore_iter_extract_tids(TidStoreIter *iter, uint64 key, uint64 val) +iter_decode_key_off(TidStoreIter *iter, tidkey key, offsetbm off_bitmap) { - TidStoreIterResult *result = (&iter->result); + TidStoreIterResult *output = (&iter->output); - while (val) + while (off_bitmap) { - uint64 tid_i; + uint64 compressed_tid; OffsetNumber off; - tid_i = key << TIDSTORE_VALUE_NBITS; - tid_i |= pg_rightmost_one_pos64(val); + compressed_tid = key << LOWER_OFFSET_NBITS; + compressed_tid |= pg_rightmost_one_pos64(off_bitmap); - off = tid_i & ((UINT64CONST(1) << iter->ts->control->offset_nbits) - 1); + off = compressed_tid & ((UINT64CONST(1) << iter->ts->control->max_off_nbits) - 1); - Assert(result->num_offsets < iter->ts->control->max_offset); - result->offsets[result->num_offsets++] = off; + Assert(output->num_offsets < iter->ts->control->max_off); + output->offsets[output->num_offsets++] = off; /* unset the rightmost bit */ - val &= ~pg_rightmost_one64(val); + off_bitmap &= ~pg_rightmost_one64(off_bitmap); } - result->blkno = key_get_blkno(iter->ts, key); + output->blkno = key_get_blkno(iter->ts, key); } /* Get block number from the given key */ static inline BlockNumber -key_get_blkno(TidStore *ts, uint64 key) +key_get_blkno(TidStore *ts, tidkey key) { - return (BlockNumber) (key >> ts->control->offset_key_nbits); + return (BlockNumber) (key >> ts->control->upper_off_nbits); } -/* Encode a tid to key and offset */ -static inline uint64 -tid_to_key_off(TidStore *ts, ItemPointer tid, uint64 *off_bit) +/* Encode a tid to key and partial offset */ +static inline tidkey +encode_tid(TidStore *ts, ItemPointer tid, offsetbm *off_bit) { - uint32 offset = ItemPointerGetOffsetNumber(tid); + OffsetNumber offset = ItemPointerGetOffsetNumber(tid); BlockNumber block = ItemPointerGetBlockNumber(tid); - return encode_key_off(ts, block, offset, off_bit); + return encode_blk_off(ts, block, offset, off_bit); } /* encode a block and offset to a key and partial offset */ -static inline uint64 -encode_key_off(TidStore *ts, BlockNumber block, uint32 offset, uint64 *off_bit) +static inline tidkey +encode_blk_off(TidStore *ts, BlockNumber block, OffsetNumber offset, + offsetbm *off_bit) { - uint64 key; - uint64 tid_i; + tidkey key; + uint64 compressed_tid; uint32 off_lower; - off_lower = offset & TIDSTORE_OFFSET_MASK; - Assert(off_lower < (sizeof(uint64) * BITS_PER_BYTE)); + off_lower = offset & LOWER_OFFSET_MASK; + Assert(off_lower < (sizeof(offsetbm) * BITS_PER_BYTE)); *off_bit = UINT64CONST(1) << off_lower; - tid_i = offset | ((uint64) block << ts->control->offset_nbits); - key = tid_i >> TIDSTORE_VALUE_NBITS; + compressed_tid = offset | ((uint64) block << ts->control->max_off_nbits); + key = compressed_tid >> LOWER_OFFSET_NBITS; return key; } diff --git a/src/include/access/tidstore.h b/src/include/access/tidstore.h index a35a52124a..66f0fdd482 100644 --- a/src/include/access/tidstore.h +++ b/src/include/access/tidstore.h @@ -17,33 +17,34 @@ #include "storage/itemptr.h" #include "utils/dsa.h" -typedef dsa_pointer tidstore_handle; +typedef dsa_pointer TidStoreHandle; typedef struct TidStore TidStore; typedef struct TidStoreIter TidStoreIter; +/* Result struct for TidStoreIterateNext */ typedef struct TidStoreIterResult { BlockNumber blkno; - OffsetNumber *offsets; int num_offsets; + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; } TidStoreIterResult; -extern TidStore *tidstore_create(size_t max_bytes, int max_offset, dsa_area *dsa); -extern TidStore *tidstore_attach(dsa_area *dsa, dsa_pointer handle); -extern void tidstore_detach(TidStore *ts); -extern void tidstore_destroy(TidStore *ts); -extern void tidstore_reset(TidStore *ts); -extern void tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, - int num_offsets); -extern bool tidstore_lookup_tid(TidStore *ts, ItemPointer tid); -extern TidStoreIter * tidstore_begin_iterate(TidStore *ts); -extern TidStoreIterResult *tidstore_iterate_next(TidStoreIter *iter); -extern void tidstore_end_iterate(TidStoreIter *iter); -extern int64 tidstore_num_tids(TidStore *ts); -extern bool tidstore_is_full(TidStore *ts); -extern size_t tidstore_max_memory(TidStore *ts); -extern size_t tidstore_memory_usage(TidStore *ts); -extern tidstore_handle tidstore_get_handle(TidStore *ts); +extern TidStore *TidStoreCreate(size_t max_bytes, int max_off, dsa_area *dsa); +extern TidStore *TidStoreAttach(dsa_area *dsa, dsa_pointer handle); +extern void TidStoreDetach(TidStore *ts); +extern void TidStoreDestroy(TidStore *ts); +extern void TidStoreReset(TidStore *ts); +extern void TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, + int num_offsets); +extern bool TidStoreIsMember(TidStore *ts, ItemPointer tid); +extern TidStoreIter * TidStoreBeginIterate(TidStore *ts); +extern TidStoreIterResult *TidStoreIterateNext(TidStoreIter *iter); +extern void TidStoreEndIterate(TidStoreIter *iter); +extern int64 TidStoreNumTids(TidStore *ts); +extern bool TidStoreIsFull(TidStore *ts); +extern size_t TidStoreMaxMemory(TidStore *ts); +extern size_t TidStoreMemoryUsage(TidStore *ts); +extern TidStoreHandle TidStoreGetHandle(TidStore *ts); #endif /* TIDSTORE_H */ diff --git a/src/test/modules/test_tidstore/test_tidstore.c b/src/test/modules/test_tidstore/test_tidstore.c index 9a1217f833..8659e6780e 100644 --- a/src/test/modules/test_tidstore/test_tidstore.c +++ b/src/test/modules/test_tidstore/test_tidstore.c @@ -37,10 +37,10 @@ check_tid(TidStore *ts, BlockNumber blkno, OffsetNumber off, bool expect) ItemPointerSet(&tid, blkno, off); - found = tidstore_lookup_tid(ts, &tid); + found = TidStoreIsMember(ts, &tid); if (found != expect) - elog(ERROR, "lookup TID (%u, %u) returned %d, expected %d", + elog(ERROR, "TidStoreIsMember for TID (%u, %u) returned %d, expected %d", blkno, off, found, expect); } @@ -69,9 +69,9 @@ test_basic(int max_offset) LWLockRegisterTranche(tranche_id, "test_tidstore"); dsa = dsa_create(tranche_id); - ts = tidstore_create(TEST_TIDSTORE_MAX_BYTES, max_offset, dsa); + ts = TidStoreCreate(TEST_TIDSTORE_MAX_BYTES, max_offset, dsa); #else - ts = tidstore_create(TEST_TIDSTORE_MAX_BYTES, max_offset, NULL); + ts = TidStoreCreate(TEST_TIDSTORE_MAX_BYTES, max_offset, NULL); #endif /* prepare the offset array */ @@ -83,7 +83,7 @@ test_basic(int max_offset) /* add tids */ for (int i = 0; i < TEST_TIDSTORE_NUM_BLOCKS; i++) - tidstore_add_tids(ts, blks[i], offs, TEST_TIDSTORE_NUM_OFFSETS); + TidStoreSetBlockOffsets(ts, blks[i], offs, TEST_TIDSTORE_NUM_OFFSETS); /* lookup test */ for (OffsetNumber off = FirstOffsetNumber ; off < max_offset; off++) @@ -105,30 +105,30 @@ test_basic(int max_offset) } /* test the number of tids */ - if (tidstore_num_tids(ts) != (TEST_TIDSTORE_NUM_BLOCKS * TEST_TIDSTORE_NUM_OFFSETS)) - elog(ERROR, "tidstore_num_tids returned " UINT64_FORMAT ", expected %d", - tidstore_num_tids(ts), + if (TidStoreNumTids(ts) != (TEST_TIDSTORE_NUM_BLOCKS * TEST_TIDSTORE_NUM_OFFSETS)) + elog(ERROR, "TidStoreNumTids returned " UINT64_FORMAT ", expected %d", + TidStoreNumTids(ts), TEST_TIDSTORE_NUM_BLOCKS * TEST_TIDSTORE_NUM_OFFSETS); /* iteration test */ - iter = tidstore_begin_iterate(ts); + iter = TidStoreBeginIterate(ts); blk_idx = 0; - while ((iter_result = tidstore_iterate_next(iter)) != NULL) + while ((iter_result = TidStoreIterateNext(iter)) != NULL) { /* check the returned block number */ if (blks_sorted[blk_idx] != iter_result->blkno) - elog(ERROR, "tidstore_iterate_next returned block number %u, expected %u", + elog(ERROR, "TidStoreIterateNext returned block number %u, expected %u", iter_result->blkno, blks_sorted[blk_idx]); /* check the returned offset numbers */ if (TEST_TIDSTORE_NUM_OFFSETS != iter_result->num_offsets) - elog(ERROR, "tidstore_iterate_next returned %u offsets, expected %u", + elog(ERROR, "TidStoreIterateNext %u offsets, expected %u", iter_result->num_offsets, TEST_TIDSTORE_NUM_OFFSETS); for (int i = 0; i < iter_result->num_offsets; i++) { if (offs[i] != iter_result->offsets[i]) - elog(ERROR, "tidstore_iterate_next returned offset number %u on block %u, expected %u", + elog(ERROR, "TidStoreIterateNext offset number %u on block %u, expected %u", iter_result->offsets[i], iter_result->blkno, offs[i]); } @@ -136,15 +136,15 @@ test_basic(int max_offset) } if (blk_idx != TEST_TIDSTORE_NUM_BLOCKS) - elog(ERROR, "tidstore_iterate_next returned %d blocks, expected %d", + elog(ERROR, "TidStoreIterateNext returned %d blocks, expected %d", blk_idx, TEST_TIDSTORE_NUM_BLOCKS); /* remove all tids */ - tidstore_reset(ts); + TidStoreReset(ts); /* test the number of tids */ - if (tidstore_num_tids(ts) != 0) - elog(ERROR, "tidstore_num_tids on empty store returned non-zero"); + if (TidStoreNumTids(ts) != 0) + elog(ERROR, "TidStoreNumTids on empty store returned non-zero"); /* lookup test for empty store */ for (OffsetNumber off = FirstOffsetNumber ; off < MaxHeapTuplesPerPage; @@ -156,7 +156,7 @@ test_basic(int max_offset) check_tid(ts, MaxBlockNumber, off, false); } - tidstore_destroy(ts); + TidStoreDestroy(ts); #ifdef TEST_SHARED_TIDSTORE dsa_detach(dsa); @@ -177,36 +177,37 @@ test_empty(void) LWLockRegisterTranche(tranche_id, "test_tidstore"); dsa = dsa_create(tranche_id); - ts = tidstore_create(TEST_TIDSTORE_MAX_BYTES, MaxHeapTuplesPerPage, dsa); + ts = TidStoreCreate(TEST_TIDSTORE_MAX_BYTES, MaxHeapTuplesPerPage, dsa); #else - ts = tidstore_create(TEST_TIDSTORE_MAX_BYTES, MaxHeapTuplesPerPage, NULL); + ts = TidStoreCreate(TEST_TIDSTORE_MAX_BYTES, MaxHeapTuplesPerPage, NULL); #endif elog(NOTICE, "testing empty tidstore"); ItemPointerSet(&tid, 0, FirstOffsetNumber); - if (tidstore_lookup_tid(ts, &tid)) - elog(ERROR, "tidstore_lookup_tid for (0,1) on empty store returned true"); + if (TidStoreIsMember(ts, &tid)) + elog(ERROR, "TidStoreIsMember for TID (%u,%u) on empty store returned true", + 0, FirstOffsetNumber); ItemPointerSet(&tid, MaxBlockNumber, MaxOffsetNumber); - if (tidstore_lookup_tid(ts, &tid)) - elog(ERROR, "tidstore_lookup_tid for (%u,%u) on empty store returned true", + if (TidStoreIsMember(ts, &tid)) + elog(ERROR, "TidStoreIsMember for TID (%u,%u) on empty store returned true", MaxBlockNumber, MaxOffsetNumber); - if (tidstore_num_tids(ts) != 0) - elog(ERROR, "tidstore_num_entries on empty store returned non-zero"); + if (TidStoreNumTids(ts) != 0) + elog(ERROR, "TidStoreNumTids on empty store returned non-zero"); - if (tidstore_is_full(ts)) - elog(ERROR, "tidstore_is_full on empty store returned true"); + if (TidStoreIsFull(ts)) + elog(ERROR, "TidStoreIsFull on empty store returned true"); - iter = tidstore_begin_iterate(ts); + iter = TidStoreBeginIterate(ts); - if (tidstore_iterate_next(iter) != NULL) - elog(ERROR, "tidstore_iterate_next on empty store returned TIDs"); + if (TidStoreIterateNext(iter) != NULL) + elog(ERROR, "TidStoreIterateNext on empty store returned TIDs"); - tidstore_end_iterate(iter); + TidStoreEndIterate(iter); - tidstore_destroy(ts); + TidStoreDestroy(ts); #ifdef TEST_SHARED_TIDSTORE dsa_detach(dsa); @@ -221,6 +222,7 @@ test_tidstore(PG_FUNCTION_ARGS) elog(NOTICE, "testing basic operations"); test_basic(MaxHeapTuplesPerPage); test_basic(10); + test_basic(MaxHeapTuplesPerPage * 2); PG_RETURN_VOID(); } -- 2.31.1