From f6feccd38472107f1b538a8785bba252fb7a6d27 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 10 Sep 2025 16:54:50 -0400 Subject: [PATCH v20251012 3/3] Reduce malloc/free traffic by caching batches Instead of immediately freeing a batch, stash it in a cache (a small fixed-size array), for reuse by the same scan. There's room for improvement: - Keeping some of the batch pieces (killItems, itemsvisibility, ...) instead of freeing them in index_batch_release. - Allocating only space we need (both index_batch_alloc calls use MaxTIDsPerBTreePage, and thus malloc - because of ALLOC_CHUNK_LIMIT). --- src/include/access/genam.h | 3 +- src/include/access/relscan.h | 5 + src/backend/access/index/indexam.c | 173 +++++++++++++++++++++++--- src/backend/access/nbtree/nbtree.c | 15 +-- src/backend/access/nbtree/nbtsearch.c | 4 +- 5 files changed, 176 insertions(+), 24 deletions(-) diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 767503bb6..913945c4b 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -230,7 +230,8 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan, bool recheckOrderBy); extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions, bool validate); -extern IndexScanBatch index_batch_alloc(int maxitems, bool want_itup); +extern IndexScanBatch index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup); +extern void index_batch_release(IndexScanDesc scan, IndexScanBatch batch); extern void index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 55495b0e3..6e511f1c1 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -187,6 +187,7 @@ typedef struct IndexScanBatchData */ char *itemsvisibility; /* Index-only scan visibility cache */ + int maxitems; IndexScanBatchPosItem items[FLEXIBLE_ARRAY_MEMBER]; } IndexScanBatchData; @@ -266,6 +267,10 @@ typedef struct IndexScanBatchState int firstBatch; /* first used batch slot */ int nextBatch; /* next empty batch slot */ + /* small cache of unused batches, to reduce malloc/free traffic */ + int batchesCacheSize; + IndexScanBatchData **batchesCache; + IndexScanBatchData **batches; /* callback to skip prefetching in IOS etc. */ diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index cba512869..1892eedd5 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -2197,6 +2197,10 @@ index_batch_init(IndexScanDesc scan) scan->batchState->firstBatch = 0; /* first batch */ scan->batchState->nextBatch = 0; /* first batch is empty */ + /* XXX init the cache of batches, capacity 16 is arbitrary */ + scan->batchState->batchesCacheSize = 16; + scan->batchState->batchesCache = NULL; + scan->batchState->batches = palloc(sizeof(IndexScanBatchData *) * scan->batchState->maxBatches); @@ -2329,34 +2333,102 @@ static void index_batch_end(IndexScanDesc scan) { index_batch_reset(scan, true); + + if (scan->batchState) + { + if (scan->batchState->batches) + pfree(scan->batchState->batches); + + if (scan->batchState->batchesCache) + { + for (int i = 0; i < scan->batchState->batchesCacheSize; i++) + { + if (scan->batchState->batchesCache[i] == NULL) + continue; + + pfree(scan->batchState->batchesCache[i]); + } + + pfree(scan->batchState->batchesCache); + } + pfree(scan->batchState); + } } /* * XXX Both index_batch_alloc() calls in btree use MaxTIDsPerBTreePage, * which seems unfortunate - it increases the allocation sizes, even if - * the index would be fine with smaller arrays. This means all batches exceed - * ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive). + * the index would be fine with smaller arrays. This means all batches + * exceed ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive). The + * cache helps for longer queries, not for queries that only create a + * single batch, etc. */ IndexScanBatch -index_batch_alloc(int maxitems, bool want_itup) +index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup) { - IndexScanBatch batch = palloc(offsetof(IndexScanBatchData, items) + - sizeof(IndexScanBatchPosItem) * maxitems); + IndexScanBatch batch = NULL; + /* + * try to find a batch in the cache + * + * XXX We can get here with batchState==NULL for bitmapscans. Could that + * mean bitmapscans have issues with malloc/free on batches too? But the + * cache can't help with that, when it's in batchState. + */ + if ((scan->batchState != NULL) && + (scan->batchState->batchesCache != NULL)) + { + /* + * try to find a batch in the cache, with maxitems high enough + * + * XXX Maybe should look for a batch with lowest maxitems? That should + * increase probability of cache hits in the future? + */ + for (int i = 0; i < scan->batchState->batchesCacheSize; i++) + { + if ((scan->batchState->batchesCache[i] != NULL) && + (scan->batchState->batchesCache[i]->maxitems >= maxitems)) + { + batch = scan->batchState->batchesCache[i]; + scan->batchState->batchesCache[i] = NULL; + break; + } + } + } + + /* found a batch in the cache? */ + if (batch) + { + /* for IOS, we expect to already have the currTuples */ + Assert(!(want_itup && (batch->currTuples == NULL))); + + /* XXX maybe we could keep these allocations too */ + Assert(batch->pos == NULL); + Assert(batch->itemsvisibility == NULL); + } + else + { + batch = palloc(offsetof(IndexScanBatchData, items) + + sizeof(IndexScanBatchPosItem) * maxitems); + + batch->maxitems = maxitems; + + /* + * If we are doing an index-only scan, we need a tuple storage + * workspace. We allocate BLCKSZ for this, which should always give + * the index AM enough space to fit a full page's worth of tuples. + */ + batch->currTuples = NULL; + if (want_itup) + batch->currTuples = palloc(BLCKSZ); + } + + /* shared initialization */ batch->firstItem = -1; batch->lastItem = -1; batch->killedItems = NULL; batch->numKilled = 0; - /* - * If we are doing an index-only scan, we need a tuple storage workspace. - * We allocate BLCKSZ for this, which should always give the index AM - * enough space to fit a full page's worth of tuples. - */ - batch->currTuples = NULL; - if (want_itup) - batch->currTuples = palloc(BLCKSZ); - batch->buf = InvalidBuffer; batch->pos = NULL; batch->itemsvisibility = NULL; /* per-batch IOS visibility */ @@ -2397,3 +2469,76 @@ index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch) ReleaseBuffer(batch->buf); batch->buf = InvalidBuffer; /* defensive */ } + +/* add the buffer to the cache, or free it */ +void +index_batch_release(IndexScanDesc scan, IndexScanBatch batch) +{ + /* + * first free some allocations + * + * XXX We could keep/reuse some of those. + */ + + if (batch->killedItems != NULL) + { + pfree(batch->killedItems); + batch->killedItems = NULL; + } + + if (batch->itemsvisibility != NULL) + { + pfree(batch->itemsvisibility); + batch->itemsvisibility = NULL; + } + + /* XXX a bit unclear what's release by AM vs. indexam */ + Assert(batch->pos == NULL); + + /* + * try adding it to the cache - finds a slot that's either empty or has a + * lower maxitems value (and replace that batch) + * + * XXX maybe we should track the number of empty slots, and minimum value + * of maxitems, so that we can skip pointless searches? + * + * XXX ignores cases with batchState=NULL (can we get here with bitmap + * scans?) + */ + if (scan->batchState != NULL) + { + /* lowest maxitems we found in the cache (to replace with batch) */ + int maxitems = batch->maxitems; + int slot = scan->batchState->batchesCacheSize; + + /* first time through, initialize the cache */ + if (scan->batchState->batchesCache == NULL) + scan->batchState->batchesCache + = palloc0_array(IndexScanBatch, + scan->batchState->batchesCacheSize); + + for (int i = 0; i < scan->batchState->batchesCacheSize; i++) + { + /* found empty slot, we're done */ + if (scan->batchState->batchesCache[i] == NULL) + { + scan->batchState->batchesCache[i] = batch; + return; + } + + /* update lowest maxitems? */ + if (scan->batchState->batchesCache[i]->maxitems < maxitems) + { + maxitems = scan->batchState->batchesCache[i]->maxitems; + slot = i; + } + } + + /* found a batch to replace? */ + if (maxitems < batch->maxitems) + { + pfree(scan->batchState->batchesCache[slot]); + scan->batchState->batchesCache[slot] = batch; + } + } +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index fba562df8..18c734c4a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -382,21 +382,22 @@ btfreebatch(IndexScanDesc scan, IndexScanBatch batch) if (batch->numKilled > 0) _bt_killitems(scan, batch); - if (batch->itemsvisibility) - pfree(batch->itemsvisibility); - - if (batch->currTuples) - pfree(batch->currTuples); - if (batch->pos) { if (!scan->batchState || !scan->batchState->dropPin) ReleaseBuffer(batch->buf); pfree(batch->pos); + + /* XXX maybe should be done in index_batch_free? */ + batch->buf = InvalidBuffer; + batch->pos = NULL; } - pfree(batch); + /* XXX keep itemsvisibility, killItems and currTuples */ + + /* free the batch (or cache it for reuse) */ + index_batch_release(scan, batch); } /* diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 6211e9548..0e9f8bda2 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1200,7 +1200,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* Allocate space for first batch */ - firstbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup); + firstbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup); firstbatch->pos = palloc(sizeof(BTScanPosData)); /* @@ -2237,7 +2237,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BTScanPos newpos; /* Allocate space for next batch */ - newbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup); + newbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup); newbatch->pos = palloc(sizeof(BTScanPosData)); newpos = newbatch->pos; -- 2.51.0