diff --git a/contrib/test_freepage/Makefile b/contrib/test_freepage/Makefile new file mode 100644 index 0000000..b482fe9 --- /dev/null +++ b/contrib/test_freepage/Makefile @@ -0,0 +1,17 @@ +# contrib/test_freepage/Makefile + +MODULES = test_freepage + +EXTENSION = test_freepage +DATA = test_freepage--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/test_freepage +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/test_freepage/test_freepage--1.0.sql b/contrib/test_freepage/test_freepage--1.0.sql new file mode 100644 index 0000000..5d3191e --- /dev/null +++ b/contrib/test_freepage/test_freepage--1.0.sql @@ -0,0 +1,15 @@ +/* contrib/test_freepage/test_freepage--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_freepage" to load this file. \quit + +CREATE FUNCTION init(size pg_catalog.int8) RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C STRICT; +CREATE FUNCTION get(pages pg_catalog.int8) RETURNS pg_catalog.int8 + AS 'MODULE_PATHNAME' LANGUAGE C STRICT; +CREATE FUNCTION inquire_largest() RETURNS pg_catalog.int8 + AS 'MODULE_PATHNAME' LANGUAGE C STRICT; +CREATE FUNCTION put(first_page pg_catalog.int8, npages pg_catalog.int8) + RETURNS pg_catalog.void AS 'MODULE_PATHNAME' LANGUAGE C STRICT; +CREATE FUNCTION dump() RETURNS pg_catalog.text + AS 'MODULE_PATHNAME' LANGUAGE C STRICT; diff --git a/contrib/test_freepage/test_freepage.c b/contrib/test_freepage/test_freepage.c new file mode 100644 index 0000000..074cf56 --- /dev/null +++ b/contrib/test_freepage/test_freepage.c @@ -0,0 +1,113 @@ +/*-------------------------------------------------------------------------- + * + * test_freepage.c + * Test harness code for free page manager. + * + * Copyright (C) 2013, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/test_freepage/test_freepage.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/freepage.h" + +PG_MODULE_MAGIC; +PG_FUNCTION_INFO_V1(init); +PG_FUNCTION_INFO_V1(get); +PG_FUNCTION_INFO_V1(inquire_largest); +PG_FUNCTION_INFO_V1(put); +PG_FUNCTION_INFO_V1(dump); + +Datum init(PG_FUNCTION_ARGS); +Datum get(PG_FUNCTION_ARGS); +Datum inquire_largest(PG_FUNCTION_ARGS); +Datum put(PG_FUNCTION_ARGS); +Datum dump(PG_FUNCTION_ARGS); + +char *space; +FreePageManager *fpm; + +Datum +init(PG_FUNCTION_ARGS) +{ + int64 size = PG_GETARG_INT64(0); + Size first_usable_page; + Size total_pages; + + if (size <= 0 || size % FPM_PAGE_SIZE != 0) + elog(ERROR, "bad size"); + + if (space != NULL) + { + free(space); + space = NULL; + fpm = NULL; + } + + space = malloc(size); + if (space == NULL) + elog(ERROR, "malloc failed: %m"); + + fpm = (FreePageManager *) space; + FreePageManagerInitialize(fpm, space, NULL, false); + + first_usable_page = sizeof(FreePageManager) / FPM_PAGE_SIZE + + (sizeof(FreePageManager) % FPM_PAGE_SIZE == 0 ? 0 : 1); + total_pages = size / FPM_PAGE_SIZE; + + FreePageManagerPut(fpm, first_usable_page, + total_pages - first_usable_page); + + PG_RETURN_VOID(); +} + +Datum +get(PG_FUNCTION_ARGS) +{ + int64 npages = PG_GETARG_INT64(0); + Size first_page; + + if (fpm == NULL) + PG_RETURN_NULL(); + + if (!FreePageManagerGet(fpm, npages, &first_page)) + PG_RETURN_NULL(); + + PG_RETURN_INT64(first_page); +} + +Datum +inquire_largest(PG_FUNCTION_ARGS) +{ + if (fpm == NULL) + PG_RETURN_NULL(); + + PG_RETURN_INT64(FreePageManagerInquireLargest(fpm)); +} + +Datum +put(PG_FUNCTION_ARGS) +{ + int64 first_page = PG_GETARG_INT64(0); + int64 npages = PG_GETARG_INT64(1); + + FreePageManagerPut(fpm, first_page, npages); + + PG_RETURN_VOID(); +} + +Datum +dump(PG_FUNCTION_ARGS) +{ + if (fpm == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(cstring_to_text(FreePageManagerDump(fpm))); +} diff --git a/contrib/test_freepage/test_freepage.control b/contrib/test_freepage/test_freepage.control new file mode 100644 index 0000000..fca4cd9 --- /dev/null +++ b/contrib/test_freepage/test_freepage.control @@ -0,0 +1,4 @@ +comment = 'Test code for shared memory message queues' +default_version = '1.0' +module_pathname = '$libdir/test_freepage' +relocatable = true diff --git a/contrib/test_sballoc/Makefile b/contrib/test_sballoc/Makefile new file mode 100644 index 0000000..880bccb --- /dev/null +++ b/contrib/test_sballoc/Makefile @@ -0,0 +1,17 @@ +# contrib/test_sballoc/Makefile + +MODULES = test_sballoc + +EXTENSION = test_sballoc +DATA = test_sballoc--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/test_sballoc +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/test_sballoc/test_sballoc--1.0.sql b/contrib/test_sballoc/test_sballoc--1.0.sql new file mode 100644 index 0000000..1cf8a5a --- /dev/null +++ b/contrib/test_sballoc/test_sballoc--1.0.sql @@ -0,0 +1,20 @@ +/* contrib/test_sballoc/test_sballoc--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_sballoc" to load this file. \quit + +CREATE FUNCTION alloc(size pg_catalog.int8, count pg_catalog.int8) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C STRICT; + +CREATE FUNCTION alloc_with_palloc(size pg_catalog.int8, count pg_catalog.int8) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C STRICT; + +CREATE FUNCTION alloc_list(size pg_catalog.int8, count pg_catalog.int8) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C STRICT; + +CREATE FUNCTION alloc_list_with_palloc(size pg_catalog.int8, count pg_catalog.int8) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C STRICT; diff --git a/contrib/test_sballoc/test_sballoc.c b/contrib/test_sballoc/test_sballoc.c new file mode 100644 index 0000000..38c03da --- /dev/null +++ b/contrib/test_sballoc/test_sballoc.c @@ -0,0 +1,144 @@ +/*-------------------------------------------------------------------------- + * + * test_sballoc.c + * Test harness code for superblock allocator. + * + * Copyright (C) 2013, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/test_sballoc/test_sballoc.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "utils/memutils.h" +#include "utils/sb_alloc.h" +#include "utils/sb_region.h" + +typedef struct llnode +{ + struct llnode *next; +} llnode; + +PG_MODULE_MAGIC; +PG_FUNCTION_INFO_V1(alloc); +PG_FUNCTION_INFO_V1(alloc_with_palloc); +PG_FUNCTION_INFO_V1(alloc_list); +PG_FUNCTION_INFO_V1(alloc_list_with_palloc); + +Datum +alloc(PG_FUNCTION_ARGS) +{ + int64 size = PG_GETARG_INT64(0); + int64 count = PG_GETARG_INT64(1); + int64 i; + int64 *p; + sb_allocator *a; + + a = sb_create_private_allocator(); + for (i = 0; i < count; ++i) + { + p = sb_alloc(a, size, 0); + *p = i; + } + sb_reset_allocator(a); + sb_dump_regions(); + + PG_RETURN_VOID(); +} + +Datum +alloc_with_palloc(PG_FUNCTION_ARGS) +{ + int64 size = PG_GETARG_INT64(0); + int64 count = PG_GETARG_INT64(1); + int64 i; + int64 *p; + MemoryContext context; + + context = AllocSetContextCreate(CurrentMemoryContext, + "alloc_with_palloc test", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + for (i = 0; i < count; ++i) + { + p = MemoryContextAlloc(context, size); + *p = i; + } + MemoryContextStats(context); + MemoryContextDelete(context); + + PG_RETURN_VOID(); +} + +Datum +alloc_list(PG_FUNCTION_ARGS) +{ + int64 size = PG_GETARG_INT64(0); + int64 count = PG_GETARG_INT64(1); + int64 i; + llnode *h = NULL; + llnode *p; + sb_allocator *a; + + if (size < sizeof(llnode)) + elog(ERROR, "size too small"); + + a = sb_create_private_allocator(); + for (i = 0; i < count; ++i) + { + p = sb_alloc(a, size, 0); + p->next = h; + h = p; + } + while (h != NULL) + { + p = h->next; + sb_free(h); + h = p; + } + sb_dump_regions(); + sb_reset_allocator(a); + + PG_RETURN_VOID(); +} + +Datum +alloc_list_with_palloc(PG_FUNCTION_ARGS) +{ + int64 size = PG_GETARG_INT64(0); + int64 count = PG_GETARG_INT64(1); + int64 i; + llnode *h = NULL; + llnode *p; + MemoryContext context; + + if (size < sizeof(llnode)) + elog(ERROR, "size too small"); + + context = AllocSetContextCreate(CurrentMemoryContext, + "alloc_list_with_palloc test", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + for (i = 0; i < count; ++i) + { + p = MemoryContextAlloc(context, size); + p->next = h; + h = p; + } + while (h != NULL) + { + p = h->next; + pfree(h); + h = p; + } + MemoryContextStats(context); + MemoryContextDelete(context); + + PG_RETURN_VOID(); +} diff --git a/contrib/test_sballoc/test_sballoc.control b/contrib/test_sballoc/test_sballoc.control new file mode 100644 index 0000000..58f61c0 --- /dev/null +++ b/contrib/test_sballoc/test_sballoc.control @@ -0,0 +1,4 @@ +comment = 'Test code for shared memory message queues' +default_version = '1.0' +module_pathname = '$libdir/test_sballoc' +relocatable = true diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 9ddc275..e034593 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -72,6 +72,7 @@ #include "storage/smgr.h" #include "tcop/tcopprot.h" #include "utils/rel.h" +#include "utils/sb_alloc.h" #include "utils/tuplesort.h" @@ -768,7 +769,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { _bt_buildadd(wstate, state, itup); if (should_free) - pfree(itup); + sb_free(itup); itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free); } @@ -776,7 +777,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { _bt_buildadd(wstate, state, itup2); if (should_free2) - pfree(itup2); + sb_free(itup2); itup2 = tuplesort_getindextuple(btspool2->sortstate, true, &should_free2); } @@ -795,7 +796,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_buildadd(wstate, state, itup); if (should_free) - pfree(itup); + sb_free(itup); } } diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile index b2403e1..c318a73 100644 --- a/src/backend/utils/mmgr/Makefile +++ b/src/backend/utils/mmgr/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = aset.o mcxt.o portalmem.o +OBJS = aset.o freepage.o mcxt.o portalmem.o sb_alloc.o sb_map.o sb_region.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c new file mode 100644 index 0000000..0fdd758 --- /dev/null +++ b/src/backend/utils/mmgr/freepage.c @@ -0,0 +1,1778 @@ +/*------------------------------------------------------------------------- + * + * freepage.c + * Management of free memory pages. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/freepage.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "utils/sb_region.h" + +/* Magic numbers to identify various page types */ +#define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0 +#define FREE_PAGE_LEAF_MAGIC 0x98eae728 +#define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9 + +/* Doubly linked list of spans of free pages; stored in first page of span. */ +struct FreePageSpanLeader +{ + int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */ + Size npages; /* number of pages in span */ + relptr(FreePageSpanLeader) prev; + relptr(FreePageSpanLeader) next; +}; + +/* Common header for btree leaf and internal pages. */ +typedef struct FreePageBtreeHeader +{ + int magic; /* FREE_PAGE_LEAF_MAGIC or FREE_PAGE_INTERNAL_MAGIC */ + Size nused; /* number of items used */ + relptr(FreePageBtree) parent; /* uplink */ +} FreePageBtreeHeader; + +/* Internal key; points to next level of btree. */ +typedef struct FreePageBtreeInternalKey +{ + Size first_page; /* low bound for keys on child page */ + relptr(FreePageBtree) child; /* downlink */ +} FreePageBtreeInternalKey; + +/* Leaf key; no payload data. */ +typedef struct FreePageBtreeLeafKey +{ + Size first_page; /* first page in span */ + Size npages; /* number of pages in span */ +} FreePageBtreeLeafKey; + +/* Work out how many keys will fit on a page. */ +#define FPM_ITEMS_PER_INTERNAL_PAGE \ + ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ + sizeof(FreePageBtreeInternalKey)) +#define FPM_ITEMS_PER_LEAF_PAGE \ + ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ + sizeof(FreePageBtreeLeafKey)) + +/* A btree page of either sort */ +struct FreePageBtree +{ + FreePageBtreeHeader hdr; + union + { + FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE]; + FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE]; + } u; +}; + +/* Results of a btree search */ +typedef struct FreePageBtreeSearchResult +{ + FreePageBtree *page; + Size index; + bool found; + unsigned split_pages; +} FreePageBtreeSearchResult; + +/* Helper functions */ +static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, + FreePageBtree *btp); +static Size FreePageBtreeCleanup(FreePageManager *fpm); +static FreePageBtree *FreePageBtreeFindLeftSibling(char *base, + FreePageBtree *btp); +static FreePageBtree *FreePageBtreeFindRightSibling(char *base, + FreePageBtree *btp); +static Size FreePageBtreeFirstKey(FreePageBtree *btp); +static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm); +static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, + Size index, Size first_page, FreePageBtree *child); +static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, + Size first_page, Size npages); +static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno); +static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, + Size index); +static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp); +static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page, + FreePageBtreeSearchResult *result); +static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page); +static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page); +static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm, + FreePageBtree *btp); +static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp); +static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, + FreePageBtree *parent, int level, StringInfo buf); +static void FreePageManagerDumpSpans(FreePageManager *fpm, + FreePageSpanLeader *span, Size expected_pages, + StringInfo buf); +static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages, + Size *first_page); +static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, + Size npages, bool soft); +static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno); +static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, + Size npages); + +/* + * Initialize a new, empty free page manager. + * + * 'fpm' should reference caller-provided memory large enough to contain a + * FreePageManager. We'll initialize it here. + * + * 'base' is the address to which all pointers are relative. When managing + * a dynamic shared memory segment, it should normally be the base of the + * segment. When managing backend-private memory, it can be either NULL or, + * if managing a single contiguous extent of memory, the start of that extent. + * + * 'lock' is the lock to be used to synchronize access to this FreePageManager. + * It can be NULL if synchronization is not required, either because we're + * managing backend-private memory or because we're managing shared memory but + * synchronization is caller-provided or not required. (For example, if only + * one process is allocating and freeing memory, locking isn't needed.) + * + * 'lock_address_is_fixed' should be false if the LWLock to be used for + * synchronization is stored in the same dynamic shared memory segment as + * the managed region, and true if it is stored in the main shared memory + * segment. Storing the LWLock in some other dynamic shared memory segment + * isn't supported. This is ignored when lock is NULL. + */ +void +FreePageManagerInitialize(FreePageManager *fpm, char *base, LWLock *lock, + bool lock_address_is_fixed) +{ + Size f; + + relptr_store(base, fpm->self, fpm); + relptr_store(base, fpm->lock, lock); + fpm->lock_address_is_fixed = lock_address_is_fixed; + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL); + fpm->btree_depth = 0; + fpm->btree_recycle_count = 0; + fpm->singleton_first_page = 0; + fpm->singleton_npages = 0; + fpm->largest_reported_chunk = 0; + + for (f = 0; f < FPM_NUM_FREELISTS; f++) + relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL); +} + +/* + * Allocate a run of pages of the given length from the free page manager. + * The return value indicates whether we were able to satisfy the request; + * if true, the first page of the allocation is stored in *first_page. + */ +bool +FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page) +{ + LWLock *lock = fpm_lock(fpm); + bool result; + Size contiguous_pages; + + if (lock != NULL) + LWLockAcquire(lock, LW_EXCLUSIVE); + result = FreePageManagerGetInternal(fpm, npages, first_page); + + /* + * It's a bit counterintuitive, but allocating pages can actually create + * opportunities for cleanup that create larger ranges. We might pull + * a key out of the btree that enables the item at the head of the btree + * recycle list to be inserted; and then if there are more items behind it + * one of those might cause two currently-separated ranges to merge, + * creating a single range of contiguous pages larger than any that existed + * previously. It might be worth trying to improve the cleanup algorithm + * to avoid such corner cases, but for now we just notice the condition + * and do the appropriate reporting. + * + * Reporting is only needed for backend-private regions, so we can skip + * it when locking is in use, or if we discover that the region has an + * associated dynamic shared memory segment. + */ + contiguous_pages = FreePageBtreeCleanup(fpm); + if (lock == NULL && contiguous_pages > fpm->largest_reported_chunk) + { + sb_region *region = sb_lookup_region(fpm); + + if (region != NULL && region->seg == NULL) + { + sb_report_contiguous_freespace(region, contiguous_pages); + fpm->largest_reported_chunk = contiguous_pages; + } + else + { + /* There's no containing region, so try to avoid future work. */ + fpm->largest_reported_chunk = (Size) -1; + } + } + + if (lock != NULL) + LWLockRelease(lock); + + return result; +} + +/* + * Return the size of the largest run of pages that the user could + * succesfully get. (If this value subsequently increases, it will trigger + * a callback to sb_report_contiguous_freespace.) + */ +Size +FreePageManagerInquireLargest(FreePageManager *fpm) +{ + LWLock *lock = fpm_lock(fpm); + char *base = fpm_segment_base(fpm); + Size largest = 0; + + if (lock != NULL) + LWLockAcquire(lock, LW_EXCLUSIVE); + + if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1])) + { + FreePageSpanLeader *candidate; + + candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]); + do + { + if (candidate->npages > largest) + largest = candidate->npages; + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + else + { + Size f = FPM_NUM_FREELISTS - 1; + + do + { + --f; + if (!relptr_is_null(fpm->freelist[f])) + largest = f + 1; + } while (f > 0); + } + + fpm->largest_reported_chunk = largest; + + if (lock != NULL) + LWLockRelease(lock); + + return largest; +} + +/* + * Transfer a run of pages to the free page manager. (If the number of + * contiguous pages now available is larger than it was previously, then + * we attempt to report this to the sb_region module.) + */ +void +FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages) +{ + LWLock *lock = fpm_lock(fpm); + Size contiguous_pages; + Assert(npages > 0); + + /* Acquire lock (if there is one). */ + if (lock != NULL) + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Record the new pages. */ + contiguous_pages = + FreePageManagerPutInternal(fpm, first_page, npages, false); + + /* + * If the new range we inserted into the page manager was contiguous + * with an existing range, it may have opened up cleanup opportunities. + */ + if (contiguous_pages > npages) + { + Size cleanup_contiguous_pages; + + cleanup_contiguous_pages = FreePageBtreeCleanup(fpm); + if (cleanup_contiguous_pages > contiguous_pages) + contiguous_pages = cleanup_contiguous_pages; + } + + /* + * If we now have more contiguous pages available than previously + * reported, attempt to notify sb_region system. + * + * Reporting is only needed for backend-private regions, so we can skip + * it when locking is in use, or if we discover that the region has an + * associated dynamic shared memory segment. + */ + if (lock == NULL && contiguous_pages > fpm->largest_reported_chunk) + { + sb_region *region = sb_lookup_region(fpm); + + if (region != NULL && region->seg == NULL) + { + fpm->largest_reported_chunk = contiguous_pages; + sb_report_contiguous_freespace(region, contiguous_pages); + } + else + { + /* There's no containing region, so try to avoid future work. */ + fpm->largest_reported_chunk = (Size) -1; + } + } + + /* Release lock (if there is one). */ + if (lock != NULL) + LWLockRelease(lock); +} + +/* + * Produce a debugging dump of the state of a free page manager. + */ +char * +FreePageManagerDump(FreePageManager *fpm) +{ + LWLock *lock = fpm_lock(fpm); + char *base = fpm_segment_base(fpm); + StringInfoData buf; + FreePageSpanLeader *recycle; + bool dumped_any_freelist = false; + Size f; + + /* Initialize output buffer. */ + initStringInfo(&buf); + + /* Acquire lock (if there is one). */ + if (lock != NULL) + LWLockAcquire(lock, LW_SHARED); + + /* Dump general stuff. */ + appendStringInfo(&buf, "metadata: self %zu lock %zu fixed %c\n", + fpm->self.relptr_off, fpm->lock.relptr_off, + fpm->lock_address_is_fixed ? 't' : 'f'); + + /* Dump btree. */ + if (fpm->btree_depth > 0) + { + FreePageBtree *root; + + appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth); + root = relptr_access(base, fpm->btree_root); + FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf); + } + else if (fpm->singleton_npages > 0) + { + appendStringInfo(&buf, "singleton: %zu(%zu)\n", + fpm->singleton_first_page, fpm->singleton_npages); + } + + /* Dump btree recycle list. */ + recycle = relptr_access(base, fpm->btree_recycle); + if (recycle != NULL) + { + appendStringInfo(&buf, "btree recycle:"); + FreePageManagerDumpSpans(fpm, recycle, 1, &buf); + } + + /* Dump free lists. */ + for (f = 0; f < FPM_NUM_FREELISTS; ++f) + { + FreePageSpanLeader *span; + + if (relptr_is_null(fpm->freelist[f])) + continue; + if (!dumped_any_freelist) + { + appendStringInfo(&buf, "freelists:\n"); + dumped_any_freelist = true; + } + appendStringInfo(&buf, " %zu:", f + 1); + span = relptr_access(base, fpm->freelist[f]); + FreePageManagerDumpSpans(fpm, span, f + 1, &buf); + } + + /* Release lock (if there is one). */ + if (lock != NULL) + LWLockRelease(lock); + + /* And return result to caller. */ + return buf.data; +} + + +/* + * The first_page value stored at index zero in any non-root page must match + * the first_page value stored in its parent at the index which points to that + * page. So when the value stored at index zero in a btree page changes, we've + * got to walk up the tree adjusting ancestor keys until we reach an ancestor + * where that key isn't index zero. This function should be called after + * updating the first key on the target page; it will propagate the change + * upward as far as needed. + * + * We assume here that the first key on the page has not changed enough to + * require changes in the ordering of keys on its ancestor pages. Thus, + * if we search the parent page for the first key greater than or equal to + * the first key on the current page, the downlink to this page will be either + * the exact index returned by the search (if the first key decreased) + * or one less (if the first key increased). + */ +static void +FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + Size first_page; + FreePageBtree *parent; + FreePageBtree *child; + + /* This might be either a leaf or an internal page. */ + Assert(btp->hdr.nused > 0); + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + first_page = btp->u.leaf_key[0].first_page; + } + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + first_page = btp->u.internal_key[0].first_page; + } + child = btp; + + /* Loop until we find an ancestor that does not require adjustment. */ + for (;;) + { + Size s; + + parent = relptr_access(base, child->hdr.parent); + if (parent == NULL) + break; + s = FreePageBtreeSearchInternal(parent, first_page); + + /* Key is either at index s or index s-1; figure out which. */ + if (s >= parent->hdr.nused) + { + Assert(s == parent->hdr.nused); + --s; + } + else + { + FreePageBtree *check; + + check = relptr_access(base, parent->u.internal_key[s].child); + if (check != child) + { + Assert(s > 0); + --s; + } + } + +#ifdef USE_ASSERT_CHECKING + /* Debugging double-check. */ + if (assert_enabled) + { + FreePageBtree *check; + + check = relptr_access(base, parent->u.internal_key[s].child); + Assert(s < parent->hdr.nused); + Assert(child == check); + } +#endif + + /* Update the parent key. */ + parent->u.internal_key[s].first_page = first_page; + + /* + * If this is the first key in the parent, go up another level; + * else done. + */ + if (s > 0) + break; + child = parent; + } +} + +/* + * Attempt to reclaim space from the free-page btree. The return value is + * the largest range of contiguous pages created by the cleanup operation. + */ +static Size +FreePageBtreeCleanup(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + Size max_contiguous_pages = 0; + + /* Attempt to shrink the depth of the btree. */ + while (!relptr_is_null(fpm->btree_root)) + { + FreePageBtree *root = relptr_access(base, fpm->btree_root); + + /* If the root contains only one key, reduce depth by one. */ + if (root->hdr.nused == 1) + { + /* Shrink depth of tree by one. */ + Assert(fpm->btree_depth > 0); + --fpm->btree_depth; + if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + /* If root is a leaf, convert only entry to singleton range. */ + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + fpm->singleton_first_page = root->u.leaf_key[0].first_page; + fpm->singleton_npages = root->u.leaf_key[0].npages; + } + else + { + FreePageBtree *newroot; + + /* If root is an internal page, make only child the root. */ + Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + relptr_copy(fpm->btree_root, root->u.internal_key[0].child); + newroot = relptr_access(base, fpm->btree_root); + relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL); + } + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root)); + } + else if (root->hdr.nused == 2 && + root->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + Size end_of_first; + Size start_of_second; + + end_of_first = root->u.leaf_key[0].first_page + + root->u.leaf_key[0].npages; + start_of_second = root->u.leaf_key[1].first_page; + + if (end_of_first + 1 == start_of_second) + { + Size root_page = fpm_pointer_to_page(base, root); + + if (end_of_first == root_page) + { + FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page); + FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page); + fpm->singleton_first_page = root->u.leaf_key[0].first_page; + fpm->singleton_npages = root->u.leaf_key[0].npages + + root->u.leaf_key[1].npages + 1; + fpm->btree_depth = 0; + relptr_store(base, fpm->btree_root, + (FreePageBtree *) NULL); + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + Assert(max_contiguous_pages == 0); + max_contiguous_pages = fpm->singleton_npages; + } + } + + /* Whether it worked or not, it's time to stop. */ + break; + } + else + { + /* Nothing more to do. Stop. */ + break; + } + } + + /* + * Attempt to free recycled btree pages. We skip this if releasing + * the recycled page would require a btree page split, because the page + * we're trying to recycle would be consumed by the split, which would + * be counterproductive. + * + * We also currently only ever attempt to recycle the first page on the + * list; that could be made more aggressive, but it's not clear that the + * complexity would be worthwhile. + */ + while (fpm->btree_recycle_count > 0) + { + FreePageBtree *btp; + Size first_page; + Size contiguous_pages; + + btp = FreePageBtreeGetRecycled(fpm); + first_page = fpm_pointer_to_page(base, btp); + contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true); + if (contiguous_pages == 0) + { + FreePageBtreeRecycle(fpm, first_page); + break; + } + else + { + if (contiguous_pages > max_contiguous_pages) + max_contiguous_pages = contiguous_pages; + } + } + + return max_contiguous_pages; +} + +/* + * Consider consolidating the given page with its left or right sibling, + * if it's fairly empty. + */ +static void +FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *np; + Size max; + + /* + * We only try to consolidate pages that are less than a third full. + * We could be more aggressive about this, but that might risk performing + * consolidation only to end up splitting again shortly thereafter. Since + * the btree should be very small compared to the space under management, + * our goal isn't so much to ensure that it always occupies the absolutely + * smallest possible number of pages as to reclaim pages before things get + * too egregiously out of hand. + */ + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + max = FPM_ITEMS_PER_LEAF_PAGE; + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + max = FPM_ITEMS_PER_INTERNAL_PAGE; + } + if (btp->hdr.nused >= max / 3) + return; + + /* + * If we can fit our right sibling's keys onto this page, consolidate. + */ + np = FreePageBtreeFindRightSibling(base, btp); + if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) + { + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0], + sizeof(FreePageBtreeLeafKey) * np->hdr.nused); + btp->hdr.nused += np->hdr.nused; + } + else + { + memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0], + sizeof(FreePageBtreeInternalKey) * np->hdr.nused); + btp->hdr.nused += np->hdr.nused; + FreePageBtreeUpdateParentPointers(base, btp); + } + FreePageBtreeRemovePage(fpm, np); + return; + } + + /* + * If we can fit our keys onto our left sibling's page, consolidate. + * In this case, we move our keys onto the other page rather than visca + * versa, to avoid having to adjust ancestor keys. + */ + np = FreePageBtreeFindLeftSibling(base, btp); + if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) + { + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0], + sizeof(FreePageBtreeLeafKey) * btp->hdr.nused); + np->hdr.nused += btp->hdr.nused; + } + else + { + memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0], + sizeof(FreePageBtreeInternalKey) * btp->hdr.nused); + np->hdr.nused += btp->hdr.nused; + FreePageBtreeUpdateParentPointers(base, np); + } + np->hdr.nused += btp->hdr.nused; + FreePageBtreeRemovePage(fpm, btp); + return; + } +} + +/* + * Find the passed page's left sibling; that is, the page at the same level + * of the tree whose keyspace immediately precedes ours. + */ +static FreePageBtree * +FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp) +{ + FreePageBtree *p = btp; + int levels = 0; + + /* Move up until we can move left. */ + for (;;) + { + Size first_page; + Size index; + + first_page = FreePageBtreeFirstKey(p); + p = relptr_access(base, p->hdr.parent); + + if (p == NULL) + return NULL; /* we were passed the rightmost page */ + + index = FreePageBtreeSearchInternal(p, first_page); + if (index > 0) + { + Assert(p->u.internal_key[index].first_page == first_page); + p = relptr_access(base, p->u.internal_key[index - 1].child); + break; + } + Assert(index == 0); + ++levels; + } + + /* Descend left. */ + while (levels > 0) + { + Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child); + --levels; + } + Assert(p->hdr.magic == btp->hdr.magic); + + return p; +} + +/* + * Find the passed page's right sibling; that is, the page at the same level + * of the tree whose keyspace immediately follows ours. + */ +static FreePageBtree * +FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp) +{ + FreePageBtree *p = btp; + int levels = 0; + + /* Move up until we can move right. */ + for (;;) + { + Size first_page; + Size index; + + first_page = FreePageBtreeFirstKey(p); + p = relptr_access(base, p->hdr.parent); + + if (p == NULL) + return NULL; /* we were passed the rightmost page */ + + index = FreePageBtreeSearchInternal(p, first_page); + if (index < p->hdr.nused - 1) + { + Assert(p->u.internal_key[index].first_page == first_page); + p = relptr_access(base, p->u.internal_key[index + 1].child); + break; + } + Assert(index == p->hdr.nused - 1); + ++levels; + } + + /* Descend left. */ + while (levels > 0) + { + Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + p = relptr_access(base, p->u.internal_key[0].child); + --levels; + } + Assert(p->hdr.magic == btp->hdr.magic); + + return p; +} + +/* + * Get the first key on a btree page. + */ +static Size +FreePageBtreeFirstKey(FreePageBtree *btp) +{ + Assert(btp->hdr.nused > 0); + + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + return btp->u.leaf_key[0].first_page; + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + return btp->u.internal_key[0].first_page; + } +} + +/* + * Get a page from the btree recycle list for use as a btree page. + */ +static FreePageBtree * +FreePageBtreeGetRecycled(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle); + FreePageSpanLeader *newhead; + + Assert(victim != NULL); + newhead = relptr_access(base, victim->next); + if (newhead != NULL) + relptr_copy(newhead->prev, victim->prev); + relptr_store(base, fpm->btree_recycle, newhead); + Assert(fpm_pointer_is_page_aligned(base, victim)); + fpm->btree_recycle_count--; + return (FreePageBtree *) victim; +} + +/* + * Insert an item into an internal page. + */ +static void +FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, + Size first_page, FreePageBtree *child) +{ + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + Assert(index <= btp->hdr.nused); + memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index], + sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index)); + btp->u.internal_key[index].first_page = first_page; + relptr_store(base, btp->u.internal_key[index].child, child); + ++btp->hdr.nused; +} + +/* + * Insert an item into a leaf page. + */ +static void +FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, + Size npages) +{ + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + Assert(index <= btp->hdr.nused); + memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index], + sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); + btp->u.leaf_key[index].first_page = first_page; + btp->u.leaf_key[index].npages = npages; + ++btp->hdr.nused; +} + +/* + * Put a page on the btree recycle list. + */ +static void +FreePageBtreeRecycle(FreePageManager *fpm, Size pageno) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle); + FreePageSpanLeader *span; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); + span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; + span->npages = 1; + relptr_store(base, span->next, head); + relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); + if (head != NULL) + relptr_store(base, head->prev, span); + relptr_store(base, fpm->btree_recycle, span); + fpm->btree_recycle_count++; +} + +/* + * Remove an item from the btree at the given position on the given page. + */ +static void +FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index) +{ + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(index < btp->hdr.nused); + + /* When last item is removed, extirpate entire page from btree. */ + if (btp->hdr.nused == 1) + { + FreePageBtreeRemovePage(fpm, btp); + return; + } + + /* Physically remove the key from the page. */ + --btp->hdr.nused; + if (index < btp->hdr.nused) + memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1], + sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); + + /* If we just removed the first key, adjust ancestor keys. */ + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, btp); + + /* Consider whether to consolidate this page with a sibling. */ + FreePageBtreeConsolidate(fpm, btp); +} + +/* + * Remove a page from the btree. Caller is responsible for having relocated + * any keys from this page that are still wanted. The page is placed on the + * recycled list. + */ +static void +FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *parent; + Size index; + Size first_page; + + for (;;) + { + /* Find parent page. */ + parent = relptr_access(base, btp->hdr.parent); + if (parent == NULL) + { + /* We are removing the root page. */ + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + fpm->btree_depth = 0; + Assert(fpm->singleton_first_page == 0); + Assert(fpm->singleton_npages == 0); + return; + } + + /* + * If the parent contains only one item, we need to remove it as + * well. + */ + if (parent->hdr.nused > 1) + break; + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); + btp = parent; + } + + /* Find and remove the downlink. */ + first_page = FreePageBtreeFirstKey(btp); + if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + index = FreePageBtreeSearchLeaf(parent, first_page); + Assert(index < parent->hdr.nused); + if (index < parent->hdr.nused - 1) + memmove(&parent->u.leaf_key[index], + &parent->u.leaf_key[index + 1], + sizeof(FreePageBtreeLeafKey) + * (parent->hdr.nused - index - 1)); + } + else + { + index = FreePageBtreeSearchInternal(parent, first_page); + Assert(index < parent->hdr.nused); + if (index < parent->hdr.nused - 1) + memmove(&parent->u.internal_key[index], + &parent->u.internal_key[index + 1], + sizeof(FreePageBtreeInternalKey) + * (parent->hdr.nused - index - 1)); + } + parent->hdr.nused--; + Assert(parent->hdr.nused > 0); + + /* Recycle the page. */ + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); + + /* Adjust ancestor keys if needed. */ + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, parent); + + /* Consider whether to consolidate the parent with a sibling. */ + FreePageBtreeConsolidate(fpm, parent); +} + +/* + * Search the btree for an entry for the given first page and initialize + * *result with the results of the search. result->page and result->index + * indicate either the position of an exact match or the position at which + * the new key should be inserted. result->found is true for an exact match, + * otherwise false. result->split_pages will contain the number of additional + * btree pages that will be needed when performing a split to insert a key. + * Except as described above, the contents of fields in the result object are + * undefined on return. + */ +static void +FreePageBtreeSearch(FreePageManager *fpm, Size first_page, + FreePageBtreeSearchResult *result) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *btp = relptr_access(base, fpm->btree_root); + Size index; + + result->split_pages = 1; + + /* If the btree is empty, there's nothing to find. */ + if (btp == NULL) + { + result->page = NULL; + result->found = false; + return; + } + + /* Descend until we hit a leaf. */ + while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + FreePageBtree *child; + + index = FreePageBtreeSearchInternal(btp, first_page); + + /* + * If the index is 0, we're not going to find it, but we keep + * descending anyway so that we can find the insertion point. + */ + if (index > 0) + --index; + + /* Track required split depth for leaf insert. */ + if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE) + { + Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); + result->split_pages++; + } + else + result->split_pages = 0; + + /* Descend to appropriate child page. */ + Assert(index < btp->hdr.nused); + child = relptr_access(base, btp->u.internal_key[index].child); + Assert(relptr_access(base, child->hdr.parent) == btp); + btp = child; + } + + /* Track required split depth for leaf insert. */ + if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE) + { + Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); + result->split_pages++; + } + else + result->split_pages = 0; + + /* Search leaf page. */ + index = FreePageBtreeSearchLeaf(btp, first_page); + + /* Assemble results. */ + result->page = btp; + result->index = index; + result->found = index < btp->hdr.nused && + first_page == btp->u.leaf_key[index].first_page; +} + +/* + * Search an internal page for the first key greater than or equal to a given + * page number. Returns the index of that key, or one greater than the number + * of keys on the page if none. + */ +static Size +FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page) +{ + Size low = 0; + Size high = btp->hdr.nused; + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE); + + while (low < high) + { + Size mid = (low + high) / 2; + Size val = btp->u.internal_key[mid].first_page; + + if (first_page == val) + return mid; + else if (first_page < val) + high = mid; + else + low = mid + 1; + } + + return low; +} + +/* + * Search a leaf page for the first key greater than or equal to a given + * page number. Returns the index of that key, or one greater than the number + * of keys on the page if none. + */ +static Size +FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page) +{ + Size low = 0; + Size high = btp->hdr.nused; + + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE); + + while (low < high) + { + Size mid = (low + high) / 2; + Size val = btp->u.leaf_key[mid].first_page; + + if (first_page == val) + return mid; + else if (first_page < val) + high = mid; + else + low = mid + 1; + } + + return low; +} + +/* + * Allocate a new btree page and move half the keys from the provided page + * to the new page. Caller is responsible for making sure that there's a + * page available from fpm->btree_recycle. Returns a pointer to the new page, + * to which caller must add a downlink. + */ +static FreePageBtree * +FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp) +{ + FreePageBtree *newsibling; + + newsibling = FreePageBtreeGetRecycled(fpm); + newsibling->hdr.magic = btp->hdr.magic; + newsibling->hdr.nused = btp->hdr.nused / 2; + relptr_copy(newsibling->hdr.parent, btp->hdr.parent); + btp->hdr.nused -= newsibling->hdr.nused; + + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + memcpy(&newsibling->u.leaf_key, + &btp->u.leaf_key[btp->hdr.nused], + sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused); + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + memcpy(&newsibling->u.internal_key, + &btp->u.internal_key[btp->hdr.nused], + sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused); + FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling); + } + + return newsibling; +} + +/* + * When internal pages are split or merged, the parent pointers of their + * children must be updated. + */ +static void +FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp) +{ + Size i; + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + for (i = 0; i < btp->hdr.nused; ++i) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[i].child); + relptr_store(base, child->hdr.parent, btp); + } +} + +/* + * Debugging dump of btree data. + */ +static void +FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, + FreePageBtree *parent, int level, StringInfo buf) +{ + char *base = fpm_segment_base(fpm); + Size pageno = fpm_pointer_to_page(base, btp); + Size index; + FreePageBtree *check_parent; + + check_stack_depth(); + check_parent = relptr_access(base, btp->hdr.parent); + appendStringInfo(buf, " %zu@%d %c", pageno, level, + btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l'); + if (parent != check_parent) + appendStringInfo(buf, " [actual parent %zu, expected %zu]", + fpm_pointer_to_page(base, check_parent), + fpm_pointer_to_page(base, parent)); + appendStringInfoChar(buf, ':'); + for (index = 0; index < btp->hdr.nused; ++index) + { + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + appendStringInfo(buf, " %zu->%zu", + btp->u.internal_key[index].first_page, + btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE); + else + appendStringInfo(buf, " %zu(%zu)", + btp->u.leaf_key[index].first_page, + btp->u.leaf_key[index].npages); + } + appendStringInfo(buf, "\n"); + + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + for (index = 0; index < btp->hdr.nused; ++index) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[index].child); + FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf); + } + } +} + +/* + * Debugging dump of free-span data. + */ +static void +FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, + Size expected_pages, StringInfo buf) +{ + char *base = fpm_segment_base(fpm); + + while (span != NULL) + { + if (span->npages != expected_pages) + appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span), + span->npages); + else + appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span)); + span = relptr_access(base, span->next); + } + + appendStringInfo(buf, "\n"); +} + +/* + * Like FreePageManagerGet, this function allocates a run of pages of the + * given length from the free page manager, but without taking and releasing + * the lock. The caller is responsible for making sure the lock is already + * held. + */ +static bool +FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *victim = NULL; + FreePageSpanLeader *prev; + FreePageSpanLeader *next; + FreePageBtreeSearchResult result; + Size victim_page = 0; /* placate compiler */ + Size f; + + /* + * Search for a free span. + * + * Right now, we use a simple best-fit policy here, but it's possible for + * this to result in memory fragmentation if we're repeatedly asked to + * allocate chunks just a little smaller than what we have available. + * Hopefully, this is unlikely, because we expect most requests to be + * single pages or superblock-sized chunks -- but no policy can be optimal + * under all circumstances unless it has knowledge of future allocation + * patterns. + */ + for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f) + { + /* Skip empty freelists. */ + if (relptr_is_null(fpm->freelist[f])) + continue; + + /* + * All of the freelists except the last one contain only items of a + * single size, so we just take the first one. But the final free + * list contains everything too big for any of the other lists, so + * we need to search the list. + */ + if (f < FPM_NUM_FREELISTS - 1) + victim = relptr_access(base, fpm->freelist[f]); + else + { + FreePageSpanLeader *candidate; + + candidate = relptr_access(base, fpm->freelist[f]); + do + { + if (candidate->npages >= npages && (victim == NULL || + victim->npages > candidate->npages)) + { + victim = candidate; + if (victim->npages == npages) + break; + } + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + break; + } + + /* If we didn't find an allocatable span, return failure. */ + if (victim == NULL) + return false; + + /* Remove span from free list. */ + Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC); + prev = relptr_access(base, victim->prev); + next = relptr_access(base, victim->next); + if (prev != NULL) + relptr_copy(prev->next, victim->next); + else + relptr_copy(fpm->freelist[f], victim->next); + if (next != NULL) + relptr_copy(next->prev, victim->prev); + victim_page = fpm_pointer_to_page(base, victim); + + /* + * If we haven't initialized the btree yet, the victim must be the single + * span stored within the FreePageManager itself. Otherwise, we need + * to update the btree. + */ + if (relptr_is_null(fpm->btree_root)) + { + Assert(victim_page == fpm->singleton_first_page); + Assert(victim->npages = fpm->singleton_npages); + Assert(victim->npages >= npages); + fpm->singleton_first_page += npages; + fpm->singleton_npages -= npages; + if (fpm->singleton_npages > 0) + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + } + else + { + /* + * If the span we found is exactly the right size, remove it from the + * btree completely. Otherwise, adjust the btree entry to reflect the + * still-unallocated portion of the span, and put that portion on the + * appropriate free list. + */ + FreePageBtreeSearch(fpm, victim_page, &result); + Assert(result.found); + if (victim->npages == npages) + FreePageBtreeRemove(fpm, result.page, result.index); + else + { + FreePageBtreeLeafKey *key; + + /* Adjust btree to reflect remaining pages. */ + Assert(victim->npages > npages); + key = &result.page->u.leaf_key[result.index]; + Assert(key->npages == victim->npages); + key->first_page += npages; + key->npages -= npages; + if (result.index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, result.page); + + /* Put the unallocated pages back on the appropriate free list. */ + FreePagePushSpanLeader(fpm, victim_page + npages, + victim->npages - npages); + } + } + + /* Return results to caller. */ + *first_page = fpm_pointer_to_page(base, victim); + return true; +} + +/* + * Put a range of pages into the btree and freelists, consolidating it with + * existing free spans just before and/or after it. If 'soft' is true, + * only perform the insertion if it can be done without allocating new btree + * pages; if false, do it always. Returns 0 if the soft flag caused the + * insertion to be skipped, or otherwise the size of the contiguous span + * created by the insertion. This may be larger than npages if we're able + * to consolidate with an adjacent range. + */ +static Size +FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, + bool soft) +{ + char *base = fpm_segment_base(fpm); + FreePageBtreeSearchResult result; + FreePageBtreeLeafKey *prevkey = NULL; + FreePageBtreeLeafKey *nextkey = NULL; + FreePageBtree *np; + Size nindex; + Assert(npages > 0); + + /* We can store a single free span without initializing the btree. */ + if (fpm->btree_depth == 0) + { + if (fpm->singleton_npages == 0) + { + /* Don't have a span yet; store this one. */ + fpm->singleton_first_page = first_page; + fpm->singleton_npages = npages; + FreePagePushSpanLeader(fpm, first_page, npages); + return fpm->singleton_npages; + } + else if (fpm->singleton_first_page + fpm->singleton_npages == + first_page) + { + /* New span immediately follows sole existing span. */ + fpm->singleton_npages += npages; + FreePagePopSpanLeader(fpm, fpm->singleton_first_page); + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + return fpm->singleton_npages; + } + else if (first_page + npages == fpm->singleton_first_page) + { + /* New span immediately precedes sole existing span. */ + FreePagePopSpanLeader(fpm, fpm->singleton_first_page); + fpm->singleton_first_page = first_page; + fpm->singleton_npages += npages; + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + return fpm->singleton_npages; + } + else + { + /* Not contiguous; we need to initialize the btree. */ + Size root_page; + FreePageBtree *root; + + if (!relptr_is_null(fpm->btree_recycle)) + root = FreePageBtreeGetRecycled(fpm); + else if (FreePageManagerGetInternal(fpm, 1, &root_page)) + root = (FreePageBtree *) fpm_page_to_pointer(base, root_page); + else + { + /* We'd better be able to get a page from the existing range. */ + elog(FATAL, "free page manager btree is corrupt"); + } + + /* Create the btree and move the preexisting range into it. */ + root->hdr.magic = FREE_PAGE_LEAF_MAGIC; + root->hdr.nused = 1; + relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL); + root->u.leaf_key[0].first_page = fpm->singleton_first_page; + root->u.leaf_key[0].npages = fpm->singleton_npages; + relptr_store(base, fpm->btree_root, root); + fpm->singleton_first_page = 0; + fpm->singleton_npages = 0; + fpm->btree_depth = 1; + + /* + * Corner case: it may be that the btree root took the very last + * free page. In that case, the sole btree entry covers a zero + * page run, which is invalid. Overwrite it with the entry we're + * trying to insert and get out. + */ + if (root->u.leaf_key[0].npages == 0) + { + root->u.leaf_key[0].first_page = first_page; + root->u.leaf_key[0].npages = npages; + return npages; + } + + /* Fall through to insert the new key. */ + } + } + + /* Search the btree. */ + FreePageBtreeSearch(fpm, first_page, &result); + Assert(!result.found); + if (result.index > 0) + prevkey = &result.page->u.leaf_key[result.index - 1]; + if (result.index < result.page->hdr.nused) + { + np = result.page; + nindex = result.index; + nextkey = &result.page->u.leaf_key[result.index]; + } + else + { + np = FreePageBtreeFindRightSibling(base, result.page); + nindex = 0; + if (np != NULL) + nextkey = &np->u.leaf_key[0]; + } + + /* Consolidate with the previous entry if possible. */ + if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page) + { + bool remove_next = false; + Size result; + + Assert(prevkey->first_page + prevkey->npages == first_page); + prevkey->npages = (first_page - prevkey->first_page) + npages; + + /* Check whether we can *also* consolidate with the following entry. */ + if (nextkey != NULL && + prevkey->first_page + prevkey->npages >= nextkey->first_page) + { + Assert(prevkey->first_page + prevkey->npages == + nextkey->first_page); + prevkey->npages = (nextkey->first_page - prevkey->first_page) + + nextkey->npages; + FreePagePopSpanLeader(fpm, nextkey->first_page); + remove_next = true; + } + + /* Put the span on the correct freelist and save size. */ + FreePagePopSpanLeader(fpm, prevkey->first_page); + FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages); + result = prevkey->npages; + + /* + * If we consolidated with both the preceding and following entries, + * we must remove the following entry. We do this last, because + * removing an element from the btree may invalidate pointers we hold + * into the current data structure. + * + * NB: The btree is technically in an invalid state a this point + * because we've already updated prevkey to cover the same key space + * as nextkey. FreePageBtreeRemove() shouldn't notice that, though. + */ + if (remove_next) + FreePageBtreeRemove(fpm, np, nindex); + + return result; + } + + /* Consolidate with the next entry if possible. */ + if (nextkey != NULL && first_page + npages >= nextkey->first_page) + { + Size newpages; + + /* Compute new size for span. */ + Assert(first_page + npages == nextkey->first_page); + newpages = (nextkey->first_page - first_page) + nextkey->npages; + + /* Put span on correct free list. */ + FreePagePopSpanLeader(fpm, nextkey->first_page); + FreePagePushSpanLeader(fpm, first_page, newpages); + + /* Update key in place. */ + nextkey->first_page = first_page; + nextkey->npages = newpages; + + /* If reducing first key on page, ancestors might need adjustment. */ + if (nindex == 0) + FreePageBtreeAdjustAncestorKeys(fpm, np); + + return nextkey->npages; + } + + /* Split leaf page and as many of its ancestors as necessary. */ + if (result.split_pages > 0) + { + /* + * NB: We could consider various coping strategies here to avoid a + * split; most obviously, if np != result.page, we could target that + * page instead. More complicated shuffling strategies could be + * possible as well; basically, unless every single leaf page is 100% + * full, we can jam this key in there if we try hard enough. It's + * unlikely that trying that hard is worthwhile, but it's possible + * we might need to make more than no effort. For now, we just do + * the easy thing, which is nothing. + */ + + /* If this is a soft insert, it's time to give up. */ + if (soft) + return 0; + + /* Check whether we need to allocate more btree pages to split. */ + if (result.split_pages > fpm->btree_recycle_count) + { + Size pages_needed; + Size recycle_page; + Size i; + + /* + * Allocate the required number of pages and split each one in + * turn. This should never fail, because if we've got enough spans + * of free pages kicking around that we need additional storage + * space just to remember them all, then we should certainly have + * enough to expand the btree, which should only ever use a tiny + * number of pages compared to the number under management. If + * it does, something's badly screwed up. + */ + pages_needed = result.split_pages - fpm->btree_recycle_count; + for (i = 0; i < pages_needed; ++i) + { + if (!FreePageManagerGetInternal(fpm, 1, &recycle_page)) + elog(FATAL, "free page manager btree is corrupt"); + FreePageBtreeRecycle(fpm, recycle_page); + } + + /* + * The act of allocating pages to recycle may have invalidated + * the results of our previous btree reserch, so repeat it. + * (We could recheck whether any of our split-avoidance strategies + * that were not viable before now are, but it hardly seems + * worthwhile, so we don't bother. Consolidation can't be possible + * now if it wasn't previously.) + */ + FreePageBtreeSearch(fpm, first_page, &result); + + /* + * The act of allocating pages for use in constructing our btree + * should never cause any page to become more full, so the new + * split depth should be no greater than the old one, and perhaps + * less if we fortutiously allocated a chunk that freed up a + * slot on the page we need to update. + */ + Assert(result.split_pages <= fpm->btree_recycle_count); + } + + /* If we still need to perform a split, do it. */ + if (result.split_pages > 0) + { + FreePageBtree *split_target = result.page; + FreePageBtree *child = NULL; + Size key = first_page; + + for (;;) + { + FreePageBtree *newsibling; + FreePageBtree *parent; + + /* Identify parent page, which must receive downlink. */ + parent = relptr_access(base, split_target->hdr.parent); + + /* Split the page - downlink not added yet. */ + newsibling = FreePageBtreeSplitPage(fpm, split_target); + + /* + * At this point in the loop, we're always carrying a pending + * insertion. On the first pass, it's the actual key we're + * trying to insert; on subsequent passes, it's the downlink + * that needs to be added as a result of the split performed + * during the previous loop iteration. Since we've just split + * the page, there's definitely room on one of the two + * resulting pages. + */ + if (child == NULL) + { + Size index; + FreePageBtree *insert_into; + + insert_into = key < newsibling->u.leaf_key[0].first_page ? + split_target : newsibling; + index = FreePageBtreeSearchLeaf(insert_into, key); + FreePageBtreeInsertLeaf(insert_into, index, key, npages); + if (index == 0 && insert_into == split_target) + FreePageBtreeAdjustAncestorKeys(fpm, split_target); + } + else + { + Size index; + FreePageBtree *insert_into; + + insert_into = + key < newsibling->u.internal_key[0].first_page ? + split_target : newsibling; + index = FreePageBtreeSearchInternal(insert_into, key); + FreePageBtreeInsertInternal(base, insert_into, index, + key, child); + relptr_store(base, child->hdr.parent, insert_into); + if (index == 0 && insert_into == split_target) + FreePageBtreeAdjustAncestorKeys(fpm, split_target); + } + + /* If the page we just split has no parent, split the root. */ + if (parent == NULL) + { + FreePageBtree *newroot; + + newroot = FreePageBtreeGetRecycled(fpm); + newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC; + newroot->hdr.nused = 2; + relptr_store(base, newroot->hdr.parent, + (FreePageBtree *) NULL); + newroot->u.internal_key[0].first_page = + FreePageBtreeFirstKey(split_target); + relptr_store(base, newroot->u.internal_key[0].child, + split_target); + relptr_store(base, split_target->hdr.parent, newroot); + newroot->u.internal_key[1].first_page = + FreePageBtreeFirstKey(newsibling); + relptr_store(base, newroot->u.internal_key[1].child, + newsibling); + relptr_store(base, newsibling->hdr.parent, newroot); + relptr_store(base, fpm->btree_root, newroot); + fpm->btree_depth++; + + break; + } + + /* If the parent page isn't full, insert the downlink. */ + key = newsibling->u.internal_key[0].first_page; + if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE) + { + Size index; + + index = FreePageBtreeSearchInternal(parent, key); + FreePageBtreeInsertInternal(base, parent, index, + key, newsibling); + relptr_store(base, newsibling->hdr.parent, parent); + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, parent); + break; + } + + /* The parent also needs to be split, so loop around. */ + child = newsibling; + split_target = parent; + } + + /* + * The loop above did the insert, so just need to update the + * free list, and we're done. + */ + FreePagePushSpanLeader(fpm, first_page, npages); + + return npages; + } + } + + /* Physically add the key to the page. */ + Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE); + FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages); + + /* If new first key on page, ancestors might need adjustment. */ + if (result.index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, result.page); + + /* Put it on the free list. */ + FreePagePushSpanLeader(fpm, first_page, npages); + + return npages; +} + +/* + * Remove a FreePageSpanLeader from the linked-list that contains it, either + * because we're changing the size of the span, or because we're allocating it. + */ +static void +FreePagePopSpanLeader(FreePageManager *fpm, Size pageno) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *span; + FreePageSpanLeader *next; + FreePageSpanLeader *prev; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); + + next = relptr_access(base, span->next); + prev = relptr_access(base, span->prev); + if (next != NULL) + relptr_copy(next->prev, span->prev); + if (prev != NULL) + relptr_copy(prev->next, span->next); + else + { + Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1; + + Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE); + relptr_copy(fpm->freelist[f], span->next); + } +} + +/* + * Initialize a new FreePageSpanLeader and put it on the appropriate free list. + */ +static void +FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages) +{ + char *base = fpm_segment_base(fpm); + Size f = Min(npages, FPM_NUM_FREELISTS) - 1; + FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]); + FreePageSpanLeader *span; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page); + span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; + span->npages = npages; + relptr_store(base, span->next, head); + relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); + if (head != NULL) + relptr_store(base, head->prev, span); + relptr_store(base, fpm->freelist[f], span); +} diff --git a/src/backend/utils/mmgr/sb_alloc.c b/src/backend/utils/mmgr/sb_alloc.c new file mode 100644 index 0000000..ace9e56 --- /dev/null +++ b/src/backend/utils/mmgr/sb_alloc.c @@ -0,0 +1,861 @@ +/*------------------------------------------------------------------------- + * + * sb_alloc.c + * Superblock-based memory allocator. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/utils/mmgr/sb_alloc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "utils/sb_region.h" + +/* + * Metadata for an ordinary superblock, a large memory allocation, or a "span + * of spans". + * + * For ordinary superblocks and large memory allocations, span objects are + * stored out-of-line; that is, the span object is not stored within the + * span itself. Ordinary superblocks are all of size SB_SUPERBLOCK_SIZE, + * and size_class indicates the size of object they contain. Large memory + * spans contain just enough pages to store the object, and size_class + * is SB_SCLASS_SPAN_LARGE; ninitialized, nused, and firstfree are all unused, + * as the whole span consists of a single object. + * + * For a "span of spans", the span object is stored "inline". The allocation + * is always exactly one page, and the sb_span object is located at the + * beginning of that page. This makes it easy to free a span: just find the + * start of the containing page, and there's the sb_span to which it needs to + * be returned. The size class will be SB_SPAN_OF_SPANS, and the remaining + * fields are used just as they would be in an ordinary superblock. We can't + * allocate spans out of ordinary superblocks because creating an ordinary + * superblock requires us to be able to allocate a span *first*. Doing it + * this way avoids that circularity. + */ +struct sb_span +{ + relptr(sb_heap) parent; /* Containing heap. */ + relptr(sb_span) prevspan; /* Previous span. */ + relptr(sb_span) nextspan; /* Next span. */ + relptr(char) start; /* Starting address. */ + Size npages; /* Length of span in pages. */ + uint16 size_class; /* Size class. */ + uint16 ninitialized; /* Maximum number of objects ever allocated. */ + uint16 nallocatable; /* Number of objects currently allocatable. */ + uint16 firstfree; /* First object on free list. */ + uint16 nmax; /* Maximum number of objects ever possible. */ + uint16 fclass; /* Current fullness class. */ +}; + +#define SB_SPAN_NOTHING_FREE ((uint16) -1) +#define SB_SUPERBLOCK_SIZE (SB_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE) + +/* + * Small allocations are handled by dividing a relatively large chunk of + * memory called a superblock into many small objects of equal size. The + * chunk sizes are defined by the following array. Larger size classes are + * spaced more widely than smaller size classes. We fudge the spacing for + * size classes >1k to avoid space wastage: based on the knowledge that we + * plan to allocate 64k superblocks, we bump the maximum object size up + * to the largest multiple of 8 bytes that still lets us fit the same + * number of objects into one superblock. + * + * NB: Because of this fudging, if the size of a superblock is ever changed, + * these size classes should be reworked to be optimal for the new size. + * + * NB: The optimal spacing for size classes, as well as the size of the + * superblocks themselves, is not a question that has one right answer. + * Some allocators (such as tcmalloc) use more closely-spaced size classes + * than we do here, while others (like aset.c) use more widely-spaced classes. + * Spacing the classes more closely avoids wasting memory within individual + * chunks, but also means a larger number of potentially-unfilled superblocks. + * This system is really only suitable for allocating relatively large amounts + * of memory, where the unfilled superblocks will be a small percentage of + * the total allocations. + */ +static const uint16 sb_size_classes[] = { + sizeof(sb_span), 0, /* special size classes */ + 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */ + 80, 96, 112, 128, /* 4 classes separated by 16 bytes */ + 160, 192, 224, 256, /* 4 classes separated by 32 bytes */ + 320, 384, 448, 512, /* 4 classes separated by 64 bytes */ + 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */ + 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */ + 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */ + 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */ +}; + +/* + * The following lookup table is used to map the size of small objects + * (less than 1kB) onto the corresponding size class. To use this table, + * round the size of the object up to the next multiple of 8 bytes, and then + * index into this array. + */ +static char sb_size_class_map[] = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, + 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, + 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, + 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 +}; +#define SB_SIZE_CLASS_MAP_QUANTUM 8 + +/* Special size classes. */ +#define SB_SCLASS_SPAN_OF_SPANS 0 +#define SB_SCLASS_SPAN_LARGE 1 +#define SB_NUM_SIZE_CLASSES lengthof(sb_size_classes) + +/* Helper functions. */ +static char *sb_alloc_guts(char *base, sb_region *region, + sb_allocator *a, int size_class); +static bool sb_ensure_active_superblock(char *base, sb_region *region, + sb_allocator *a, sb_heap *heap, + int size_class); +static void sb_init_span(char *base, sb_span *span, sb_heap *heap, + char *ptr, Size npages, uint16 size_class); +static void sb_out_of_memory_error(sb_allocator *a); +static bool sb_transfer_first_span(char *base, sb_heap *heap, + int fromclass, int toclass); +static void sb_unlink_span(char *base, sb_heap *heap, sb_span *span); + +/* + * Create a backend-private allocator. + */ +sb_allocator * +sb_create_private_allocator(void) +{ + Size allocator_size; + int heapno; + int fclass; + sb_allocator *a; + char *base = NULL; + + allocator_size = offsetof(sb_allocator, heaps); + allocator_size += sizeof(sb_heap) * SB_NUM_SIZE_CLASSES; + a = malloc(allocator_size); + if (a == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + a->private = true; + for (heapno = 0; heapno < SB_NUM_SIZE_CLASSES; ++heapno) + { + sb_heap *heap = &a->heaps[heapno]; + + relptr_store(base, heap->lock, (LWLock *) NULL); + for (fclass = 0; fclass < SB_FULLNESS_CLASSES; ++fclass) + relptr_store(base, heap->spans[fclass], (sb_span *) NULL); + } + + return a; +} + +/* + * Allocate memory. + */ +void * +sb_alloc(sb_allocator *a, Size size, int flags) +{ + sb_region *region = NULL; + char *base = NULL; + uint16 size_class; + char *result; + + Assert(size > 0); + + /* + * For shared memory allocation, pointers are relative to the start of the + * region, so finding out that information is essential. For + * backend-private memory allocation, allocators aren't uniquely tied to + * a region; we'll only need to grab a region if we can't allocate out of + * an existing superblock. + */ + if (!a->private) + { + region = sb_lookup_region(a); + if (region == NULL) + elog(ERROR, "sb_region not found"); + base = region->region_start; + } + + /* If it's too big for a superblock, just grab a raw run of pages. */ + if (size > sb_size_classes[lengthof(sb_size_classes) - 1]) + { + Size npages = fpm_size_to_pages(size); + Size first_page; + sb_span *span; + sb_heap *heap = &a->heaps[SB_SCLASS_SPAN_LARGE]; + LWLock *lock = relptr_access(base, heap->lock); + void *ptr; + + /* Obtain a span object. */ + span = (sb_span *) sb_alloc_guts(base, region, a, + SB_SCLASS_SPAN_OF_SPANS); + if (span == NULL) + { + if ((flags & SB_ALLOC_SOFT_FAIL) == 0) + sb_out_of_memory_error(a); + return NULL; + } + + /* Find a region from which to allocate. */ + if (region == NULL) + region = sb_private_region_for_allocator(npages); + + /* Here's where we try to perform the actual allocation. */ + if (region == NULL || + !FreePageManagerGet(region->fpm, npages, &first_page)) + { + /* XXX. Free the span. */ + if ((flags & SB_ALLOC_SOFT_FAIL) == 0) + sb_out_of_memory_error(a); + return NULL; + } + ptr = fpm_page_to_pointer(fpm_segment_base(region->fpm), first_page); + + /* Initialize span and pagemap. */ + if (lock != NULL) + LWLockAcquire(lock, LW_EXCLUSIVE); + sb_init_span(base, span, heap, ptr, npages, SB_SCLASS_SPAN_LARGE); + if (lock != NULL) + LWLockRelease(lock); + sb_map_set(region->pagemap, first_page, span); + + return ptr; + } + + /* Map allocation to a size class. */ + if (size < lengthof(sb_size_class_map) * SB_SIZE_CLASS_MAP_QUANTUM) + { + int mapidx; + + mapidx = ((size + SB_SIZE_CLASS_MAP_QUANTUM - 1) / + SB_SIZE_CLASS_MAP_QUANTUM) - 1; + size_class = sb_size_class_map[mapidx]; + } + else + { + uint16 min = sb_size_class_map[lengthof(sb_size_class_map) - 1]; + uint16 max = lengthof(sb_size_classes) - 1; + + while (min < max) + { + uint16 mid = (min + max) / 2; + uint16 class_size = sb_size_classes[mid]; + + if (class_size < size) + min = mid + 1; + else + max = mid; + } + + size_class = min; + } + Assert(size <= sb_size_classes[size_class]); + Assert(size_class == 0 || size > sb_size_classes[size_class - 1]); + + /* Attempt the actual allocation. */ + result = sb_alloc_guts(base, region, a, size_class); + if (result == NULL && (flags & SB_ALLOC_SOFT_FAIL) == 0) + sb_out_of_memory_error(a); + return result; +} + +/* + * Free memory allocated via sb_alloc. + */ +void +sb_free(void *ptr) +{ + sb_region *region; + char *fpm_base; + char *base = NULL; + sb_span *span; + LWLock *lock = NULL; + char *superblock; + Size pageno; + Size obsize; + uint16 size_class; + + /* Locate the containing superblock. */ + region = sb_lookup_region(ptr); + fpm_base = fpm_segment_base(region->fpm); + pageno = fpm_pointer_to_page(fpm_base, ptr); + span = sb_map_get(region->pagemap, pageno); + + /* + * If this is a shared-memory region, we might need locking. If so, + * lock the heap. + */ + if (region->seg != NULL) + { + sb_heap *heap = relptr_access(fpm_base, span->parent); + base = fpm_base; + lock = relptr_access(fpm_base, heap->lock); + if (lock != NULL) + LWLockAcquire(lock, LW_EXCLUSIVE); + } + + /* Compute the object size. */ + size_class = span->size_class; + obsize = sb_size_classes[size_class]; + + /* If it's a large object, free the entire span. */ + if (size_class == SB_SCLASS_SPAN_LARGE) + { + sb_heap *heap = relptr_access(base, span->parent); + Size first_page; + + sb_unlink_span(base, heap, span); + first_page = fpm_pointer_to_page(fpm_base, + relptr_access(base, span->start)); + FreePageManagerPut(region->fpm, first_page, span->npages); + sb_free(span); + + /* We're done, but must release any lock first. */ + if (lock != NULL) + LWLockRelease(lock); + } + + /* Put the object on the superblock's freelist. */ + superblock = relptr_access(base, span->start); + Assert(((char *) ptr) >= superblock); + Assert(((char *) ptr) < superblock + SB_SUPERBLOCK_SIZE); + Assert((((char *) ptr) - superblock) % obsize == 0); + * (Size *) ptr = span->firstfree; + span->firstfree = (((char *) ptr) - superblock) / obsize; + span->nallocatable++; + + if (span->nallocatable == 1 && span->fclass == SB_FULLNESS_CLASSES - 1) + { + sb_heap *heap = relptr_access(base, span->parent); + sb_span *new_nextspan; + + /* + * The superblock is completely full and is located in the + * highest-numbered fullness class, which is never scanned for free + * chunks. We must move it to the next-lower fullness class. + */ + + sb_unlink_span(base, heap, span); + span->fclass = SB_FULLNESS_CLASSES - 2; + relptr_copy(span->nextspan, heap->spans[SB_FULLNESS_CLASSES - 2]); + relptr_store(base, span->prevspan, (sb_span *) NULL); + new_nextspan = relptr_access(base, + heap->spans[SB_FULLNESS_CLASSES - 2]); + if (new_nextspan != NULL) + relptr_store(base, new_nextspan->prevspan, span); + relptr_store(base, heap->spans[SB_FULLNESS_CLASSES - 2], span); + } + else if (span->nallocatable == span->nmax && (span->fclass != 1 || + !relptr_is_null(span->prevspan))) + { + sb_heap *heap = relptr_access(base, span->parent); + Size first_page; + + /* + * This entire superblock is free, and it's not the active superblock + * for this size class. Return the memory to the free page manager. + * We don't do this for the active superblock to prevent hysteresis: + * if we repeatedly allocate and free the only chunk in the active + * superblock, it will be very inefficient if we deallocate and + * reallocate the superblock every time. + */ + sb_unlink_span(base, heap, span); + first_page = fpm_pointer_to_page(fpm_base, + relptr_access(base, span->start)); + FreePageManagerPut(region->fpm, first_page, span->npages); + + /* + * Span-of-spans superblocks store the span which describes them + * within the superblock itself, so freeing the storage implicitly + * frees the descriptor also. If this is a superblock of any other + * type, we need to separately free the span object also. + */ + if (size_class != SB_SCLASS_SPAN_OF_SPANS) + sb_free(span); + } + + /* If we locked the heap, release the lock. */ + if (lock != NULL) + LWLockRelease(lock); +} + +/* + * Return the size of the chunk that will be used to satisfy a given + * allocation. + */ +Size +sb_alloc_space(Size size) +{ + uint16 size_class; + + /* Large objects allocate full pages. */ + if (size > sb_size_classes[lengthof(sb_size_classes) - 1]) + return FPM_PAGE_SIZE * fpm_size_to_pages(size); + + /* Map request size to a size class. */ + if (size < lengthof(sb_size_class_map) * SB_SIZE_CLASS_MAP_QUANTUM) + { + int mapidx; + + mapidx = ((size + SB_SIZE_CLASS_MAP_QUANTUM - 1) / + SB_SIZE_CLASS_MAP_QUANTUM) - 1; + size_class = sb_size_class_map[mapidx]; + } + else + { + uint16 min = sb_size_class_map[lengthof(sb_size_class_map) - 1]; + uint16 max = lengthof(sb_size_classes) - 1; + while (min < max) + { + uint16 mid = (min + max) / 2; + uint16 class_size = sb_size_classes[mid]; + + if (class_size < size) + min = mid + 1; + else + max = mid; + } + size_class = min; + } + + return sb_size_classes[size_class]; +} + +/* + * Return the size of the chunk used to satisfy a given allocation. + * + * This is roughly an analogue of GetMemoryChunkSpace, but it's hard to make + * a precisely fair comparison. Unlike MemoryContextAlloc/AllocSetAlloc, + * there's no bookkeeping overhead associated with any single allocation; + * the only thing we can really reflect here is the fact that allocations + * will be rounded up to the next larger size class (or, for large allocations, + * to a full FPM page). The storage overhead of the sb_span, sb_map, + * sb_region, and FreePageManager structures is typically spread across + * enough small allocations to make reflecting those costs here difficult. + * + * On the other hand, we also hope that the overhead in question is small + * enough not to matter. The system malloc is not without bookkeeping + * overhead of its own. + */ +Size +sb_chunk_space(void *ptr) +{ + sb_region *region; + char *fpm_base; + sb_span *span; + Size pageno; + uint16 size_class; + + /* Locate the containing superblock. */ + region = sb_lookup_region(ptr); + fpm_base = fpm_segment_base(region->fpm); + pageno = fpm_pointer_to_page(fpm_base, ptr); + span = sb_map_get(region->pagemap, pageno); + + /* Work out the size of the allocation. */ + size_class = span->size_class; + if (span->size_class == SB_SCLASS_SPAN_LARGE) + return FPM_PAGE_SIZE * span->npages; + else + return sb_size_classes[size_class]; +} + +/* + * Free all memory used by an allocator. + * + * NB: It's not safe to do this while the allocator is in use! + */ +void +sb_reset_allocator(sb_allocator *a) +{ + char *base = NULL; + int heapno; + + /* + * For shared memory allocation, pointers are relative to the start of the + * region. + */ + if (!a->private) + { + sb_region *region = sb_lookup_region(a); + if (region == NULL) + elog(ERROR, "sb_region not found"); + base = region->region_start; + } + + /* + * Iterate through heaps back to front. We do it this way so that + * spans-of-spans are freed last. + */ + for (heapno = SB_NUM_SIZE_CLASSES - 1; heapno >= 0; --heapno) + { + sb_heap *heap = &a->heaps[heapno]; + int fclass; + + for (fclass = 0; fclass < SB_FULLNESS_CLASSES; ++fclass) + { + sb_region *region; + char *superblock; + sb_span *span; + + span = relptr_access(base, heap->spans[fclass]); + while (span != NULL) + { + Size offset; + sb_span *nextspan; + + superblock = relptr_access(base, span->start); + nextspan = relptr_access(base, span->nextspan); + region = sb_lookup_region(superblock); + Assert(region != NULL); + offset = superblock - fpm_segment_base(region->fpm); + Assert(offset % FPM_PAGE_SIZE == 0); + FreePageManagerPut(region->fpm, offset / FPM_PAGE_SIZE, + span->npages); + span = nextspan; + } + } + } +} + +/* + * Allocate an object of the requested size class from the given allocator. + * If necessary, steal or create another superblock. + */ +static char * +sb_alloc_guts(char *base, sb_region *region, sb_allocator *a, int size_class) +{ + sb_heap *heap = &a->heaps[size_class]; + LWLock *lock = relptr_access(base, heap->lock); + sb_span *active_sb; + char *superblock; + char *result; + Size obsize; + + /* If locking is in use, acquire the lock. */ + if (lock != NULL) + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* + * If there's no active superblock, we must successfully obtain one or + * fail the request. + */ + if (relptr_is_null(heap->spans[1]) + && !sb_ensure_active_superblock(base, region, a, heap, size_class)) + { + if (lock != NULL) + LWLockRelease(lock); + return NULL; + } + Assert(!relptr_is_null(heap->spans[1])); + + /* + * There should be a superblock in fullness class 1 at this point, and + * it should never be completely full. Thus we can either pop the + * free list or, failing that, initialize a new object. + */ + active_sb = relptr_access(base, heap->spans[1]); + Assert(active_sb != NULL && active_sb->nallocatable > 0); + superblock = relptr_access(base, active_sb->start); + Assert(size_class < SB_NUM_SIZE_CLASSES); + obsize = sb_size_classes[size_class]; + if (active_sb->firstfree != SB_SPAN_NOTHING_FREE) + { + result = superblock + active_sb->firstfree * obsize; + active_sb->firstfree = * (Size *) result; + } + else + { + result = superblock + active_sb->ninitialized * obsize; + ++active_sb->ninitialized; + } + --active_sb->nallocatable; + + /* If it's now full, move it to the highest-numbered fullness class. */ + if (active_sb->nallocatable == 0) + sb_transfer_first_span(base, heap, 1, SB_FULLNESS_CLASSES - 1); + + /* We're all done. Release the lock. */ + if (lock != NULL) + LWLockRelease(lock); + + return result; +} + +/* + * Ensure an active (i.e. fullness class 1) superblock, unless all existing + * superblocks are completely full and no more can be allocated. + * + * Fullness classes K of 0..N is loosely intended to represent superblocks + * whose utilization percentage is at least K/N, but we only enforce this + * rigorously for the highest-numbered fullness class, which always contains + * exactly those blocks that are completely full. It's otherwise acceptable + * for a superblock to be in a higher-numbered fullness class than the one + * to which it logically belongs. In addition, the active superblock, which + * is always the first block in fullness class 1, is permitted to have a + * higher allocation percentage than would normally be allowable for that + * fullness class; we don't move it until it's completely full, and then + * it goes to the highest-numbered fullness class. + * + * It might seem odd that the active superblock is the head of fullness class + * 1 rather than fullness class 0, but experience with other allocators has + * shown that it's usually better to allocate from a superblock that's + * moderately full rather than one that's nearly empty. Insofar as is + * reasonably possible, we want to avoid performing new allocations in a + * superblock that would otherwise become empty soon. + */ +static bool +sb_ensure_active_superblock(char *base, sb_region *region, sb_allocator *a, + sb_heap *heap, int size_class) +{ + Size obsize = sb_size_classes[size_class]; + Size nmax; + int fclass; + sb_span *span = NULL; + Size npages = 1; + Size first_page; + Size i; + void *ptr; + + /* + * Compute the number of objects that will fit in a superblock of this + * size class. Span-of-spans superblocks are just a single page, and the + * first object isn't available for use because it describes the + * span-of-spans itself. + */ + if (size_class == SB_SCLASS_SPAN_OF_SPANS) + nmax = FPM_PAGE_SIZE / obsize - 1; + else + nmax = SB_SUPERBLOCK_SIZE / obsize; + + /* + * If fullness class 1 is empty, try to find something to put in it by + * scanning higher-numbered fullness classes (excluding the last one, + * whose blocks are certain to all be completely full). + */ + for (fclass = 2; fclass < SB_FULLNESS_CLASSES - 1; ++fclass) + { + sb_span *span; + + span = relptr_access(base, heap->spans[fclass]); + while (span != NULL) + { + int tfclass; + sb_span *nextspan; + sb_span *prevspan; + + /* Figure out what fullness class should contain this. */ + tfclass = (nmax - span->nallocatable) + * (SB_FULLNESS_CLASSES - 1) / nmax; + + /* Look up next span. */ + nextspan = relptr_access(base, span->nextspan); + + /* + * If utilization has dropped enough that this now belongs in + * some other fullness class, move it there. + */ + if (tfclass < fclass) + { + prevspan = relptr_access(base, span->prevspan); + + relptr_copy(span->nextspan, heap->spans[tfclass]); + relptr_store(base, span->prevspan, (sb_span *) NULL); + if (nextspan != NULL) + relptr_copy(nextspan->prevspan, span->prevspan); + if (prevspan != NULL) + relptr_copy(prevspan->nextspan, span->nextspan); + else + relptr_copy(heap->spans[fclass], span->nextspan); + span->fclass = tfclass; + } + + /* Advance to next span on list. */ + span = nextspan; + } + + /* Stop now if we found a suitable superblock. */ + if (!relptr_is_null(heap->spans[1])) + return true; + } + + /* + * If there are no superblocks that properly belong in fullness class 1, + * pick one from some other fullness class and move it there anyway, so + * that we have an allocation target. Our last choice is to transfer a + * superblock that's almost empty (and might become completely empty soon + * if left alone), but even that is better than failing, which is what we + * must do if there are no superblocks at all with freespace. + */ + Assert(relptr_is_null(heap->spans[1])); + for (fclass = 2; fclass < SB_FULLNESS_CLASSES - 1; ++fclass) + if (sb_transfer_first_span(base, heap, fclass, 1)) + return true; + if (relptr_is_null(heap->spans[1]) && + sb_transfer_first_span(base, heap, 0, 1)) + return true; + + /* + * Get an sb_span object to describe the new superblock... unless + * this allocation is for an sb_span object, in which case that's + * surely not going to work. We handle that case by storing the + * sb_span describing an sb_span superblock inline. + */ + if (size_class != SB_SCLASS_SPAN_OF_SPANS) + { + sb_region *span_region = a->private ? NULL : region; + + span = (sb_span *) sb_alloc_guts(base, span_region, a, + SB_SCLASS_SPAN_OF_SPANS); + if (span == NULL) + return false; + npages = SB_PAGES_PER_SUPERBLOCK; + } + + /* Find a region from which to allocate the superblock. */ + if (region == NULL) + { + Assert(a->private); + region = sb_private_region_for_allocator(npages); + } + + /* Try to allocate the actual superblock. */ + if (region == NULL || + !FreePageManagerGet(region->fpm, npages, &first_page)) + { + /* XXX. Free the span, if any. */ + return false; + } + ptr = fpm_page_to_pointer(fpm_segment_base(region->fpm), first_page); + + /* + * If this is a span-of-spans, carve the descriptor right out of + * the allocated space. + */ + if (size_class == SB_SCLASS_SPAN_OF_SPANS) + span = (sb_span *) ptr; + + /* Initialize span and pagemap. */ + sb_init_span(base, span, heap, ptr, npages, size_class); + for (i = 0; i < npages; ++i) + sb_map_set(region->pagemap, first_page + i, span); + + return true; +} + +/* + * Add a new span to fullness class 1 of the indicated heap. + */ +static void +sb_init_span(char *base, sb_span *span, sb_heap *heap, char *ptr, + Size npages, uint16 size_class) +{ + sb_span *head = relptr_access(base, heap->spans[1]); + Size obsize = sb_size_classes[size_class]; + + if (head != NULL) + relptr_store(base, head->prevspan, span); + relptr_store(base, span->parent, heap); + relptr_store(base, span->nextspan, head); + relptr_store(base, span->prevspan, (sb_span *) NULL); + relptr_store(base, heap->spans[1], span); + relptr_store(base, span->start, ptr); + span->npages = npages; + span->size_class = size_class; + span->ninitialized = 0; + if (size_class == SB_SCLASS_SPAN_OF_SPANS) + { + /* + * A span-of-spans contains its own descriptor, so mark one object + * as initialized and reduce the count of allocatable objects by one. + * Doing this here has the side effect of also reducing nmax by one, + * which is important to make sure we free this object at the correct + * time. + */ + span->ninitialized = 1; + span->nallocatable = FPM_PAGE_SIZE / obsize - 1; + } + else if (size_class != SB_SCLASS_SPAN_LARGE) + span->nallocatable = SB_SUPERBLOCK_SIZE / obsize; + span->firstfree = SB_SPAN_NOTHING_FREE; + span->nmax = span->nallocatable; + span->fclass = 1; +} + +/* + * Report an out-of-memory condition. + */ +static void +sb_out_of_memory_error(sb_allocator *a) +{ + if (a->private) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + else + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); +} + +/* + * Transfer the first span in one fullness class to the head of another + * fullness class. + */ +static bool +sb_transfer_first_span(char *base, sb_heap *heap, int fromclass, int toclass) +{ + sb_span *span; + sb_span *nextspan; + + /* Can't do it if source list is empty. */ + span = relptr_access(base, heap->spans[fromclass]); + if (span == NULL) + return false; + + /* Remove span from source list. */ + nextspan = relptr_access(base, span->nextspan); + relptr_store(base, heap->spans[fromclass], nextspan); + if (nextspan != NULL) + relptr_store(base, nextspan->prevspan, (sb_span *) NULL); + + /* Add span to target list. */ + relptr_copy(span->nextspan, heap->spans[toclass]); + relptr_store(base, heap->spans[toclass], span); + nextspan = relptr_access(base, span->nextspan); + if (nextspan != NULL) + relptr_store(base, nextspan->prevspan, span); + span->fclass = toclass; + + return true; +} + +/* + * Remove span from current list. + */ +static void +sb_unlink_span(char *base, sb_heap *heap, sb_span *span) +{ + sb_span *nextspan = relptr_access(base, span->nextspan); + sb_span *prevspan = relptr_access(base, span->prevspan); + + relptr_store(base, span->prevspan, (sb_span *) NULL); + if (nextspan != NULL) + relptr_copy(nextspan->prevspan, span->prevspan); + if (prevspan != NULL) + relptr_copy(prevspan->nextspan, span->nextspan); + else + relptr_copy(heap->spans[span->fclass], span->nextspan); +} diff --git a/src/backend/utils/mmgr/sb_map.c b/src/backend/utils/mmgr/sb_map.c new file mode 100644 index 0000000..7c629df --- /dev/null +++ b/src/backend/utils/mmgr/sb_map.c @@ -0,0 +1,137 @@ +/*------------------------------------------------------------------------- + * + * sb_map.c + * Superblock allocator page-mapping infrastructure. + * + * The superblock allocator does not store metadata with each chunk, and + * therefore needs a way to find the metadata given only the pointer + * address. The first step is to translate the pointer address to a + * an offset relative to some base address, from which a page number + * can be calculated. Then, this module is reponsible for mapping the + * page number to an offset with the chunk where the associated span + * object is stored. We do this in the simplest possible way: one big + * array. + * + * Span metadata is stored within the same chunk of memory as the span + * itself. Therefore, we can assume that the offset is less than 4GB + * whenever we're managing less than 4GB of pages, and use 4 byte + * offsets. When we're managing more than 4GB of pages, we use 8 byte + * offsets. (This could probably be optimized; for example, we could use + * 6 byte offsets for allocation sizes up to 256TB; also, if we assumed + * that the span object must itself be 2, 4, or 8 byte aligned, we could + * extend the cutoff point for offsets of any given length by a similar + * multiple. It's not clear that the extra math would be worthwhile.) + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/utils/mmgr/sb_map.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "storage/shmem.h" +#include "utils/freepage.h" +#include "utils/sb_map.h" + +const uint64 maxpages_4b = UINT64CONST(0x100000000) / FPM_PAGE_SIZE; + +struct sb_map +{ + relptr(sb_map) self; + Size offset; + Size npages; + bool use64; +}; + +/* Map layout for segments less than 4GB. */ +typedef struct sb_map32 +{ + sb_map hdr; + uint32 map[FLEXIBLE_ARRAY_MEMBER]; +} sb_map32; + +/* Map layout for segments less than 8GB. */ +typedef struct sb_map64 +{ + sb_map hdr; + uint64 map[FLEXIBLE_ARRAY_MEMBER]; +} sb_map64; + +#define sb_map_base(m) \ + (((char *) m) - m->self.relptr_off) + +/* + * Compute the amount of space required for an sb_map covering a given + * number of pages. Note that for shared memory (i.e. when base != NULL), + * we assume that the pointers will always point to addresses within that + * same segment, but for backend-private memory that might not be the case. + */ +Size +sb_map_size(char *base, Size npages) +{ + Size map_bytes; + + if (sizeof(Size) <= 4 || (base != NULL && npages < maxpages_4b)) + map_bytes = add_size(offsetof(sb_map32, map), + mul_size(npages, sizeof(uint32))); + else + map_bytes = add_size(offsetof(sb_map64, map), + mul_size(npages, sizeof(uint64))); + + return map_bytes; +} + +/* + * Initialize an sb_map. Storage is provided by the caller. Note that we + * don't zero the array; the caller shouldn't try to get a value that hasn't + * been set. + */ +void +sb_map_initialize(sb_map *m, char *base, Size offset, Size npages) +{ + relptr_store(base, m->self, m); + m->offset = offset; + m->npages = npages; + if (sizeof(Size) <= 4 || (base != NULL && npages < maxpages_4b)) + m->use64 = false; + else + m->use64 = true; +} + +/* + * Store a value into an sb_map. + */ +void +sb_map_set(sb_map *m, Size pageno, void *ptr) +{ + char *base = sb_map_base(m); + Assert(pageno >= m->offset); + pageno -= m->offset; + Assert(pageno < m->npages); + + if (m->use64) + ((sb_map64 *) m)->map[pageno] = (uint64) (((char *) ptr) - base); + else + ((sb_map32 *) m)->map[pageno] = (uint32) (((char *) ptr) - base); +} + +/* + * Get a value from an sb_map. Getting a value not previously stored will + * produce an undefined result, so don't do that. + */ +void * +sb_map_get(sb_map *m, Size pageno) +{ + char *base = sb_map_base(m); + Assert(pageno >= m->offset); + pageno -= m->offset; + Assert(pageno < m->npages); + + if (m->use64) + return base + ((sb_map64 *) m)->map[pageno]; + else + return base + ((sb_map32 *) m)->map[pageno]; +} diff --git a/src/backend/utils/mmgr/sb_region.c b/src/backend/utils/mmgr/sb_region.c new file mode 100644 index 0000000..1c51563 --- /dev/null +++ b/src/backend/utils/mmgr/sb_region.c @@ -0,0 +1,744 @@ +/*------------------------------------------------------------------------- + * + * sb_region.c + * Superblock allocator memory region manager. + * + * The superblock allocator operates on ranges of pages managed by a + * FreePageManager and reverse-mapped by an sb_map. When it's asked to + * free an object, it just gets a pointer address; our job is to figure + * out which page range contains that object and locate the + * FreePageManager, sb_map, and other metadata that the superblock + * allocator will need to do its thing. Moreover, when allocating an + * object, the caller is only required to provide the superblock allocator + * with a pointer to the sb_allocator object, which could be in either + * shared or backend-private memory; our job again is to know which it + * is and provide pointers to the appropriate supporting data structures. + * To do all this, we have to keep track of where all dynamic shared memory + * segments configured for memory allocation are located; and we also have + * to keep track of where all chunks of memory obtained from the operating + * system for backend-private allocations are located. + * + * On a 32-bit system, the number of chunks can never get very big, so + * we just store them all in a single array and use binary search for + * lookups. On a 64-bit system, this might get dicey, so we maintain + * one such array for every 4GB of address space; chunks that span a 4GB + * boundary require multiple entries. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/utils/mmgr/sb_region.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/sb_region.h" + +/* + * On 64-bit systems, we use a two-level radix tree to find the data for + * the relevant 4GB range. The radix tree is deliberately unbalanced, with + * more entries at the first level than at the second level. We expect this + * to save memory, because the first level has a cache, and the full array + * is only instantiated if the cache overflows. Since each L2 entry + * covers 2^44 bytes of address space (16TB), we expect overflows of the + * four-entry cache to happen essentially never. + */ +#define SB_LOOKUP_ROOT_BITS 20 +#define SB_LOOKUP_ROOT_ENTRIES (1 << SB_LOOKUP_ROOT_BITS) +#define SB_LOOKUP_ROOT_CACHE_SIZE 4 +#define SB_LOOKUP_L2_BITS 12 +#define SB_LOOKUP_L2_ENTRIES (1 << SB_LOOKUP_L2_BITS) + +/* Lookup data for a 4GB range of address space. */ +typedef struct +{ + int nused; + int nallocated; + sb_region **region; +} sb_lookup_leaf; + +/* Lookup data for a 16TB range of address space, direct mapped. */ +typedef struct +{ + sb_lookup_leaf *leaf[SB_LOOKUP_L2_ENTRIES]; +} sb_lookup_l2; + +/* Lookup data for an entire 64-bit address space. */ +typedef struct +{ + uint32 cache_key[SB_LOOKUP_ROOT_CACHE_SIZE]; + sb_lookup_l2 *cache_value[SB_LOOKUP_ROOT_CACHE_SIZE]; + sb_lookup_l2 **l2; +} sb_lookup_root; + +/* Toplevel address lookup structure. */ +#if SIZEOF_SIZE_T > 4 +static sb_lookup_root lookup_root; +#else +static sb_lookup_leaf lookup_root_leaf; +#endif + +/* + * Backend-private chunks binned by maximum contiguous freespace. Lists are + * doubly-linked using fl_node. List 0 contains regions with no internal + * no free pages at all. List I, for I>0, contains regions where the number + * of contiguous free pages is no larger than 2^(I-1), except for the last + * list which contains everything with too many pages for any other list. + * A region may be on a higher-numbered list than where it actually belongs, + * but it cannot be any lower. Thus it's safe to assume that searching + * lower-numbered lists is always pointless, but higher-numbered lists may + * contain regions that can't actually satisfy a requested allocation. + */ +#define NUM_PRIVATE_FREELISTS 16 +static dlist_head private_freelist[NUM_PRIVATE_FREELISTS]; + +/* + * Constants to set the size of backend-private regions. Superblocks are + * 16 pages each (64kB), and we want a number of superblocks to fit inside + * each region, so these need to be pretty good-sized. The actual + * allocations will be a bit larger than the values indicated here, because + * we add a bit of space for bookkeeping. These values are in units of + * FPM_PAGE_SIZE. + */ +#define SB_REGION_INITSIZE (16 * SB_PAGES_PER_SUPERBLOCK) +#define SB_REGION_MAXSIZE ((64 * 1024 * 1024) / FPM_PAGE_SIZE) + +static Size sb_private_pages_allocated = 0; +static Size sb_private_bytes_allocated = 0; +static Size sb_peak_private_bytes_allocated = 0; + +/* Static functions. */ +static bool sb_adjust_lookup(sb_region *region, bool insert); +static bool sb_adjust_lookup_leaf(sb_lookup_leaf *leaf, sb_region *region, + bool insert); +static void sb_dump_regions_leaf(sb_region *last_region, sb_lookup_leaf *leaf); +#if SIZEOF_SIZE_T > 4 +static sb_lookup_leaf *sb_find_leaf(Size highbits, bool insert); +#endif +static void *system_calloc(Size count, Size s); +static void system_free(void *p, Size s); +static void *system_malloc(Size s); + +/* + * Dump debugging information for sb_region objects. + */ +void +sb_dump_regions(void) +{ +#if SIZEOF_SIZE_T > 4 + sb_region *last_region = NULL; + + if (lookup_root.l2 != NULL) + { + int i; + int j; + + for (i = 0; i < SB_LOOKUP_ROOT_ENTRIES; ++i) + { + sb_lookup_l2 *l2 = lookup_root.l2[i]; + + if (l2 == NULL) + continue; + for (j = 0; j < SB_LOOKUP_L2_ENTRIES; ++j) + { + sb_lookup_leaf *leaf = l2->leaf[j]; + + if (leaf != NULL) + { + sb_dump_regions_leaf(last_region, leaf); + last_region = leaf->region[leaf->nused - 1]; + } + } + } + } + else + { + bool first = true; + Size highbits = 0; + + for (;;) + { + int i; + int j; + int n = -1; + sb_lookup_l2 *l2; + + /* Find next L2 entry to visit. */ + for (i = 0; i < SB_LOOKUP_ROOT_CACHE_SIZE; ++i) + { + if (lookup_root.cache_value[i] != NULL && + (first || lookup_root.cache_key[i] > highbits)) + n = i; + } + if (n == -1) + break; + first = false; + highbits = lookup_root.cache_key[n]; + + /* Dump this L2 entry. */ + l2 = lookup_root.cache_value[n]; + for (j = 0; j < SB_LOOKUP_L2_ENTRIES; ++j) + { + sb_lookup_leaf *leaf = l2->leaf[j]; + + if (leaf != NULL) + { + sb_dump_regions_leaf(last_region, leaf); + last_region = leaf->region[leaf->nused - 1]; + } + } + } + } +#else + sb_dump_regions_leaf(NULL, lookup_root_leaf); +#endif + + fprintf(stderr, "== overall statistics ==\n"); + fprintf(stderr, "private bytes now: %zu, peak %zu\n", + sb_private_bytes_allocated, + Max(sb_private_bytes_allocated, sb_peak_private_bytes_allocated)); +} + +/* + * Find the region to which a pointer belongs. + */ +sb_region * +sb_lookup_region(void *ptr) +{ + Size p = (Size) ptr; + sb_lookup_leaf *leaf = NULL; + int high, low; + + /* + * If this is a 64-bit system, locate the lookup table that pertains + * to the upper 32 bits of ptr. On a 32-bit system, there's only one + * lookup table. + */ +#if SIZEOF_SIZE_T > 4 + { + Size highbits = p >> 32; + static Size last_highbits = 0; + static sb_lookup_leaf *last_leaf = NULL; + + /* Quick test to see if we're in same range as before. */ + if (last_highbits == highbits && last_leaf != NULL) + leaf = last_leaf; + else + { + leaf = sb_find_leaf(highbits, false); + + /* No lookup table for this 4GB range? OK, no matching region. */ + if (leaf == NULL) + return NULL; + + /* Remember results of this lookup for next time. */ + last_highbits = highbits; + last_leaf = leaf; + } + } +#else + leaf = &lookup_root_leaf; +#endif + + /* Now we use binary search on the sb_lookup_leaf. */ + high = leaf->nused; + low = 0; + while (low < high) + { + int mid; + sb_region *region; + + mid = (high + low) / 2; + region = leaf->region[mid]; + if (region->region_start > (char *) ptr) + high = mid; + else if (region->region_start + region->region_size < (char *) ptr) + low = mid + 1; + else + return region; + } + + return NULL; +} + +/* + * When a backend-private sb_allocator needs more memory, it calls this + * function. We search the existing backend-private regions for one capable + * of satisfying the request; if none found, we must create a new region. + */ +sb_region * +sb_private_region_for_allocator(Size npages) +{ + int freelist = Min(fls(npages), NUM_PRIVATE_FREELISTS); + Size new_region_net_pages; + Size metadata_bytes; + char *region_start; + Size region_size; + sb_region *region; + + Assert(npages > 0); + + while (freelist < NUM_PRIVATE_FREELISTS) + { + dlist_mutable_iter iter; + Size threshold = 1 << (freelist - 1); + + dlist_foreach_modify(iter, &private_freelist[freelist]) + { + sb_region *region; + Size largest; + + region = dlist_container(sb_region, fl_node, iter.cur); + + /* + * Quickly skip regions which appear to have enough space to + * belong on this freelist but which don't have enough space to + * satisfy the request, to avoid probing every region on the list + * for its exact free space on every trip through. + */ + if (region->contiguous_pages >= threshold && + region->contiguous_pages < npages) + continue; + + /* + * We're going to either use this region or move it to a + * lower-numbered freelist or both, so determine the precise size + * of the largest remaining run of pages. + */ + largest = FreePageManagerInquireLargest(region->fpm); + region->contiguous_pages = largest; + + /* + * The region we're examining not only doesn't have enough + * contiguous freespace to satisfy this allocation, but it + * doesn't even belong in this bucket. Move it to the right place. + */ + if (largest < threshold) + { + int new_freelist = Min(fls(largest), NUM_PRIVATE_FREELISTS); + + dlist_delete(iter.cur); + dlist_push_head(&private_freelist[new_freelist], + ®ion->fl_node); + } + + /* + * If the region is big enough, use it. For larger allocations + * this might be suboptimal, because we might carve space out of a + * chunk that's bigger than we really need rather than locating + * the best fit across all chunks. It shouldn't be too far off, + * though, because chunks with way more contiguous space available + * will be on a higher-numbered freelist. + * + * NB: For really large backend-private allocations, it's probably + * better to malloc() directly than go through this machinery. + */ + if (largest >= npages) + return region; + } + + /* Try next freelist. */ + ++freelist; + } + + /* + * There is no existing backend-private region with enough freespace + * to satisfy the request, so we'll need to create a new one. First + * step is to figure out how many pages we should try to obtain. + */ + for (new_region_net_pages = SB_REGION_INITSIZE; + new_region_net_pages < sb_private_pages_allocated && + new_region_net_pages < SB_REGION_MAXSIZE; new_region_net_pages *= 2) + ; + if (new_region_net_pages < npages) + new_region_net_pages = npages; + + /* Try to allocate space from the operating system. */ + for (;;) + { + /* + * Compute space required for metadata and determine raw allocation + * size. + */ + metadata_bytes = MAXALIGN(sizeof(sb_region)); + metadata_bytes += MAXALIGN(sizeof(FreePageManager)); + metadata_bytes += MAXALIGN(sb_map_size(NULL, new_region_net_pages)); + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + region_size = new_region_net_pages * FPM_PAGE_SIZE + metadata_bytes; + + /* Try to allocate memory. */ + region_start = system_malloc(region_size); + if (region_start != NULL) + break; + + /* Too big; if possible, loop and try a smaller allocation. */ + if (new_region_net_pages == npages) + return NULL; + new_region_net_pages = Max(new_region_net_pages / 2, npages); + } + + /* + * Initialize region object. + * + * NB: We temporarily set region->contiguous_pages to a value one more + * than the actual number. This is because calling FreePageManagerPut + * will provoke a callback to sb_report_contiguous_freespace, which we + * want to exit quickly and, in particular, without deallocating the + * region. + */ + region = (sb_region *) region_start; + region->region_start = region_start; + region->region_size = region_size; + region->usable_pages = new_region_net_pages; + sb_private_pages_allocated += region->usable_pages; + region->seg = NULL; + region->allocator = NULL; + region->fpm = (FreePageManager *) + (region_start + MAXALIGN(sizeof(sb_region))); + region->pagemap = (sb_map *) + (((char *) region->fpm) + MAXALIGN(sizeof(FreePageManager))); + region->contiguous_pages = new_region_net_pages + 1; + + /* Initialize supporting data structures. */ + FreePageManagerInitialize(region->fpm, region->region_start, NULL, false); + FreePageManagerPut(region->fpm, metadata_bytes / FPM_PAGE_SIZE, + new_region_net_pages); + sb_map_initialize(region->pagemap, NULL, metadata_bytes / FPM_PAGE_SIZE, + new_region_net_pages); + region->contiguous_pages = new_region_net_pages; /* Now fix the value. */ + freelist = Min(fls(new_region_net_pages), NUM_PRIVATE_FREELISTS); + dlist_push_head(&private_freelist[freelist], ®ion->fl_node); + sb_adjust_lookup(region, true); + + /* Time to rock and roll. */ + return region; +} + +/* + * When a free page manager detects that the maximum contiguous freespace in + * a backend-private region has increased, it calls this function. Our job + * is to free the region completely if there are no remaining allocatons, + * and otherwise to + */ +void +sb_report_contiguous_freespace(sb_region *region, Size npages) +{ + int old_freelist; + int new_freelist; + + /* This should only be called for private regions. */ + Assert(region->seg == NULL); + Assert(region->allocator == NULL); + + /* + * If there have been allocations from the region since the last report, + * it's possible that the number of pages reported is less than what we + * already know about. In that case, exit quickly; else update our + * cached value. + */ + if (npages < region->contiguous_pages) + return; + + /* + * If the entire region is free, deallocate it. The sb_region, + * FreePageManager, and sb_map for the region is stored within it, so + * they all go away when we free the managed space. + */ + if (npages == region->usable_pages) + { + char *region_start = region->region_start; + Size region_size = region->region_size; + + /* Pull the region out of the lookup table. */ + sb_adjust_lookup(region, false); + + /* Remove the region object from the private freelist. */ + dlist_delete(®ion->fl_node); + + /* Decrement count of private pages allocated. */ + Assert(sb_private_pages_allocated >= region->usable_pages); + sb_private_pages_allocated -= region->usable_pages; + + /* Return the managed space to the operating system. */ + system_free(region_start, region_size); + return; + } + + /* If necessary, move the region to a higher-numbered freelist. */ + old_freelist = Min(fls(region->contiguous_pages), NUM_PRIVATE_FREELISTS); + new_freelist = Min(fls(npages), NUM_PRIVATE_FREELISTS); + if (new_freelist > old_freelist) + { + dlist_delete(®ion->fl_node); + dlist_push_head(&private_freelist[new_freelist], ®ion->fl_node); + } + + /* Record the reported value for future calls to this function. */ + region->contiguous_pages = npages; +} + +/* + * Insert a region into, or delete a region from, the address-based lookup + * tables. Returns true on success and false if we fail due to memory + * exhaustion; delete always succeeds. + */ +static bool +sb_adjust_lookup(sb_region *region, bool insert) +{ + bool ok = true; + + /* + * If this is a 64-bit system, we need to loop over all of the relevant + * tables and update each one. On a 32-bit system, there's only one table + * and we simply update that. + */ +#if SIZEOF_SIZE_T > 4 + Size tabstart; + Size tabstop; + Size i; + + tabstart = ((Size) region->region_start) >> 32; + tabstop = ((Size) region->region_start + region->region_size - 1) >> 32; + + for (i = tabstart; i <= tabstop; ++i) + { + sb_lookup_leaf *leaf = sb_find_leaf(i, insert); + + /* + * Finding the leaf might fail if we're inserting and can't allocate + * memory for a new lookup table. Even if we get the leaf, inserting + * the new region pointer into it might also fail for lack of memory. + */ + Assert(insert || leaf != NULL); + if (leaf == NULL) + ok = false; + else + ok = sb_adjust_lookup_leaf(leaf, region, insert); + + if (!ok) + { + /* We ran out of memory; back out changes already made. */ + ok = false; + tabstop = i - 1; + for (i = tabstart; i <= tabstop; ++i) + sb_adjust_lookup_leaf(sb_find_leaf(i, false), region, false); + break; + } + } +#else + ok = sb_adjust_lookup_leaf(&lookup_root_leaf, region, insert); +#endif + + return ok; +} + +/* + * Insert a region into, or remove a region from, a particular sb_lookup_leaf. + * Returns true on success and false if we fail due to memory exhaustion; + * delete always succeeds. + */ +static bool +sb_adjust_lookup_leaf(sb_lookup_leaf *leaf, sb_region *region, bool insert) +{ + int high, low; + + /* If we're inserting, we might need to allocate more space. */ + if (insert && leaf->nused >= leaf->nallocated) + { + Size newsize; + sb_region **newtab; + + newsize = leaf->nallocated == 0 ? 16 : leaf->nallocated * 2; + newtab = system_malloc(sizeof(sb_region *) * newsize); + if (newtab == NULL) + return false; + if (leaf->nused > 0) + memcpy(newtab, leaf->region, sizeof(sb_region *) * leaf->nused); + if (leaf->region != NULL) + system_free(leaf->region, sizeof(sb_region *) * leaf->nallocated); + leaf->nallocated = newsize; + leaf->region = newtab; + } + + /* Use binary search on the sb_lookup_leaf. */ + high = leaf->nused; + low = 0; + while (low < high) + { + int mid; + sb_region *candidate; + + mid = (high + low) / 2; + candidate = leaf->region[mid]; + if (candidate->region_start > region->region_start) + high = mid; + else if (candidate->region_start < region->region_start) + low = mid + 1; + else + low = high = mid; + } + + /* Really do it. */ + if (insert) + { + Assert(low == leaf->nused || + leaf->region[low]->region_start > region->region_start); + if (low < leaf->nused) + memmove(&leaf->region[low + 1], &leaf->region[low], + sizeof(sb_region *) * (leaf->nused - low)); + leaf->region[low] = region; + ++leaf->nused; + } + else + { + Assert(leaf->region[low] == region); + if (low < leaf->nused - 1) + memmove(&leaf->region[low], &leaf->region[low + 1], + sizeof(sb_region *) * (leaf->nused - low - 1)); + --leaf->nused; + } + + return true; +} + +/* + * Dump debugging information for the regions covered by a single + * sb_lookup_leaf. Skip the first one if it's the same as last_region. + */ +static void +sb_dump_regions_leaf(sb_region *last_region, sb_lookup_leaf *leaf) +{ + int i; + + for (i = 0; i < leaf->nused; ++i) + { + sb_region *region = leaf->region[i]; + + if (i == 0 && region == last_region) + continue; + fprintf(stderr, "== region at %p [%zu bytes, %zu usable pages] ==\n", + region->region_start, region->region_size, + region->usable_pages); + fprintf(stderr, "%s\n\n", FreePageManagerDump(region->fpm)); + } +} + +#if SIZEOF_SIZE_T > 4 +static sb_lookup_leaf * +sb_find_leaf(Size highbits, bool insert) +{ + Size rootbits; + sb_lookup_l2 *l2 = NULL; + sb_lookup_leaf **leafptr; + int i; + int unused = -1; + + rootbits = (highbits >> SB_LOOKUP_L2_BITS) & (SB_LOOKUP_ROOT_ENTRIES - 1); + + /* Check for L2 entry in toplevel cache. */ + for (i = 0; i < SB_LOOKUP_ROOT_CACHE_SIZE; ++i) + { + if (lookup_root.cache_value[i] == NULL) + unused = i; + else if (lookup_root.cache_key[i] == rootbits) + l2 = lookup_root.cache_value[i]; + } + + /* If no hit, check the full L2 loookup table, if it's been initialized. */ + if (l2 == NULL && lookup_root.l2 != NULL) + { + rootbits &= SB_LOOKUP_ROOT_ENTRIES - 1; + l2 = lookup_root.l2[rootbits]; + + /* Pull entry into cache. */ + if (l2 != NULL) + { + /* + * No need to be smart about replacement policy; we expect to + * arrive here virtually never. + */ + i = highbits % SB_LOOKUP_ROOT_CACHE_SIZE; + lookup_root.cache_key[i] = highbits; + lookup_root.cache_value[i] = l2; + } + } + + /* If no L2 entry found, create one if inserting else give up. */ + if (l2 == NULL) + { + if (!insert) + return NULL; + l2 = system_calloc(1, sizeof(sb_lookup_l2)); + if (l2 == NULL) + return NULL; + if (unused != -1) + { + lookup_root.cache_key[unused] = rootbits; + lookup_root.cache_value[unused] = l2; + } + else if (lookup_root.l2 != NULL) + lookup_root.l2[rootbits] = l2; + else + { + lookup_root.l2 = system_calloc(SB_LOOKUP_ROOT_ENTRIES, + sizeof(sb_lookup_l2 *)); + if (lookup_root.l2 == NULL) + { + system_free(l2, sizeof(sb_lookup_l2)); + return NULL; + } + for (i = 0; i < SB_LOOKUP_ROOT_CACHE_SIZE; ++i) + lookup_root.l2[lookup_root.cache_key[i]] = + lookup_root.cache_value[i]; + } + } + + /* Find slot for entry, and try to initialize it if needed. */ + leafptr = &l2->leaf[highbits & (SB_LOOKUP_L2_ENTRIES - 1)]; + if (insert && *leafptr == NULL) + *leafptr = system_calloc(1, sizeof(sb_lookup_leaf)); + + return *leafptr; +} +#endif + +/* + * calloc() wrapper, to track bytes allocated. + */ +static void * +system_calloc(Size count, Size s) +{ + void *p = calloc(count, s); + + if (p != NULL) + sb_private_bytes_allocated += count * s; + return p; +} + +/* + * free() wrapper, to track bytes allocated. + */ +static void +system_free(void *p, Size s) +{ + free(p); + if (sb_private_bytes_allocated > sb_peak_private_bytes_allocated) + sb_peak_private_bytes_allocated = sb_private_bytes_allocated; + sb_private_bytes_allocated -= s; +} + +/* + * malloc() wrapper, to track bytes allocated. + */ +static void * +system_malloc(Size s) +{ + void *p = malloc(s); + + if (p != NULL) + sb_private_bytes_allocated += s; + return p; +} diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 8b520c1..120dee7 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -113,6 +113,7 @@ #include "utils/memutils.h" #include "utils/pg_rusage.h" #include "utils/rel.h" +#include "utils/sb_alloc.h" #include "utils/sortsupport.h" #include "utils/tuplesort.h" @@ -216,6 +217,7 @@ struct Tuplesortstate int maxTapes; /* number of tapes (Knuth's T) */ int tapeRange; /* maxTapes-1 (Knuth's P) */ MemoryContext sortcontext; /* memory context holding all sort data */ + sb_allocator *sortallocator; /* superblock allocator holding sort data */ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ /* @@ -728,6 +730,7 @@ tuplesort_begin_index_btree(Relation heapRel, { Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); MemoryContext oldcontext; + state->sortallocator = sb_create_private_allocator(); oldcontext = MemoryContextSwitchTo(state->sortcontext); @@ -1845,8 +1848,11 @@ inittapes(Tuplesortstate *state) #ifdef TRACE_SORT if (trace_sort) - elog(LOG, "switching to external sort with %d tapes: %s", - maxTapes, pg_rusage_show(&state->ru_start)); + { + int64 spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + elog(LOG, "switching to external sort with %d tapes (%ld KB used): %s", + maxTapes, spaceUsed, pg_rusage_show(&state->ru_start)); + } #endif /* @@ -3340,9 +3346,9 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) IndexTuple newtuple; /* copy the tuple into sort storage */ - newtuple = (IndexTuple) palloc(tuplen); + newtuple = (IndexTuple) sb_alloc(state->sortallocator, tuplen, 0); memcpy(newtuple, tuple, tuplen); - USEMEM(state, GetMemoryChunkSpace(newtuple)); + USEMEM(state, sb_alloc_space(tuplen)); stup->tuple = (void *) newtuple; /* set up first-column key value */ stup->datum1 = index_getattr(newtuple, @@ -3366,8 +3372,8 @@ writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) LogicalTapeWrite(state->tapeset, tapenum, (void *) &tuplen, sizeof(tuplen)); - FREEMEM(state, GetMemoryChunkSpace(tuple)); - pfree(tuple); + FREEMEM(state, sb_chunk_space(tuple)); + sb_free(tuple); } static void @@ -3375,9 +3381,9 @@ readtup_index(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len) { unsigned int tuplen = len - sizeof(unsigned int); - IndexTuple tuple = (IndexTuple) palloc(tuplen); + IndexTuple tuple = (IndexTuple) sb_alloc(state->sortallocator, tuplen, 0); - USEMEM(state, GetMemoryChunkSpace(tuple)); + USEMEM(state, sb_chunk_space(tuple)); LogicalTapeReadExact(state->tapeset, tapenum, tuple, tuplen); if (state->randomAccess) /* need trailing length word? */ diff --git a/src/include/utils/freepage.h b/src/include/utils/freepage.h new file mode 100644 index 0000000..dd905d7 --- /dev/null +++ b/src/include/utils/freepage.h @@ -0,0 +1,101 @@ +/*------------------------------------------------------------------------- + * + * freepage.h + * Management of page-organized free memory. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/freepage.h + * + *------------------------------------------------------------------------- + */ + +#ifndef FREEPAGE_H +#define FREEPAGE_H + +#include "storage/lwlock.h" +#include "utils/relptr.h" + +/* Forward declarations. */ +typedef struct FreePageSpanLeader FreePageSpanLeader; +typedef struct FreePageBtree FreePageBtree; +typedef struct FreePageManager FreePageManager; + +/* + * PostgreSQL normally uses 8kB pages for most things, but many common + * architecture/operating system pairings use a 4kB page size for memory + * allocation, so we do that here also. We assume that a large allocation + * is likely to begin on a page boundary; if not, we'll discard bytes from + * the beginning and end of the object and use only the middle portion that + * is properly aligned. This works, but is not ideal, so it's best to keep + * this conservatively small. There don't seem to be any common architectures + * where the page size is less than 4kB, so this should be good enough; also, + * making it smaller would increase the space consumed by the address space + * map, which also uses this page size. + */ +#define FPM_PAGE_SIZE 4096 + +/* + * Each freelist except for the last contains only spans of one particular + * size. Everything larger goes on the last one. In some sense this seems + * like a waste since most allocations are in a few common sizes, but it + * means that small allocations can simply pop the head of the relevant list + * without needing to worry about whether the object we find there is of + * precisely the correct size (because we know it must be). + */ +#define FPM_NUM_FREELISTS 129 + +/* Everything we need in order to manage free pages (see freepage.c) */ +struct FreePageManager +{ + relptr(FreePageManager) self; + relptr(LWLock) lock; + bool lock_address_is_fixed; + relptr(FreePageBtree) btree_root; + relptr(FreePageSpanLeader) btree_recycle; + unsigned btree_depth; + unsigned btree_recycle_count; + Size singleton_first_page; + Size singleton_npages; + Size largest_reported_chunk; + relptr(FreePageSpanLeader) freelist[FPM_NUM_FREELISTS]; +}; + +/* Macros to convert between page numbers (expressed as Size) and pointers. */ +#define fpm_page_to_pointer(base, page) \ + (AssertVariableIsOfTypeMacro(page, Size), \ + (base) + FPM_PAGE_SIZE * (page)) +#define fpm_pointer_to_page(base, ptr) \ + (((Size) (((char *) (ptr)) - (base))) / FPM_PAGE_SIZE) + +/* Macro to convert an allocation size to a number of pages. */ +#define fpm_size_to_pages(sz) \ + (TYPEALIGN((sz), FPM_PAGE_SIZE)) + +/* Macros to check alignment of absolute and relative pointers. */ +#define fpm_pointer_is_page_aligned(base, ptr) \ + (((Size) (((char *) (ptr)) - (base))) % FPM_PAGE_SIZE == 0) +#define fpm_relptr_is_page_aligned(base, relptr) \ + ((relptr).relptr_off % FPM_PAGE_SIZE == 0) + +/* Macro to find base address of the segment containing a FreePageManager. */ +#define fpm_segment_base(fpm) \ + (((char *) fpm) - fpm->self.relptr_off) + +/* Macro to find the lwlock for the FreePageManager. */ +#define fpm_lock(fpm) \ + (relptr_access((fpm)->lock_address_is_fixed ? NULL : \ + fpm_segment_base(fpm), (fpm)->lock)) + +/* Functions to manipulate the free page map. */ +extern void FreePageManagerInitialize(FreePageManager *fpm, char *base, + LWLock *lock, bool lock_address_is_fixed); +extern bool FreePageManagerGet(FreePageManager *fpm, Size npages, + Size *first_page); +extern void FreePageManagerPut(FreePageManager *fpm, Size first_page, + Size npages); +extern Size FreePageManagerInquireLargest(FreePageManager *fpm); +extern char *FreePageManagerDump(FreePageManager *fpm); + +#endif /* FREEPAGE_H */ diff --git a/src/include/utils/relptr.h b/src/include/utils/relptr.h new file mode 100644 index 0000000..46281cf --- /dev/null +++ b/src/include/utils/relptr.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * + * relptr.h + * This file contains basic declarations for relative pointers. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/relptr.h + * + *------------------------------------------------------------------------- + */ + +#ifndef RELPTR_H +#define RELPTR_H + +/* + * Relative pointers are intended to be used when storing an address that may + * be relative either to the base of the processes address space or some + * dynamic shared memory segment mapped therein. + * + * The idea here is that you declare a relative pointer as relptr(type) + * and then use relptr_access to dereference it and relptr_store to change + * it. The use of a union here is a hack, because what's stored in the + * relptr is always a Size, never an actual pointer. But including a pointer + * in the union allows us to use stupid macro tricks to provide some measure + * of type-safety. + */ +#define relptr(type) union { type *relptr_type; Size relptr_off; } +#define relptr_access(base, rp) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + (__typeof__((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \ + (base + (rp).relptr_off))) +#define relptr_is_null(rp) \ + ((rp).relptr_off == 0) +#define relptr_store(base, rp, val) \ + (AssertVariableIsOfTypeMacro(base, char *), \ + AssertVariableIsOfTypeMacro(val, __typeof__((rp).relptr_type)), \ + (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base))) +#define relptr_copy(rp1, rp2) \ + ((rp1).relptr_off = (rp2).relptr_off) + +#endif /* RELPTR_H */ diff --git a/src/include/utils/sb_alloc.h b/src/include/utils/sb_alloc.h new file mode 100644 index 0000000..50ef5c3 --- /dev/null +++ b/src/include/utils/sb_alloc.h @@ -0,0 +1,79 @@ +/*------------------------------------------------------------------------- + * + * sb_alloc.h + * Superblock-based memory allocator. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/sb_alloc.h + * + *------------------------------------------------------------------------- + */ + +#ifndef SB_ALLOC_H +#define SB_ALLOC_H + +#include "storage/lwlock.h" +#include "utils/relptr.h" + +typedef struct sb_span sb_span; + +/* + * Superblocks are binned by how full they are. Generally, each fullness + * class corresponds to one quartile, but the superblock being used for + * allocations is always at the head of the list for fullness class 1, + * regardless of how full it really is. + * + * For large objects, we just stick all of the allocations in fullness class + * 0. Since we can just return the space directly to the free page manager, + * we don't really need them on a list at all, except that if someone wants + * to bulk release everything allocated using this sb_allocator, we have no + * other way of finding them. + */ +#define SB_FULLNESS_CLASSES 4 + +/* + * An sb_heap represents a set of allocations of a given size class. + * There can be multiple heaps for the same size class for contention + * avoidance. + */ +typedef struct sb_heap +{ + relptr(LWLock) lock; + relptr(sb_span) spans[SB_FULLNESS_CLASSES]; +} sb_heap; + +/* + * An sb_allocator is basically just a group of heaps, one per size class. + * If locking is required, then we've also got an array of LWLocks, one per + * heap. + */ +typedef struct sb_allocator +{ + bool private; + relptr(LWLock) locks; + sb_heap heaps[FLEXIBLE_ARRAY_MEMBER]; +} sb_allocator; + +/* Pages per superblock (in units of FPM_PAGE_SIZE). */ +#define SB_PAGES_PER_SUPERBLOCK 16 + +/* Allocation options. */ +#define SB_ALLOC_HUGE 0x0001 /* allow >=1GB */ +#define SB_ALLOC_SOFT_FAIL 0x0002 /* return NULL if no mem */ + +/* Functions to manipulate allocators. */ +extern sb_allocator *sb_create_private_allocator(void); +extern void sb_reset_allocator(sb_allocator *a); +extern void sb_destroy_private_allocator(sb_allocator *); + +/* Functions to allocate and free memory. */ +extern void *sb_alloc(sb_allocator *, Size, int flags); +extern void sb_free(void *ptr); + +/* Reporting functions. */ +extern Size sb_alloc_space(Size size); +extern Size sb_chunk_space(void *ptr); + +#endif /* SB_ALLOC_H */ diff --git a/src/include/utils/sb_map.h b/src/include/utils/sb_map.h new file mode 100644 index 0000000..519bf52 --- /dev/null +++ b/src/include/utils/sb_map.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * + * sb_map.h + * Superblock allocator page-mapping infrastructure. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/sb_map.h + * + *------------------------------------------------------------------------- + */ + +#ifndef SB_MAP_H +#define SB_MAP_H + +typedef struct sb_map sb_map; + +extern Size sb_map_size(char *base, Size npages); +extern void sb_map_initialize(sb_map *, char *base, Size offset, Size npages); +extern void sb_map_set(sb_map *, Size pageno, void *ptr); +extern void *sb_map_get(sb_map *, Size pageno); + +#endif /* SB_MAP_H */ diff --git a/src/include/utils/sb_region.h b/src/include/utils/sb_region.h new file mode 100644 index 0000000..5bb01f3 --- /dev/null +++ b/src/include/utils/sb_region.h @@ -0,0 +1,68 @@ +/*------------------------------------------------------------------------- + * + * sb_region.h + * Superblock allocator memory region manager. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/sb_region.h + * + *------------------------------------------------------------------------- + */ + +#ifndef SB_REGION_H +#define SB_REGION_H + +#include "lib/ilist.h" +#include "storage/dsm.h" +#include "storage/shm_toc.h" +#include "utils/freepage.h" +#include "utils/sb_alloc.h" +#include "utils/sb_map.h" + +/* + * An sb_region is a backend-private object used to track allocatable regions + * of memory, either backend-private or shared. + */ +typedef struct sb_region +{ + char *region_start; /* Address of region. */ + Size region_size; /* Number of bytes in region. */ + Size usable_pages; /* Number of usable pages in region. */ + dsm_segment *seg; /* If not backend-private, DSM handle. */ + sb_allocator *allocator; /* If not backend-private, shared allocator. */ + FreePageManager *fpm; /* Free page manager for region (if any). */ + sb_map *pagemap; /* Page map for region (if any). */ + Size contiguous_pages; /* Last reported contiguous free pages. */ + dlist_node fl_node; /* Freelist links. */ +} sb_region; + +/* + * An sb_shared_region is a shared-memory object containing the information + * necessary to set up an sb_region object for an individual backend. + */ +typedef struct sb_shared_region +{ + relptr(FreePageManager) fpm; + relptr(sb_map) pagemap; + relptr(sb_allocator) allocator; + int lwlock_tranche_id; + char lwlock_tranche_name[FLEXIBLE_ARRAY_MEMBER]; +} sb_shared_region; + +/* Public API. */ +extern sb_shared_region *sb_create_shared_region(dsm_segment *seg, + shm_toc *toc, Size size, + int lwlock_tranche_id, + char *lwlock_tranche_name); +extern sb_allocator *sb_attach_shared_region(dsm_segment *, + sb_shared_region *); +extern void sb_dump_regions(void); + +/* For internal use by cooperating modules. */ +extern sb_region *sb_lookup_region(void *); +extern sb_region *sb_private_region_for_allocator(Size npages); +extern void sb_report_contiguous_freespace(sb_region *, Size npages); + +#endif /* SB_REGION_H */