Split _bt_insertonpg to two functions - Mailing list pgsql-patches
From | Heikki Linnakangas |
---|---|
Subject | Split _bt_insertonpg to two functions |
Date | |
Msg-id | 45E2B2C0.8020103@enterprisedb.com Whole thread Raw |
Responses |
Re: Split _bt_insertonpg to two functions
Re: Split _bt_insertonpg to two functions |
List | pgsql-patches |
Here's a patch that: Moves the logic to find a page with enough room from _bt_insertonpg to a new function, _bt_findinsertloc. It makes the code more readable, and simplifies the forthcoming Grouped Index Tuples patch. Also, the insert location within page used to be calculated twice for unique indexes, once in _bt_checkunique and second time in _bt_insertonpg. That's a waste of cycles, and this patch fixes that. I couldn't measure a difference with pgbench, but this micro-benchmark shows it: > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);" > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest; INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;" Without patch: real 0m7.260s With patch: real 0m6.963s On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10, to remove any other variables. It's not a huge difference, but it's worth having, and performance wasn't the main motivation of the patch anyway. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com Index: src/backend/access/nbtree/nbtinsert.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v retrieving revision 1.152 diff -c -r1.152 nbtinsert.c *** src/backend/access/nbtree/nbtinsert.c 21 Feb 2007 20:02:17 -0000 1.152 --- src/backend/access/nbtree/nbtinsert.c 26 Feb 2007 09:37:16 -0000 *************** *** 46,58 **** static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, ! Relation heapRel, Buffer buf, ScanKey itup_scankey); static void _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, - int keysz, ScanKey scankey, IndexTuple itup, ! OffsetNumber afteritem, bool split_only_page); static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, --- 46,63 ---- static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, ! Relation heapRel, Buffer buf, OffsetNumber ioffset, ScanKey itup_scankey); + static void _bt_findinsertloc(Relation rel, + Buffer *bufptr, + OffsetNumber *offsetptr, + int keysz, + ScanKey scankey, + IndexTuple newtup); static void _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, IndexTuple itup, ! OffsetNumber newitemoff, bool split_only_page); static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, *************** *** 86,91 **** --- 91,97 ---- ScanKey itup_scankey; BTStack stack; Buffer buf; + OffsetNumber offset; /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, itup); *************** *** 94,99 **** --- 100,107 ---- /* find the first page containing this key */ stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE); + offset = InvalidOffsetNumber; + /* trade in our read lock for a write lock */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBuffer(buf, BT_WRITE); *************** *** 128,134 **** { TransactionId xwait; ! xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey); if (TransactionIdIsValid(xwait)) { --- 136,143 ---- { TransactionId xwait; ! offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); ! xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey); if (TransactionIdIsValid(xwait)) { *************** *** 142,148 **** } /* do the insertion */ ! _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false); /* be tidy */ _bt_freestack(stack); --- 151,158 ---- } /* do the insertion */ ! _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup); ! _bt_insertonpg(rel, buf, stack, itup, offset, false); /* be tidy */ _bt_freestack(stack); *************** *** 152,169 **** /* * _bt_check_unique() -- Check for violation of unique index constraint * * Returns InvalidTransactionId if there is no conflict, else an xact ID * we must wait for to see if it commits a conflicting tuple. If an actual * conflict is detected, no return --- just ereport(). */ static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, ! Buffer buf, ScanKey itup_scankey) { TupleDesc itupdesc = RelationGetDescr(rel); int natts = rel->rd_rel->relnatts; ! OffsetNumber offset, ! maxoff; Page page; BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; --- 162,182 ---- /* * _bt_check_unique() -- Check for violation of unique index constraint * + * offset points to the first possible item that could conflict. It can + * also point to end-of-page, which means that the first tuple to check + * is the first tuple on the next page. + * * Returns InvalidTransactionId if there is no conflict, else an xact ID * we must wait for to see if it commits a conflicting tuple. If an actual * conflict is detected, no return --- just ereport(). */ static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, ! Buffer buf, OffsetNumber offset, ScanKey itup_scankey) { TupleDesc itupdesc = RelationGetDescr(rel); int natts = rel->rd_rel->relnatts; ! OffsetNumber maxoff; Page page; BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; *************** *** 173,184 **** maxoff = PageGetMaxOffsetNumber(page); /* - * Find first item >= proposed new item. Note we could also get a pointer - * to end-of-page here. - */ - offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); - - /* * Scan over all equal tuples, looking for live conflicts. */ for (;;) --- 186,191 ---- *************** *** 342,374 **** return InvalidTransactionId; } ! /*---------- ! * _bt_insertonpg() -- Insert a tuple on a particular page in the index. ! * ! * This recursive procedure does the following things: ! * ! * + finds the right place to insert the tuple. ! * + if necessary, splits the target page (making sure that the ! * split is equitable as far as post-insert free space goes). ! * + inserts the tuple. ! * + if the page was split, pops the parent stack, and finds the ! * right place to insert the new child pointer (by walking ! * right using information stored in the parent stack). ! * + invokes itself with the appropriate tuple for the right ! * child page on the parent. ! * + updates the metapage if a true root or fast root is split. ! * ! * On entry, we must have the right buffer in which to do the ! * insertion, and the buffer must be pinned and write-locked. On return, ! * we will have dropped both the pin and the lock on the buffer. ! * ! * If 'afteritem' is >0 then the new tuple must be inserted after the ! * existing item of that number, noplace else. If 'afteritem' is 0 ! * then the procedure finds the exact spot to insert it by searching. ! * (keysz and scankey parameters are used ONLY if afteritem == 0. ! * The scankey must be an insertion-type scankey.) * ! * NOTE: if the new key is equal to one or more existing keys, we can * legitimately place it anywhere in the series of equal keys --- in fact, * if the new key is equal to the page's "high key" we can place it on * the next page. If it is equal to the high key, and there's not room --- 349,359 ---- return InvalidTransactionId; } ! ! /* ! * _bt_findinsertloc() -- Finds an insert location for a tuple * ! * If the new key is equal to one or more existing keys, we can * legitimately place it anywhere in the series of equal keys --- in fact, * if the new key is equal to the page's "high key" we can place it on * the next page. If it is equal to the high key, and there's not room *************** *** 379,414 **** * Once we have chosen the page to put the key on, we'll insert it before * any existing equal keys because of the way _bt_binsrch() works. * ! * The locking interactions in this code are critical. You should ! * grok Lehman and Yao's paper before making any changes. In addition, ! * you need to understand how we disambiguate duplicate keys in this ! * implementation, in order to be able to find our location using ! * L&Y "move right" operations. Since we may insert duplicate user ! * keys, and since these dups may propagate up the tree, we use the ! * 'afteritem' parameter to position ourselves correctly for the ! * insertion on internal pages. ! *---------- */ static void ! _bt_insertonpg(Relation rel, ! Buffer buf, ! BTStack stack, ! int keysz, ! ScanKey scankey, ! IndexTuple itup, ! OffsetNumber afteritem, ! bool split_only_page) { ! Page page; BTPageOpaque lpageop; OffsetNumber newitemoff; ! OffsetNumber firstright = InvalidOffsetNumber; ! Size itemsz; - page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); ! itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ --- 364,403 ---- * Once we have chosen the page to put the key on, we'll insert it before * any existing equal keys because of the way _bt_binsrch() works. * ! * If there's not enough room in the space, we try to make room by ! * removing any LP_DELETEd tuples. ! * ! * On entry, *buf and *offsetptr point to the first legal position ! * where the new tuple could be inserted. The caller should hold an ! * exclusive lock on *buf. *offsetptr can also be set to ! * InvalidOffsetNumber, in which case the function will search the right ! * location within the page if needed. On exit, they point to the chosen ! * insert location. If findinsertloc decided to move right, the lock and ! * pin on the original page will be released and the new page returned to ! * the caller is exclusively locked instead. ! * ! * newtup is the new tuple we're inserting, and scankey is an insertion ! * type scan key for it. */ static void ! _bt_findinsertloc(Relation rel, ! Buffer *bufptr, ! OffsetNumber *offsetptr, ! int keysz, ! ScanKey scankey, ! IndexTuple newtup) { ! Buffer buf = *bufptr; ! Page page = BufferGetPage(buf); ! Size itemsz; BTPageOpaque lpageop; + bool movedright, vacuumed; OffsetNumber newitemoff; ! OffsetNumber firstlegaloff = *offsetptr; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); ! itemsz = IndexTupleDSize(*newtup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ *************** *** 429,525 **** "Consider a function index of an MD5 hash of the value, " "or use full text indexing."))); - /* - * Determine exactly where new item will go. - */ - if (afteritem > 0) - newitemoff = afteritem + 1; - else - { - /*---------- - * If we will need to split the page to put the item here, - * check whether we can put the tuple somewhere to the right, - * instead. Keep scanning right until we - * (a) find a page with enough free space, - * (b) reach the last page where the tuple can legally go, or - * (c) get tired of searching. - * (c) is not flippant; it is important because if there are many - * pages' worth of equal keys, it's better to split one of the early - * pages than to scan all the way to the end of the run of equal keys - * on every insert. We implement "get tired" as a random choice, - * since stopping after scanning a fixed number of pages wouldn't work - * well (we'd never reach the right-hand side of previously split - * pages). Currently the probability of moving right is set at 0.99, - * which may seem too high to change the behavior much, but it does an - * excellent job of preventing O(N^2) behavior with many equal keys. - *---------- - */ - bool movedright = false; - - while (PageGetFreeSpace(page) < itemsz) - { - Buffer rbuf; - /* - * before considering moving right, see if we can obtain enough - * space by erasing LP_DELETE items - */ - if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) - { - _bt_vacuum_one_page(rel, buf); - if (PageGetFreeSpace(page) >= itemsz) - break; /* OK, now we have enough space */ - } ! /* ! * nope, so check conditions (b) and (c) enumerated above ! */ ! if (P_RIGHTMOST(lpageop) || ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || ! random() <= (MAX_RANDOM_VALUE / 100)) ! break; ! /* ! * step right to next non-dead page ! * ! * must write-lock that page before releasing write lock on ! * current page; else someone else's _bt_check_unique scan could ! * fail to see our insertion. write locks on intermediate dead ! * pages won't do because we don't know when they will get ! * de-linked from the tree. ! */ ! rbuf = InvalidBuffer; ! for (;;) ! { ! BlockNumber rblkno = lpageop->btpo_next; ! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); ! page = BufferGetPage(rbuf); ! lpageop = (BTPageOpaque) PageGetSpecialPointer(page); ! if (!P_IGNORE(lpageop)) ! break; ! if (P_RIGHTMOST(lpageop)) ! elog(ERROR, "fell off the end of \"%s\"", ! RelationGetRelationName(rel)); ! } ! _bt_relbuf(rel, buf); ! buf = rbuf; ! movedright = true; } /* ! * Now we are on the right page, so find the insert position. If we ! * moved right at all, we know we should insert at the start of the ! * page, else must find the position by searching. */ ! if (movedright) ! newitemoff = P_FIRSTDATAKEY(lpageop); ! else ! newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); } /* * Do we need to split the page to fit the item on it? * * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, --- 418,573 ---- "Consider a function index of an MD5 hash of the value, " "or use full text indexing."))); ! /*---------- ! * If we will need to split the page to put the item on this page, ! * check whether we can put the tuple somewhere to the right, ! * instead. Keep scanning right until we ! * (a) find a page with enough free space, ! * (b) reach the last page where the tuple can legally go, or ! * (c) get tired of searching. ! * (c) is not flippant; it is important because if there are many ! * pages' worth of equal keys, it's better to split one of the early ! * pages than to scan all the way to the end of the run of equal keys ! * on every insert. We implement "get tired" as a random choice, ! * since stopping after scanning a fixed number of pages wouldn't work ! * well (we'd never reach the right-hand side of previously split ! * pages). Currently the probability of moving right is set at 0.99, ! * which may seem too high to change the behavior much, but it does an ! * excellent job of preventing O(N^2) behavior with many equal keys. ! *---------- ! */ ! movedright = false; ! vacuumed = false; ! while (PageGetFreeSpace(page) < itemsz) ! { ! Buffer rbuf; ! /* ! * before considering moving right, see if we can obtain enough ! * space by erasing LP_DELETE items ! */ ! if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) ! { ! _bt_vacuum_one_page(rel, buf); ! /* remember that we vacuumed this page, because that makes ! * the hint supplied by the caller invalid */ ! vacuumed = true; ! if (PageGetFreeSpace(page) >= itemsz) ! break; /* OK, now we have enough space */ } /* ! * nope, so check conditions (b) and (c) enumerated above */ ! if (P_RIGHTMOST(lpageop) || ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || ! random() <= (MAX_RANDOM_VALUE / 100)) ! break; ! ! /* ! * step right to next non-dead page ! * ! * must write-lock that page before releasing write lock on ! * current page; else someone else's _bt_check_unique scan could ! * fail to see our insertion. write locks on intermediate dead ! * pages won't do because we don't know when they will get ! * de-linked from the tree. ! */ ! rbuf = InvalidBuffer; ! ! for (;;) ! { ! BlockNumber rblkno = lpageop->btpo_next; ! ! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); ! page = BufferGetPage(rbuf); ! lpageop = (BTPageOpaque) PageGetSpecialPointer(page); ! if (!P_IGNORE(lpageop)) ! break; ! if (P_RIGHTMOST(lpageop)) ! elog(ERROR, "fell off the end of \"%s\"", ! RelationGetRelationName(rel)); ! } ! _bt_relbuf(rel, buf); ! buf = rbuf; ! movedright = true; ! vacuumed = false; } /* + * Now we are on the right page, so find the insert position. If we + * moved right at all, we know we should insert at the start of the + * page. If we didn't move right, we can use the firstlegaloff hint + * if the caller supplied one, unless we vacuumed the page which + * might have moved tuples around making the hint invalid. If we + * didn't move right or can't use the hint, find the position + * by searching. + */ + if (movedright) + newitemoff = P_FIRSTDATAKEY(lpageop); + else if(firstlegaloff != InvalidOffsetNumber && !vacuumed) + newitemoff = firstlegaloff; + else + newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); + + *bufptr = buf; + *offsetptr = newitemoff; + } + + /*---------- + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. + * + * This recursive procedure does the following things: + * + * + if necessary, splits the target page (making sure that the + * split is equitable as far as post-insert free space goes). + * + inserts the tuple. + * + if the page was split, pops the parent stack, and finds the + * right place to insert the new child pointer (by walking + * right using information stored in the parent stack). + * + invokes itself with the appropriate tuple for the right + * child page on the parent. + * + updates the metapage if a true root or fast root is split. + * + * On entry, we must have the right buffer in which to do the + * insertion, and the buffer must be pinned and write-locked. On return, + * we will have dropped both the pin and the lock on the buffer. + * + * The locking interactions in this code are critical. You should + * grok Lehman and Yao's paper before making any changes. In addition, + * you need to understand how we disambiguate duplicate keys in this + * implementation, in order to be able to find our location using + * L&Y "move right" operations. Since we may insert duplicate user + * keys, and since these dups may propagate up the tree, we use the + * 'afteritem' parameter to position ourselves correctly for the + * insertion on internal pages. + *---------- + */ + static void + _bt_insertonpg(Relation rel, + Buffer buf, + BTStack stack, + IndexTuple itup, + OffsetNumber newitemoff, + bool split_only_page) + { + Page page; + BTPageOpaque lpageop; + OffsetNumber firstright = InvalidOffsetNumber; + Size itemsz; + + page = BufferGetPage(buf); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we + * need to be consistent */ + + /* * Do we need to split the page to fit the item on it? * * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, *************** *** 1427,1433 **** /* Recursively update the parent */ _bt_insertonpg(rel, pbuf, stack->bts_parent, ! 0, NULL, new_item, stack->bts_offset, is_only); /* be tidy */ --- 1475,1481 ---- /* Recursively update the parent */ _bt_insertonpg(rel, pbuf, stack->bts_parent, ! new_item, stack->bts_offset + 1, is_only); /* be tidy */
pgsql-patches by date: