diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index bd39333..4291fde 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -92,9 +92,9 @@ _hash_doinsert(Relation rel, IndexTuple itup) _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old lock - * and lock what now appears to be the correct bucket. + * If the previous iteration of this loop locked the primary page of + * what is still the correct target bucket, we are done. Otherwise, + * drop any old lock before acquiring the new one. */ if (retry) { @@ -103,7 +103,7 @@ _hash_doinsert(Relation rel, IndexTuple itup) _hash_relbuf(rel, buf); } - /* Fetch the primary bucket page for the bucket */ + /* Fetch and lock the primary bucket page for the target bucket */ buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); /* @@ -132,15 +132,13 @@ _hash_doinsert(Relation rel, IndexTuple itup) Assert(pageopaque->hasho_bucket == bucket); /* - * If there is any pending split, try to finish it before proceeding for - * the insertion. We try to finish the split for the insertion in old - * bucket, as that will allow us to remove the tuples from old bucket and - * reuse the space. There is no such apparent benefit from finishing the - * split during insertion in new bucket. - * - * In future, if we want to finish the splits during insertion in new - * bucket, we must ensure the locking order such that old bucket is locked - * before new bucket. + * If this bucket is in the process of being split, try to finish the + * split before inserting, because that might create room for the + * insertion to proceed without allocating an additional overflow page. + * It's only interesting to finish the split if we're trying to insert + * into the bucket from which we're removing tuples (the "old" bucket), + * not if we're trying to insert into the bucket into which tuples are + * being moved (the "new" bucket). */ if (H_OLD_INCOMPLETE_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) { @@ -176,8 +174,10 @@ _hash_doinsert(Relation rel, IndexTuple itup) { /* * ovfl page exists; go get it. if it doesn't have room, we'll - * find out next pass through the loop test above. Retain the - * pin, if it is a primary bucket page. + * find out next pass through the loop test above. we always + * release both the lock and pin if this is an overflow page, but + * only the lock if this is the primary bucket page, since the pin + * on the primary bucket must be retained throughout the scan. */ if (pageopaque->hasho_flag & LH_BUCKET_PAGE) _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); @@ -217,8 +217,9 @@ _hash_doinsert(Relation rel, IndexTuple itup) (void) _hash_pgaddtup(rel, buf, itemsz, itup); /* - * write and release the modified page and ensure to release the pin on - * primary page. + * write and release the modified page. if the page we modified was an + * overflow page, we also need to separately drop the pin we retained on + * the primary bucket page. */ _hash_wrtbuf(rel, buf); if (buf != bucket_buf) diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 36cacc8..106ffca 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -96,10 +96,10 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags) } /* - * _hash_getbuf_with_condlock_cleanup() -- as above, but get the buffer for write. + * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup. * - * We try to take the conditional cleanup lock and if we get it then - * return the buffer, else return InvalidBuffer. + * We read the page and try to acquire a cleanup lock. If we get it, + * we return the buffer; otherwise, we return InvalidBuffer. */ Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)