From c3bd06eb05fa600d70223acbd6a319cf53b990f0 Mon Sep 17 00:00:00 2001
From: ashu <ashutosh.sharma@enterprisedb.com>
Date: Thu, 21 Sep 2017 12:24:04 +0530
Subject: [PATCH] Improve locking startegy during VACUUM in Hash Index for
 regular tables.

Patch by Ashutosh Sharma.
---
 src/backend/access/hash/README     | 33 +++++++++++-----------
 src/backend/access/hash/hash.c     | 58 +++++++++++++++++++++++++-------------
 src/backend/access/hash/hashovfl.c | 13 ++++++---
 3 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 3b1f719..a77e45d 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -396,8 +396,8 @@ The fourth operation is garbage collection (bulk deletion):
 			mark the target page dirty
 			write WAL for deleting tuples from target page
 			if this is the last bucket page, break out of loop
-			pin and x-lock next page
 			release prior lock and pin (except keep pin on primary bucket page)
+			pin and x-lock next page
 		if the page we have locked is not the primary bucket page:
 			release lock and take exclusive lock on primary bucket page
 		if there are no other pins on the primary bucket page:
@@ -415,21 +415,22 @@ The fourth operation is garbage collection (bulk deletion):
 Note that this is designed to allow concurrent splits and scans.  If a split
 occurs, tuples relocated into the new bucket will be visited twice by the
 scan, but that does no harm.  As we release the lock on bucket page during
-cleanup scan of a bucket, it will allow concurrent scan to start on a bucket
-and ensures that scan will always be behind cleanup.  It is must to keep scans
-behind cleanup, else vacuum could decrease the TIDs that are required to
-complete the scan.  Now, as the scan that returns multiple tuples from the
-same bucket page always expect next valid TID to be greater than or equal to
-the current TID, it might miss the tuples.  This holds true for backward scans
-as well (backward scans first traverse each bucket starting from first bucket
-to last overflow page in the chain).  We must be careful about the statistics
-reported by the VACUUM operation.  What we can do is count the number of
-tuples scanned, and believe this in preference to the stored tuple count if
-the stored tuple count and number of buckets did *not* change at any time
-during the scan.  This provides a way of correcting the stored tuple count if
-it gets out of sync for some reason.  But if a split or insertion does occur
-concurrently, the scan count is untrustworthy; instead, subtract the number of
-tuples deleted from the stored tuple count and use that.
+cleanup scan of a bucket, it will allow concurrent scan to start on a bucket.
+It is quite possible that scans on a regular table get ahead of vacuum and
+vacuum removes some items from the current page being scanned, but that does
+no harm as we always copy all the matching items from a page at once in the
+backend local array and also check for the page's lsn before marking a tuple
+in a page as dead. However, this is not valid for unlogged or temporary tables,
+as for these tables lsn is not meaningful and therefore, there is a chance that
+if scan overtakes vacuum it might mark some valid tuple as dead. Hence, for
+unlogged or temporary tables we always ensure that scan follows VACUUM. We must
+be careful about the statistics reported by the VACUUM operation.  What we can
+do is count the number of tuples scanned, and believe this in preference to the
+stored tuple count if the stored tuple count and number of buckets did *not*
+change at any time during the scan.  This provides a way of correcting the stored
+tuple count if it gets out of sync for some reason.  But if a split or insertion
+does occur concurrently, the scan count is untrustworthy; instead, subtract the
+number of tuples deleted from the stored tuple count and use that.
 
 
 Free Space Management
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 8550218..76474f3 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -655,16 +655,18 @@ hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
  * primary bucket page.  The lock won't necessarily be held continuously,
  * though, because we'll release it when visiting overflow pages.
  *
- * It would be very bad if this function cleaned a page while some other
- * backend was in the midst of scanning it, because hashgettuple assumes
- * that the next valid TID will be greater than or equal to the current
- * valid TID.  There can't be any concurrent scans in progress when we first
- * enter this function because of the cleanup lock we hold on the primary
- * bucket page, but as soon as we release that lock, there might be.  We
- * handle that by conspiring to prevent those scans from passing our cleanup
- * scan.  To do that, we lock the next page in the bucket chain before
- * releasing the lock on the previous page.  (This type of lock chaining is
- * not ideal, so we might want to look for a better solution at some point.)
+ * There is a possibility of this function cleaning a page while some other
+ * backend is in the midst of scanning it, but, that won't impact the concurrent
+ * scan as it works in page at a time mode which means the hash page being
+ * scanned won't be locked/unlocked at the tuple level and therefore,
+ * hashgettuple don't need to find the tid of next valid tuple in the index page
+ * assuming that the concurrent insert might have inserted a new tuple in the
+ * page. However, we do such validation in the _hash_kill_items to ensure that
+ * we are marking the correct index tuple as dead. There can't be any concurrent
+ * scans in progress when we first enter this function because of the cleanup
+ * lock we hold on the primary bucket page, but as soon as we release that lock,
+ * there might be. But, we do not have to bother about it, as the hash index
+ * scan work in page at a time mode.
  *
  * We need to retain a pin on the primary bucket to ensure that no concurrent
  * split can start.
@@ -833,18 +835,36 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
 		if (!BlockNumberIsValid(blkno))
 			break;
 
-		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
-											  LH_OVERFLOW_PAGE,
-											  bstrategy);
-
 		/*
-		 * release the lock on previous page after acquiring the lock on next
-		 * page
+		 * As the hash index scan works in page-at-a-time mode, vacuum can
+		 * release the lock on previous page before acquiring lock on the next
+		 * page for regular tables, but, for unlogged tables, we avoid this as
+		 * we do not want scan to cross vacuum when both are running on the
+		 * same bucket page. This is to ensure that, we are safe during dead
+		 * marking of index tuples in _hash_kill_items().
 		 */
-		if (retain_pin)
-			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		if (RelationNeedsWAL(rel))
+		{
+			if (retain_pin)
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			else
+				_hash_relbuf(rel, buf);
+
+			next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
+												  LH_OVERFLOW_PAGE,
+												  bstrategy);
+		}
 		else
-			_hash_relbuf(rel, buf);
+		{
+			next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
+												  LH_OVERFLOW_PAGE,
+												  bstrategy);
+
+			if (retain_pin)
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			else
+				_hash_relbuf(rel, buf);
+		}
 
 		buf = next_buf;
 	}
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index c206e70..b41afbb 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -524,7 +524,7 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
 	 * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
 	 * up the bucket chain members behind and ahead of the overflow page being
 	 * deleted.  Concurrency issues are avoided by using lock chaining as
-	 * described atop hashbucketcleanup.
+	 * described atop _hash_squeezebucket.
 	 */
 	if (BlockNumberIsValid(prevblkno))
 	{
@@ -790,9 +790,14 @@ _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
  *	Caller must acquire cleanup lock on the primary page of the target
  *	bucket to exclude any scans that are in progress, which could easily
  *	be confused into returning the same tuple more than once or some tuples
- *	not at all by the rearrangement we are performing here.  To prevent
- *	any concurrent scan to cross the squeeze scan we use lock chaining
- *	similar to hasbucketcleanup.  Refer comments atop hashbucketcleanup.
+ *	not at all by the rearrangement we are performing here. This means there
+ *	can't be any concurrent scans in progress when we first enter this
+ *	function because of the cleanup lock we hold on the primary bucket page,
+ *	but as soon as we release that lock, there might be. To prevent any
+ *	concurrent scan to cross the squeeze scan we use lock chaining i.e.
+ *	we lock the next page in the bucket chain before releasing the lock on
+ *	the previous page. (This type of lock chaining is not ideal, so we might
+ *	want to look for a better solution at some point.)
  *
  *	We need to retain a pin on the primary bucket to ensure that no concurrent
  *	split can start.
-- 
1.8.3.1