From 755b6383aa8abaeff2e040841e80a2a1cf893e12 Mon Sep 17 00:00:00 2001
From: Jinbao Chen <jinchen@pivotal.io>
Date: Fri, 22 Nov 2019 16:55:09 +0800
Subject: [PATCH] Added the selection rate of the inner table non-empty bucket

The planner will use big table as inner table in hash join
if small table have fewer unique values. But this plan is
much slower than using small table as inner table.

In general, the cost of creating a hash table is higher
than the cost of querying a hash table. So we tend to use
small tables as internal tables. But if the average chain
length of the bucket is large, the situation is just the
opposite.

If virtualbuckets is much larger than innerndistinct, and
outerndistinct is much larger than innerndistinct. Then most
tuples of the outer table will match the empty bucket. So when
we calculate the cost of traversing the bucket, we need to
ignore the tuple matching empty bucket.

So we add the selection rate of the inner table non-empty
bucket. The formula is:
(1 - ((outerndistinct - innerndistinct)/outerndistinct)*
((virtualbuckets - innerndistinct)/virtualbuckets))
---
 src/backend/optimizer/path/costsize.c   | 73 ++++++++++++++++++++++++++++++++-
 src/test/regress/expected/join_hash.out | 24 +++++++++++
 src/test/regress/sql/join_hash.sql      | 19 +++++++++
 3 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index c5f6593485..2633b020ed 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -3382,6 +3382,9 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	double		virtualbuckets;
 	Selectivity innerbucketsize;
 	Selectivity innermcvfreq;
+	double		outerndistinct;
+	double		innerndistinct;
+	Selectivity outer_match_nonempty_frac;
 	ListCell   *hcl;
 
 	/* Mark the path with the correct row estimate */
@@ -3426,20 +3429,30 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	 * because we avoid contaminating the cache with a value that's wrong for
 	 * non-unique-ified paths.
 	 */
+	outerndistinct = 1.0;
+
 	if (IsA(inner_path, UniquePath))
 	{
 		innerbucketsize = 1.0 / virtualbuckets;
 		innermcvfreq = 0.0;
+		innerndistinct = inner_path_rows;
 	}
 	else
 	{
 		innerbucketsize = 1.0;
 		innermcvfreq = 1.0;
+		innerndistinct = 1.0;
+
 		foreach(hcl, hashclauses)
 		{
 			RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
 			Selectivity thisbucketsize;
 			Selectivity thismcvfreq;
+			double thisinnerndistinct;
+			double thisouterndistinct;
+			VariableStatData vardatainner;
+			VariableStatData vardataouter;
+			bool isdefault;
 
 			/*
 			 * First we have to figure out which side of the hashjoin clause
@@ -3465,6 +3478,25 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 					thisbucketsize = restrictinfo->right_bucketsize;
 				}
 				thismcvfreq = restrictinfo->right_mcvfreq;
+
+				examine_variable(root, get_rightop(restrictinfo->clause), 0, &vardatainner);
+				thisinnerndistinct = get_variable_numdistinct(&vardatainner, &isdefault);
+				if (vardatainner.rel && vardatainner.rel->tuples > 0)
+				{
+					thisinnerndistinct *= vardatainner.rel->rows / vardatainner.rel->tuples;
+					thisinnerndistinct = clamp_row_est(thisinnerndistinct);
+				}
+				ReleaseVariableStats(vardatainner);
+
+				/* lefthand side is outer */
+				examine_variable(root, get_leftop(restrictinfo->clause), 0, &vardataouter);
+				thisouterndistinct = get_variable_numdistinct(&vardataouter, &isdefault);
+				if (vardataouter.rel && vardataouter.rel->tuples > 0)
+				{
+					thisinnerndistinct *= vardataouter.rel->rows / vardataouter.rel->tuples;
+					thisinnerndistinct = clamp_row_est(thisinnerndistinct);
+				}
+				ReleaseVariableStats(vardataouter);
 			}
 			else
 			{
@@ -3483,12 +3515,35 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 					thisbucketsize = restrictinfo->left_bucketsize;
 				}
 				thismcvfreq = restrictinfo->left_mcvfreq;
+
+				examine_variable(root, get_leftop(restrictinfo->clause), 0, &vardatainner);
+				thisinnerndistinct = get_variable_numdistinct(&vardatainner, &isdefault);
+				if (vardatainner.rel && vardatainner.rel->tuples > 0)
+				{
+					thisinnerndistinct *= vardatainner.rel->rows / vardatainner.rel->tuples;
+					thisinnerndistinct = clamp_row_est(thisinnerndistinct);
+				}
+				ReleaseVariableStats(vardatainner);
+
+				/* righthand side is outers */
+				examine_variable(root, get_rightop(restrictinfo->clause), 0, &vardataouter);
+				thisouterndistinct = get_variable_numdistinct(&vardataouter, &isdefault);
+				if (vardataouter.rel && vardataouter.rel->tuples > 0)
+				{
+					thisinnerndistinct *= vardataouter.rel->rows / vardataouter.rel->tuples;
+					thisinnerndistinct = clamp_row_est(thisinnerndistinct);
+				}
+				ReleaseVariableStats(vardataouter);
 			}
 
 			if (innerbucketsize > thisbucketsize)
 				innerbucketsize = thisbucketsize;
 			if (innermcvfreq > thismcvfreq)
 				innermcvfreq = thismcvfreq;
+			if (outerndistinct < thisouterndistinct)
+				outerndistinct = thisouterndistinct;
+			if (innerndistinct < thisinnerndistinct)
+				innerndistinct =  thisinnerndistinct;
 		}
 	}
 
@@ -3516,6 +3571,21 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 
 	/* CPU costs */
 
+	/*
+	 * If virtualbuckets is much larger than innerndistinct, and
+	 * outerndistinct is much larger than innerndistinct. Then most
+	 * tuples of the outer table will match the empty bucket. So when
+	 * we calculate the cost of traversing the bucket, we need to ignore
+	 * the tuple matching empty bucket.
+	 */
+	outer_match_nonempty_frac = 1.0;
+	if (virtualbuckets > innerndistinct * 2 && outerndistinct > innerndistinct * 2)
+	{
+		outer_match_nonempty_frac = (1 -
+				((outerndistinct - innerndistinct)/outerndistinct)*
+				((virtualbuckets - innerndistinct)/virtualbuckets));
+	}
+
 	if (path->jpath.jointype == JOIN_SEMI ||
 		path->jpath.jointype == JOIN_ANTI ||
 		extra->inner_unique)
@@ -3539,7 +3609,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
 
 		startup_cost += hash_qual_cost.startup;
-		run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
+		run_cost += hash_qual_cost.per_tuple * outer_matched_rows * outer_match_nonempty_frac *
 			clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
 
 		/*
@@ -3579,6 +3649,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 		 */
 		startup_cost += hash_qual_cost.startup;
 		run_cost += hash_qual_cost.per_tuple * outer_path_rows *
+			outer_match_nonempty_frac *
 			clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
 
 		/*
diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out
index 3a91c144a2..548f9724ad 100644
--- a/src/test/regress/expected/join_hash.out
+++ b/src/test/regress/expected/join_hash.out
@@ -1012,4 +1012,28 @@ WHERE
  text | t  | hjtest_1 | hjtest_2
 (1 row)
 
+-- If virtualbuckets is much larger than innerndistinct, and
+-- outerndistinct is much larger than innerndistinct. Then most
+-- tuples of the outer table will match the empty bucket. So when
+-- we calculate the cost of traversing the bucket, we need to ignore
+-- the tuple matching empty bucket.
+savepoint settings;
+set max_parallel_workers_per_gather = 0;
+create table join_hash_t_small(a int);
+create table join_hash_t_big(b int);
+insert into join_hash_t_small select i%100 from generate_series(0, 3000)i;
+insert into join_hash_t_big select i%100000 from generate_series(1, 100000)i ;
+analyze join_hash_t_small;
+analyze join_hash_t_big;
+explain (costs off) select * from join_hash_t_small, join_hash_t_big where a = b;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Hash Join
+   Hash Cond: (join_hash_t_big.b = join_hash_t_small.a)
+   ->  Seq Scan on join_hash_t_big
+   ->  Hash
+         ->  Seq Scan on join_hash_t_small
+(5 rows)
+
+rollback to settings;
 ROLLBACK;
diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql
index 68c1a8c7b6..154e9e0085 100644
--- a/src/test/regress/sql/join_hash.sql
+++ b/src/test/regress/sql/join_hash.sql
@@ -537,4 +537,23 @@ WHERE
     AND (SELECT hjtest_2.c * 5) < 55
     AND hjtest_1.a <> hjtest_2.b;
 
+-- If virtualbuckets is much larger than innerndistinct, and
+-- outerndistinct is much larger than innerndistinct. Then most
+-- tuples of the outer table will match the empty bucket. So when
+-- we calculate the cost of traversing the bucket, we need to ignore
+-- the tuple matching empty bucket.
+savepoint settings;
+set max_parallel_workers_per_gather = 0;
+create table join_hash_t_small(a int);
+create table join_hash_t_big(b int);
+
+insert into join_hash_t_small select i%100 from generate_series(0, 3000)i;
+insert into join_hash_t_big select i%100000 from generate_series(1, 100000)i ;
+
+analyze join_hash_t_small;
+analyze join_hash_t_big;
+
+explain (costs off) select * from join_hash_t_small, join_hash_t_big where a = b;
+rollback to settings;
+
 ROLLBACK;
-- 
2.14.2