From 0056ced7353926c419fa9235d973fecfa6538382 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Sat, 22 Oct 2022 01:39:39 +0200
Subject: [PATCH 08/11] wip: adaptive watermark step

Another option it to adjust the watermark step based on past tuplesort
executions, and either increase or decrease the step, based on whether
the sort was in-memory or on-disk, etc.

To do this, set the GUC to -1:

  SET brinsort_watermark_step = -1;
---
 src/backend/access/brin/brin_minmax.c   |   7 +-
 src/backend/executor/nodeBrinSort.c     | 189 +++++++++++++++++++++++-
 src/backend/optimizer/plan/createplan.c |  21 +--
 src/backend/utils/misc/guc_tables.c     |   2 +-
 src/include/nodes/execnodes.h           |   2 +-
 src/include/nodes/plannodes.h           |   3 +-
 6 files changed, 206 insertions(+), 18 deletions(-)

diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c
index 6b08d8b288b..85f8992f878 100644
--- a/src/backend/access/brin/brin_minmax.c
+++ b/src/backend/access/brin/brin_minmax.c
@@ -47,9 +47,6 @@ static FmgrInfo *minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno,
 											  Oid subtype, uint16 strategynum);
 
 
-/* print info about ranges */
-#define BRINSORT_DEBUG
-
 Datum
 brin_minmax_opcinfo(PG_FUNCTION_ARGS)
 {
@@ -1995,7 +1992,7 @@ brin_minmax_scan_add_tuple(BrinRangeScanDesc *scan, TupleTableSlot *slot,
 	scan->nranges++;
 }
 
-#ifdef BRINSORT_DEBUG
+#ifdef BRIN_SORT_DEBUG
 /*
  * brin_minmax_scan_next
  *		Return the next BRIN range information from the tuplestore.
@@ -2204,7 +2201,7 @@ brin_minmax_ranges(PG_FUNCTION_ARGS)
 	/* do the sort and any necessary post-processing */
 	brin_minmax_scan_finalize(brscan);
 
-#ifdef BRINSORT_DEBUG
+#ifdef BRIN_SORT_DEBUG
 	brin_minmax_scan_dump(brscan);
 #endif
 
diff --git a/src/backend/executor/nodeBrinSort.c b/src/backend/executor/nodeBrinSort.c
index f8356202b77..08507f2b5d9 100644
--- a/src/backend/executor/nodeBrinSort.c
+++ b/src/backend/executor/nodeBrinSort.c
@@ -218,6 +218,8 @@
  *		ExecBrinSortReInitializeDSM reinitialize DSM for fresh scan
  *		ExecBrinSortInitializeWorker attach to DSM info in parallel worker
  */
+#include <math.h>
+
 #include "postgres.h"
 
 #include "access/brin.h"
@@ -248,6 +250,14 @@ static void ExecInitBrinSortRanges(BrinSort *node, BrinSortState *planstate);
 bool debug_brin_sort = false;
 #endif
 
+/*
+ * How many distinct minval values to look forward for the next watermark?
+ *
+ * The smallest step we can do is 1, which means the immediately following
+ * (while distinct) minval.
+ */
+int	brinsort_watermark_step = 0;
+
 /* do various consistency checks */
 static void
 AssertCheckRanges(BrinSortState *node)
@@ -859,6 +869,175 @@ brinsort_rescan(BrinSortState *node)
 	tuplesort_rescan(node->bs_scan->ranges);
 }
 
+/*
+ * Look at the tuplesort statistics, and maybe increase or decrease the
+ * watermark step. If the last sort was in-memory, we decrease the step.
+ * If the sort was in-memory, but we used less than work_mem/3, increment
+ * the step value.
+ *
+ * XXX This should probably behave differently for LIMIT queries, so that
+ * we don't load too many rows unnecessarily. We already consider that in
+ * create_brinsort_plan, but maybe we should limit increments to the step
+ * value here too - say, by tracking how many rows are we supposed to
+ * produce, and limiting the watermark so that we don't process too many
+ * rows in future steps.
+ *
+ * XXX We might also track the number of rows in the sort and space used,
+ * to calculate more accurate estimate of row width. And then use that to
+ * calculate number of rows that fit into work_mem. But the number of rows
+ * that go into tuplesort (per range) added would still remain fairly
+ * inaccurate, so not sure how good this woud be.
+ */
+static void
+brinsort_adjust_watermark_step(BrinSortState *node, TuplesortInstrumentation *stats)
+{
+	BrinSort   *plan = (BrinSort *) node->ss.ps.plan;
+
+	if (brinsort_watermark_step != -1)
+		return;
+
+	if (stats->spaceType == SORT_SPACE_TYPE_DISK)
+	{
+		/*
+		 * We don't know how much to decrease the step (hard to estimate
+		 * due to space needed for in-memory and on-disk sorts is not
+		 * easily comparable, so we just cut the step in half. For the
+		 * in-memory sort, we then can do better estimate and increase
+		 * the step more accurately.
+		 */
+		plan->watermark_step = Max(1, plan->watermark_step / 2);
+	}
+	else
+	{
+		/*
+		 * Adjust the step based on the last sort - we shoot for 2/3 of
+		 * work_mem, to keep some slack (and not switch to on-disk sort
+		 * due to minor differences). We calculate the average row width
+		 * using space used and number of rows in the tuplesort, number
+		 * of rows we could fit into work_mem, and how many steps would
+		 * that mean (assuming number of rows is proportional to the
+		 * number of steps).
+		 *
+		 * We need to be careful about the number of rows we're supposed
+		 * to produce (and how many we already produced). Consider for
+		 * example a query with LIMIT 1000, and that we produce 999 rows
+		 * in the first sort, so that we need only 1 more row. It would
+		 * be silly to pick the steps with the goal to "fill work_mem"
+		 * instead of just enough to produce the one row.
+		 *
+		 * XXX In principle, we don't know how many rows will need to be
+		 * read from the table - there may be interesting rows already in
+		 * the tuplestore (in which case we could do a smaller step). But
+		 * we don't know how many such rows are there - maybe if we had
+		 * multiple smaller tuplestores, which would also reduce the
+		 * amount of "respill" we need to do.
+		 */
+		int		nrows_remaining;
+		int		step = plan->watermark_step;
+		int		step_max = plan->watermark_step * 2;
+
+		/* number of remaining rows we're expected to produce */
+		nrows_remaining = Max(1.0, plan->step_maxrows - node->bs_stats.ntuples_tuplesort_all);
+
+		/*
+		 * If we sorted any rows, calculate how many similar rows we can fit
+		 * into work_mem. We restrict ourselves to 2/3 of work_mem, to leave
+		 * a bit of slack space.
+		 *
+		 * XXX Hopefully the average width is somewhat accurate, but maybe
+		 * we should remember the width we originally expected, and combine
+		 * that somehow. Maybe we should not use just the last tuplesort,
+		 * but instead accumulate average from all preceding sorts and
+		 * combine them somehow (say, using weighted average with older
+		 * values having less influence).
+		 */
+		if (node->bs_stats.ntuples_tuplesort > 0)
+		{
+			int		nrows_wmem;
+			int		avgwidth;
+
+			/* average tuple width, calculated from last sort */
+			avgwidth = (stats->spaceUsed * 1024L / node->bs_stats.ntuples_tuplesort);
+
+			/*
+			 * Calculate the numer of rows to fit into 2/3 of work_mem, but
+			 * cap to the number of rows we're expected to produce.
+			 */
+			nrows_wmem = Min(nrows_remaining, (2 * 1024L * work_mem / 3) / avgwidth);
+
+			/* scale the number of steps to produce the number of rows */
+			step = step * ((double) (nrows_wmem * avgwidth) / (stats->spaceUsed * 1024L));
+
+			/* remember this as the max, so that we don't overflow work_mem */
+			step_max = Min(step, step_max);
+
+			/* however, make sure we don't grow too fast - cap to 2x */
+			step = Min(step, step_max);
+		}
+
+		/*
+		 * Now calculate average step size using data from all sorts we did
+		 * up to now. Then we calculate the number of steps we expect to be
+		 * necessary.
+		 *
+		 * If we had calculated average number of rows per step from AM stats,
+		 * consider that too. It's possible the batch had just one row, which
+		 * might result in very high estimate of steps - it'd be silly to
+		 * jump e.g. from 1 to 1000 based on this unreliable statistics. To
+		 * prevent that, we combine the two rows_per_step sources as weighted
+		 * sum, using the observed vs. target number of rows as weight. The
+		 * closer we're to the target, the more reliable value from past
+		 * executions is.
+		 *
+		 * But we don't want to overflow work_mem, so cap by step_max.
+		 */
+		if (node->bs_stats.ntuples_tuplesort_all > 0)
+		{
+			double		rows_per_step;
+
+			/* average number of rows we produced per step so far */
+			rows_per_step = (double) node->bs_stats.ntuples_tuplesort_all / node->bs_stats.watermark_updates_steps;
+
+			/*
+			 * If we have AM stats with average number of rows per step, consider
+			 * that too - approximate depending on what fraction of rows we already
+			 * produced (with higher fraction of rows produced we prefer the local
+			 * average, as opposed to the global average from index AM stats).
+			 */
+			if (plan->rows_per_step > 0)
+			{
+				/* number of rows we already produced (as a fraction) */
+				double weight = (double) node->bs_stats.ntuples_tuplesort_all / plan->step_maxrows;
+
+				/* paranoia */
+				weight = Min(1.0, weight);
+
+				/*
+				 * Approximate between index AM and "local" average calculated
+				 * from past executions. The closer we get to target rows, the
+				 * more we ignore the index AM stats.
+				 */
+				rows_per_step = weight * rows_per_step + (1 - weight) * plan->rows_per_step;
+			}
+
+			/* approximate the steps between */
+			step = Max(step, ceil((double) nrows_remaining / rows_per_step));
+
+			/*
+			 * But don't overflow the current max (which is set either
+			 * as 2x starting value, or from work_mem.
+			 */
+			step = Min(step, step_max);
+		}
+
+		plan->watermark_step = step;
+
+	}
+
+	plan->watermark_step = Max(1, plan->watermark_step);
+	plan->watermark_step = Min(8192, plan->watermark_step);
+}
+
 /* ----------------------------------------------------------------
  *		IndexNext
  *
@@ -997,13 +1176,21 @@ IndexNext(BrinSortState *node)
 
 						/*
 						 * Reset tuplesort statistics between runs, otherwise
-						 * we'll keep re-using stats from the largest run.
+						 * we'll keep re-using stats from the largest run, which
+						 * would then confuse the adaptive adjustment of the
+						 * watermark step.
 						 */
 						tuplesort_reset_stats(node->bs_tuplesortstate);
 
 						tuplesort_performsort(node->bs_tuplesortstate);
 
 						node->bs_stats.sort_count++;
+
+						memset(&stats, 0, sizeof(TuplesortInstrumentation));
+						tuplesort_get_stats(node->bs_tuplesortstate, &stats);
+
+						brinsort_adjust_watermark_step(node, &stats);
+
 						node->bs_stats.ntuples_tuplesort = 0;
 
 						tuplesort_get_stats(node->bs_tuplesortstate, &stats);
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index c88337bd310..c9c47f24e3d 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -324,13 +324,8 @@ static GatherMerge *create_gather_merge_plan(PlannerInfo *root,
 											 GatherMergePath *best_path);
 
 
-/*
- * How many distinct minval values to look forward for the next watermark?
- *
- * The smallest step we can do is 1, which means the immediately following
- * (while distinct) minval.
- */
-int brinsort_watermark_step = 0;
+/* defined in nodeBrinSort.c */
+extern int brinsort_watermark_step;
 
 /*
  * create_plan
@@ -3449,8 +3444,14 @@ create_brinsort_plan(PlannerInfo *root,
 	 * are there, and if there are only few then try increasing the step?
 	 */
 	brinsort_plan->watermark_step = brinsort_watermark_step;
+	brinsort_plan->rows_per_step = -1;
 
-	if (brinsort_plan->watermark_step == 0)
+	if (root->limit_tuples > 0)
+		brinsort_plan->step_maxrows = root->limit_tuples;
+	else
+		brinsort_plan->step_maxrows = brinsort_plan->scan.plan.plan_rows;
+
+	if (brinsort_plan->watermark_step <= 0)
 	{
 		BrinMinmaxStats *amstats;
 
@@ -3478,7 +3479,9 @@ create_brinsort_plan(PlannerInfo *root,
 									   amstats->maxval_increment_avg);
 			double	rows_per_step = Max(1.0, pct_per_step * rows);
 
-			brinsort_plan->watermark_step = (int) (maxrows / rows_per_step);
+			brinsort_plan->rows_per_step = rows_per_step;
+
+			brinsort_plan->watermark_step = (int) ceil(maxrows / rows_per_step);
 		}
 
 		/* some rough safety estimates */
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 63b164edbeb..b1d7879f028 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3542,7 +3542,7 @@ struct config_int ConfigureNamesInt[] =
 			GUC_NOT_IN_SAMPLE
 		},
 		&brinsort_watermark_step,
-		0, 0, INT_MAX,
+		0, -1, INT_MAX,
 		NULL, NULL, NULL
 	},
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 06dc6416d99..a3059314054 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1713,7 +1713,7 @@ typedef struct BrinSortState
 	 * We need two tuplesort instances - one for current range, one for
 	 * spill-over tuples from the overlapping ranges
 	 */
-	void		   *bs_tuplesortstate;
+	Tuplesortstate  *bs_tuplesortstate;
 	Tuplestorestate *bs_tuplestore;
 } BrinSortState;
 
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 659a6d110ee..b0cff1d02d2 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -533,7 +533,8 @@ typedef struct BrinSort
 
 	/* number of watermark steps to make */
 	int			watermark_step;
-
+	int			step_maxrows;
+	int			rows_per_step;
 } BrinSort;
 
 /* ----------------
-- 
2.39.2