diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
new file mode 100644
index 5f21fcb..af92835
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2557,25 +2557,21 @@ compute_scalar_stats(VacAttrStatsP stats
 		 * Decide how many values are worth storing as most-common values. If
 		 * we are able to generate a complete MCV list (all the values in the
 		 * sample will fit, and we think these are all the ones in the table),
-		 * then do so.  Otherwise, store only those values that are
-		 * significantly more common than the (estimated) average. We set the
-		 * threshold rather arbitrarily at 25% more than average, with at
-		 * least 2 instances in the sample.  Also, we won't suppress values
-		 * that have a frequency of at least 1/K where K is the intended
-		 * number of histogram bins; such values might otherwise cause us to
-		 * emit duplicate histogram bin boundaries.  (We might end up with
-		 * duplicate histogram entries anyway, if the distribution is skewed;
-		 * but we prefer to treat such values as MCVs if at all possible.)
+		 * then do so.  Otherwise, keep only those values that appear
+		 * sufficiently often in the sample that it is reasonable to
+		 * extrapolate their sample frequencies to the entire table. We do
+		 * this by placing an upper bound on the relative standard error of
+		 * the sample frequency, so that any estimates the planner generates
+		 * from these statistics can be expected to be reasonably accurate.
 		 *
 		 * Note: the first of these cases is meant to address columns with
 		 * small, fixed sets of possible values, such as boolean or enum
 		 * columns.  If we can *completely* represent the column population by
 		 * an MCV list that will fit into the stats target, then we should do
 		 * so and thus provide the planner with complete information.  But if
-		 * the MCV list is not complete, it's generally worth being more
-		 * selective, and not just filling it all the way up to the stats
-		 * target.  So for an incomplete list, we try to take only MCVs that
-		 * are significantly more common than average.
+		 * the MCV list is not complete, then we need to be more selective, to
+		 * avoid including values that aren't common enough in the sample to
+		 * generate accurate statistics for the population.
 		 */
 		if (track_cnt == ndistinct && toowide_cnt == 0 &&
 			stats->stadistinct > 0 &&
@@ -2586,24 +2582,39 @@ compute_scalar_stats(VacAttrStatsP stats
 		}
 		else
 		{
-			double		ndistinct_table = stats->stadistinct;
-			double		avgcount,
-						mincount,
-						maxmincount;
+			/*----------
+			 * Discard values whose relative standard error is too high. A
+			 * common rule of thumb when estimating errors in this situation
+			 * is to require at least 10 instances in the sample. This is
+			 * sufficient to allow the distribution of the sample frequency (a
+			 * hypergeometric distribution, since we are doing sampling
+			 * without replacement) to be approximated by a normal
+			 * distribution, and standard error analysis techniques can be
+			 * applied. Then, if the sample size is n, the population size is
+			 * N, and the sample frequency is p=cnt/n, the standard error on p
+			 * is given by
+			 *		SE = sqrt(p*(1-p)/n) * sqrt((N-n)/(N-1))
+			 * where the second term is the finite population correction. We
+			 * impose an (arbitrarily chosen) upper bound on the relative
+			 * standard error of 10% -- i.e., SE/p < 0.1. This gives a lower
+			 * bound on the number of instances of the value seen:
+			 *		cnt > n*(N-n) / (N-n+0.01*n*(N-1))
+			 * This bound is at most 100, and approaches 0 as n approaches 0
+			 * or N. The case where n approaches 0 isn't actually possible,
+			 * since the sample size is at least 300. The case where n
+			 * approaches N corresponds to sampling most of the table, in
+			 * which case it is reasonable to keep the whole MCV list, as we
+			 * do above. Thus it is reasonable to apply this bound for all
+			 * inputs (even though the formula is technically only valid when
+			 * the right hand side is at least around 10), giving a smooth
+			 * transition from this code branch to the all-values-seen branch
+			 * above.
+			 *----------
+			 */
+			double		n = samplerows;
+			double		N = totalrows;
+			double		mincount = n*(N-n) / (N-n+0.01*n*(N-1));
 
-			/* Re-extract estimate of # distinct nonnull values in table */
-			if (ndistinct_table < 0)
-				ndistinct_table = -ndistinct_table * totalrows;
-			/* estimate # occurrences in sample of a typical nonnull value */
-			avgcount = (double) nonnull_cnt / ndistinct_table;
-			/* set minimum threshold count to store a value */
-			mincount = avgcount * 1.25;
-			if (mincount < 2)
-				mincount = 2;
-			/* don't let threshold exceed 1/K, however */
-			maxmincount = (double) values_cnt / (double) num_bins;
-			if (mincount > maxmincount)
-				mincount = maxmincount;
 			if (num_mcv > track_cnt)
 				num_mcv = track_cnt;
 			for (i = 0; i < num_mcv; i++)