From f17117fd0aa3a362294ba51475abc576668ab95d Mon Sep 17 00:00:00 2001 From: Oleksandr Shulgin Date: Wed, 2 Mar 2016 18:54:09 +0100 Subject: [PATCH 2/2] Try to account for skewed distributions in ANALYZE This is aimed to produce better (more unique) compressed histograms where appropriate by allowing slightly more values in the MCV lists than previously. Another observed effect is that in face of highly skewed distribution of value frequencies in the sample, the MCV list is more predictable between the runs and is less dependent on the factors of pure luck. --- src/backend/commands/analyze.c | 90 +++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index f05b496..4661f7f 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -2132,26 +2132,35 @@ compute_distinct_stats(VacAttrStatsP stats, { int sample_cnt = nonnull_cnt - toowide_cnt; double ndistinct = stats->stadistinct; - double avgcount, - mincount; if (ndistinct < 0) ndistinct = -ndistinct * sample_cnt; - /* estimate # of occurrences in sample of a typical value */ - avgcount = (double) sample_cnt / ndistinct; - /* set minimum threshold count to store a value */ - mincount = avgcount * 1.25; - if (mincount < 2) - mincount = 2; + + /* sanity check in case of a very wrong estimate above */ + if (ndistinct < track_cnt) + ndistinct = track_cnt; + if (num_mcv > track_cnt) num_mcv = track_cnt; for (i = 0; i < num_mcv; i++) { + double avgcount, + mincount; + + /* estimate # of occurrences in sample of a typical value */ + avgcount = (double) sample_cnt / ndistinct; + + /* set minimum threshold count to store a value */ + mincount = avgcount * 1.25; + if (track[i].count < mincount) { num_mcv = i; break; } + + sample_cnt -= track[i].count; + ndistinct -= 1.0; } } @@ -2479,6 +2488,8 @@ compute_scalar_stats(VacAttrStatsP stats, * emit duplicate histogram bin boundaries. (We might end up with * duplicate histogram entries anyway, if the distribution is skewed; * but we prefer to treat such values as MCVs if at all possible.) + * We also decrease ndistinct in the process such that going forward + * it refers to the number of distinct values left for the histogram. */ if (track_cnt == ndistinct && toowide_cnt == 0 && stats->stadistinct > 0 && @@ -2486,32 +2497,58 @@ compute_scalar_stats(VacAttrStatsP stats, { /* Track list includes all values seen, and all will fit */ num_mcv = track_cnt; + + /* Nothing left for the histogram */ + num_hist = 0; + ndistinct = 0; } else { - double avgcount, - mincount, - maxmincount; - - /* estimate # of occurrences in sample of a typical value */ - avgcount = (double) values_cnt / (double) ndistinct; - /* set minimum threshold count to store a value */ - mincount = avgcount * 1.25; - if (mincount < 2) - mincount = 2; - /* don't let threshold exceed 1/K, however */ - maxmincount = (double) values_cnt / (double) num_bins; - if (mincount > maxmincount) - mincount = maxmincount; + /* + * Starting number of values left for the histogram: samplerows + * sans nulls and too wide ones. + */ + int sample_cnt = values_cnt; + + num_hist = ndistinct; + if (num_hist > num_bins) + num_hist = num_bins + 1; + if (num_mcv > track_cnt) num_mcv = track_cnt; for (i = 0; i < num_mcv; i++) { - if (track[i].count < mincount) + if (num_hist >= 2) { - num_mcv = i; - break; + double avgcount, + mincount, + maxmincount; + + /* estimate # of occurrences in sample of a typical value */ + avgcount = (double) sample_cnt / (double) ndistinct; + + /* set minimum threshold count to store a value */ + mincount = 1.25 * avgcount; + + /* don't let threshold exceed 1/K, however */ + maxmincount = (sample_cnt - 1) / (double) (num_hist - 1); + if (mincount > maxmincount) + mincount = maxmincount; + if (track[i].count < mincount) + { + num_mcv = i; + break; + } } + + /* Narrow our view of samples left for the histogram */ + sample_cnt -= track[i].count; + ndistinct--; + + /* Recalculate histogram size due to lower ndistinct */ + num_hist = ndistinct; + if (num_hist > num_bins) + num_hist = num_bins + 1; } } @@ -2554,9 +2591,6 @@ compute_scalar_stats(VacAttrStatsP stats, * values not accounted for in the MCV list. (This ensures the * histogram won't collapse to empty or a singleton.) */ - num_hist = ndistinct - num_mcv; - if (num_hist > num_bins) - num_hist = num_bins + 1; if (num_hist >= 2) { MemoryContext old_context; -- 2.5.0