From e287fa474fea487ace0ee7d476f84b6f787cc2a7 Mon Sep 17 00:00:00 2001 From: Justin Pryzby Date: Tue, 24 Dec 2019 22:02:22 -0600 Subject: [PATCH v2 2/2] Pass ndistinct and minmax to allow good estimates even with timestamps of granularity other than 1sec --- src/backend/optimizer/util/plancat.c | 8 ++- src/backend/utils/adt/selfuncs.c | 36 ++++++++--- src/backend/utils/adt/timestamp.c | 113 ++++++++++++++++++++++------------- src/include/nodes/supportnodes.h | 2 + src/include/optimizer/plancat.h | 2 +- 5 files changed, 110 insertions(+), 51 deletions(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 2469ca6..aab794c 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -2011,9 +2011,13 @@ get_function_rows(PlannerInfo *root, Oid funcid, Node *node) /* * Return a multiplier [0..1] to help estimate effect on rowcount of GROUP BY * f(x), relative to input x. + * + * minmax is an array of (min,max) values for the variable, which might be + * useful to determine its granularity (like timestamps per second, minute or + * hour). */ double -get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var) +get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var, double ndistinct, Datum *minmax) { HeapTuple proctup; Form_pg_proc procform; @@ -2033,6 +2037,8 @@ get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var) req.funcid = funcid; req.node = node; req.var = var; + req.ndistinct = ndistinct; + req.minmax = minmax; req.factor = 1; /* just for sanity */ sresult = (SupportRequestGroupBy *) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index eb0b86f..a7f396d 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -116,6 +116,7 @@ #include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" +#include "nodes/supportnodes.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/optimizer.h" @@ -3168,16 +3169,37 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, Form_pg_statistic stats = (Form_pg_statistic)GETSTRUCT(vardata.statsTuple); FuncExpr *expr = (FuncExpr *) groupexpr; - Var *v = (Var*) var; + Datum minmax[2]; + TypeCacheEntry *tce; + Var *v = (Var*) var; RangeTblEntry *rte = root->simple_rte_array[v->varno]; - char *reln = get_rel_name(rte->relid); - char *coln = get_attname(rte->relid, v->varattno, true); - ret = get_function_groupby(root, expr->funcid, groupexpr, var); - - fprintf(stderr, "HERE %s %d %s.%s ndistinct=%f ret=%f\n", __FUNCTION__, __LINE__, + char *reln = get_rel_name(rte->relid); + char *coln = get_attname(rte->relid, v->varattno, true); + + /* Seems like maybe this should be defined here and pass a single argument to groupby helper? */ + SupportRequestGroupBy req = { + .type = T_SupportRequestGroupBy, + .root = root, + .funcid = expr->funcid, + .node = groupexpr, + .var = var, + .ndistinct = stats->stadistinct >= 0 ? stats->stadistinct : + -stats->stadistinct * vardata.rel->tuples, + .minmax = minmax, + .factor = 1, /* just for sanity */ + }; + + fprintf(stderr, "HERE %s %d %s.%s tuples=%f, stadistinct=%f ndistinct=%f ret=%f\n", __FUNCTION__, __LINE__, reln?reln:"null", coln?coln:"null", - stats->stadistinct, ret); + vardata.rel->tuples, + stats->stadistinct, req.ndistinct, ret); + + tce = lookup_type_cache(v->vartype, TYPECACHE_LT_OPR); + if (get_variable_range(root, &vardata, tce->lt_opr, minmax, minmax+1)) + ret = get_function_groupby(root, expr->funcid, groupexpr, var, req.ndistinct, minmax); + else + ret = get_function_groupby(root, expr->funcid, groupexpr, var, req.ndistinct, NULL); Assert(ret>=0); Assert(ret<=1); diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 359ad32..0314ef9 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -5563,13 +5563,44 @@ generate_series_timestamptz(PG_FUNCTION_ARGS) Datum date_trunc_support(PG_FUNCTION_ARGS) { - Node *rawreq; + Node *rawreq; SupportRequestGroupBy *req; - List *args; - Node *arg1, *arg2; - Node *var; + List *args; + Node *arg1, *arg2; + Node *var; char *start; - int typmod; + int typmod, i; + long unsigned int diff; + + const struct { + const char *name; /* Granularity name */ + const int factor; /* Multiplier */ + const double minsecsec; /* Minimum number of distinct second values per second in range of the histogram */ + } grans[] = { + /* XXX: these factors not applied unless...typmod>1 ? */ + { "microseconds", 1 }, + { "milliseconds", 1 }, + + /* + * These factors not applied unless minmax indicates the incoming + * timestamp is at fine enough granularity that truncating to courser + * granularity would affect result. + */ + { "second", 1, 1.0 }, + { "minute", 60, 1.0/60 }, + { "hour", 60, 1.0/60/60 }, + { "day", 24, 1.0/60/60/24 }, // XXX: should not handle for DATEOID + + /* These factors applied up to the specified granularity */ + { "week", 7 }, + { "month", 30 }, + { "quarter", 3 }, + { "year", 4 }, + { "decade", 10 }, + { "century", 10 }, + { "millennium", 10 }, + + }; rawreq = (Node *) PG_GETARG_POINTER(0); if (!IsA(rawreq, SupportRequestGroupBy)) @@ -5584,13 +5615,45 @@ date_trunc_support(PG_FUNCTION_ARGS) arg2 = lsecond(args); /* arg3 may be the timezone */ - var = req->var; + // XXX: handle if these are null ? + diff = req->minmax ? timestamptz_to_time_t(DatumGetTimestamp(req->minmax[1])) - + timestamptz_to_time_t(DatumGetTimestamp(req->minmax[0])) + : 0; - /* XXX Assumes the input has 1-second granularity */ + fprintf(stderr, "got distinct %f diff=%ld\n", req->ndistinct, diff); // XXX: only work on const? start = TextDatumGetCString(((Const*)arg1)->constvalue); + for (i=0; ; ++i) { + if (i >= sizeof(grans)/sizeof(*grans)) + /* Unhandled truncation granularity */ + PG_RETURN_POINTER(NULL); + + fprintf(stderr, "applying factor %s %d: cur=%f %f -gt %f\n", + grans[i].name, grans[i].factor, req->factor, + req->ndistinct/diff, grans[i].minsecsec ); + + if (req->ndistinct / diff >= grans[i].minsecsec) { + if (req->ndistinct / diff > grans[i-1].minsecsec) + /* Apply the factor in full strength */ + req->factor /= grans[i].factor; + else { + /* interpolate: if at (say) 15 minute granularity, then apply a 4x hourly correction, not 60x */ + // req->factor /= grans[i].factor / (req->ndistinct / diff / grans[i].minsecsec); + req->factor /= req->ndistinct / diff / grans[i].minsecsec; // XXX: is this right + fprintf(stderr, "applying partial factor %f\n", req->ndistinct / diff / grans[i].minsecsec); + } + } + + if (strcmp(start, grans[i].name) == 0) + break; + } + + PG_RETURN_POINTER(req); + +#if 0 + // var = req->var; // XXX: not working due to promotion ? // exprType(var) is still not right, since date_trunc(a, b::date) uses b's type and not date.. // ...but date_trunc('', x::date) is weird @@ -5604,50 +5667,16 @@ date_trunc_support(PG_FUNCTION_ARGS) /* If the input has decimal digits, the grouping effect is stronger */ if (typmod != -1) { req->factor /= 2<factor /= 60; - } else if (strcmp(start, "hour")==0) { - req->factor /= 60*60; - } } // else { elog(ERROR, "unknown type %u", exprType(var)); } - if (strcmp(start, "day")==0) { - req->factor /= 60*60*24; - } else if (strcmp(start, "week")==0) { - req->factor /= 60*60*24*7; - } else if (strcmp(start, "month")==0) { - /* 30 days */ - req->factor /= 60*60*24*30; - } else if (strcmp(start, "quarter")==0) { - req->factor /= 60*60*24*30*3; - } else if (strcmp(start, "year")==0) { - req->factor /= 60*60*24*365.24; - } else if (strcmp(start, "decade")==0) { - req->factor /= 60*60*24*365.25*10; - } else if (strcmp(start, "century")==0) { - req->factor /= 60*60*24*365.25*100; - } else if (strcmp(start, "millennium")==0) { - req->factor /= 60*60*24*365.25*1000; - } else if (req->factor > 1) { /* Maybe a DATE with finer graularity trunc */ req->factor = 1; } - // else { elog(ERROR, "", ); } - /* Fudge Factor, since the input may be already "grouped", say at multiples of 15min, */ - /* or otherwise have course granularity to begin with */ - // factor/=4; +#endif - PG_RETURN_POINTER(req); } diff --git a/src/include/nodes/supportnodes.h b/src/include/nodes/supportnodes.h index cb4ea44..f5f33bf 100644 --- a/src/include/nodes/supportnodes.h +++ b/src/include/nodes/supportnodes.h @@ -178,6 +178,8 @@ typedef struct SupportRequestGroupBy Oid funcid; /* function we are inquiring about */ Node *var; /* original (2nd) argument */ Node *node; /* parse node invoking function */ + double ndistinct; /* Initial estimate of ndistinct of the variable */ + Datum *minmax; /* Array of (min,max) values for variable */ /* Output fields: */ double factor; /* [0..1] fraction of rows in GROUP BY f(x) diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index cbe5386..c13d40d 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -70,7 +70,7 @@ extern void add_function_cost(PlannerInfo *root, Oid funcid, Node *node, extern double get_function_rows(PlannerInfo *root, Oid funcid, Node *node); -extern double get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var); +extern double get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var, double ndistinct, Datum *minmax); extern bool has_row_triggers(PlannerInfo *root, Index rti, CmdType event); -- 2.7.4