From 47bb4944fec83c4b47a3dfcac9851cf0450e179f Mon Sep 17 00:00:00 2001 From: mahendra Date: Thu, 7 Apr 2022 15:05:57 +0300 Subject: [PATCH 6/6] Add jsonb statistics --- src/backend/catalog/system_functions.sql | 36 + src/backend/catalog/system_views.sql | 56 + src/backend/utils/adt/Makefile | 2 + src/backend/utils/adt/jsonb_selfuncs.c | 1582 ++++++++++++++++++++++++++++ src/backend/utils/adt/jsonb_typanalyze.c | 1627 +++++++++++++++++++++++++++++ src/backend/utils/adt/jsonpath_exec.c | 2 +- src/include/catalog/pg_operator.dat | 17 +- src/include/catalog/pg_proc.dat | 11 + src/include/catalog/pg_statistic.h | 2 + src/include/catalog/pg_type.dat | 2 +- src/include/utils/json_selfuncs.h | 113 ++ src/test/regress/expected/jsonb_stats.out | 713 +++++++++++++ src/test/regress/expected/rules.out | 32 + src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/jsonb_stats.sql | 249 +++++ 15 files changed, 4435 insertions(+), 11 deletions(-) create mode 100644 src/backend/utils/adt/jsonb_selfuncs.c create mode 100644 src/backend/utils/adt/jsonb_typanalyze.c create mode 100644 src/include/utils/json_selfuncs.h create mode 100644 src/test/regress/expected/jsonb_stats.out create mode 100644 src/test/regress/sql/jsonb_stats.sql diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index 81bac6f..0b9f68e 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -594,6 +594,42 @@ LANGUAGE internal STRICT IMMUTABLE PARALLEL SAFE AS 'unicode_is_normalized'; +-- XXX is this function immutable / parallel safe? +-- XXX do we actually need to cast to text and then to jsonb? +CREATE FUNCTION pg_json_path_stats(tab regclass, path_index integer) RETURNS text +AS $$ + SELECT jsonb_pretty(( + CASE + WHEN stakind1 = 8 THEN stavalues1 + WHEN stakind2 = 8 THEN stavalues2 + WHEN stakind3 = 8 THEN stavalues3 + WHEN stakind4 = 8 THEN stavalues4 + WHEN stakind5 = 8 THEN stavalues5 + END::text::jsonb[])[$2]) + FROM pg_statistic + WHERE starelid = $1 +$$ LANGUAGE 'sql'; + +-- XXX is this function immutable / parallel safe? +-- XXX do we actually need to cast to text and then to jsonb? +CREATE FUNCTION pg_json_path_stats(tab regclass, path text) RETURNS text +AS $$ + SELECT jsonb_pretty(pathstats) + FROM ( + SELECT unnest( + CASE + WHEN stakind1 = 8 THEN stavalues1 + WHEN stakind2 = 8 THEN stavalues2 + WHEN stakind3 = 8 THEN stavalues3 + WHEN stakind4 = 8 THEN stavalues4 + WHEN stakind5 = 8 THEN stavalues5 + END::text::jsonb[]) pathstats + FROM pg_statistic + WHERE starelid = $1 + ) paths + WHERE pathstats->>'path' = $2 +$$ LANGUAGE 'sql'; + -- -- The default permissions for functions mean that anyone can execute them. -- A number of functions shouldn't be executable by just anyone, but rather diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 9eaa51d..ee35239 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -364,6 +364,62 @@ CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS -- unprivileged users may read pg_statistic_ext but not pg_statistic_ext_data REVOKE ALL ON pg_statistic_ext_data FROM public; +-- XXX This probably needs to do the same checks as pg_stats, i.e. +-- WHERE NOT attisdropped +-- AND has_column_privilege(c.oid, a.attnum, 'select') +-- AND (c.relrowsecurity = false OR NOT row_security_active(c.oid)); +CREATE VIEW pg_stats_json AS + SELECT + nspname AS schemaname, + relname AS tablename, + attname AS attname, + + path->>'path' AS json_path, + + stainherit AS inherited, + + (path->'json'->>'nullfrac')::float4 AS null_frac, + (path->'json'->>'width')::float4 AS avg_width, + (path->'json'->>'distinct')::float4 AS n_distinct, + + ARRAY(SELECT val FROM jsonb_array_elements( + path->'json'->'mcv'->'values') val)::anyarray + AS most_common_vals, + + ARRAY(SELECT num::text::float4 FROM jsonb_array_elements( + path->'json'->'mcv'->'numbers') num) + AS most_common_freqs, + + ARRAY(SELECT val FROM jsonb_array_elements( + path->'json'->'histogram'->'values') val) + AS histogram_bounds, + + ARRAY(SELECT val::text::int FROM jsonb_array_elements( + path->'array_length'->'mcv'->'values') val) + AS most_common_array_lengths, + + ARRAY(SELECT num::text::float4 FROM jsonb_array_elements( + path->'array_length'->'mcv'->'numbers') num) + AS most_common_array_length_freqs, + + (path->'json'->>'correlation')::float4 AS correlation + + FROM + pg_statistic s JOIN pg_class c ON (c.oid = s.starelid) + JOIN pg_attribute a ON (c.oid = attrelid AND attnum = s.staattnum) + LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace), + LATERAL ( + SELECT unnest((CASE + WHEN stakind1 = 8 THEN stavalues1 + WHEN stakind2 = 8 THEN stavalues2 + WHEN stakind3 = 8 THEN stavalues3 + WHEN stakind4 = 8 THEN stavalues4 + WHEN stakind5 = 8 THEN stavalues5 + END ::text::jsonb[])[2:]) AS path + ) paths; + +-- no need to revoke any privileges, we've already revoked accss to pg_statistic + CREATE VIEW pg_publication_tables AS SELECT P.pubname AS pubname, diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 7c722ea..072a529 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -51,6 +51,8 @@ OBJS = \ jsonb.o \ jsonb_gin.o \ jsonb_op.o \ + jsonb_selfuncs.o \ + jsonb_typanalyze.o \ jsonb_util.o \ jsonfuncs.o \ jsonbsubs.o \ diff --git a/src/backend/utils/adt/jsonb_selfuncs.c b/src/backend/utils/adt/jsonb_selfuncs.c new file mode 100644 index 0000000..f5520f8 --- /dev/null +++ b/src/backend/utils/adt/jsonb_selfuncs.c @@ -0,0 +1,1582 @@ +/*------------------------------------------------------------------------- + * + * jsonb_selfuncs.c + * Functions for selectivity estimation of jsonb operators + * + * Copyright (c) 2016-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/adt/jsonb_selfuncs.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "fmgr.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_statistic.h" +#include "catalog/pg_type.h" +#include "common/string.h" +#include "nodes/primnodes.h" +#include "utils/builtins.h" +#include "utils/json.h" +#include "utils/jsonb.h" +#include "utils/json_selfuncs.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/selfuncs.h" + +#define DEFAULT_JSON_CONTAINS_SEL 0.001 + +/* + * jsonGetField + * Given a JSONB document and a key, extract the JSONB value for the key. + */ +static inline Datum +jsonGetField(Datum obj, const char *field) +{ + Jsonb *jb = DatumGetJsonbP(obj); + JsonbValue *jbv = findJsonbValueFromContainerLen(&jb->root, JB_FOBJECT, + field, strlen(field)); + return jbv ? JsonbPGetDatum(JsonbValueToJsonb(jbv)) : PointerGetDatum(NULL); +} + +/* + * jsonGetFloat4 + * Given a JSONB value, interpret it as a float4 value. + * + * This expects the JSONB value to be a numeric, because that's how we store + * floats in JSONB, and we cast it to float4. + */ +static inline float4 +jsonGetFloat4(Datum jsonb, float4 default_val) +{ + Jsonb *jb; + JsonbValue jv; + + if (!DatumGetPointer(jsonb)) + return default_val; + + jb = DatumGetJsonbP(jsonb); + + if (!JsonbExtractScalar(&jb->root, &jv) || jv.type != jbvNumeric) + return default_val; + + return DatumGetFloat4(DirectFunctionCall1(numeric_float4, + NumericGetDatum(jv.val.numeric))); +} + +/* + * jsonStatsInit + * Given a pg_statistic tuple, expand STATISTIC_KIND_JSON into JsonStats. + */ +bool +jsonStatsInit(JsonStats data, const VariableStatData *vardata) +{ + Jsonb *jb; + JsonbValue prefix; + + if (!vardata->statsTuple) + return false; + + data->statsTuple = vardata->statsTuple; + memset(&data->attslot, 0, sizeof(data->attslot)); + + /* Were there just NULL values in the column? No JSON stats, but still useful. */ + if (((Form_pg_statistic) GETSTRUCT(data->statsTuple))->stanullfrac >= 1.0) + { + data->nullfrac = 1.0; + return true; + } + + /* Do we have the JSON stats built in the pg_statistic? */ + if (!get_attstatsslot(&data->attslot, data->statsTuple, + STATISTIC_KIND_JSON, InvalidOid, + ATTSTATSSLOT_NUMBERS | ATTSTATSSLOT_VALUES)) + return false; + + /* + * Valid JSON stats should have at least 2 elements in values: + * 0th - root path prefix + * 1st - root path stats + */ + if (data->attslot.nvalues < 2) + { + free_attstatsslot(&data->attslot); + return false; + } + + /* XXX If the ACL check was not OK, would we even get here? */ + data->acl_ok = vardata->acl_ok; + data->rel = vardata->rel; + data->nullfrac = + data->attslot.nnumbers > 0 ? data->attslot.numbers[0] : 0.0; + data->pathdatums = data->attslot.values + 1; + data->npaths = data->attslot.nvalues - 1; + + /* Extract root path prefix */ + jb = DatumGetJsonbP(data->attslot.values[0]); + if (!JsonbExtractScalar(&jb->root, &prefix) || prefix.type != jbvString) + { + free_attstatsslot(&data->attslot); + return false; + } + + data->prefix = prefix.val.string.val; + data->prefixlen = prefix.val.string.len; + + /* Create path cache, initialze only two fields that acting as flags */ + data->paths = palloc(sizeof(*data->paths) * data->npaths); + + for (int i = 0; i < data->npaths; i++) + { + data->paths[i].data = NULL; + data->paths[i].path = NULL; + } + + return true; +} + +/* + * jsonStatsRelease + * Release resources (statistics slot) associated with the JsonStats value. + */ +void +jsonStatsRelease(JsonStats data) +{ + free_attstatsslot(&data->attslot); +} + +/* + * jsonPathStatsAllocSpecialStats + * Allocate a copy of JsonPathStats for accessing special (length etc.) + * stats for a given JSON path. + */ +static JsonPathStats +jsonPathStatsAllocSpecialStats(JsonPathStats pstats, JsonPathStatsType type) +{ + JsonPathStats stats; + + if (!pstats) + return NULL; + + /* copy and replace stats type */ + stats = palloc(sizeof(*stats)); + *stats = *pstats; + stats->type = type; + + return stats; +} + +/* + * jsonPathStatsGetArrayLengthStats + * Extract statistics of array lengths for the path. + */ +JsonPathStats +jsonPathStatsGetArrayLengthStats(JsonPathStats pstats) +{ + /* + * The array length statistics is relevant only for values that are arrays. + * So if we observed no such values, we know there can't be such + * statistics and so we simply return NULL. + */ + if (jsonPathStatsGetTypeFreq(pstats, jbvArray, 0.0) <= 0.0) + return NULL; + + return jsonPathStatsAllocSpecialStats(pstats, JsonPathStatsArrayLength); +} + +/* + * jsonPathStatsGetObjectLengthStats + * Extract statistics of object length for the path. + */ +JsonPathStats +jsonPathStatsGetObjectLengthStats(JsonPathStats pstats) +{ + /* + * The object length statistics is relevant only for values that are arrays. + * So if we observed no such values, we know there can't be such + * statistics and so we simply return NULL. + */ + if (jsonPathStatsGetTypeFreq(pstats, jbvObject, 0.0) <= 0.0) + return NULL; + + return jsonPathStatsAllocSpecialStats(pstats, JsonPathStatsObjectLength); +} + +/* + * jsonPathStatsGetPath + * Try to use cached path name or extract it from per-path stats datum. + * + * Returns true on succces, false on error. + */ +static inline bool +jsonPathStatsGetPath(JsonPathStats stats, Datum pathdatum, + const char **path, int *pathlen) +{ + *path = stats->path; + + if (*path) + /* use cached path name */ + *pathlen = stats->pathlen; + else + { + Jsonb *jsonb = DatumGetJsonbP(pathdatum); + JsonbValue pathkey; + JsonbValue *pathval; + + /* extract path from the statistics represented as jsonb document */ + JsonValueInitStringWithLen(&pathkey, "path", 4); + pathval = findJsonbValueFromContainer(&jsonb->root, JB_FOBJECT, &pathkey); + + if (!pathval || pathval->type != jbvString) + return false; /* XXX invalid stats data, maybe throw error */ + + /* cache extracted path name */ + *path = stats->path = pathval->val.string.val; + *pathlen = stats->pathlen = pathval->val.string.len; + } + + return true; +} + +/* Context for bsearch()ing paths */ +typedef struct JsonPathStatsSearchContext +{ + JsonStats stats; + const char *path; + int pathlen; +} JsonPathStatsSearchContext; + +/* + * jsonPathStatsCompare + * Compare two JsonPathStats structs, so that we can sort them. + * + * We do this so that we can search for stats for a given path simply by + * bsearch(). + * + * XXX We never build two structs for the same path, so we know the paths + * are different - one may be a prefix of the other, but then we sort the + * strings by length. + */ +static int +jsonPathStatsCompare(const void *pv1, const void *pv2) +{ + JsonPathStatsSearchContext const *cxt = pv1; + Datum const *pathdatum = (Datum const *) pv2; + int index = pathdatum - cxt->stats->pathdatums; + JsonPathStats stats = &cxt->stats->paths[index]; + const char *path; + int pathlen; + int res; + + if (!jsonPathStatsGetPath(stats, *pathdatum, &path, &pathlen)) + return 1; /* XXX invalid stats data */ + + /* compare the shared part first, then compare by length */ + res = strncmp(cxt->path, path, Min(cxt->pathlen, pathlen)); + + return res ? res : cxt->pathlen - pathlen; +} + +/* + * jsonStatsFindPath + * Find stats for a given path. + * + * The stats are sorted by path, so we can simply do bsearch(). + * This is low-level function and jsdata->prefix is not considered, the caller + * should handle it by itself. + */ +static JsonPathStats +jsonStatsFindPath(JsonStats jsdata, const char *path, int pathlen) +{ + JsonPathStatsSearchContext cxt; + JsonPathStats stats; + Datum *pdatum; + int index; + + cxt.stats = jsdata; + cxt.path = path; + cxt.pathlen = pathlen; + + pdatum = bsearch(&cxt, jsdata->pathdatums, jsdata->npaths, + sizeof(*jsdata->pathdatums), jsonPathStatsCompare); + + if (!pdatum) + return NULL; + + index = pdatum - jsdata->pathdatums; + stats = &jsdata->paths[index]; + + Assert(stats->path); + Assert(stats->pathlen == pathlen); + + /* Init all fields if needed (stats->data == NULL means uninitialized) */ + if (!stats->data) + { + stats->data = jsdata; + stats->datum = pdatum; + stats->type = JsonPathStatsValues; + } + + return stats; +} + +/* + * jsonStatsGetPathByStr + * Find stats for a given path string considering jsdata->prefix. + */ +JsonPathStats +jsonStatsGetPathByStr(JsonStats jsdata, const char *subpath, int subpathlen) +{ + JsonPathStats stats; + char *path; + int pathlen; + + if (jsdata->nullfrac >= 1.0) + return NULL; + + pathlen = jsdata->prefixlen + subpathlen - 1; + path = palloc(pathlen); + + memcpy(path, jsdata->prefix, jsdata->prefixlen); + memcpy(&path[jsdata->prefixlen], &subpath[1], subpathlen - 1); + + stats = jsonStatsFindPath(jsdata, path, pathlen); + + if (!stats) + pfree(path); + + return stats; +} + +/* + * jsonStatsGetRootPath + * Find JSON stats for root prefix path. + */ +static JsonPathStats +jsonStatsGetRootPath(JsonStats jsdata) +{ + if (jsdata->nullfrac >= 1.0) + return NULL; + + return jsonStatsFindPath(jsdata, jsdata->prefix, jsdata->prefixlen); +} + +#define jsonStatsGetRootArrayPath(jsdata) \ + jsonStatsGetPathByStr(jsdata, JSON_PATH_ROOT_ARRAY, JSON_PATH_ROOT_ARRAY_LEN) + +/* + * jsonPathAppendEntry + * Append entry (represented as simple string) to a path. + * + * NULL entry is treated as wildcard array accessor "[*]". + */ +void +jsonPathAppendEntry(StringInfo path, const char *entry) +{ + if (entry) + { + appendStringInfoCharMacro(path, '.'); + escape_json(path, entry); + } + else + appendStringInfoString(path, "[*]"); +} + +/* + * jsonPathAppendEntryWithLen + * Append string (represented as string + length) to a path. + */ +static void +jsonPathAppendEntryWithLen(StringInfo path, const char *entry, int len) +{ + char *tmpentry = pnstrdup(entry, len); + jsonPathAppendEntry(path, tmpentry); + pfree(tmpentry); +} + +/* + * jsonPathStatsGetSubpath + * Find JSON path stats for object key or array elements (if 'key' = NULL). + */ +JsonPathStats +jsonPathStatsGetSubpath(JsonPathStats pstats, const char *key) +{ + JsonPathStats spstats; + StringInfoData str; + + initStringInfo(&str); + appendBinaryStringInfo(&str, pstats->path, pstats->pathlen); + jsonPathAppendEntry(&str, key); + + spstats = jsonStatsFindPath(pstats->data, str.data, str.len); + if (!spstats) + pfree(str.data); + + return spstats; +} + +/* + * jsonPathStatsGetArrayIndexSelectivity + * Given stats for a path, determine selectivity for an array index. + */ +Selectivity +jsonPathStatsGetArrayIndexSelectivity(JsonPathStats pstats, int index) +{ + JsonPathStats lenstats = jsonPathStatsGetArrayLengthStats(pstats); + JsonbValue tmpjbv; + Jsonb *jb; + + /* + * If we have no array length stats, assume all documents match. + * + * XXX Shouldn't this use a default smaller than 1.0? What do the selfuncs + * for regular arrays use? + */ + if (!lenstats) + return 1.0; + + jb = JsonbValueToJsonb(JsonValueInitInteger(&tmpjbv, index)); + + /* calculate fraction of elements smaller than the index */ + return jsonSelectivity(lenstats, JsonbPGetDatum(jb), JsonbGtOperator); +} + +/* + * jsonStatsGetPath + * Find JSON statistics for a given path. + * + * 'path' is an array of text datums of length 'pathlen' (can be zero). + */ +static JsonPathStats +jsonStatsGetPath(JsonStats jsdata, Datum *path, int pathlen, + bool try_arrays_indexes, float4 *nullfrac) +{ + JsonPathStats pstats = jsonStatsGetRootPath(jsdata); + Selectivity sel = 1.0; + + for (int i = 0; pstats && i < pathlen; i++) + { + char *key = TextDatumGetCString(path[i]); + char *tail; + int index; + + if (!try_arrays_indexes) + { + /* Find object key stats */ + pstats = jsonPathStatsGetSubpath(pstats, key); + pfree(key); + continue; + } + + /* Try to interpret path entry as integer array index */ + errno = 0; + index = strtoint(key, &tail, 10); + + if (tail == key || *tail != '\0' || errno != 0) + { + /* Find object key stats */ + pstats = jsonPathStatsGetSubpath(pstats, key); + } + else + { + /* Find array index stats */ + /* FIXME consider object key "index" also */ + JsonPathStats arrstats = jsonPathStatsGetSubpath(pstats, NULL); + + if (arrstats) + { + float4 arrfreq = jsonPathStatsGetFreq(pstats, 0.0); + + sel *= jsonPathStatsGetArrayIndexSelectivity(pstats, index); + + if (arrfreq > 0.0) + sel /= arrfreq; + } + + pstats = arrstats; + } + + pfree(key); + } + + *nullfrac = 1.0 - sel; + + return pstats; +} + +/* + * jsonPathStatsGetNextSubpathStats + * Iterate all collected subpaths of a given path. + * + * This function can be useful for estimation of selectivity of jsonpath + * '.*' and '.**' operators. + * + * The next found subpath is written into *pkeystats, which should be set to + * NULL before the first call. + * + * If keysOnly is true, emit only top-level object-key subpaths. + * + * Returns false on the end of iteration and true otherwise. + */ +bool +jsonPathStatsGetNextSubpathStats(JsonPathStats stats, JsonPathStats *pkeystats, + bool keysOnly) +{ + JsonPathStats keystats = *pkeystats; + /* compute next index */ + int index = + (keystats ? keystats->datum : stats->datum) - stats->data->pathdatums + 1; + + if (stats->type != JsonPathStatsValues) + return false; /* length stats doe not have subpaths */ + + for (; index < stats->data->npaths; index++) + { + Datum *pathdatum = &stats->data->pathdatums[index]; + const char *path; + int pathlen; + + keystats = &stats->data->paths[index]; + + if (!jsonPathStatsGetPath(keystats, *pathdatum, &path, &pathlen)) + break; /* invalid path stats */ + + /* Break, if subpath does not start from a desired prefix */ + if (pathlen <= stats->pathlen || + memcmp(path, stats->path, stats->pathlen)) + break; + + if (keysOnly) + { + const char *c = &path[stats->pathlen]; + + if (*c == '[') + { + Assert(c[1] == '*' && c[2] == ']'); + +#if 0 /* TODO add separate flag for requesting top-level array accessors */ + /* skip if it is not last key in the path */ + if (pathlen > stats->pathlen + 3) +#endif + continue; /* skip array accessors */ + } + else if (*c == '.') + { + /* find end of '."key"' */ + const char *pathend = path + pathlen - 1; + + if (++c >= pathend || *c != '"') + break; /* invalid path */ + + while (++c <= pathend && *c != '"') + if (*c == '\\') /* handle escaped chars */ + c++; + + if (c > pathend) + break; /* invalid path */ + + /* skip if it is not last key in the path */ + if (c < pathend) + continue; + } + else + continue; /* invalid path */ + } + + /* Init path stats if needed */ + if (!keystats->data) + { + keystats->data = stats->data; + keystats->datum = pathdatum; + keystats->type = JsonPathStatsValues; + } + + *pkeystats = keystats; + + return true; + } + + return false; +} + +/* + * jsonStatsConvertArray + * Convert a JSONB array into an array of some regular data type. + * + * The "type" identifies what elements are in the input JSONB array, while + * typid determines the target type. + */ +static Datum +jsonStatsConvertArray(Datum jsonbValueArray, JsonStatType type, Oid typid, + float4 multiplier) +{ + Datum *values; + Jsonb *jbvals; + JsonbValue jbv; + JsonbIterator *it; + JsonbIteratorToken r; + int nvalues; + int i; + int16 typlen; + bool typbyval; + char typalign; + + if (!DatumGetPointer(jsonbValueArray)) + return PointerGetDatum(NULL); + + jbvals = DatumGetJsonbP(jsonbValueArray); + + nvalues = JsonContainerSize(&jbvals->root); + + values = palloc(sizeof(Datum) * nvalues); + + for (i = 0, it = JsonbIteratorInit(&jbvals->root); + (r = JsonbIteratorNext(&it, &jbv, true)) != WJB_DONE;) + { + if (r == WJB_ELEM) + { + Datum value; + + switch (type) + { + case JsonStatJsonb: + case JsonStatJsonbWithoutSubpaths: + value = JsonbPGetDatum(JsonbValueToJsonb(&jbv)); + break; + + case JsonStatText: + case JsonStatString: + Assert(jbv.type == jbvString); + value = PointerGetDatum( + cstring_to_text_with_len(jbv.val.string.val, + jbv.val.string.len)); + break; + + case JsonStatNumeric: + Assert(jbv.type == jbvNumeric); + value = NumericGetDatum(jbv.val.numeric); + break; + + case JsonStatFloat4: + Assert(jbv.type == jbvNumeric); + value = DirectFunctionCall1(numeric_float4, + NumericGetDatum(jbv.val.numeric)); + value = Float4GetDatum(DatumGetFloat4(value) * multiplier); + break; + + default: + elog(ERROR, "invalid json stat type %d", type); + value = (Datum) 0; + break; + } + + Assert(i < nvalues); + values[i++] = value; + } + } + + Assert(i == nvalues); + + get_typlenbyvalalign(typid, &typlen, &typbyval, &typalign); + + return PointerGetDatum( + construct_array(values, nvalues, typid, typlen, typbyval, typalign)); +} + +/* + * jsonPathStatsExtractData + * Extract pg_statistics values from statistics for a single path. + * + * Extract ordinary MCV, Histogram, Correlation slots for a requested stats + * type. If requested stats for JSONB, include also transformed JSON slot for + * a path and possibly for its subpaths. + */ +static bool +jsonPathStatsExtractData(JsonPathStats pstats, JsonStatType stattype, + float4 nullfrac, StatsData *statdata) +{ + Datum data; + Datum nullf; + Datum dist; + Datum width; + Datum mcv; + Datum hst; + Datum corr; + Oid type; + Oid eqop; + Oid ltop; + const char *key; + StatsSlot *slot = statdata->slots; + + nullfrac = 1.0 - (1.0 - pstats->data->nullfrac) * (1.0 - nullfrac); + + /* + * Depending on requested statistics type, select: + * - stavalues data type + * - corresponding eq/lt operators + * - JSONB field, containing stats slots for this statistics type + */ + switch (stattype) + { + case JsonStatJsonb: + case JsonStatJsonbWithoutSubpaths: + key = pstats->type == JsonPathStatsArrayLength ? "array_length" : + pstats->type == JsonPathStatsObjectLength ? "object_length" : + "json"; + type = JSONBOID; + eqop = JsonbEqOperator; + ltop = JsonbLtOperator; + break; + case JsonStatText: + key = "text"; + type = TEXTOID; + eqop = TextEqualOperator; + ltop = TextLessOperator; + break; + case JsonStatString: + key = "string"; + type = TEXTOID; + eqop = TextEqualOperator; + ltop = TextLessOperator; + break; + case JsonStatNumeric: + key = "numeric"; + type = NUMERICOID; + eqop = NumericEqOperator; + ltop = NumericLtOperator; + break; + case JsonStatFloat4: /* special internal stats type */ + default: + elog(ERROR, "invalid json statistic type %d", stattype); + break; + } + + /* Extract object containing slots */ + data = jsonGetField(*pstats->datum, key); + + if (!DatumGetPointer(data)) + return false; + + nullf = jsonGetField(data, "nullfrac"); + dist = jsonGetField(data, "distinct"); + width = jsonGetField(data, "width"); + mcv = jsonGetField(data, "mcv"); + hst = jsonGetField(data, "histogram"); + corr = jsonGetField(data, "correlation"); + + statdata->nullfrac = jsonGetFloat4(nullf, 0); + statdata->distinct = jsonGetFloat4(dist, 0); + statdata->width = (int32) jsonGetFloat4(width, 0); + + statdata->nullfrac += (1.0 - statdata->nullfrac) * nullfrac; + + /* Include MCV slot if exists */ + if (DatumGetPointer(mcv)) + { + slot->kind = STATISTIC_KIND_MCV; + slot->opid = eqop; + slot->numbers = jsonStatsConvertArray(jsonGetField(mcv, "numbers"), + JsonStatFloat4, FLOAT4OID, + 1.0 - nullfrac); + slot->values = jsonStatsConvertArray(jsonGetField(mcv, "values"), + stattype, type, 0); + slot++; + } + + /* Include Histogram slot if exists */ + if (DatumGetPointer(hst)) + { + slot->kind = STATISTIC_KIND_HISTOGRAM; + slot->opid = ltop; + slot->numbers = jsonStatsConvertArray(jsonGetField(hst, "numbers"), + JsonStatFloat4, FLOAT4OID, 1.0); + slot->values = jsonStatsConvertArray(jsonGetField(hst, "values"), + stattype, type, 0); + slot++; + } + + /* Include Correlation slot if exists */ + if (DatumGetPointer(corr)) + { + Datum correlation = Float4GetDatum(jsonGetFloat4(corr, 0)); + + slot->kind = STATISTIC_KIND_CORRELATION; + slot->opid = ltop; + slot->numbers = PointerGetDatum(construct_array(&correlation, 1, + FLOAT4OID, 4, true, + 'i')); + slot++; + } + + /* Include JSON statistics for a given path and possibly for its subpaths */ + if ((stattype == JsonStatJsonb || + stattype == JsonStatJsonbWithoutSubpaths) && + jsonAnalyzeBuildSubPathsData(pstats->data->pathdatums, + pstats->data->npaths, + pstats->datum - pstats->data->pathdatums, + pstats->path, + pstats->pathlen, + stattype == JsonStatJsonb, + nullfrac, + &slot->values, + &slot->numbers)) + { + slot->kind = STATISTIC_KIND_JSON; + slot++; + } + + return true; +} + +static float4 +jsonPathStatsGetFloat(JsonPathStats pstats, const char *key, float4 defaultval) +{ + if (!pstats) + return defaultval; + + return jsonGetFloat4(jsonGetField(*pstats->datum, key), defaultval); +} + +float4 +jsonPathStatsGetFreq(JsonPathStats pstats, float4 defaultfreq) +{ + return jsonPathStatsGetFloat(pstats, "freq", defaultfreq); +} + +float4 +jsonPathStatsGetAvgArraySize(JsonPathStats pstats) +{ + return jsonPathStatsGetFloat(pstats, "avg_array_length", 1.0); +} + +/* + * jsonPathStatsGetTypeFreq + * Get frequency of different JSON object types for a given path. + * + * JSON documents don't have any particular schema, and the same path may point + * to values with different types in multiple documents. Consider for example + * two documents {"a" : "b"} and {"a" : 100} which have both a string and int + * for the same path. So we track the frequency of different JSON types for + * each path, so that we can consider this later. + */ +float4 +jsonPathStatsGetTypeFreq(JsonPathStats pstats, JsonbValueType type, + float4 defaultfreq) +{ + const char *key; + + if (!pstats) + return defaultfreq; + + /* + * When dealing with (object/array) length stats, we only really care about + * objects and arrays. + * + * Lengths are always numeric, so simply return 0 if requested frequency + * of non-numeric values. + */ + if (pstats->type == JsonPathStatsArrayLength) + { + if (type != jbvNumeric) + return 0.0; + + return jsonPathStatsGetFloat(pstats, "freq_array", defaultfreq); + } + + if (pstats->type == JsonPathStatsObjectLength) + { + if (type != jbvNumeric) + return 0.0; + + return jsonPathStatsGetFloat(pstats, "freq_object", defaultfreq); + } + + /* Which JSON type are we interested in? Pick the right freq_type key. */ + switch (type) + { + case jbvNull: + key = "freq_null"; + break; + case jbvString: + key = "freq_string"; + break; + case jbvNumeric: + key = "freq_numeric"; + break; + case jbvBool: + key = "freq_boolean"; + break; + case jbvObject: + key = "freq_object"; + break; + case jbvArray: + key = "freq_array"; + break; + default: + elog(ERROR, "Invalid jsonb value type: %d", type); + break; + } + + return jsonPathStatsGetFloat(pstats, key, defaultfreq); +} + +/* + * jsonPathStatsFormTuple + * For a pg_statistic tuple representing JSON statistics. + * + * XXX Maybe it's a bit expensive to first build StatsData and then transform it + * again while building the tuple. Could it be done in a single step? Would it be + * more efficient? Not sure how expensive it actually is, though. + */ +static HeapTuple +jsonPathStatsFormTuple(JsonPathStats pstats, JsonStatType type, float4 nullfrac) +{ + StatsData statdata; + + if (!pstats || !pstats->datum) + return NULL; + + /* + * If it is the ordinary root path stats, there is no need to transform + * the tuple, it can be simply copied. + */ + if (pstats->datum == &pstats->data->pathdatums[0] && + pstats->type == JsonPathStatsValues) + return heap_copytuple(pstats->data->statsTuple); + + MemSet(&statdata, 0, sizeof(statdata)); + + if (!jsonPathStatsExtractData(pstats, type, nullfrac, &statdata)) + return NULL; + + return stats_form_tuple(&statdata); +} + +/* + * jsonStatsGetPathTuple + * Extract JSON statistics for a text[] path and form pg_statistics tuple. + */ +static HeapTuple +jsonStatsGetPathTuple(JsonStats jsdata, JsonStatType type, + Datum *path, int pathlen, bool try_arrays_indexes) +{ + float4 nullfrac; + JsonPathStats pstats = jsonStatsGetPath(jsdata, path, pathlen, + try_arrays_indexes, &nullfrac); + + return jsonPathStatsFormTuple(pstats, type, nullfrac); +} + +/* + * jsonStatsGetArrayIndexStatsTuple + * Extract JSON statistics for a array index and form pg_statistics tuple. + */ +static HeapTuple +jsonStatsGetArrayIndexStatsTuple(JsonStats jsdata, JsonStatType type, int32 index) +{ + /* Extract statistics for root array elements */ + JsonPathStats arrstats = jsonStatsGetRootArrayPath(jsdata); + JsonPathStats rootstats; + Selectivity index_sel; + + if (!arrstats) + return NULL; + + /* Compute relative selectivity of 'EXISTS($[index])' */ + rootstats = jsonStatsGetRootPath(jsdata); + index_sel = jsonPathStatsGetArrayIndexSelectivity(rootstats, index); + index_sel /= jsonPathStatsGetFreq(arrstats, 0.0); + + /* Form pg_statistics tuple, taking into account array index selectivity */ + return jsonPathStatsFormTuple(arrstats, type, 1.0 - index_sel); +} + +/* + * jsonStatsGetPathFreq + * Return frequency of a path (fraction of documents containing it). + */ +static float4 +jsonStatsGetPathFreq(JsonStats jsdata, Datum *path, int pathlen, + bool try_array_indexes) +{ + float4 nullfrac; + JsonPathStats pstats = jsonStatsGetPath(jsdata, path, pathlen, + try_array_indexes, &nullfrac); + float4 freq = (1.0 - nullfrac) * jsonPathStatsGetFreq(pstats, 0.0); + + CLAMP_PROBABILITY(freq); + return freq; +} + +/* + * jsonbStatsVarOpConst + * Prepare optimizer statistics for a given operator, from JSON stats. + * + * This handles only OpExpr expressions, with variable and a constant. We get + * the constant as is, and the variable is represented by statistics fetched + * by get_restriction_variable(). + * + * opid - OID of the operator (input parameter) + * resdata - pointer to calculated statistics for result of operator + * vardata - statistics for the restriction variable + * cnst - constant from the operator expression + * + * Returns true when useful optimizer statistics have been calculated. + */ +static bool +jsonbStatsVarOpConst(Oid opid, VariableStatData *resdata, + const VariableStatData *vardata, Const *cnst) +{ + JsonStatData jsdata; + JsonStatType statype = JsonStatJsonb; + + if (!jsonStatsInit(&jsdata, vardata)) + return false; + + switch (opid) + { + case JsonbObjectFieldTextOperator: + statype = JsonStatText; + /* FALLTHROUGH */ + case JsonbObjectFieldOperator: + { + if (cnst->consttype != TEXTOID) + { + jsonStatsRelease(&jsdata); + return false; + } + + resdata->statsTuple = jsonStatsGetPathTuple(&jsdata, statype, + &cnst->constvalue, 1, + false); + break; + } + + case JsonbArrayElementTextOperator: + statype = JsonStatText; + /* FALLTHROUGH */ + case JsonbArrayElementOperator: + { + if (cnst->consttype != INT4OID) + { + jsonStatsRelease(&jsdata); + return false; + } + + resdata->statsTuple = + jsonStatsGetArrayIndexStatsTuple(&jsdata, statype, + DatumGetInt32(cnst->constvalue)); + break; + } + + case JsonbExtractPathTextOperator: + statype = JsonStatText; + /* FALLTHROUGH */ + case JsonbExtractPathOperator: + { + Datum *path; + bool *nulls; + int pathlen; + bool have_nulls = false; + + if (cnst->consttype != TEXTARRAYOID) + { + jsonStatsRelease(&jsdata); + return false; + } + + deconstruct_array(DatumGetArrayTypeP(cnst->constvalue), TEXTOID, + -1, false, 'i', &path, &nulls, &pathlen); + + for (int i = 0; i < pathlen; i++) + { + if (nulls[i]) + { + have_nulls = true; + break; + } + } + + if (!have_nulls) + resdata->statsTuple = jsonStatsGetPathTuple(&jsdata, statype, + path, pathlen, + true); + + pfree(path); + pfree(nulls); + break; + } + + default: + jsonStatsRelease(&jsdata); + return false; + } + + if (!resdata->statsTuple) + resdata->statsTuple = stats_form_tuple(NULL); /* form all-NULL tuple */ + + resdata->acl_ok = vardata->acl_ok; + resdata->freefunc = heap_freetuple; + Assert(resdata->rel == vardata->rel); + Assert(resdata->atttype == + (statype == JsonStatJsonb ? JSONBOID : + statype == JsonStatText ? TEXTOID : + /* statype == JsonStatFreq */ BOOLOID)); + + jsonStatsRelease(&jsdata); + return true; +} + +/* + * jsonb_stats + * Statistics estimation procedure for JSONB data type. + * + * This only supports OpExpr expressions, with (Var op Const) shape. + * + * Var really can be a chain of OpExprs with derived statistics + * (jsonb_column -> 'key1' -> key2'), because get_restriction_variable() + * already handles this case. + */ +Datum +jsonb_stats(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + OpExpr *opexpr = (OpExpr *) PG_GETARG_POINTER(1); + int varRelid = PG_GETARG_INT32(2); + VariableStatData *resdata = (VariableStatData *) PG_GETARG_POINTER(3); + VariableStatData vardata; + Node *constexpr; + bool varonleft; + + /* should only be called for OpExpr expressions */ + Assert(IsA(opexpr, OpExpr)); + + /* Is the expression simple enough? (Var op Const) or similar? */ + if (!get_restriction_variable(root, opexpr->args, varRelid, + &vardata, &constexpr, &varonleft)) + PG_RETURN_VOID(); + + /* XXX Could we also get varonleft=false in useful cases? */ + if (IsA(constexpr, Const) && varonleft) + jsonbStatsVarOpConst(opexpr->opno, resdata, &vardata, + (Const *) constexpr); + + ReleaseVariableStats(vardata); + + PG_RETURN_VOID(); +} + +/* + * jsonSelectivity + * Use JSON statistics to estimate selectivity for (in)equalities. + * + * The statistics is represented as (arrays of) JSON values etc. so we + * need to pass the right operators to the functions. + */ +Selectivity +jsonSelectivity(JsonPathStats stats, Datum scalar, Oid operator) +{ + VariableStatData vardata; + Selectivity sel; + + if (!stats) + return 0.0; + + vardata.atttype = JSONBOID; + vardata.atttypmod = -1; + vardata.isunique = false; + vardata.rel = stats->data->rel; + vardata.var = NULL; + vardata.vartype = JSONBOID; + vardata.acl_ok = stats->data->acl_ok; + vardata.statsTuple = jsonPathStatsFormTuple(stats, + JsonStatJsonbWithoutSubpaths, 0.0); + + if (operator == JsonbEqOperator) + sel = var_eq_const(&vardata, operator, InvalidOid, scalar, false, true, false); + else + sel = scalarineqsel(NULL, operator, + /* is it greater or greater-or-equal? */ + operator == JsonbGtOperator || + operator == JsonbGeOperator, + /* is it equality? */ + operator == JsonbLeOperator || + operator == JsonbGeOperator, + InvalidOid, + &vardata, scalar, JSONBOID); + + if (vardata.statsTuple) + heap_freetuple(vardata.statsTuple); + + return sel; +} + +/* + * jsonAccumulateSubPathSelectivity + * Transform absolute subpath selectivity into relative and accumulate it + * into parent path simply by multiplication of relative selectivities. + */ +static void +jsonAccumulateSubPathSelectivity(Selectivity subpath_abs_sel, + Selectivity path_freq, + Selectivity *path_relative_sel, + JsonPathStats array_path_stats) +{ + Selectivity sel = subpath_abs_sel / path_freq; /* relative selectivity */ + + /* XXX Try to take into account array length */ + if (array_path_stats) + sel = 1.0 - pow(1.0 - sel, + jsonPathStatsGetAvgArraySize(array_path_stats)); + + /* Accumulate selectivity of subpath into parent path */ + *path_relative_sel *= sel; +} + +/* + * jsonSelectivityContains + * Estimate selectivity for containment operator on JSON. + * + * Iterate through query jsonb elements, build paths to its leaf elements, + * calculate selectivies of 'path == scalar' in leaves, multiply relative + * selectivities of subpaths at each path level, propagate computed + * selectivities to the root. + */ +static Selectivity +jsonSelectivityContains(JsonStats stats, Jsonb *jb) +{ + JsonbValue v; + JsonbIterator *it; + JsonbIteratorToken r; + StringInfoData pathstr; /* path string */ + struct Path /* path stack entry */ + { + struct Path *parent; /* parent entry */ + int len; /* associated length of pathstr */ + Selectivity freq; /* absolute frequence of path */ + Selectivity sel; /* relative selectivity of subpaths */ + JsonPathStats stats; /* statistics for the path */ + bool is_array_accesor; /* is it '[*]' ? */ + } root, /* root path entry */ + *path = &root; /* path entry stack */ + Selectivity sel; /* resulting selectivity */ + Selectivity scalarSel; /* selectivity of 'jsonb == scalar' */ + + /* Initialize root path string */ + initStringInfo(&pathstr); + appendBinaryStringInfo(&pathstr, stats->prefix, stats->prefixlen); + + /* Initialize root path entry */ + root.parent = NULL; + root.len = pathstr.len; + root.stats = jsonStatsFindPath(stats, pathstr.data, pathstr.len); + root.freq = jsonPathStatsGetFreq(root.stats, 0.0); + root.sel = 1.0; + root.is_array_accesor = pathstr.data[pathstr.len - 1] == ']'; + + /* Return 0, if NULL fraction is 1. */ + if (root.freq <= 0.0) + return 0.0; + + /* + * Selectivity of query 'jsonb @> scalar' consists of selectivities of + * 'jsonb == scalar' and 'jsonb[*] == scalar'. Selectivity of + * 'jsonb[*] == scalar' will be computed in root.sel, but for + * 'jsonb == scalar' we need additional computation. + */ + if (JsonContainerIsScalar(&jb->root)) + scalarSel = jsonSelectivity(root.stats, JsonbPGetDatum(jb), + JsonbEqOperator); + else + scalarSel = 0.0; + + it = JsonbIteratorInit(&jb->root); + + while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) + { + switch (r) + { + case WJB_BEGIN_OBJECT: + { + struct Path *p; + Selectivity freq = + jsonPathStatsGetTypeFreq(path->stats, jbvObject, 0.0); + + /* If there are no objects, selectivity is 0. */ + if (freq <= 0.0) + return 0.0; + + /* + * Push path entry for object keys, actual key names are + * appended later in WJB_KEY case. + */ + p = palloc(sizeof(*p)); + p->len = pathstr.len; + p->parent = path; + p->stats = NULL; + p->freq = freq; + p->sel = 1.0; + p->is_array_accesor = false; + path = p; + break; + } + + case WJB_BEGIN_ARRAY: + { + struct Path *p; + JsonPathStats pstats; + Selectivity freq; + + /* + * First, find stats for the parent path if needed, it will be + * used in jsonAccumulateSubPathSelectivity(). + */ + if (!path->stats) + path->stats = jsonStatsFindPath(stats, pathstr.data, + pathstr.len); + + /* Appeend path string entry for array elements, get stats. */ + jsonPathAppendEntry(&pathstr, NULL); + pstats = jsonStatsFindPath(stats, pathstr.data, pathstr.len); + freq = jsonPathStatsGetFreq(pstats, 0.0); + + /* If there are no arrays, return 0 or scalar selectivity */ + if (freq <= 0.0) + return scalarSel; + + /* Push path entry for array elements. */ + p = palloc(sizeof(*p)); + p->len = pathstr.len; + p->parent = path; + p->stats = pstats; + p->freq = freq; + p->sel = 1.0; + p->is_array_accesor = true; + path = p; + break; + } + + case WJB_END_OBJECT: + case WJB_END_ARRAY: + { + struct Path *p = path; + /* Absoulte selectivity of the path with its all subpaths */ + Selectivity abs_sel = p->sel * p->freq; + + /* Pop last path entry */ + path = path->parent; + pfree(p); + pathstr.len = path->len; + pathstr.data[pathstr.len] = '\0'; + + /* Accumulate selectivity into parent path */ + jsonAccumulateSubPathSelectivity(abs_sel, path->freq, + &path->sel, + path->is_array_accesor ? + path->parent->stats : NULL); + break; + } + + case WJB_KEY: + { + /* Remove previous key in the path string */ + pathstr.len = path->parent->len; + pathstr.data[pathstr.len] = '\0'; + + /* Append current key to path string */ + jsonPathAppendEntryWithLen(&pathstr, v.val.string.val, + v.val.string.len); + path->len = pathstr.len; + break; + } + + case WJB_VALUE: + case WJB_ELEM: + { + /* + * Extract statistics for a path. Array elements share the + * same statistics that was extracted in WJB_BEGIN_ARRAY. + */ + JsonPathStats pstats = r == WJB_ELEM ? path->stats : + jsonStatsFindPath(stats, pathstr.data, pathstr.len); + Selectivity abs_sel; /* Absolute selectivity of 'path == scalar' */ + + if (pstats) + { + /* Make scalar jsonb datum and compute selectivity */ + Datum scalar = JsonbPGetDatum(JsonbValueToJsonb(&v)); + + abs_sel = jsonSelectivity(pstats, scalar, JsonbEqOperator); + } + else + abs_sel = 0.0; + + /* Accumulate selectivity into parent path */ + jsonAccumulateSubPathSelectivity(abs_sel, path->freq, + &path->sel, + path->is_array_accesor ? + path->parent->stats : NULL); + break; + } + + default: + break; + } + } + + /* Compute absolute selectivity for root, including raw scalar case. */ + sel = root.sel * root.freq + scalarSel; + CLAMP_PROBABILITY(sel); + return sel; +} + +/* + * jsonSelectivityExists + * Estimate selectivity for JSON "exists" operator. + */ +static Selectivity +jsonSelectivityExists(JsonStats stats, Datum key) +{ + JsonPathStats rootstats; + JsonPathStats arrstats; + JsonbValue jbvkey; + Datum jbkey; + Selectivity keysel; + Selectivity scalarsel; + Selectivity arraysel; + Selectivity sel; + + JsonValueInitStringWithLen(&jbvkey, + VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); + + jbkey = JsonbPGetDatum(JsonbValueToJsonb(&jbvkey)); + + keysel = jsonStatsGetPathFreq(stats, &key, 1, false); + + rootstats = jsonStatsGetRootPath(stats); + scalarsel = jsonSelectivity(rootstats, jbkey, JsonbEqOperator); + + arrstats = jsonStatsGetRootArrayPath(stats); + arraysel = jsonSelectivity(arrstats, jbkey, JsonbEqOperator); + arraysel = 1.0 - pow(1.0 - arraysel, + jsonPathStatsGetAvgArraySize(rootstats)); + + sel = keysel + scalarsel + arraysel; + CLAMP_PROBABILITY(sel); + return sel; +} + +static Selectivity +jsonb_sel_internal(JsonStats stats, Oid operator, Const *cnst, bool varonleft) +{ + switch (operator) + { + case JsonbExistsOperator: + if (!varonleft || cnst->consttype != TEXTOID) + break; + + return jsonSelectivityExists(stats, cnst->constvalue); + + case JsonbExistsAnyOperator: + case JsonbExistsAllOperator: + { + Datum *keys; + bool *nulls; + Selectivity freq = 1.0; + int nkeys; + int i; + bool all = operator == JsonbExistsAllOperator; + + if (!varonleft || cnst->consttype != TEXTARRAYOID) + break; + + deconstruct_array(DatumGetArrayTypeP(cnst->constvalue), TEXTOID, + -1, false, 'i', &keys, &nulls, &nkeys); + + for (i = 0; i < nkeys; i++) + if (!nulls[i]) + { + Selectivity pathfreq = jsonSelectivityExists(stats, + keys[i]); + freq *= all ? pathfreq : (1.0 - pathfreq); + } + + pfree(keys); + pfree(nulls); + + if (!all) + freq = 1.0 - freq; + + return freq; + } + + case JsonbContainedOperator: + if (varonleft || cnst->consttype != JSONBOID) + break; + + return jsonSelectivityContains(stats, + DatumGetJsonbP(cnst->constvalue)); + + case JsonbContainsOperator: + if (!varonleft || cnst->consttype != JSONBOID) + break; + + return jsonSelectivityContains(stats, + DatumGetJsonbP(cnst->constvalue)); + + default: + break; + } + + return DEFAULT_JSON_CONTAINS_SEL; +} + +/* + * jsonb_sel + * The main procedure estimating selectivity for all JSONB operators. + */ +Datum +jsonb_sel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + double sel = DEFAULT_JSON_CONTAINS_SEL; + Node *other; + bool varonleft; + VariableStatData vardata; + + if (get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + { + if (IsA(other, Const)) + { + Const *cnst = (Const *) other; + + if (cnst->constisnull) + sel = 0.0; + else + { + JsonStatData stats; + + if (jsonStatsInit(&stats, &vardata)) + { + sel = jsonb_sel_internal(&stats, operator, cnst, varonleft); + jsonStatsRelease(&stats); + } + } + } + + ReleaseVariableStats(vardata); + } + + PG_RETURN_FLOAT8((float8) sel); +} diff --git a/src/backend/utils/adt/jsonb_typanalyze.c b/src/backend/utils/adt/jsonb_typanalyze.c new file mode 100644 index 0000000..7882db2 --- /dev/null +++ b/src/backend/utils/adt/jsonb_typanalyze.c @@ -0,0 +1,1627 @@ +/*------------------------------------------------------------------------- + * + * jsonb_typanalyze.c + * Functions for gathering statistics from jsonb columns + * + * Copyright (c) 2016-2022, PostgreSQL Global Development Group + * + * Functions in this module are used to analyze contents of JSONB columns + * and build optimizer statistics. In principle we extract paths from all + * sampled documents and calculate the usual statistics (MCV, histogram) + * for each path - in principle each path is treated as a column. + * + * Because we're not enforcing any JSON schema, the documents may differ + * a lot - the documents may contain large number of different keys, the + * types of values may be entirely different, etc. This makes it more + * challenging than building stats for regular columns. For example not + * only do we need to decide which values to keep in the MCV, but also + * which paths to keep (in case the documents are so variable we can't + * keep all paths). + * + * The statistics is stored in pg_statistic, in a slot with a new stakind + * value (STATISTIC_KIND_JSON). The statistics is serialized as an array + * of JSONB values, eash element storing statistics for one path. + * + * For each path, we store the following keys: + * + * - path - path this stats is for, serialized as jsonpath + * - freq - frequency of documents containing this path + * - json - the regular per-column stats (MCV, histogram, ...) + * - freq_null - frequency of JSON null values + * - freq_array - frequency of JSON array values + * - freq_object - frequency of JSON object values + * - freq_string - frequency of JSON string values + * - freq_numeric - frequency of JSON numeric values + * + * This is stored in the stavalues array. + * + * The first element of stavalues is a path prefix. It is used for avoiding + * path transformations when the derived statistics for the chains of -> + * operators is computed. + * + * The per-column stats (stored in the "json" key) have additional internal + * structure, to allow storing multiple stakind types (histogram, mcv). See + * jsonAnalyzeMakeScalarStats for details. + * + * + * XXX It's a bit weird the "regular" stats are stored in the "json" key, + * while the JSON stats (frequencies of different JSON types) are right + * at the top level. + * + * + * IDENTIFICATION + * src/backend/utils/adt/jsonb_typanalyze.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "access/hash.h" +#include "access/detoast.h" +#include "catalog/pg_attribute.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_type.h" +#include "commands/vacuum.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/hsearch.h" +#include "utils/json.h" +#include "utils/jsonb.h" +#include "utils/json_selfuncs.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" + +typedef struct JsonPathEntry JsonPathEntry; + +/* + * Element of a path in the JSON document (i.e. not jsonpath). Elements + * are linked together to build longer paths. + * + * 'entry' can be not zero-terminated when it is pointing to JSONB keys, so + * 'len' is necessary. 'len' is also used for faster entry comparison, to + * distinguish array entries ('len' == -1). + */ +typedef struct JsonPathEntry +{ + JsonPathEntry *parent; + const char *entry; /* element of the path as a string */ + int len; /* length of entry string (may be 0 or -1) */ + uint32 hash; /* hash of the whole path (with parent) */ + char *pathstr; /* full path string */ + int depth; /* nesting level, i.e. path length */ +} JsonPathEntry; + +#define JsonPathEntryIsArray(entry) ((entry)->len == -1) + +/* + * An array containing a dynamic number of values extracted from JSON documents. + * All values should have the same data type: + * jsonb - ordinary path stats, values of different JSON types + * int32 - array/object length stats + * text - separate stats fro strings + * numeric - separate stats fro numbers + */ +typedef struct JsonValues +{ + Datum *buf; + int count; + int allocated; +} JsonValues; + +/* + * Scalar statistics built for an array of values, extracted from a JSON + * document (for one particular path). + */ +typedef struct JsonScalarStats +{ + JsonValues values; + VacAttrStats stats; +} JsonScalarStats; + +/* + * Statistics calculated for a set of values. + * + * + * XXX This seems rather complicated and needs simplification. We're not + * really using all the various JsonScalarStats bits, there's a lot of + * duplication (e.g. each JsonScalarStats contains it's own array, which + * has a copy of data from the one in "jsons"). + */ +typedef struct JsonValueStats +{ + JsonScalarStats jsons; /* stats for all JSON types together */ + +#ifdef JSON_ANALYZE_SCALARS /* XXX */ + JsonScalarStats strings; /* stats for JSON strings */ + JsonScalarStats numerics; /* stats for JSON numerics */ +#endif + + JsonScalarStats arrlens; /* stats of array lengths */ + JsonScalarStats objlens; /* stats of object lengths */ + + int nnulls; /* number of JSON null values */ + int ntrue; /* number of JSON true values */ + int nfalse; /* number of JSON false values */ + int nobjects; /* number of JSON objects */ + int narrays; /* number of JSON arrays */ + int nstrings; /* number of JSON strings */ + int nnumerics; /* number of JSON numerics */ + + int64 narrelems; /* total number of array elements + * (for avg. array length) */ +} JsonValueStats; + +typedef struct JsonPathDocBitmap +{ + bool is_list; + int size; + int allocated; + union + { + int32 *list; + uint8 *bitmap; + } data; +} JsonPathDocBitmap; + +/* JSON path and list of documents containing it */ +typedef struct JsonPathAnlDocs +{ + JsonPathEntry path; + JsonPathDocBitmap bitmap; +} JsonPathAnlDocs; + +/* Main structure for analyzed JSON path */ +typedef struct JsonPathAnlStats +{ + JsonPathEntry path; + double freq; /* frequence of the path */ + JsonValueStats vstats; /* collected values and raw computed stats */ + Jsonb *stats; /* stats converted into jsonb form */ +} JsonPathAnlStats; + +/* Some parent path stats counters that used for frequency calculations */ +typedef struct JsonPathParentStats +{ + double freq; + int count; + int narrays; +} JsonPathParentStats; + +/* various bits needed while analyzing JSON */ +typedef struct JsonAnalyzeContext +{ + VacAttrStats *stats; + MemoryContext mcxt; + AnalyzeAttrFetchFunc fetchfunc; + HTAB *pathshash; + JsonPathAnlStats *root; + double totalrows; + double total_width; + int samplerows; + int current_rownum; + int target; + int null_cnt; + int analyzed_cnt; + int maxdepth; + bool scalarsOnly; + bool single_pass; +} JsonAnalyzeContext; + +/* + * JsonPathMatch + * Determine when two JSON paths (list of JsonPathEntry) match. + * + * Returned int instead of bool, because it is an implementation of + * HashCompareFunc. + */ +static int +JsonPathEntryMatch(const void *key1, const void *key2, Size keysize) +{ + const JsonPathEntry *path1 = key1; + const JsonPathEntry *path2 = key2; + + return path1->parent != path2->parent || + path1->len != path2->len || + (path1->len > 0 && + strncmp(path1->entry, path2->entry, path1->len)); +} + +/* + * JsonPathHash + * Calculate hash of the path entry. + * + * Parent hash should be already calculated. + */ +static uint32 +JsonPathEntryHash(const void *key, Size keysize) +{ + const JsonPathEntry *path = key; + uint32 hash = path->parent ? path->parent->hash : 0; + + hash = (hash << 1) | (hash >> 31); + hash ^= path->len < 0 ? 0 : + DatumGetUInt32(hash_any((const unsigned char *) path->entry, path->len)); + + return hash; +} + +static void +jsonStatsBitmapInit(JsonPathDocBitmap *bitmap) +{ + memset(bitmap, 0, sizeof(*bitmap)); + bitmap->is_list = true; +} + +static void +jsonStatsBitmapAdd(JsonAnalyzeContext *cxt, JsonPathDocBitmap *bitmap, int doc) +{ + /* Use more compact list representation if not too many bits set */ + if (bitmap->is_list) + { + int *list = bitmap->data.list; + +#if 1 /* Enable list representation */ + if (bitmap->size > 0 && list[bitmap->size - 1] == doc) + return; + + if (bitmap->size < cxt->samplerows / sizeof(list[0]) / 8) + { + if (bitmap->size >= bitmap->allocated) + { + MemoryContext oldcxt = MemoryContextSwitchTo(cxt->mcxt); + + if (bitmap->allocated) + { + bitmap->allocated *= 2; + list = repalloc(list, sizeof(list[0]) * bitmap->allocated); + } + else + { + bitmap->allocated = 8; + list = palloc(sizeof(list[0]) * bitmap->allocated); + } + + bitmap->data.list = list; + + MemoryContextSwitchTo(oldcxt); + } + + list[bitmap->size++] = doc; + return; + } +#endif + /* convert list to bitmap */ + bitmap->allocated = (cxt->samplerows + 7) / 8; + bitmap->data.bitmap = MemoryContextAllocZero(cxt->mcxt, bitmap->allocated); + bitmap->is_list = false; + + if (list) + { + for (int i = 0; i < bitmap->size; i++) + { + int d = list[i]; + + bitmap->data.bitmap[d / 8] |= (1 << (d % 8)); + } + + pfree(list); + } + } + + /* set bit in bitmap */ + if (doc < cxt->samplerows && + !(bitmap->data.bitmap[doc / 8] & (1 << (doc % 8)))) + { + bitmap->data.bitmap[doc / 8] |= (1 << (doc % 8)); + bitmap->size++; + } +} + +static bool +jsonStatsBitmapNext(JsonPathDocBitmap *bitmap, int *pbit) +{ + uint8 *bmp = bitmap->data.bitmap; + uint8 *pb; + uint8 *pb_end = &bmp[bitmap->allocated]; + int bit = *pbit; + + Assert(!bitmap->is_list); + + if (bit < 0) + { + pb = bmp; + bit = 0; + } + else + { + ++bit; + pb = &bmp[bit / 8]; + bit %= 8; + } + + for (; pb < pb_end; pb++, bit = 0) + { + uint8 b; + + /* Skip zero bytes */ + if (!bit) + { + while (!*pb) + { + if (++pb >= pb_end) + return false; + } + } + + b = *pb; + + /* Skip zero bits */ + while (bit < 8 && !(b & (1 << bit))) + bit++; + + if (bit >= 8) + continue; /* Non-zero bit not found, go to next byte */ + + /* Output next non-zero bit */ + *pbit = (pb - bmp) * 8 + bit; + return true; + } + + return false; +} + +static void +jsonStatsAnlInit(JsonPathAnlStats *stats) +{ + /* initialize the stats counter for this path entry */ + memset(&stats->vstats, 0, sizeof(JsonValueStats)); + stats->stats = NULL; + stats->freq = 0.0; +} + +/* + * jsonAnalyzeAddPath + * Add an entry for a JSON path to the working list of statistics. + * + * Returns a pointer to JsonPathAnlStats (which might have already existed + * if the path was in earlier document), which can then be populated or + * updated. + */ +static inline JsonPathEntry * +jsonAnalyzeAddPath(JsonAnalyzeContext *ctx, JsonPathEntry *parent, + const char *entry, int len) +{ + JsonPathEntry path; + JsonPathEntry *stats; + bool found; + + /* Init path entry */ + path.parent = parent; + path.entry = entry; + path.len = len; + path.hash = JsonPathEntryHash(&path, 0); + + /* See if we already saw this path earlier. */ + stats = hash_search_with_hash_value(ctx->pathshash, &path, path.hash, + HASH_ENTER, &found); + + /* + * Nope, it's the first time we see this path, so initialize all the + * fields (path string, counters, ...). + */ + if (!found) + { + JsonPathEntry *parent = stats->parent; + const char *ppath = parent->pathstr; + StringInfoData si; + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(ctx->mcxt); + + /* NULL entries are treated as wildcard array accessors "[*]" */ + if (stats->entry) + /* Copy path entry name into the right MemoryContext */ + stats->entry = pnstrdup(stats->entry, stats->len); + + MemoryContextSwitchTo(oldcxt); + + /* Initialze full path string */ + initStringInfo(&si); + appendStringInfoString(&si, ppath); + jsonPathAppendEntry(&si, stats->entry); + + MemoryContextSwitchTo(ctx->mcxt); + stats->pathstr = pstrdup(si.data); + MemoryContextSwitchTo(oldcxt); + + pfree(si.data); + + if (ctx->single_pass) + jsonStatsAnlInit((JsonPathAnlStats *) stats); + else + jsonStatsBitmapInit(&((JsonPathAnlDocs *) stats)->bitmap); + + stats->depth = parent->depth + 1; + + /* update maximal depth */ + if (ctx->maxdepth < stats->depth) + ctx->maxdepth = stats->depth; + } + + return stats; +} + +/* + * JsonValuesAppend + * Add a JSON value to the dynamic array (enlarge it if needed). + * + * XXX This is likely one of the problems - the documents may be pretty + * large, with a lot of different values for each path. At that point + * it's problematic to keep all of that in memory at once. So maybe we + * need to introduce some sort of compaction (e.g. we could try + * deduplicating the values), limit on size of the array or something. + */ +static inline void +JsonValuesAppend(JsonValues *values, Datum value, int initialSize) +{ + if (values->count >= values->allocated) + { + if (values->allocated) + { + values->allocated = values->allocated * 2; + values->buf = repalloc(values->buf, + sizeof(values->buf[0]) * values->allocated); + } + else + { + values->allocated = initialSize; + values->buf = palloc(sizeof(values->buf[0]) * values->allocated); + } + } + + values->buf[values->count++] = value; +} + +/* + * jsonAnalyzeJsonValue + * Process a value extracted from the document (for a given path). + */ +static inline void +jsonAnalyzeJsonValue(JsonAnalyzeContext *ctx, JsonValueStats *vstats, + JsonbValue *jv) +{ + JsonbValue *jbv; + JsonbValue jbvtmp; + Jsonb *jb; + Datum value; + MemoryContext oldcxt = NULL; + + /* XXX if analyzing only scalar values, make containers empty */ + if (ctx->scalarsOnly && jv->type == jbvBinary) + { + if (JsonContainerIsObject(jv->val.binary.data)) + jbv = JsonValueInitObject(&jbvtmp, 0, 0); + else + { + Assert(JsonContainerIsArray(jv->val.binary.data)); + jbv = JsonValueInitArray(&jbvtmp, 0, 0, false); + } + } + else + jbv = jv; + + jb = JsonbValueToJsonb(jbv); + + if (ctx->single_pass) + { + oldcxt = MemoryContextSwitchTo(ctx->stats->anl_context); + jb = memcpy(palloc(VARSIZE(jb)), jb, VARSIZE(jb)); + } + + /* always add it to the "global" JSON stats, shared by all types */ + JsonValuesAppend(&vstats->jsons.values, + JsonbPGetDatum(jb), + ctx->target); + + /* also update the type-specific counters */ + switch (jv->type) + { + case jbvNull: + vstats->nnulls++; + break; + + case jbvBool: + if (jv->val.boolean) + vstats->ntrue++; + else + vstats->nfalse++; + break; + + case jbvString: + vstats->nstrings++; +#ifdef JSON_ANALYZE_SCALARS + value = PointerGetDatum( + cstring_to_text_with_len(jv->val.string.val, + jv->val.string.len)); + JsonValuesAppend(&vstats->strings.values, value, ctx->target); +#endif + break; + + case jbvNumeric: + vstats->nnumerics++; +#ifdef JSON_ANALYZE_SCALARS + value = PointerGetDatum(jv->val.numeric); + JsonValuesAppend(&vstats->numerics.values, value, ctx->target); +#endif + break; + + case jbvBinary: + if (JsonContainerIsObject(jv->val.binary.data)) + { + uint32 size = JsonContainerSize(jv->val.binary.data); + + value = DatumGetInt32(size); + vstats->nobjects++; + JsonValuesAppend(&vstats->objlens.values, value, ctx->target); + } + else if (JsonContainerIsArray(jv->val.binary.data)) + { + uint32 size = JsonContainerSize(jv->val.binary.data); + + value = DatumGetInt32(size); + vstats->narrays++; + JsonValuesAppend(&vstats->arrlens.values, value, ctx->target); + vstats->narrelems += size; + } + break; + + default: + elog(ERROR, "invalid scalar json value type %d", jv->type); + break; + } + + if (ctx->single_pass) + MemoryContextSwitchTo(oldcxt); +} + +/* + * jsonAnalyzeCollectPaths + * Parse the JSON document and collect all paths and their values. + */ +static void +jsonAnalyzeCollectPaths(JsonAnalyzeContext *ctx, Jsonb *jb, void *param) +{ + JsonbValue jv; + JsonbIterator *it; + JsonbIteratorToken tok; + JsonPathEntry *stats = &ctx->root->path; + int doc = ctx->current_rownum; + bool collect_values = (bool)(intptr_t) param; + bool scalar = false; + + if (collect_values && !JB_ROOT_IS_SCALAR(jb)) + jsonAnalyzeJsonValue(ctx, &((JsonPathAnlStats *) stats)->vstats, + JsonValueInitBinary(&jv, jb)); + + it = JsonbIteratorInit(&jb->root); + + while ((tok = JsonbIteratorNext(&it, &jv, true)) != WJB_DONE) + { + switch (tok) + { + case WJB_BEGIN_OBJECT: + /* + * Read next token to see if the object is empty or not. + * If not, make stats for the first key. Subsequent WJB_KEYs + * and WJB_END_OBJECT will expect that stats will be pointing + * to the key of current object. + */ + tok = JsonbIteratorNext(&it, &jv, true); + + if (tok == WJB_END_OBJECT) + /* Empty object, simply skip stats initialization. */ + break; + + if (tok != WJB_KEY) + elog(ERROR, "unexpected jsonb iterator token: %d", tok); + + stats = jsonAnalyzeAddPath(ctx, stats, + jv.val.string.val, + jv.val.string.len); + break; + + case WJB_BEGIN_ARRAY: + /* Make stats for non-scalar array and use it for all elements */ + if (!(scalar = jv.val.array.rawScalar)) + stats = jsonAnalyzeAddPath(ctx, stats, NULL, -1); + break; + + case WJB_END_ARRAY: + if (scalar) + break; + /* FALLTHROUGH */ + case WJB_END_OBJECT: + /* Reset to parent stats */ + stats = stats->parent; + break; + + case WJB_KEY: + /* + * Stats should point to the previous key of current object, + * use its parent path as a base path. + */ + stats = jsonAnalyzeAddPath(ctx, stats->parent, + jv.val.string.val, + jv.val.string.len); + break; + + case WJB_VALUE: + case WJB_ELEM: + if (collect_values) + jsonAnalyzeJsonValue(ctx, + &((JsonPathAnlStats *) stats)->vstats, + &jv); + else if (stats != &ctx->root->path) + jsonStatsBitmapAdd(ctx, + &((JsonPathAnlDocs *) stats)->bitmap, + doc); + + /* + * Manually recurse into container by creating child iterator. + * We use skipNested=true to give jsonAnalyzeJsonValue() + * ability to access jbvBinary containers. + */ + if (jv.type == jbvBinary) + { + JsonbIterator *it2 = JsonbIteratorInit(jv.val.binary.data); + + it2->parent = it; + it = it2; + } + break; + + default: + break; + } + } +} + +/* + * jsonAnalyzeCollectSubpath + * Recursively extract trailing part of a path and collect its values. + */ +static void +jsonAnalyzeCollectSubpath(JsonAnalyzeContext *ctx, JsonPathAnlStats *pstats, + JsonbValue *jbv, JsonPathEntry **entries, + int start_entry) +{ + JsonbValue scalar; + int i; + + for (i = start_entry; i < pstats->path.depth; i++) + { + JsonPathEntry *entry = entries[i]; + JsonbContainer *jbc = jbv->val.binary.data; + JsonbValueType type = jbv->type; + + if (i > start_entry) + pfree(jbv); + + if (type != jbvBinary) + return; + + if (JsonPathEntryIsArray(entry)) + { + JsonbIterator *it; + JsonbIteratorToken r; + JsonbValue elem; + + if (!JsonContainerIsArray(jbc) || JsonContainerIsScalar(jbc)) + return; + + it = JsonbIteratorInit(jbc); + + while ((r = JsonbIteratorNext(&it, &elem, true)) != WJB_DONE) + { + if (r == WJB_ELEM) + jsonAnalyzeCollectSubpath(ctx, pstats, &elem, entries, i + 1); + } + + return; + } + else + { + if (!JsonContainerIsObject(jbc)) + return; + + jbv = findJsonbValueFromContainerLen(jbc, JB_FOBJECT, + entry->entry, entry->len); + + if (!jbv) + return; + } + } + + if (i == start_entry && + jbv->type == jbvBinary && + JsonbExtractScalar(jbv->val.binary.data, &scalar)) + jbv = &scalar; + + jsonAnalyzeJsonValue(ctx, &pstats->vstats, jbv); + + if (i > start_entry) + pfree(jbv); +} + +/* + * jsonAnalyzeCollectPath + * Extract a single path from JSON documents and collect its values. + */ +static void +jsonAnalyzeCollectPath(JsonAnalyzeContext *ctx, Jsonb *jb, void *param) +{ + JsonPathAnlStats *pstats = (JsonPathAnlStats *) param; + JsonbValue jbvtmp; + JsonbValue *jbv = JsonValueInitBinary(&jbvtmp, jb); + JsonPathEntry *path; + JsonPathEntry **entries; + int i; + + entries = palloc(sizeof(*entries) * pstats->path.depth); + + /* Build entry array in direct order */ + for (path = &pstats->path, i = pstats->path.depth - 1; + path->parent && i >= 0; + path = path->parent, i--) + entries[i] = path; + + jsonAnalyzeCollectSubpath(ctx, pstats, jbv, entries, 0); + + pfree(entries); +} + +static Datum +jsonAnalyzePathFetch(VacAttrStatsP stats, int rownum, bool *isnull) +{ + *isnull = false; + return stats->exprvals[rownum]; +} + +/* + * jsonAnalyzePathValues + * Calculate per-column statistics for values for a single path. + * + * We have already accumulated all the values for the path, so we simply + * call the typanalyze function for the proper data type, and then + * compute_stats (which points to compute_scalar_stats or so). + */ +static void +jsonAnalyzePathValues(JsonAnalyzeContext *ctx, JsonScalarStats *sstats, + Oid typid, double freq, bool use_anl_context) +{ + JsonValues *values = &sstats->values; + VacAttrStats *stats = &sstats->stats; + FormData_pg_attribute attr; + FormData_pg_type type; + int i; + + if (!sstats->values.count) + return; + + get_typlenbyvalalign(typid, &type.typlen, &type.typbyval, &type.typalign); + + attr.attstattarget = ctx->target; + + stats->attr = &attr; + stats->attrtypid = typid; + stats->attrtypmod = -1; + stats->attrtype = &type; + stats->anl_context = use_anl_context ? ctx->stats->anl_context : CurrentMemoryContext; + + stats->exprvals = values->buf; + + /* + * The fields describing the stats->stavalues[n] element types default to + * the type of the data being analyzed, but the type-specific typanalyze + * function can change them if it wants to store something else. + */ + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + stats->statypid[i] = stats->attrtypid; + stats->statyplen[i] = stats->attrtype->typlen; + stats->statypbyval[i] = stats->attrtype->typbyval; + stats->statypalign[i] = stats->attrtype->typalign; + } + + std_typanalyze(stats); + + stats->compute_stats(stats, jsonAnalyzePathFetch, + values->count, + ctx->totalrows / ctx->samplerows * values->count); + + /* + * We've only kept the non-null values, so compute_stats will always + * leave this as 1.0. But we have enough info to calculate the correct + * value. + */ + stats->stanullfrac = (float4)(1.0 - freq); + + /* + * Similarly, we need to correct the MCV frequencies, becuse those are + * also calculated only from the non-null values. All we need to do is + * simply multiply that with the non-NULL frequency. + */ + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + if (stats->stakind[i] == STATISTIC_KIND_MCV) + { + int j; + for (j = 0; j < stats->numnumbers[i]; j++) + stats->stanumbers[i][j] *= freq; + } + } +} + +/* + * jsonAnalyzeMakeScalarStats + * Serialize scalar stats into a JSON representation. + * + * We simply produce a JSON document with a list of predefined keys: + * + * - nullfrac + * - distinct + * - width + * - correlation + * - mcv or histogram + * + * For the mcv / histogram, we store a nested values / numbers. + */ +static JsonbValue * +jsonAnalyzeMakeScalarStats(JsonbParseState **ps, const char *name, + const VacAttrStats *stats) +{ + JsonbValue val; + int i; + int j; + + pushJsonbKey(ps, &val, name); + + pushJsonbValue(ps, WJB_BEGIN_OBJECT, NULL); + + pushJsonbKeyValueFloat(ps, &val, "nullfrac", stats->stanullfrac); + pushJsonbKeyValueFloat(ps, &val, "distinct", stats->stadistinct); + pushJsonbKeyValueInteger(ps, &val, "width", stats->stawidth); + + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + if (!stats->stakind[i]) + break; + + switch (stats->stakind[i]) + { + case STATISTIC_KIND_MCV: + pushJsonbKey(ps, &val, "mcv"); + break; + + case STATISTIC_KIND_HISTOGRAM: + pushJsonbKey(ps, &val, "histogram"); + break; + + case STATISTIC_KIND_CORRELATION: + pushJsonbKeyValueFloat(ps, &val, "correlation", + stats->stanumbers[i][0]); + continue; + + default: + elog(ERROR, "unexpected stakind %d", stats->stakind[i]); + break; + } + + pushJsonbValue(ps, WJB_BEGIN_OBJECT, NULL); + + if (stats->numvalues[i] > 0) + { + pushJsonbKey(ps, &val, "values"); + pushJsonbValue(ps, WJB_BEGIN_ARRAY, NULL); + for (j = 0; j < stats->numvalues[i]; j++) + { + Datum v = stats->stavalues[i][j]; + if (stats->attrtypid == JSONBOID) + pushJsonbElemBinary(ps, &val, DatumGetJsonbP(v)); + else if (stats->attrtypid == TEXTOID) + pushJsonbElemText(ps, &val, DatumGetTextP(v)); + else if (stats->attrtypid == NUMERICOID) + pushJsonbElemNumeric(ps, &val, DatumGetNumeric(v)); + else if (stats->attrtypid == INT4OID) + pushJsonbElemInteger(ps, &val, DatumGetInt32(v)); + else + elog(ERROR, "unexpected stat value type %d", + stats->attrtypid); + } + pushJsonbValue(ps, WJB_END_ARRAY, NULL); + } + + if (stats->numnumbers[i] > 0) + { + pushJsonbKey(ps, &val, "numbers"); + pushJsonbValue(ps, WJB_BEGIN_ARRAY, NULL); + for (j = 0; j < stats->numnumbers[i]; j++) + pushJsonbElemFloat(ps, &val, stats->stanumbers[i][j]); + pushJsonbValue(ps, WJB_END_ARRAY, NULL); + } + + pushJsonbValue(ps, WJB_END_OBJECT, NULL); + } + + return pushJsonbValue(ps, WJB_END_OBJECT, NULL); +} + +static void +pushJsonbKeyValueFloatNonZero(JsonbParseState **ps, JsonbValue *jbv, + const char *field, double val) +{ + if (val != 0.0) + pushJsonbKeyValueFloat(ps, jbv, field, val); +} + +/* + * jsonAnalyzeBuildPathStats + * Serialize statistics for a particular json path. + * + * This includes both the per-column stats (stored in "json" key) and the + * JSON specific stats (like frequencies of different object types). + */ +static Jsonb * +jsonAnalyzeBuildPathStats(JsonPathAnlStats *pstats) +{ + const JsonValueStats *vstats = &pstats->vstats; + float4 freq = pstats->freq; + bool fullstats = true; /* pstats->path.parent != NULL */ + JsonbValue val; + JsonbValue *jbv; + JsonbParseState *ps = NULL; + + pushJsonbValue(&ps, WJB_BEGIN_OBJECT, NULL); + + pushJsonbKeyValueString(&ps, &val, "path", pstats->path.pathstr); + + pushJsonbKeyValueFloat(&ps, &val, "freq", freq); + + pushJsonbKeyValueFloatNonZero(&ps, &val, "freq_null", + freq * vstats->nnulls / + vstats->jsons.values.count); + + pushJsonbKeyValueFloatNonZero(&ps, &val, "freq_boolean", + freq * (vstats->nfalse + vstats->ntrue) / + vstats->jsons.values.count); + + pushJsonbKeyValueFloatNonZero(&ps, &val, "freq_string", + freq * vstats->nstrings / + vstats->jsons.values.count); + + pushJsonbKeyValueFloatNonZero(&ps, &val, "freq_numeric", + freq * vstats->nnumerics / + vstats->jsons.values.count); + + pushJsonbKeyValueFloatNonZero(&ps, &val, "freq_array", + freq * vstats->narrays / + vstats->jsons.values.count); + + pushJsonbKeyValueFloatNonZero(&ps, &val, "freq_object", + freq * vstats->nobjects / + vstats->jsons.values.count); + + /* + * We keep array length stats here for queries like jsonpath '$.size() > 5'. + * Object lengths stats can be useful for other query lanuages. + */ + if (vstats->arrlens.values.count) + jsonAnalyzeMakeScalarStats(&ps, "array_length", &vstats->arrlens.stats); + + if (vstats->objlens.values.count) + jsonAnalyzeMakeScalarStats(&ps, "object_length", &vstats->objlens.stats); + + if (vstats->narrays) + pushJsonbKeyValueFloat(&ps, &val, "avg_array_length", + (float4) vstats->narrelems / vstats->narrays); + + if (fullstats) + { +#ifdef JSON_ANALYZE_SCALARS + jsonAnalyzeMakeScalarStats(&ps, "string", &vstats->strings.stats); + jsonAnalyzeMakeScalarStats(&ps, "numeric", &vstats->numerics.stats); +#endif + jsonAnalyzeMakeScalarStats(&ps, "json", &vstats->jsons.stats); + } + + jbv = pushJsonbValue(&ps, WJB_END_OBJECT, NULL); + + return JsonbValueToJsonb(jbv); +} + +/* + * jsonAnalyzeCalcPathFreq + * Calculate path frequency, i.e. how many documents contain this path. + */ +static void +jsonAnalyzeCalcPathFreq(JsonAnalyzeContext *ctx, JsonPathAnlStats *pstats, + JsonPathParentStats *parent) +{ + if (pstats->path.parent) + { + int count = JsonPathEntryIsArray(&pstats->path) ? + parent->narrays : pstats->vstats.jsons.values.count; + + pstats->freq = parent->freq * count / parent->count; + + CLAMP_PROBABILITY(pstats->freq); + } + else + pstats->freq = (double) ctx->analyzed_cnt / ctx->samplerows; +} + +/* + * jsonAnalyzePath + * Build statistics for values accumulated for this path. + * + * We're done with accumulating values for this path, so calculate the + * statistics for the various arrays. + * + * XXX I wonder if we could introduce some simple heuristict on which + * paths to keep, similarly to what we do for MCV lists. For example a + * path that occurred just once is not very interesting, so we could + * decide to ignore it and not build the stats. Although that won't + * save much, because there'll be very few values accumulated. + */ +static Jsonb * +jsonAnalyzePath(JsonAnalyzeContext *ctx, JsonPathAnlStats *pstats, + JsonPathParentStats *parent_stats) +{ + JsonValueStats *vstats = &pstats->vstats; + Jsonb *stats; + + jsonAnalyzeCalcPathFreq(ctx, pstats, parent_stats); + + /* values combining all object types */ + jsonAnalyzePathValues(ctx, &vstats->jsons, JSONBOID, pstats->freq, + /* store root stats in analyze context */ + !parent_stats); + + /* + * Lengths and array lengths. We divide counts by the total number of json + * values to compute correct nullfrac (i.e. not all jsons have lengths). + */ + jsonAnalyzePathValues(ctx, &vstats->arrlens, INT4OID, + pstats->freq * vstats->arrlens.values.count / + vstats->jsons.values.count, false); + jsonAnalyzePathValues(ctx, &vstats->objlens, INT4OID, + pstats->freq * vstats->objlens.values.count / + vstats->jsons.values.count, false); + +#ifdef JSON_ANALYZE_SCALARS + /* stats for values of string/numeric types only */ + jsonAnalyzePathValues(ctx, &vstats->strings, TEXTOID, pstats->freq, false); + jsonAnalyzePathValues(ctx, &vstats->numerics, NUMERICOID, pstats->freq, false); +#endif + + /* Build jsonb with path stats */ + stats = jsonAnalyzeBuildPathStats(pstats); + + /* Copy stats to non-temporary context */ + return memcpy(MemoryContextAlloc(ctx->stats->anl_context, VARSIZE(stats)), + stats, VARSIZE(stats)); +} + +/* + * JsonPathStatsCompare + * Compare two path stats (by path string). + * + * We store the stats sorted by path string, and this is the comparator. + */ +static int +JsonPathStatsCompare(const void *pv1, const void *pv2) +{ + return strcmp((*((const JsonPathEntry **) pv1))->pathstr, + (*((const JsonPathEntry **) pv2))->pathstr); +} + +/* + * jsonAnalyzeSortPaths + * Reads all stats stored in the hash table and sorts them. + */ +static JsonPathEntry ** +jsonAnalyzeSortPaths(JsonAnalyzeContext *ctx, int *p_npaths) +{ + HASH_SEQ_STATUS hseq; + JsonPathEntry *path; + JsonPathEntry **paths; + int npaths; + + npaths = hash_get_num_entries(ctx->pathshash) + 1; + paths = MemoryContextAlloc(ctx->mcxt, sizeof(*paths) * npaths); + + paths[0] = &ctx->root->path; + + hash_seq_init(&hseq, ctx->pathshash); + + for (int i = 1; (path = hash_seq_search(&hseq)) != NULL; i++) + paths[i] = path; + + pg_qsort(paths, npaths, sizeof(*paths), JsonPathStatsCompare); + + *p_npaths = npaths; + return paths; +} + +/* + * jsonAnalyzeBuildPathStatsArray + * Build jsonb datum array for path stats, that will be used as stavalues. + * + * The first element is a path prefix. + */ +static Datum * +jsonAnalyzeBuildPathStatsArray(Jsonb **pstats, int npaths, int *nvals, + const char *prefix, int prefixlen) +{ + Datum *values = palloc(sizeof(Datum) * (npaths + 1)); + JsonbValue *jbvprefix = palloc(sizeof(JsonbValue)); + int i; + + JsonValueInitStringWithLen(jbvprefix, + memcpy(palloc(prefixlen), prefix, prefixlen), + prefixlen); + + values[0] = JsonbPGetDatum(JsonbValueToJsonb(jbvprefix)); + + for (i = 0; i < npaths; i++) + values[i + 1] = JsonbPGetDatum(pstats[i]); + + *nvals = npaths + 1; + + return values; +} + +/* + * jsonAnalyzeMakeStats + * Build stavalues jsonb array for the root path prefix. + */ +static Datum * +jsonAnalyzeMakeStats(JsonAnalyzeContext *ctx, Jsonb **paths, + int npaths, int *numvalues) +{ + Datum *values; + MemoryContext oldcxt = MemoryContextSwitchTo(ctx->stats->anl_context); + + values = jsonAnalyzeBuildPathStatsArray(paths, npaths, numvalues, + JSON_PATH_ROOT, JSON_PATH_ROOT_LEN); + + MemoryContextSwitchTo(oldcxt); + + return values; +} + +/* + * jsonAnalyzeBuildSubPathsData + * Build statvalues and stanumbers arrays for the subset of paths starting + * from a given prefix. + * + * pathsDatums[index] should point to the desired path. + */ +bool +jsonAnalyzeBuildSubPathsData(Datum *pathsDatums, int npaths, int index, + const char *path, int pathlen, + bool includeSubpaths, float4 nullfrac, + Datum *pvals, Datum *pnums) +{ + Jsonb **pvalues = palloc(sizeof(*pvalues) * npaths); + Datum *values; + Datum numbers[1]; + JsonbValue pathkey; + int nsubpaths = 0; + int nvalues; + int i; + + JsonValueInitStringWithLen(&pathkey, "path", 4); + + for (i = index; i < npaths; i++) + { + /* Extract path name */ + Jsonb *jb = DatumGetJsonbP(pathsDatums[i]); + JsonbValue *jbv = findJsonbValueFromContainer(&jb->root, JB_FOBJECT, + &pathkey); + + /* Check if path name starts with a given prefix */ + if (!jbv || jbv->type != jbvString || + jbv->val.string.len < pathlen || + memcmp(jbv->val.string.val, path, pathlen)) + break; + + pfree(jbv); + + /* Collect matching path */ + pvalues[nsubpaths] = jb; + + nsubpaths++; + + /* + * The path should go before its subpaths, so if subpaths are not + * needed the loop is broken after the first matching path. + */ + if (!includeSubpaths) + break; + } + + if (!nsubpaths) + { + pfree(pvalues); + return false; + } + + /* Construct new array from the selected paths */ + values = jsonAnalyzeBuildPathStatsArray(pvalues, nsubpaths, &nvalues, + path, pathlen); + *pvals = PointerGetDatum(construct_array(values, nvalues, JSONBOID, -1, + false, 'i')); + + pfree(pvalues); + pfree(values); + + numbers[0] = Float4GetDatum(nullfrac); + *pnums = PointerGetDatum(construct_array(numbers, 1, FLOAT4OID, 4, + true /*FLOAT4PASSBYVAL*/, 'i')); + + return true; +} + +/* + * jsonAnalyzeInit + * Initialize the analyze context so that we can start adding paths. + */ +static void +jsonAnalyzeInit(JsonAnalyzeContext *ctx, VacAttrStats *stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, double totalrows, bool single_pass) +{ + HASHCTL hash_ctl; + + memset(ctx, 0, sizeof(*ctx)); + + ctx->stats = stats; + ctx->fetchfunc = fetchfunc; + ctx->mcxt = CurrentMemoryContext; + ctx->samplerows = samplerows; + ctx->totalrows = totalrows; + ctx->target = stats->attr->attstattarget; + ctx->scalarsOnly = false; + ctx->single_pass = single_pass; + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(JsonPathEntry); + hash_ctl.entrysize = ctx->single_pass ? sizeof(JsonPathAnlStats) : sizeof(JsonPathAnlDocs); + hash_ctl.hash = JsonPathEntryHash; + hash_ctl.match = JsonPathEntryMatch; + hash_ctl.hcxt = ctx->mcxt; + + ctx->pathshash = hash_create("JSON analyze path table", 100, &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); + + ctx->root = MemoryContextAllocZero(ctx->mcxt, sizeof(JsonPathAnlStats)); + ctx->root->path.pathstr = JSON_PATH_ROOT; +} + +/* + * jsonAnalyzePass + * One analysis pass over the JSON column. + * + * Performs one analysis pass on the JSON documents, and passes them to the + * custom analyzefunc. + */ +static void +jsonAnalyzePass(JsonAnalyzeContext *ctx, + void (*analyzefunc)(JsonAnalyzeContext *, Jsonb *, void *), + void *analyzearg, + JsonPathDocBitmap *bitmap) +{ + MemoryContext tmpcxt = AllocSetContextCreate(CurrentMemoryContext, + "Json Analyze Pass Context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + MemoryContext oldcxt = MemoryContextSwitchTo(tmpcxt); + int row_num = -1; + + ctx->null_cnt = 0; + ctx->analyzed_cnt = 0; + ctx->total_width = 0; + + /* Loop over the jsonbs. */ + for (int i = 0; i < (bitmap ? bitmap->size : ctx->samplerows); i++) + { + Datum value; + Jsonb *jb; + Size width; + bool isnull; + + vacuum_delay_point(); + + if (bitmap) + { + if (bitmap->is_list) + row_num = bitmap->data.list[i]; + else if (!jsonStatsBitmapNext(bitmap, &row_num)) + break; + } + else + row_num = i; + + value = ctx->fetchfunc(ctx->stats, row_num, &isnull); + + if (isnull) + { + /* json is null, just count that */ + ctx->null_cnt++; + continue; + } + + width = toast_raw_datum_size(value); + + ctx->total_width += VARSIZE_ANY(DatumGetPointer(value)); /* FIXME raw width? */ + + /* Skip too-large values. */ +#define JSON_WIDTH_THRESHOLD (100 * 1024) + + if (width > JSON_WIDTH_THRESHOLD) + continue; + + ctx->analyzed_cnt++; + + jb = DatumGetJsonbP(value); + + if (!ctx->single_pass) + MemoryContextSwitchTo(oldcxt); + + ctx->current_rownum = row_num; + analyzefunc(ctx, jb, analyzearg); + + if (!ctx->single_pass) + oldcxt = MemoryContextSwitchTo(tmpcxt); + + MemoryContextReset(tmpcxt); + } + + MemoryContextSwitchTo(oldcxt); +} + +/* + * compute_json_stats() -- compute statistics for a json column + */ +static void +compute_json_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, + int samplerows, double totalrows) +{ + JsonAnalyzeContext ctx; + JsonPathEntry **paths; + Jsonb **pstats; + int npaths; + int root_analyzed_cnt; + int root_null_cnt; + double root_total_width; + + jsonAnalyzeInit(&ctx, stats, fetchfunc, samplerows, totalrows, + false /* FIXME make GUC or simply remove */); + + /* + * Collect and analyze JSON path values in single or multiple passes. + * Sigle-pass collection is faster but consumes much more memory than + * collecting and analyzing by the one path at pass. + */ + if (ctx.single_pass) + { + /* Collect all values of all paths */ + jsonAnalyzePass(&ctx, jsonAnalyzeCollectPaths, (void *)(intptr_t) true, NULL); + + root_analyzed_cnt = ctx.analyzed_cnt; + root_null_cnt = ctx.null_cnt; + root_total_width = ctx.total_width; + + /* + * Now that we're done with processing the documents, we sort the paths + * we extracted and calculate stats for each of them. + * + * XXX I wonder if we could do this in two phases, to maybe not collect + * (or even accumulate) values for paths that are not interesting. + */ + paths = jsonAnalyzeSortPaths(&ctx, &npaths); + pstats = palloc(sizeof(*pstats) * npaths); + + for (int i = 0; i < npaths; i++) + { + JsonPathAnlStats *astats = (JsonPathAnlStats *) paths[i]; + JsonPathAnlStats *parent = (JsonPathAnlStats *) paths[i]->parent; + JsonPathParentStats parent_stats; + + if (parent) + { + parent_stats.freq = parent->freq; + parent_stats.count = parent->vstats.jsons.values.count; + parent_stats.narrays = parent->vstats.narrays; + } + + pstats[i] = jsonAnalyzePath(&ctx, astats, + parent ? &parent_stats : NULL); + } + } + else + { + MemoryContext oldcxt; + MemoryContext tmpcxt = AllocSetContextCreate(CurrentMemoryContext, + "Json Analyze Tmp Context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + JsonPathParentStats *stack; + + elog(DEBUG1, "analyzing %s attribute \"%s\"", + stats->attrtypid == JSONBOID ? "jsonb" : "json", + NameStr(stats->attr->attname)); + + elog(DEBUG1, "collecting json paths"); + + oldcxt = MemoryContextSwitchTo(tmpcxt); + + /* Collect all paths first without accumulating any Values, sort them */ + jsonAnalyzePass(&ctx, jsonAnalyzeCollectPaths, (void *)(intptr_t) false, NULL); + paths = jsonAnalyzeSortPaths(&ctx, &npaths); + pstats = MemoryContextAlloc(oldcxt, sizeof(*pstats) * npaths); + stack = MemoryContextAlloc(oldcxt, sizeof(*stack) * (ctx.maxdepth + 1)); + + root_analyzed_cnt = ctx.analyzed_cnt; + root_null_cnt = ctx.null_cnt; + root_total_width = ctx.total_width; + + /* + * Next, process each path independently to save memory (we don't want + * to accumulate all values for all paths, with a lot of duplicities). + */ + MemoryContextReset(tmpcxt); + + for (int i = 0; i < npaths; i++) + { + JsonPathEntry *path = paths[i]; + JsonPathAnlStats astats_tmp; + JsonPathAnlStats *astats; + + if (!i) + astats = ctx.root; + else + { + astats = &astats_tmp; + jsonStatsAnlInit(astats); + astats->path = *path; + } + + elog(DEBUG1, "analyzing json path (%d/%d) %s", + i + 1, npaths, path->pathstr); + + jsonAnalyzePass(&ctx, jsonAnalyzeCollectPath, astats, + /* root has no bitmap */ + i > 0 ? &((JsonPathAnlDocs *) path)->bitmap : NULL); + + pstats[i] = jsonAnalyzePath(&ctx, astats, + path->depth ? &stack[path->depth - 1] : NULL); + + /* Save parent stats in the stack */ + stack[path->depth].freq = astats->freq; + stack[path->depth].count = astats->vstats.jsons.values.count; + stack[path->depth].narrays = astats->vstats.narrays; + + MemoryContextReset(tmpcxt); + } + + MemoryContextSwitchTo(oldcxt); + + MemoryContextDelete(tmpcxt); + } + + /* We can only compute real stats if we found some non-null values. */ + if (root_null_cnt >= samplerows) + { + /* We found only nulls; assume the column is entirely null */ + stats->stats_valid = true; + stats->stanullfrac = 1.0; + stats->stawidth = 0; /* "unknown" */ + stats->stadistinct = 0.0; /* "unknown" */ + } + else if (!root_analyzed_cnt) + { + int nonnull_cnt = samplerows - root_null_cnt; + + /* We found some non-null values, but they were all too wide */ + stats->stats_valid = true; + /* Do the simple null-frac and width stats */ + stats->stanullfrac = (double) root_null_cnt / (double) samplerows; + stats->stawidth = root_total_width / (double) nonnull_cnt; + /* Assume all too-wide values are distinct, so it's a unique column */ + stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); + } + else + { + VacAttrStats *jsstats = &ctx.root->vstats.jsons.stats; + int i; + int empty_slot = -1; + + stats->stats_valid = true; + + stats->stanullfrac = jsstats->stanullfrac; + stats->stawidth = jsstats->stawidth; + stats->stadistinct = jsstats->stadistinct; + + /* + * We need to store the statistics the statistics slots. We simply + * store the regular stats in the first slots, and then we put the + * JSON stats into the first empty slot. + */ + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + /* once we hit an empty slot, we're done */ + if (!jsstats->staop[i]) + { + empty_slot = i; /* remember the empty slot */ + break; + } + + stats->stakind[i] = jsstats->stakind[i]; + stats->staop[i] = jsstats->staop[i]; + stats->stanumbers[i] = jsstats->stanumbers[i]; + stats->stavalues[i] = jsstats->stavalues[i]; + stats->statypid[i] = jsstats->statypid[i]; + stats->statyplen[i] = jsstats->statyplen[i]; + stats->statypbyval[i] = jsstats->statypbyval[i]; + stats->statypalign[i] = jsstats->statypalign[i]; + stats->numnumbers[i] = jsstats->numnumbers[i]; + stats->numvalues[i] = jsstats->numvalues[i]; + } + + Assert((empty_slot >= 0) && (empty_slot < STATISTIC_NUM_SLOTS)); + + stats->stakind[empty_slot] = STATISTIC_KIND_JSON; + stats->staop[empty_slot] = InvalidOid; + stats->numnumbers[empty_slot] = 1; + stats->stanumbers[empty_slot] = MemoryContextAlloc(stats->anl_context, + sizeof(float4)); + stats->stanumbers[empty_slot][0] = 0.0; /* nullfrac */ + stats->stavalues[empty_slot] = + jsonAnalyzeMakeStats(&ctx, pstats, npaths, + &stats->numvalues[empty_slot]); + + /* We are storing jsonb values */ + stats->statypid[empty_slot] = JSONBOID; + get_typlenbyvalalign(stats->statypid[empty_slot], + &stats->statyplen[empty_slot], + &stats->statypbyval[empty_slot], + &stats->statypalign[empty_slot]); + } +} + +/* + * json_typanalyze -- typanalyze function for jsonb + */ +Datum +jsonb_typanalyze(PG_FUNCTION_ARGS) +{ + VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0); + Form_pg_attribute attr = stats->attr; + + /* If the attstattarget column is negative, use the default value */ + /* NB: it is okay to scribble on stats->attr since it's a copy */ + if (attr->attstattarget < 0) + attr->attstattarget = default_statistics_target; + + stats->compute_stats = compute_json_stats; + /* see comment about the choice of minrows in commands/analyze.c */ + stats->minrows = 300 * attr->attstattarget; + + PG_RETURN_BOOL(true); +} diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c index c55b3aa..613d16b 100644 --- a/src/backend/utils/adt/jsonpath_exec.c +++ b/src/backend/utils/adt/jsonpath_exec.c @@ -1793,7 +1793,7 @@ executeLikeRegex(JsonPathItem *jsp, JsonbValue *str, JsonbValue *rarg, cxt->cflags = jspConvertRegexFlags(jsp->content.like_regex.flags); } - if (RE_compile_and_execute(cxt->regex, str->val.string.val, + if (RE_compile_and_execute(cxt->regex, unconstify(char *, str->val.string.val), str->val.string.len, cxt->cflags, DEFAULT_COLLATION_OID, 0, NULL)) return jpbTrue; diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat index 8e0e65a..9805bb1 100644 --- a/src/include/catalog/pg_operator.dat +++ b/src/include/catalog/pg_operator.dat @@ -3175,7 +3175,7 @@ { oid => '3211', oid_symbol => 'JsonbObjectFieldOperator', descr => 'get jsonb object field', oprname => '->', oprleft => 'jsonb', oprright => 'text', oprresult => 'jsonb', - oprcode => 'jsonb_object_field' }, + oprcode => 'jsonb_object_field', oprstat => 'jsonb_stats' }, { oid => '3477', oid_symbol => 'JsonbObjectFieldTextOperator', descr => 'get jsonb object field as text', oprname => '->>', oprleft => 'jsonb', oprright => 'text', oprresult => 'text', @@ -3183,7 +3183,7 @@ { oid => '3212', oid_symbol => 'JsonbArrayElementOperator', descr => 'get jsonb array element', oprname => '->', oprleft => 'jsonb', oprright => 'int4', oprresult => 'jsonb', - oprcode => 'jsonb_array_element' }, + oprcode => 'jsonb_array_element', oprstat => 'jsonb_stats' }, { oid => '3481', oid_symbol => 'JsonbArrayElementTextOperator', descr => 'get jsonb array element as text', oprname => '->>', oprleft => 'jsonb', oprright => 'int4', oprresult => 'text', @@ -3191,7 +3191,8 @@ { oid => '3213', oid_symbol => 'JsonbExtractPathOperator', descr => 'get value from jsonb with path elements', oprname => '#>', oprleft => 'jsonb', oprright => '_text', - oprresult => 'jsonb', oprcode => 'jsonb_extract_path' }, + oprresult => 'jsonb', oprcode => 'jsonb_extract_path', + oprstat => 'jsonb_stats' }, { oid => '3206', oid_symbol => 'JsonbExtractPathTextOperator', descr => 'get value from jsonb as text with path elements', oprname => '#>>', oprleft => 'jsonb', oprright => '_text', @@ -3229,23 +3230,23 @@ { oid => '3246', oid_symbol => 'JsonbContainsOperator', descr => 'contains', oprname => '@>', oprleft => 'jsonb', oprright => 'jsonb', oprresult => 'bool', oprcom => '<@(jsonb,jsonb)', oprcode => 'jsonb_contains', - oprrest => 'matchingsel', oprjoin => 'matchingjoinsel' }, + oprrest => 'jsonb_sel', oprjoin => 'matchingjoinsel' }, { oid => '3247', oid_symbol => 'JsonbExistsOperator', descr => 'key exists', oprname => '?', oprleft => 'jsonb', oprright => 'text', oprresult => 'bool', - oprcode => 'jsonb_exists', oprrest => 'matchingsel', + oprcode => 'jsonb_exists', oprrest => 'jsonb_sel', oprjoin => 'matchingjoinsel' }, { oid => '3248', oid_symbol => 'JsonbExistsAnyOperator', descr => 'any key exists', oprname => '?|', oprleft => 'jsonb', oprright => '_text', oprresult => 'bool', - oprcode => 'jsonb_exists_any', oprrest => 'matchingsel', + oprcode => 'jsonb_exists_any', oprrest => 'jsonb_sel', oprjoin => 'matchingjoinsel' }, { oid => '3249', oid_symbol => 'JsonbExistsAllOperator', descr => 'all keys exist', oprname => '?&', oprleft => 'jsonb', oprright => '_text', oprresult => 'bool', - oprcode => 'jsonb_exists_all', oprrest => 'matchingsel', + oprcode => 'jsonb_exists_all', oprrest => 'jsonb_sel', oprjoin => 'matchingjoinsel' }, { oid => '3250', oid_symbol => 'JsonbContainedOperator', descr => 'is contained by', oprname => '<@', oprleft => 'jsonb', oprright => 'jsonb', oprresult => 'bool', oprcom => '@>(jsonb,jsonb)', oprcode => 'jsonb_contained', - oprrest => 'matchingsel', oprjoin => 'matchingjoinsel' }, + oprrest => 'jsonb_sel', oprjoin => 'matchingjoinsel' }, { oid => '3284', descr => 'concatenate', oprname => '||', oprleft => 'jsonb', oprright => 'jsonb', oprresult => 'jsonb', oprcode => 'jsonb_concat' }, diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 2530443..30ece2c 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11839,4 +11839,15 @@ prorettype => 'bytea', proargtypes => 'pg_brin_minmax_multi_summary', prosrc => 'brin_minmax_multi_summary_send' }, +# jsonb statistics +{ oid => '8526', descr => 'jsonb typanalyze', + proname => 'jsonb_typanalyze', provolatile => 's', prorettype => 'bool', + proargtypes => 'internal', prosrc => 'jsonb_typanalyze' }, +{ oid => '8527', descr => 'jsonb selectivity estimation', + proname => 'jsonb_sel', provolatile => 's', prorettype => 'float8', + proargtypes => 'internal oid internal int4', prosrc => 'jsonb_sel' }, +{ oid => '8528', descr => 'jsonb statsistics estimation', + proname => 'jsonb_stats', provolatile => 's', prorettype => 'void', + proargtypes => 'internal internal int4 internal', prosrc => 'jsonb_stats' }, + ] diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h index cdf7448..c4f53eb 100644 --- a/src/include/catalog/pg_statistic.h +++ b/src/include/catalog/pg_statistic.h @@ -277,6 +277,8 @@ DECLARE_FOREIGN_KEY((starelid, staattnum), pg_attribute, (attrelid, attnum)); */ #define STATISTIC_KIND_BOUNDS_HISTOGRAM 7 +#define STATISTIC_KIND_JSON 8 + #endif /* EXPOSE_TO_CLIENT_CODE */ #endif /* PG_STATISTIC_H */ diff --git a/src/include/catalog/pg_type.dat b/src/include/catalog/pg_type.dat index df45879..b867db4 100644 --- a/src/include/catalog/pg_type.dat +++ b/src/include/catalog/pg_type.dat @@ -445,7 +445,7 @@ typname => 'jsonb', typlen => '-1', typbyval => 'f', typcategory => 'U', typsubscript => 'jsonb_subscript_handler', typinput => 'jsonb_in', typoutput => 'jsonb_out', typreceive => 'jsonb_recv', typsend => 'jsonb_send', - typalign => 'i', typstorage => 'x' }, + typanalyze => 'jsonb_typanalyze', typalign => 'i', typstorage => 'x' }, { oid => '4072', array_type_oid => '4073', descr => 'JSON path', typname => 'jsonpath', typlen => '-1', typbyval => 'f', typcategory => 'U', typinput => 'jsonpath_in', typoutput => 'jsonpath_out', diff --git a/src/include/utils/json_selfuncs.h b/src/include/utils/json_selfuncs.h new file mode 100644 index 0000000..9a36567 --- /dev/null +++ b/src/include/utils/json_selfuncs.h @@ -0,0 +1,113 @@ +/*------------------------------------------------------------------------- + * + * json_selfuncs.h + * JSON cost estimation functions. + * + * + * Portions Copyright (c) 2016-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/utils/json_selfuncs.h + * + *------------------------------------------------------------------------- + */ + +#ifndef JSON_SELFUNCS_H_ +#define JSON_SELFUNCS_H 1 + +#include "postgres.h" +#include "access/htup.h" +#include "utils/jsonb.h" +#include "utils/lsyscache.h" +#include "utils/selfuncs.h" + +#define JSON_PATH_ROOT "$" +#define JSON_PATH_ROOT_LEN 1 + +#define JSON_PATH_ROOT_ARRAY "$[*]" +#define JSON_PATH_ROOT_ARRAY_LEN 4 + +typedef enum +{ + JsonPathStatsValues, + JsonPathStatsArrayLength, + JsonPathStatsObjectLength +} JsonPathStatsType; + +typedef struct JsonStatData JsonStatData, *JsonStats; + +/* Per-path JSON stats */ +typedef struct JsonPathStatsData +{ + JsonStats data; /* pointer to per-column control structure */ + Datum *datum; /* pointer to JSONB datum with stats data */ + const char *path; /* path string, points directly to JSONB data */ + int pathlen; /* path length */ + JsonPathStatsType type; /* type of stats (values, lengths etc.) */ +} JsonPathStatsData, *JsonPathStats; + +/* Per-column JSON stats */ +struct JsonStatData +{ + HeapTuple statsTuple; /* original pg_statistic tuple */ + AttStatsSlot attslot; /* data extracted from STATISTIC_KIND_JSON + * slot of statsTuple */ + RelOptInfo *rel; /* Relation, or NULL if not identifiable */ + Datum *pathdatums; /* path JSONB datums */ + JsonPathStatsData *paths; /* cached paths */ + int npaths; /* number of paths */ + float4 nullfrac; /* NULL fraction */ + const char *prefix; /* global path prefix which needs to be used + * for searching in pathdatums */ + int prefixlen; /* path prefix length */ + bool acl_ok; /* ACL check is Ok */ +}; + +typedef enum JsonStatType +{ + JsonStatJsonb, + JsonStatJsonbWithoutSubpaths, + JsonStatText, + JsonStatFloat4, + JsonStatString, + JsonStatNumeric, + JsonStatFreq, +} JsonStatType; + +extern bool jsonStatsInit(JsonStats stats, const VariableStatData *vardata); +extern void jsonStatsRelease(JsonStats data); + +extern JsonPathStats jsonStatsGetPathByStr(JsonStats stats, + const char *path, int pathlen); + +extern JsonPathStats jsonPathStatsGetSubpath(JsonPathStats stats, + const char *subpath); + +extern bool jsonPathStatsGetNextSubpathStats(JsonPathStats stats, + JsonPathStats *keystats, + bool keysOnly); + +extern JsonPathStats jsonPathStatsGetArrayLengthStats(JsonPathStats pstats); +extern JsonPathStats jsonPathStatsGetObjectLengthStats(JsonPathStats pstats); + +extern float4 jsonPathStatsGetFreq(JsonPathStats pstats, float4 defaultfreq); + +extern float4 jsonPathStatsGetTypeFreq(JsonPathStats pstats, + JsonbValueType type, float4 defaultfreq); + +extern float4 jsonPathStatsGetAvgArraySize(JsonPathStats pstats); + +extern Selectivity jsonPathStatsGetArrayIndexSelectivity(JsonPathStats pstats, + int index); + +extern Selectivity jsonSelectivity(JsonPathStats stats, Datum scalar, Oid oper); + +extern void jsonPathAppendEntry(StringInfo path, const char *entry); + +extern bool jsonAnalyzeBuildSubPathsData(Datum *pathsDatums, + int npaths, int index, + const char *path, int pathlen, + bool includeSubpaths, float4 nullfrac, + Datum *pvals, Datum *pnums); + +#endif /* JSON_SELFUNCS_H */ diff --git a/src/test/regress/expected/jsonb_stats.out b/src/test/regress/expected/jsonb_stats.out new file mode 100644 index 0000000..c7b1e64 --- /dev/null +++ b/src/test/regress/expected/jsonb_stats.out @@ -0,0 +1,713 @@ +CREATE OR REPLACE FUNCTION explain_jsonb(sql_query text) +RETURNS TABLE(explain_line json) AS +$$ +BEGIN + RETURN QUERY EXECUTE 'EXPLAIN (ANALYZE, FORMAT json) ' || sql_query; +END; +$$ LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION get_plan_and_actual_rows(sql_query text) +RETURNS TABLE(plan integer, actual integer) AS +$$ + SELECT + (plan->>'Plan Rows')::integer plan, + (plan->>'Actual Rows')::integer actual + FROM ( + SELECT explain_jsonb(sql_query) #> '{0,Plan,Plans,0}' + ) p(plan) +$$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION check_estimate(sql_query text, accuracy real) +RETURNS boolean AS +$$ + SELECT plan BETWEEN actual / (1 + accuracy) AND (actual + 1) * (1 + accuracy) + FROM (SELECT * FROM get_plan_and_actual_rows(sql_query)) x +$$ LANGUAGE sql; +CREATE OR REPLACE FUNCTION check_estimate2(sql_query text, accuracy real) +RETURNS TABLE(min integer, max integer) AS +$$ + SELECT (actual * (1 - accuracy))::integer, ((actual + 1) * (1 + accuracy))::integer + FROM (SELECT * FROM get_plan_and_actual_rows(sql_query)) x +$$ LANGUAGE sql; +CREATE TABLE jsonb_stats_test(js jsonb); +INSERT INTO jsonb_stats_test SELECT NULL FROM generate_series(1, 1000); +INSERT INTO jsonb_stats_test SELECT 'null' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT 'true' FROM generate_series(1, 300); +INSERT INTO jsonb_stats_test SELECT 'false' FROM generate_series(1, 500); +INSERT INTO jsonb_stats_test SELECT '12345' FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT (1000 * (i % 10))::text::jsonb FROM generate_series(1, 400) i; +INSERT INTO jsonb_stats_test SELECT i::text::jsonb FROM generate_series(1, 500) i; +INSERT INTO jsonb_stats_test SELECT '"foo"' FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT format('"bar%s"', i % 10)::jsonb FROM generate_series(1, 400) i; +INSERT INTO jsonb_stats_test SELECT format('"baz%s"', i)::jsonb FROM generate_series(1, 500) i; +INSERT INTO jsonb_stats_test SELECT '{}' FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT jsonb_build_object('foo', 'bar') FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT jsonb_build_object('foo', 'baz' || (i % 10)) FROM generate_series(1, 300) i; +INSERT INTO jsonb_stats_test SELECT jsonb_build_object('foo', i % 10) FROM generate_series(1, 200) i; +INSERT INTO jsonb_stats_test SELECT jsonb_build_object('"foo \"bar"', i % 10) FROM generate_series(1, 200) i; +INSERT INTO jsonb_stats_test SELECT '[]' FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT '["foo"]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[12345]' FROM generate_series(1, 300); +INSERT INTO jsonb_stats_test SELECT '[["foo"]]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[{"key": "foo"}]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[null, "foo"]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[null, 12345]' FROM generate_series(1, 300); +INSERT INTO jsonb_stats_test SELECT '[null, ["foo"]]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[null, {"key": "foo"}]' FROM generate_series(1, 200); +-- Build random variable-length integer arrays +SELECT setseed(0.0); + setseed +--------- + +(1 row) + +INSERT INTO jsonb_stats_test +SELECT jsonb_build_object('array', + jsonb_build_array()) +FROM generate_series(1, 1000); +INSERT INTO jsonb_stats_test +SELECT jsonb_build_object('array', + jsonb_build_array( + floor(random() * 10)::int)) +FROM generate_series(1, 4000); +INSERT INTO jsonb_stats_test +SELECT jsonb_build_object('array', + jsonb_build_array( + floor(random() * 10)::int, + floor(random() * 10)::int)) +FROM generate_series(1, 3000); +INSERT INTO jsonb_stats_test +SELECT jsonb_build_object('array', + jsonb_build_array( + floor(random() * 10)::int, + floor(random() * 10)::int, + floor(random() * 10)::int)) +FROM generate_series(1, 2000); +ANALYZE jsonb_stats_test; +CREATE OR REPLACE FUNCTION check_jsonb_stats_test_estimate(sql_condition text, accuracy real) +RETURNS boolean AS +$$ + SELECT check_estimate('SELECT count(*) FROM jsonb_stats_test WHERE ' || sql_condition, accuracy) +$$ LANGUAGE sql; +DROP FUNCTION IF EXISTS check_jsonb_stats_test_estimate2(text, real); +NOTICE: function check_jsonb_stats_test_estimate2(text,pg_catalog.float4) does not exist, skipping +CREATE OR REPLACE FUNCTION check_jsonb_stats_test_estimate2(sql_condition text, accuracy real) +RETURNS TABLE(plan integer, actual integer) AS +$$ + SELECT get_plan_and_actual_rows('SELECT count(*) FROM jsonb_stats_test WHERE ' || sql_condition) +$$ LANGUAGE sql; +-- Check NULL estimate +SELECT check_jsonb_stats_test_estimate($$js IS NULL$$, 0.03); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js #> '{bad_key}' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 1000000 IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js #> '{1000000}' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key1' -> 'bad_key2' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js #> '{bad_key1,bad_key2}' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key1' -> 1 IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js #> '{bad_key1,1}' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 1000000 -> 'foo' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js #> '{1000000,foo}' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key' = '123'$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 1000000 = '123'$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check null eq estimate +SELECT check_jsonb_stats_test_estimate($$js = 'null'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> 'null'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check boolean eq estimate +SELECT check_jsonb_stats_test_estimate($$js = 'true'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> 'true'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js = 'false'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> 'false'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check numeric eq estimate +SELECT check_jsonb_stats_test_estimate($$js = '12345'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js#>'{}' = '12345'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js = '3000'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js = '1234'$$, 1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '6000'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check numeric range estimate +SELECT check_jsonb_stats_test_estimate($$js < '0'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js < '100'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js < '1000'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js < '3456'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js < '10000'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js < '100000'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js > '100' AND js < '600'$$, 0.5); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js > '6800' AND js < '12000'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check string eq estimate +SELECT check_jsonb_stats_test_estimate($$js = '"foo"'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js = '"bar7"'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js = '"baz1234"'$$, 10); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '"bar4"'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check string range estimate +SELECT check_jsonb_stats_test_estimate($$js > '"foo"'$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js > '"bar"'$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js > '"baz"'$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check object eq estimate +SELECT check_jsonb_stats_test_estimate($$js = '{}'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js > '{}'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check object key eq estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' = '"bar"'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' = '"baz"'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' = '"baz5"'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js #> '{foo}' = '"bar"'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check object key range estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' >= '"baz2"'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' < '"baz9"'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' >= '"baz2"' AND js -> 'foo' < '"baz9"'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check array eq estimate +SELECT check_jsonb_stats_test_estimate($$js = '[]'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js >= '[]' AND js < '{}'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check variable-length array element eq estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 0 = '1'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 1 = '6'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 2 = '8'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 3 = '1'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check variable-length array element range estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 0 < '7'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 1 < '7'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 2 < '7'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 3 < '7'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check variable-length array containment estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[]'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1]'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[100]'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1, 2]'$$, 1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1, 100]'$$, 1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1, 2, 100]'$$, 1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1, 2, 3]'$$, 5); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '1'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '100'$$, 10); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 0 @> '1'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 1 @> '1'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 2 @> '1'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 3 @> '1'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 0 @> '[1]'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": []}'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1]}'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [100]}'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1, 2]}'$$, 1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1, 100]}'$$, 1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1, 2, 100]}'$$, 1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1, 2, 3]}'$$, 3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- check misc containment +SELECT check_jsonb_stats_test_estimate($$js @> '"foo"'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '12345'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '[]'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '[12345]'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '["foo"]'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '[["foo", "bar"]]'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '[["foo"]]'$$, 0.2); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '[{"key": "foo"}]'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js @> '[null]'$$, 0.3); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check object key null estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' IS NULL$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' IS NOT NULL$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> '"foo \"bar"' IS NOT NULL$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key' IS NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key' IS NOT NULL$$, 0.01); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +-- Check object key existence +SELECT check_jsonb_stats_test_estimate($$js ? 'bad_key'$$, 10); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js ? 'foo'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js ? 'array'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js ?| '{foo,bad_key}'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js ?| '{foo,array}'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js ?& '{foo,bad_key}'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + +SELECT check_jsonb_stats_test_estimate($$js ?& '{foo,bar}'$$, 0.1); + check_jsonb_stats_test_estimate +--------------------------------- + t +(1 row) + diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 423b9b9..05e5a4b 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2523,6 +2523,38 @@ pg_stats_ext_exprs| SELECT cn.nspname AS schemaname, LEFT JOIN pg_namespace sn ON ((sn.oid = s.stxnamespace))) JOIN LATERAL ( SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr, unnest(sd.stxdexpr) AS a) stat ON ((stat.expr IS NOT NULL))); +pg_stats_json| SELECT n.nspname AS schemaname, + c.relname AS tablename, + a.attname, + (paths.path ->> 'path'::text) AS json_path, + s.stainherit AS inherited, + (((paths.path -> 'json'::text) ->> 'nullfrac'::text))::real AS null_frac, + (((paths.path -> 'json'::text) ->> 'width'::text))::real AS avg_width, + (((paths.path -> 'json'::text) ->> 'distinct'::text))::real AS n_distinct, + ARRAY( SELECT val.value AS val + FROM jsonb_array_elements((((paths.path -> 'json'::text) -> 'mcv'::text) -> 'values'::text)) val(value)) AS most_common_vals, + ARRAY( SELECT ((num.value)::text)::real AS num + FROM jsonb_array_elements((((paths.path -> 'json'::text) -> 'mcv'::text) -> 'numbers'::text)) num(value)) AS most_common_freqs, + ARRAY( SELECT val.value AS val + FROM jsonb_array_elements((((paths.path -> 'json'::text) -> 'histogram'::text) -> 'values'::text)) val(value)) AS histogram_bounds, + ARRAY( SELECT ((val.value)::text)::integer AS val + FROM jsonb_array_elements((((paths.path -> 'array_length'::text) -> 'mcv'::text) -> 'values'::text)) val(value)) AS most_common_array_lengths, + ARRAY( SELECT ((num.value)::text)::real AS num + FROM jsonb_array_elements((((paths.path -> 'array_length'::text) -> 'mcv'::text) -> 'numbers'::text)) num(value)) AS most_common_array_length_freqs, + (((paths.path -> 'json'::text) ->> 'correlation'::text))::real AS correlation + FROM (((pg_statistic s + JOIN pg_class c ON ((c.oid = s.starelid))) + JOIN pg_attribute a ON (((c.oid = a.attrelid) AND (a.attnum = s.staattnum)))) + LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))), + LATERAL ( SELECT unnest(((( + CASE + WHEN (s.stakind1 = 8) THEN s.stavalues1 + WHEN (s.stakind2 = 8) THEN s.stavalues2 + WHEN (s.stakind3 = 8) THEN s.stavalues3 + WHEN (s.stakind4 = 8) THEN s.stavalues4 + WHEN (s.stakind5 = 8) THEN s.stavalues5 + ELSE NULL::anyarray + END)::text)::jsonb[])[2:]) AS path) paths; pg_tables| SELECT n.nspname AS schemaname, c.relname AS tablename, pg_get_userbyid(c.relowner) AS tableowner, diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 5030d19..2c8fc11 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -111,7 +111,7 @@ test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combo # ---------- # Another group of parallel tests (JSON related) # ---------- -test: json jsonb json_encoding jsonpath jsonpath_encoding jsonb_jsonpath sqljson json_sqljson jsonb_sqljson +test: json jsonb json_encoding jsonpath jsonpath_encoding jsonb_jsonpath sqljson json_sqljson jsonb_sqljson jsonb_stats # ---------- # Another group of parallel tests diff --git a/src/test/regress/sql/jsonb_stats.sql b/src/test/regress/sql/jsonb_stats.sql new file mode 100644 index 0000000..fac71d0 --- /dev/null +++ b/src/test/regress/sql/jsonb_stats.sql @@ -0,0 +1,249 @@ +CREATE OR REPLACE FUNCTION explain_jsonb(sql_query text) +RETURNS TABLE(explain_line json) AS +$$ +BEGIN + RETURN QUERY EXECUTE 'EXPLAIN (ANALYZE, FORMAT json) ' || sql_query; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION get_plan_and_actual_rows(sql_query text) +RETURNS TABLE(plan integer, actual integer) AS +$$ + SELECT + (plan->>'Plan Rows')::integer plan, + (plan->>'Actual Rows')::integer actual + FROM ( + SELECT explain_jsonb(sql_query) #> '{0,Plan,Plans,0}' + ) p(plan) +$$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION check_estimate(sql_query text, accuracy real) +RETURNS boolean AS +$$ + SELECT plan BETWEEN actual / (1 + accuracy) AND (actual + 1) * (1 + accuracy) + FROM (SELECT * FROM get_plan_and_actual_rows(sql_query)) x +$$ LANGUAGE sql; + +CREATE OR REPLACE FUNCTION check_estimate2(sql_query text, accuracy real) +RETURNS TABLE(min integer, max integer) AS +$$ + SELECT (actual * (1 - accuracy))::integer, ((actual + 1) * (1 + accuracy))::integer + FROM (SELECT * FROM get_plan_and_actual_rows(sql_query)) x +$$ LANGUAGE sql; + +CREATE TABLE jsonb_stats_test(js jsonb); + +INSERT INTO jsonb_stats_test SELECT NULL FROM generate_series(1, 1000); + +INSERT INTO jsonb_stats_test SELECT 'null' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT 'true' FROM generate_series(1, 300); +INSERT INTO jsonb_stats_test SELECT 'false' FROM generate_series(1, 500); + +INSERT INTO jsonb_stats_test SELECT '12345' FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT (1000 * (i % 10))::text::jsonb FROM generate_series(1, 400) i; +INSERT INTO jsonb_stats_test SELECT i::text::jsonb FROM generate_series(1, 500) i; + +INSERT INTO jsonb_stats_test SELECT '"foo"' FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT format('"bar%s"', i % 10)::jsonb FROM generate_series(1, 400) i; +INSERT INTO jsonb_stats_test SELECT format('"baz%s"', i)::jsonb FROM generate_series(1, 500) i; + +INSERT INTO jsonb_stats_test SELECT '{}' FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT jsonb_build_object('foo', 'bar') FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT jsonb_build_object('foo', 'baz' || (i % 10)) FROM generate_series(1, 300) i; +INSERT INTO jsonb_stats_test SELECT jsonb_build_object('foo', i % 10) FROM generate_series(1, 200) i; +INSERT INTO jsonb_stats_test SELECT jsonb_build_object('"foo \"bar"', i % 10) FROM generate_series(1, 200) i; + +INSERT INTO jsonb_stats_test SELECT '[]' FROM generate_series(1, 100); +INSERT INTO jsonb_stats_test SELECT '["foo"]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[12345]' FROM generate_series(1, 300); +INSERT INTO jsonb_stats_test SELECT '[["foo"]]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[{"key": "foo"}]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[null, "foo"]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[null, 12345]' FROM generate_series(1, 300); +INSERT INTO jsonb_stats_test SELECT '[null, ["foo"]]' FROM generate_series(1, 200); +INSERT INTO jsonb_stats_test SELECT '[null, {"key": "foo"}]' FROM generate_series(1, 200); + +-- Build random variable-length integer arrays +SELECT setseed(0.0); + +INSERT INTO jsonb_stats_test +SELECT jsonb_build_object('array', + jsonb_build_array()) +FROM generate_series(1, 1000); + +INSERT INTO jsonb_stats_test +SELECT jsonb_build_object('array', + jsonb_build_array( + floor(random() * 10)::int)) +FROM generate_series(1, 4000); + +INSERT INTO jsonb_stats_test +SELECT jsonb_build_object('array', + jsonb_build_array( + floor(random() * 10)::int, + floor(random() * 10)::int)) +FROM generate_series(1, 3000); + +INSERT INTO jsonb_stats_test +SELECT jsonb_build_object('array', + jsonb_build_array( + floor(random() * 10)::int, + floor(random() * 10)::int, + floor(random() * 10)::int)) +FROM generate_series(1, 2000); + + +ANALYZE jsonb_stats_test; + +CREATE OR REPLACE FUNCTION check_jsonb_stats_test_estimate(sql_condition text, accuracy real) +RETURNS boolean AS +$$ + SELECT check_estimate('SELECT count(*) FROM jsonb_stats_test WHERE ' || sql_condition, accuracy) +$$ LANGUAGE sql; + +DROP FUNCTION IF EXISTS check_jsonb_stats_test_estimate2(text, real); + +CREATE OR REPLACE FUNCTION check_jsonb_stats_test_estimate2(sql_condition text, accuracy real) +RETURNS TABLE(plan integer, actual integer) AS +$$ + SELECT get_plan_and_actual_rows('SELECT count(*) FROM jsonb_stats_test WHERE ' || sql_condition) +$$ LANGUAGE sql; + +-- Check NULL estimate +SELECT check_jsonb_stats_test_estimate($$js IS NULL$$, 0.03); +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key' IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js #> '{bad_key}' IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js -> 1000000 IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js #> '{1000000}' IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key1' -> 'bad_key2' IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js #> '{bad_key1,bad_key2}' IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key1' -> 1 IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js #> '{bad_key1,1}' IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js -> 1000000 -> 'foo' IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js #> '{1000000,foo}' IS NULL$$, 0.01); + +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key' = '123'$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js -> 1000000 = '123'$$, 0.01); + +-- Check null eq estimate +SELECT check_jsonb_stats_test_estimate($$js = 'null'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> 'null'$$, 0.1); + +-- Check boolean eq estimate +SELECT check_jsonb_stats_test_estimate($$js = 'true'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> 'true'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js = 'false'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> 'false'$$, 0.1); + +-- Check numeric eq estimate +SELECT check_jsonb_stats_test_estimate($$js = '12345'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js#>'{}' = '12345'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js = '3000'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js = '1234'$$, 1); +SELECT check_jsonb_stats_test_estimate($$js @> '6000'$$, 0.2); + +-- Check numeric range estimate +SELECT check_jsonb_stats_test_estimate($$js < '0'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js < '100'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js < '1000'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js < '3456'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js < '10000'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js < '100000'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js > '100' AND js < '600'$$, 0.5); +SELECT check_jsonb_stats_test_estimate($$js > '6800' AND js < '12000'$$, 0.1); + +-- Check string eq estimate +SELECT check_jsonb_stats_test_estimate($$js = '"foo"'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js = '"bar7"'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js = '"baz1234"'$$, 10); +SELECT check_jsonb_stats_test_estimate($$js @> '"bar4"'$$, 0.3); + +-- Check string range estimate +SELECT check_jsonb_stats_test_estimate($$js > '"foo"'$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js > '"bar"'$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js > '"baz"'$$, 0.01); + +-- Check object eq estimate +SELECT check_jsonb_stats_test_estimate($$js = '{}'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js > '{}'$$, 0.1); + +-- Check object key eq estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' = '"bar"'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' = '"baz"'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' = '"baz5"'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js #> '{foo}' = '"bar"'$$, 0.2); + +-- Check object key range estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' >= '"baz2"'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' < '"baz9"'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' >= '"baz2"' AND js -> 'foo' < '"baz9"'$$, 0.1); + +-- Check array eq estimate +SELECT check_jsonb_stats_test_estimate($$js = '[]'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js >= '[]' AND js < '{}'$$, 0.1); + +-- Check variable-length array element eq estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 0 = '1'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 1 = '6'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 2 = '8'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 3 = '1'$$, 0.2); + +-- Check variable-length array element range estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 0 < '7'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 1 < '7'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 2 < '7'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 3 < '7'$$, 0.1); + +-- Check variable-length array containment estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[]'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1]'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[100]'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1, 2]'$$, 1); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1, 100]'$$, 1); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1, 2, 100]'$$, 1); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '[1, 2, 3]'$$, 5); + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '1'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' @> '100'$$, 10); + +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 0 @> '1'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 1 @> '1'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 2 @> '1'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 3 @> '1'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js -> 'array' -> 0 @> '[1]'$$, 0.3); + +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": []}'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1]}'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [100]}'$$, 0.3); +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1, 2]}'$$, 1); +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1, 100]}'$$, 1); +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1, 2, 100]}'$$, 1); +SELECT check_jsonb_stats_test_estimate($$js @> '{"array": [1, 2, 3]}'$$, 3); + +-- check misc containment +SELECT check_jsonb_stats_test_estimate($$js @> '"foo"'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> '12345'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> '[]'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> '[12345]'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> '["foo"]'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> '[["foo", "bar"]]'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> '[["foo"]]'$$, 0.2); +SELECT check_jsonb_stats_test_estimate($$js @> '[{"key": "foo"}]'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js @> '[null]'$$, 0.3); + +-- Check object key null estimate +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' IS NULL$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> 'foo' IS NOT NULL$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> '"foo \"bar"' IS NOT NULL$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key' IS NULL$$, 0.01); +SELECT check_jsonb_stats_test_estimate($$js -> 'bad_key' IS NOT NULL$$, 0.01); + +-- Check object key existence +SELECT check_jsonb_stats_test_estimate($$js ? 'bad_key'$$, 10); +SELECT check_jsonb_stats_test_estimate($$js ? 'foo'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js ? 'array'$$, 0.1); + +SELECT check_jsonb_stats_test_estimate($$js ?| '{foo,bad_key}'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js ?| '{foo,array}'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js ?& '{foo,bad_key}'$$, 0.1); +SELECT check_jsonb_stats_test_estimate($$js ?& '{foo,bar}'$$, 0.1); -- 1.8.3.1