Re: pgsql: Extended statistics on expressions - Mailing list pgsql-committers
From | David Rowley |
---|---|
Subject | Re: pgsql: Extended statistics on expressions |
Date | |
Msg-id | CAApHDvpYT10-nkSp8xXe-nbO3jmoaRyRFHbzh-RWMfAJynqgpQ@mail.gmail.com Whole thread Raw |
In response to | pgsql: Extended statistics on expressions (Tomas Vondra <tomas.vondra@postgresql.org>) |
Responses |
Re: pgsql: Extended statistics on expressions
|
List | pgsql-committers |
Hi Tomas, I'm debugging a crash after running sqllancer on current master. The first bad commit seems to be this one. The crash stack trace is: Program received signal SIGSEGV, Segmentation fault. pg_detoast_datum_packed (datum=0x5) at fmgr.c:1759 1759 if (VARATT_IS_COMPRESSED(datum) || VARATT_IS_EXTERNAL(datum)) (gdb) bt #0 pg_detoast_datum_packed (datum=0x5) at fmgr.c:1759 #1 0x000055e460c99da2 in textlike (fcinfo=0x7fff1bbc39e0) at like.c:287 #2 0x000055e460d44dde in FunctionCall2Coll (flinfo=flinfo@entry=0x7fff1bbc3ae0, collation=<optimized out>, arg1=<optimized out>, arg2=<optimized out>) at fmgr.c:1163 #3 0x000055e460bd6dad in mcv_get_match_bitmap (clauses=<optimized out>, keys=0x55e4614627e8, exprs=0x0, mcvlist=<optimized out>, is_or=<optimized out>, root=<optimized out>) at mcv.c:1706 #4 0x000055e460bd965d in mcv_clauselist_selectivity (root=root@entry=0x55e46145fca0, stat=stat@entry=0x55e461462790, clauses=clauses@entry=0x55e46144cf08, varRelid=varRelid@entry=0, jointype=jointype@entry=JOIN_INNER, sjinfo=sjinfo@entry=0x0, rel=0x55e461461f18, basesel=0x7fff1bbc3c88, totalsel=0x7fff1bbc3c90) at mcv.c:2043 #5 0x000055e460bd6364 in statext_mcv_clauselist_selectivity (is_or=<optimized out>, estimatedclauses=<optimized out>, rel=<optimized out>, sjinfo=<optimized out>, jointype=<optimized out>, varRelid=<optimized out>, clauses=<optimized out>, root=<optimized out>) at extended_stats.c:1916 #6 statext_clauselist_selectivity (root=root@entry=0x55e46145fca0, clauses=clauses@entry=0x55e461462ad0, varRelid=varRelid@entry=0, jointype=jointype@entry=JOIN_INNER, sjinfo=sjinfo@entry=0x0, rel=0x55e461461f18, estimatedclauses=0x7fff1bbc3d28, is_or=false) at extended_stats.c:1948 #7 0x000055e460b0e973 in clauselist_selectivity_ext (root=root@entry=0x55e46145fca0, clauses=0x55e461462ad0, varRelid=varRelid@entry=0, jointype=jointype@entry=JOIN_INNER, sjinfo=0x0, use_extended_stats=true) at clausesel.c:155 #8 0x000055e460b0e346 in clause_selectivity_ext (root=root@entry=0x55e46145fca0, clause=0x55e46144c8f8, varRelid=varRelid@entry=0, jointype=jointype@entry=JOIN_INNER, sjinfo=sjinfo@entry=0x0, use_extended_stats=use_extended_stats@entry=true) at clausesel.c:838 #9 0x000055e460b0e1a4 in clauselist_selectivity_or (use_extended_stats=<optimized out>, sjinfo=<optimized out>, jointype=JOIN_INNER, varRelid=0, clauses=0x55e461462cc0, root=0x55e46145fca0) at clausesel.c:414 #10 clause_selectivity_ext (root=0x55e46145fca0, clause=<optimized out>, varRelid=0, jointype=JOIN_INNER, sjinfo=<optimized out>, use_extended_stats=use_extended_stats@entry=true) at clausesel.c:851 #11 0x000055e460b0e863 in clauselist_selectivity_ext (root=root@entry=0x55e46145fca0, clauses=0x55e46144cd28, varRelid=varRelid@entry=0, jointype=jointype@entry=JOIN_INNER, sjinfo=sjinfo@entry=0x0, use_extended_stats=use_extended_stats@entry=true) at ../../../../src/include/nodes/pg_list.h:260 #12 0x000055e460b0eacf in clauselist_selectivity (root=root@entry=0x55e46145fca0, clauses=<optimized out>, varRelid=varRelid@entry=0, jointype=jointype@entry=JOIN_INNER, sjinfo=sjinfo@entry=0x0) at clausesel.c:108 #13 0x000055e460b16992 in set_baserel_size_estimates (root=root@entry=0x55e46145fca0, rel=rel@entry=0x55e461461f18) at costsize.c:4753 #14 0x000055e460b0b971 in set_plain_rel_size (rte=<optimized out>, rel=0x55e461461f18, root=0x55e46145fca0) at allpaths.c:583 #15 set_rel_size (root=0x55e46145fca0, rel=0x55e461461f18, rti=1, rte=0x55e4614220a8) at allpaths.c:412 #16 0x000055e460b0d8c0 in set_base_rel_sizes (root=<optimized out>) at allpaths.c:323 I'm struggling a bit to reproduce this consistently. If you do: psql -c "CREATE ROLE sqlancer;" postgres created test; psql -f database0.sql test then run the following set of commands: insert into t1 values(1234,false),(5678,true); analyze; delete from t1; SELECT t1.c0 FROM ONLY t1 WHERE ((((((((upper('%|1j]<#^j\???u,???D'))LIKE((((('h???x')||(178227136)))||((('(-2073106655,58629343]'::int4range)*('(-1903439243,-1774128827)'::int4range)))))))AND((((t1.c0) IN (-91026522, 0.6000845835151706))OR(t1.c1)))))OR(t1.c1)))OR(CAST(t1.c1 AS BOOLEAN))); It seems to be a corrupt Datum in the 2nd argument to textlike(). (gdb) frame 3 #3 0x000055e460bd6dad in mcv_get_match_bitmap (clauses=<optimized out>, keys=0x55e461480148, exprs=0x0, mcvlist=<optimized out>, is_or=<optimized out>, root=<optimized out>) at mcv.c:1706 1706 match = DatumGetBool(FunctionCall2Coll(&opproc, (gdb) p i $7 = 0 (gdb) p idx $8 = 2 I've not studied the code in great detail, but per above, idx == 2 and you're doing item->values[idx]. Isn't the index of 2 out of bounds of the most_common_vals column below? test1=# select most_common_vals from pg_stats_ext; most_common_vals --------------------- {{1234,f},{5678,t}} (1 row) The code that sets idx to 2 (mcv_match_expression()) looks a bit weird. These stats don't have any exprs and the "expr" past to mcv_match_expression is an OpExpr. The function just goes and sets idx = bms_num_members(keys); then does 0 loops due to the empty "exprs" and returns 2. The Assert() does not help catch not finding anything since it checks idx >= bms_num_members(keys), which is 2. I'm not quite sure how this is all meant to work. Are you able to look further? David On Sat, 27 Mar 2021 at 13:14, Tomas Vondra <tomas.vondra@postgresql.org> wrote: > > Extended statistics on expressions > > Allow defining extended statistics on expressions, not just just on > simple column references. With this commit, expressions are supported > by all existing extended statistics kinds, improving the same types of > estimates. A simple example may look like this: > > CREATE TABLE t (a int); > CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t; > ANALYZE t; > > The collected statistics are useful e.g. to estimate queries with those > expressions in WHERE or GROUP BY clauses: > > SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0; > > SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20); > > This introduces new internal statistics kind 'e' (expressions) which is > built automatically when the statistics object definition includes any > expressions. This represents single-expression statistics, as if there > was an expression index (but without the index maintenance overhead). > The statistics is stored in pg_statistics_ext_data as an array of > composite types, which is possible thanks to 79f6a942bd. > > CREATE STATISTICS allows building statistics on a single expression, in > which case in which case it's not possible to specify statistics kinds. > > A new system view pg_stats_ext_exprs can be used to display expression > statistics, similarly to pg_stats and pg_stats_ext views. > > ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it > treats indexes, i.e. it drops and recreates the statistics. This means > all statistics are reset, and we no longer try to preserve at least the > functional dependencies. This should not be a major issue in practice, > as the functional dependencies actually rely on per-column statistics, > which were always reset anyway. > > Author: Tomas Vondra > Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu > Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com > > Branch > ------ > master > > Details > ------- > https://git.postgresql.org/pg/commitdiff/a4d75c86bf15220df22de0a92c819ecef9db3849 > > Modified Files > -------------- > doc/src/sgml/catalogs.sgml | 295 +++- > doc/src/sgml/ref/create_statistics.sgml | 116 +- > src/backend/catalog/Makefile | 8 +- > src/backend/catalog/system_views.sql | 69 + > src/backend/commands/statscmds.c | 437 +++--- > src/backend/commands/tablecmds.c | 104 +- > src/backend/nodes/copyfuncs.c | 14 + > src/backend/nodes/equalfuncs.c | 13 + > src/backend/nodes/outfuncs.c | 12 + > src/backend/optimizer/util/plancat.c | 62 + > src/backend/parser/gram.y | 38 +- > src/backend/parser/parse_agg.c | 10 + > src/backend/parser/parse_expr.c | 6 + > src/backend/parser/parse_func.c | 3 + > src/backend/parser/parse_utilcmd.c | 125 +- > src/backend/statistics/dependencies.c | 616 +++++++- > src/backend/statistics/extended_stats.c | 1253 ++++++++++++++-- > src/backend/statistics/mcv.c | 369 +++-- > src/backend/statistics/mvdistinct.c | 96 +- > src/backend/tcop/utility.c | 29 +- > src/backend/utils/adt/ruleutils.c | 281 +++- > src/backend/utils/adt/selfuncs.c | 413 +++++- > src/bin/pg_dump/t/002_pg_dump.pl | 12 + > src/bin/psql/describe.c | 130 +- > src/include/catalog/catversion.h | 2 +- > src/include/catalog/pg_proc.dat | 8 + > src/include/catalog/pg_statistic_ext.h | 4 + > src/include/catalog/pg_statistic_ext_data.h | 1 + > src/include/commands/defrem.h | 4 +- > src/include/nodes/nodes.h | 1 + > src/include/nodes/parsenodes.h | 19 +- > src/include/nodes/pathnodes.h | 1 + > src/include/parser/parse_node.h | 1 + > src/include/parser/parse_utilcmd.h | 2 + > src/include/statistics/extended_stats_internal.h | 32 +- > src/include/statistics/statistics.h | 5 +- > src/include/utils/ruleutils.h | 2 + > src/test/regress/expected/create_table_like.out | 20 +- > src/test/regress/expected/oidjoins.out | 10 +- > src/test/regress/expected/rules.out | 73 + > src/test/regress/expected/stats_ext.out | 1658 +++++++++++++++++++--- > src/test/regress/sql/create_table_like.sql | 2 + > src/test/regress/sql/stats_ext.sql | 589 +++++++- > 43 files changed, 5982 insertions(+), 963 deletions(-) >
Attachment
pgsql-committers by date: