Enhanced containment selectivity function - Mailing list pgsql-hackers
From | Matteo Beccati |
---|---|
Subject | Enhanced containment selectivity function |
Date | |
Msg-id | 42F22806.20503@beccati.com Whole thread Raw |
Responses |
Re: Enhanced containment selectivity function
Re: Enhanced containment selectivity function |
List | pgsql-hackers |
Hi, I've recently had problems with slow queries caused by the selectivity of the <@ ltree operator, as you may see in my post here: http://archives.postgresql.org/pgsql-performance/2005-07/msg00473.php Someone on IRC (AndrewSN if I'm not wrong) pointed out that the restriction selectivity function for <@ is contsel, which returns a constant value of 0.001. So I started digging in the source code trying to understand how the default behaviour could be enhanced, and ended up writing a little patch which adds an alternative containment selectivity function (called "contstatsel") which is able to deliver better results. This first version is based on the eqsel function and uses only histogram values to calculate the selectivity and uses the 0.001 constant as a fallback. This also made me think: is there a reason why geometric selectivity functions return constant values rather than checking statistics for a better result? Attached you will find a patch suitable for current CVS HEAD. My C skills are a bit rusty and my knowledge of pg internals are very poor, so I'm sure it could be improved and modified to better fit the pg coding standards. Here are the results on a slow query: test=# EXPLAIN ANALYZE SELECT * FROM gw_users JOIN gw_batches USING (u_id) WHERE tree <@ '1041' AND t_stamp > '2005-07-01'; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------ Nested Loop (cost=0.00..553.02 rows=8 width=364) (actual time=2.423..19787.259 rows=6785 loops=1) -> Index Scan using gw_users_gisttree_key on gw_users (cost=0.00..21.63 rows=5 width=156) (actual time=0.882..107.434 rows=4696 loops=1) Index Cond: (tree <@ '1041'::ltree) -> Index Scan using gw_batches_t_stamp_u_id_key on gw_batches (cost=0.00..106.09 rows=15 width=212) (actual time=3.898..4.171 rows=1 loops=4696) Index Cond: ((gw_batches.t_stamp > '2005-07-01 00:00:00+02'::timestamp with time zone) AND ("outer".u_id = gw_batches.u_id)) Total runtime: 19805.447 ms (6 rows) test=# EXPLAIN ANALYZE SELECT * FROM gw_users JOIN gw_batches USING (u_id) WHERE tree <<@ '1041' AND t_stamp > '2005-07-01'; QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------- Hash Join (cost=245.26..1151.80 rows=7671 width=364) (actual time=69.562..176.966 rows=6785 loops=1) Hash Cond: ("outer".u_id = "inner".u_id) -> Bitmap Heap Scan on gw_batches (cost=57.74..764.39 rows=8212 width=212) (actual time=8.330..39.542 rows=7819 loops=1) Recheck Cond: (t_stamp > '2005-07-01 00:00:00+02'::timestamp with time zone) -> Bitmap Index Scan on gw_batches_t_stamp_u_id_key (cost=0.00..57.74 rows=8212 width=0) (actual time=8.120..8.120 rows=7819 loops=1) Index Cond: (t_stamp > '2005-07-01 00:00:00+02'::timestamp with time zone) -> Hash (cost=175.79..175.79 rows=4692 width=156) (actual time=61.046..61.046 rows=4696 loops=1) -> Seq Scan on gw_users (cost=0.00..175.79 rows=4692 width=156) (actual time=0.083..34.200 rows=4696 loops=1) Filter: (tree <<@ '1041'::ltree) Total runtime: 194.621 ms (10 rows) The second query uses a custom <<@ operator I added to test the alternative selectivity function: CREATE FUNCTION contstatsel(internal, oid, internal, integer) RETURNS double precision AS 'contstatsel' LANGUAGE internal; CREATE OPERATOR <<@ ( LEFTARG = ltree, LEFTARG = ltree, PROCEDURE = ltree_risparent, COMMUTATOR = '@>', RESTRICT = contstatsel, JOIN = contjoinsel ); Of course any comments/feedback are welcome. Best regards -- Matteo Beccati http://phpadsnew.com/ http://phppgads.com/ Index: src/backend/utils/adt/selfuncs.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v retrieving revision 1.187 diff -r1.187 selfuncs.c 1309a1310,1433 > * contstatsel - Selectivity of containment for any data types. > */ > Datum > contstatsel(PG_FUNCTION_ARGS) > { > PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); > Oid operator = PG_GETARG_OID(1); > List *args = (List *) PG_GETARG_POINTER(2); > int varRelid = PG_GETARG_INT32(3); > VariableStatData vardata; > Node *other; > bool varonleft; > Datum *values; > int nvalues; > double selec = 0.0; > > /* > * If expression is not variable = something or something = variable, > * then punt and return a default estimate. > */ > if (!get_restriction_variable(root, args, varRelid, > &vardata, &other, &varonleft)) > PG_RETURN_FLOAT8(0.001); > > /* > * If the something is a NULL constant, assume operator is strict and > * return zero, ie, operator will never return TRUE. > */ > if (IsA(other, Const) && > ((Const *) other)->constisnull) > { > ReleaseVariableStats(vardata); > PG_RETURN_FLOAT8(0.0); > } > > if (HeapTupleIsValid(vardata.statsTuple)) > { > Form_pg_statistic stats; > > stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); > > if (IsA(other, Const)) > { > /* Variable is being compared to a known non-null constant */ > Datum constval = ((Const *) other)->constvalue; > bool match = false; > int i; > > elog(INFO, "Checking histogram"); > > /* > * Is the constant "=" to any of the column's most common > * values? (Although the given operator may not really be > * "=", we will assume that seeing whether it returns TRUE is > * an appropriate test. If you don't like this, maybe you > * shouldn't be using eqsel for your operator...) > */ > if (get_attstatsslot(vardata.statsTuple, > vardata.atttype, vardata.atttypmod, > STATISTIC_KIND_HISTOGRAM, InvalidOid, > &values, &nvalues, > NULL, NULL)) > { > FmgrInfo contproc; > > fmgr_info(get_opcode(operator), &contproc); > > elog(INFO, "Found %d values", nvalues); > > for (i = 0; i < nvalues; i++) > { > /* be careful to apply operator right way 'round */ > if (varonleft) > match = DatumGetBool(FunctionCall2(&contproc, > values[i], > constval)); > else > match = DatumGetBool(FunctionCall2(&contproc, > constval, > values[i])); > if (match) > selec++; > } > > if (selec > 0.0 && nvalues > 0) > { > selec /= nvalues; > } > > elog(INFO, "Computed selectivity: %04.3f", selec); > > } > else > { > elog(INFO, "No histogram info"); > > /* no most-common-value info available */ > values = NULL; > i = nvalues = 0; > } > > if (!selec) > selec = 0.001; > > free_attstatsslot(vardata.atttype, values, nvalues, > NULL, 0); > } > else > selec = 0.001; > } > else > selec = 0.001; > > ReleaseVariableStats(vardata); > > /* result should be in range, but make sure... */ > CLAMP_PROBABILITY(selec); > > elog(INFO, "Returned selectivity: %04.3f", selec); > > PG_RETURN_FLOAT8((float8) selec); > } > > /* Index: src/include/catalog/pg_proc.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/catalog/pg_proc.h,v retrieving revision 1.380 diff -r1.380 pg_proc.h 3752a3753,3755 > DATA(insert OID = 2600 ( contstatsel PGNSP PGUID 12 f f t f s 4 701 "2281 26 2281 23" _null_ _null_ _null_ contstatsel - _null_ )); > DESCR("enhanced restriction selectivity for containment comparison operators"); > Index: src/include/utils/selfuncs.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/utils/selfuncs.h,v retrieving revision 1.23 diff -r1.23 selfuncs.h 97a98,99 > extern Datum contstatsel(PG_FUNCTION_ARGS); >
pgsql-hackers by date: