diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 53cf96f..5f10d7f 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -29,6 +29,7 @@ * index_can_return - does index support index-only scans? * index_getprocid - get a support procedure OID * index_getprocinfo - get a support procedure's lookup info + * index_skip - advance to the next key value in a scan * * NOTES * This file contains the index_ routines which used @@ -742,6 +743,37 @@ index_can_return(Relation indexRelation) PointerGetDatum(indexRelation))); } +bool +index_can_skip(Relation indexRelation) +{ + FmgrInfo *procedure; + + RELATION_CHECKS; + + /* amcanskip is optional; assume FALSE if not provided by AM */ + if (!RegProcedureIsValid(indexRelation->rd_am->amcanskip)) + return false; + + GET_REL_PROCEDURE(amcanskip); + + return DatumGetBool(FunctionCall1(procedure, + PointerGetDatum(indexRelation))); +} + +bool +index_skip(IndexScanDesc scan, ScanDirection direction, int prefix) +{ + FmgrInfo *procedure; + SCAN_CHECKS; + GET_SCAN_PROCEDURE(amskip); + Datum result = + DatumGetPointer(FunctionCall3(procedure, + PointerGetDatum(scan), + Int32GetDatum(direction), + Int32GetDatum(prefix))); + return DatumGetBool(result); +} + /* ---------------- * index_getprocid * diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 117b18e..bc09ea1 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -426,6 +426,8 @@ btbeginscan(PG_FUNCTION_ARGS) so->currPos.nextTupleOffset = 0; so->markPos.nextTupleOffset = 0; + so->skipScanKey = NULL; + scan->xs_itupdesc = RelationGetDescr(rel); scan->opaque = so; @@ -501,6 +503,19 @@ btrescan(PG_FUNCTION_ARGS) } /* + * btskip() -- skip to the beginning of the next key prefix + */ +Datum +btskip(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); + int prefix = PG_GETARG_INT32(2); + bool result = _bt_skip(scan, dir, prefix); + PG_RETURN_BOOL(result); +} + +/* * btendscan() -- close down a scan */ Datum @@ -1110,3 +1125,12 @@ btcanreturn(PG_FUNCTION_ARGS) { PG_RETURN_BOOL(true); } + +/* + * btcanskip() -- Check whether btree indexes support skipping. + */ +Datum +btcanskip(PG_FUNCTION_ARGS) +{ + PG_RETURN_BOOL(true); +} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 203b969..8aaeb69 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1082,6 +1082,90 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) } /* + * _bt_skip() -- Skip items that have the same prefix as the most recently + * fetched index tuple. The current position is set so that a subsequent call + * to _bt_next will fetch the first tuple that differs in the leading 'prefix' + * keys. + * + * TODO:TM The 'step back one tuple' behaviour is only necessary because I + * wanted the nodeIndexonlyscan code to be able to use its existing + * tuple-fetch-visibility-check loop without a special case for the first + * iteration. An alernative woudl be for skip to actually fetch the desired + * tuple immediately (ie without a subsequent call to _bt_next). + */ +bool +_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix) +{ + /* TODO:TM For now, we use _bt_search to search from the root; in theory + * we should be able to do a local traversal ie from the current page, but + * I don't know if it would actually be better in general */ + + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTStack stack; + Buffer buf; + OffsetNumber offnum; + BTScanPosItem *currItem; + + if (!scan->xs_want_itup) + elog(ERROR, "_bt_skip cannot skip if not returning tuples"); + if (!scan->xs_itup) + elog(ERROR, "_bt_skip cannot skip if a tuple has not been fetched yet"); + /*elog(NOTICE, "_bt_skip");*/ + + if (BTScanPosIsValid(so->currPos)) + { + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; + } + + /* + * TODO:TM lazily call _bt_mkscankey the first time and then just update + * the values in so->skipScanKey each time after that, instead of the repeated + * free/realloc + */ + if (so->skipScanKey != NULL) + _bt_freeskey(so->skipScanKey); + so->skipScanKey = _bt_mkscankey(scan->indexRelation, scan->xs_itup); + + /* TODO:TM share code with _bt_search? Much of this copied from there + * without good understanding of what is going on! */ + + stack =_bt_search(scan->indexRelation, prefix, so->skipScanKey, + ScanDirectionIsForward(dir), &buf, BT_READ); + _bt_freestack(stack); + so->currPos.buf = buf; + offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey, + ScanDirectionIsForward(dir)); + PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf), + scan->xs_snapshot); + if (ScanDirectionIsForward(dir)) + { + so->currPos.moreLeft = false; + so->currPos.moreRight = true; + } + else + { + so->currPos.moreLeft = true; + so->currPos.moreRight = false; + } + if (ScanDirectionIsForward(dir)) + offnum = OffsetNumberPrev(offnum); + if (!_bt_readpage(scan, dir, offnum)) + { + if (!_bt_steppage(scan, dir)) + { + return false; + } + } + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_ctup.t_self = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + return true; +} + +/* * _bt_readpage() -- Load data from current index page into so->currPos * * Caller must have pinned and read-locked so->currPos.buf; the buffer's state diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 781a736..d94146b 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1065,7 +1065,20 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_IndexOnlyScan: { IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; - + if (indexonlyscan->distinctPrefix > 0) + { + /* + * TODO:TM show "for distinct prefix (a, b)" instead of + * just the column count? + */ + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(es->str, " for distinct %d column(s)", + indexonlyscan->distinctPrefix); + else + ExplainPropertyInteger("Distinct Prefix", + indexonlyscan->distinctPrefix, + es); + } ExplainIndexScanDetails(indexonlyscan->indexid, indexonlyscan->indexorderdir, es); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index afcd1ff..d46de44 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -74,6 +74,21 @@ IndexOnlyNext(IndexOnlyScanState *node) slot = node->ss.ss_ScanTupleSlot; /* + * Check if we need to skip to the next key prefix, because we've been + * asked to implement DISTINCT. + */ + if (node->ioss_NumDistinctKeys > 0 && node->ioss_FirstTupleEmitted) + { + /*elog(NOTICE, "I WOULD LIKE TO SKIP");*/ + if (!index_skip(scandesc, direction, node->ioss_NumDistinctKeys)) + { + /* Reached end of index. */ + /* TODO:TM is this appropriate cleanup? */ + return ExecClearTuple(slot); + } + } + + /* * OK, now that we have what we need, fetch the next tuple. */ while ((tid = index_getnext_tid(scandesc, direction)) != NULL) @@ -176,6 +191,17 @@ IndexOnlyNext(IndexOnlyScanState *node) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); + /* + * TODO:TM Do distinct scans break SSI? We don't visit every page + * anymore! But maybe that's OK, because we only "read" the fact that + * there is *at least one* row that has this value, so a conflicting + * write would be one that removes ALL rows having the same value, and + * therefore would touch this page that happens to hold the arbitrary + * row we looked at, in the case of a disinct skip scan. Does this + * make any sense? + */ + + node->ioss_FirstTupleEmitted = true; return slot; } @@ -391,6 +417,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ioss_HeapFetches = 0; + indexstate->ioss_NumDistinctKeys = node->distinctPrefix; + indexstate->ioss_FirstTupleEmitted = false; /* * Miscellaneous initialization diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index aa053a0..fe351fd5 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -398,6 +398,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from) COPY_NODE_FIELD(indexorderby); COPY_NODE_FIELD(indextlist); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(distinctPrefix); return newnode; } diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 42dcb11..3908899 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -982,6 +982,10 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, index_only_scan = (scantype != ST_BITMAPSCAN && check_index_only(rel, index)); + /* TODO:TM -- if index_only_scan is true, AND we want DISTINCT, then + * ... perhaps this is the right place to check if we can use an index + * only scan in distinct mode? */ + /* * 4. Generate an indexscan path if there are relevant restriction clauses * in the current clauses, OR the index ordering is potentially useful for diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index e1480cd..46105ca 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1934,10 +1934,30 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) result_plan, current_pathkeys, -1.0); + result_plan = (Plan *) make_unique(result_plan, + parse->distinctClause); + } + else if IsA(result_plan, IndexOnlyScan) + { + /* + * The plan is appropriately sorted, and we can ask it to + * produce unique values by skipping. + * + * TODO:TM This is a quick and dirty hack to get something + * working by modifying the plan, but really we probably need + * help planning this much earlier, possibly in + * build_index_paths. + */ + IndexOnlyScan *ios = (IndexOnlyScan *) result_plan; + /*elog(NOTICE, "using skip to implement DISTINCT");*/ + ios->distinctPrefix = list_length(root->distinct_pathkeys); + } + else + { + /* The plan is appropriately sorted, but contains duplicates. */ + result_plan = (Plan *) make_unique(result_plan, + parse->distinctClause); } - - result_plan = (Plan *) make_unique(result_plan, - parse->distinctClause); result_plan->plan_rows = dNumDistinctRows; /* The Unique node won't change sort ordering */ } diff --git a/src/include/access/genam.h b/src/include/access/genam.h index d99158f..349097d 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -157,6 +157,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); extern bool index_can_return(Relation indexRelation); +extern bool index_can_skip(Relation indexRelation); +extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix); extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum); extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9fa943f..0a2a693 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -606,6 +606,9 @@ typedef struct BTScanOpaqueData /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ + + /* workspace for _bt_skip */ + ScanKey skipScanKey; /* used to control skipping */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -638,6 +641,8 @@ extern Datum btbulkdelete(PG_FUNCTION_ARGS); extern Datum btvacuumcleanup(PG_FUNCTION_ARGS); extern Datum btcanreturn(PG_FUNCTION_ARGS); extern Datum btoptions(PG_FUNCTION_ARGS); +extern Datum btcanskip(PG_FUNCTION_ARGS); +extern Datum btskip(PG_FUNCTION_ARGS); /* * prototypes for functions in nbtinsert.c @@ -684,6 +689,8 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); +extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix); + /* * prototypes for functions in nbtutils.c diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h index 759ea70..b3d835e 100644 --- a/src/include/catalog/pg_am.h +++ b/src/include/catalog/pg_am.h @@ -67,6 +67,8 @@ CATALOG(pg_am,2601) regproc amcanreturn; /* can indexscan return IndexTuples? */ regproc amcostestimate; /* estimate cost of an indexscan */ regproc amoptions; /* parse AM-specific parameters */ + regproc amcanskip; /* can indexscan skip to next prefix value? */ + regproc amskip; /* skip to next prefix value function */ } FormData_pg_am; /* ---------------- @@ -80,7 +82,7 @@ typedef FormData_pg_am *Form_pg_am; * compiler constants for pg_am * ---------------- */ -#define Natts_pg_am 30 +#define Natts_pg_am 32 #define Anum_pg_am_amname 1 #define Anum_pg_am_amstrategies 2 #define Anum_pg_am_amsupport 3 @@ -111,25 +113,27 @@ typedef FormData_pg_am *Form_pg_am; #define Anum_pg_am_amcanreturn 28 #define Anum_pg_am_amcostestimate 29 #define Anum_pg_am_amoptions 30 +#define Anum_pg_am_amcanskip 31 +#define Anum_pg_am_amskip 32 /* ---------------- * initial contents of pg_am * ---------------- */ -DATA(insert OID = 403 ( btree 5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions )); +DATA(insert OID = 403 ( btree 5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions btcanskip btskip )); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 -DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions )); +DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions - - )); DESCR("hash index access method"); #define HASH_AM_OID 405 -DATA(insert OID = 783 ( gist 0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions )); +DATA(insert OID = 783 ( gist 0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions - - )); DESCR("GiST index access method"); #define GIST_AM_OID 783 -DATA(insert OID = 2742 ( gin 0 6 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions )); +DATA(insert OID = 2742 ( gin 0 6 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions - - )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 -DATA(insert OID = 4000 ( spgist 0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions )); +DATA(insert OID = 4000 ( spgist 0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions - - )); DESCR("SP-GiST index access method"); #define SPGIST_AM_OID 4000 diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index a56a635..0d8e0d9 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -564,6 +564,10 @@ DATA(insert OID = 1268 ( btcostestimate PGNSP PGUID 12 1 0 0 0 f f f f t f v DESCR("btree(internal)"); DATA(insert OID = 2785 ( btoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ btoptions _null_ _null_ _null_ )); DESCR("btree(internal)"); +DATA(insert OID = 4051 ( btcanskip PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ btcanskip _null_ _null_ _null_ )); +DESCR("btree(internal)"); +DATA(insert OID = 4052 ( btskip PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ btskip _null_ _null_ _null_ )); +DESCR("btree(internal)"); DATA(insert OID = 339 ( poly_same PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ )); DATA(insert OID = 340 ( poly_contain PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ )); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index b271f21..9c64c81 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1280,6 +1280,7 @@ typedef struct IndexScanState * ScanDesc index scan descriptor * VMBuffer buffer in use for visibility map testing, if any * HeapFetches number of tuples we were forced to fetch from heap + * NumDistinctKeys number of keys for skip-based DISTINCT * ---------------- */ typedef struct IndexOnlyScanState @@ -1298,6 +1299,8 @@ typedef struct IndexOnlyScanState IndexScanDesc ioss_ScanDesc; Buffer ioss_VMBuffer; long ioss_HeapFetches; + int ioss_NumDistinctKeys; + bool ioss_FirstTupleEmitted; } IndexOnlyScanState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 3b9c683..6ae992a 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -340,6 +340,7 @@ typedef struct IndexOnlyScan List *indexorderby; /* list of index ORDER BY exprs */ List *indextlist; /* TargetEntry list describing index's cols */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int distinctPrefix; /* the size of the prefix for distinct scans */ } IndexOnlyScan; /* ---------------- diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 37b6cbb..ce811f1 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -61,6 +61,8 @@ typedef struct RelationAmInfo FmgrInfo ammarkpos; FmgrInfo amrestrpos; FmgrInfo amcanreturn; + FmgrInfo amcanskip; + FmgrInfo amskip; } RelationAmInfo;