*** ../postgresql/doc/src/sgml/gist.sgml Thu Jan 29 23:50:18 2009
--- gist.sgml Fri Jun 5 23:29:55 2009
***************
*** 92,98 ****
There are seven methods that an index operator class for
! GiST must provide:
--- 92,112 ----
There are seven methods that an index operator class for
! GiST must provide. Correctness of the index is ensured
! by proper implementation of the same>, consistent> and
! union> methods, while efficiency (speed) of the index will depend
! on the penalty> and picksplit> methods.
!
!
!
! The last two are compress> and decompress>, they allow to
! have internal tree data of a different type than the data indexed. The
! leaves are to be of the indexed data type type while the other tree nodes
! can be of any C struct (you still have to follow
! PostgreSQL> rules here, see about varlena> for
! variable sized data). If the tree nodes internal data type exists at the
! SQL level, the STORAGE> option of the CREATE
! OPERATORS CLASS> can be used.
***************
*** 108,113 ****
--- 122,195 ----
the predicate implies the query (recheck> = false) or
not (recheck> = true).
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_consistent(internal, data_type, smallint, oid, internal)
+ RETURNS bool
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_consistent(PG_FUNCTION_ARGS);
+
+ PG_FUNCTION_INFO_V1(my_consistent);
+ Datum
+ my_consistent(PG_FUNCTION_ARGS)
+ {
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ data_type *query = PG_GETARG_DATA_TYPE_P(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+ /* Oid subtype = PG_GETARG_OID(3); */
+ data_type *key = DatumGetDataType(entry->key);
+ bool *recheck;
+ bool retval;
+
+ Assert( PG_NARGS() == 4 || PG_NARGS() == 5);
+
+ if( PG_NARGS() == 5 ) {
+ recheck = (bool *) PG_GETARG_POINTER(4);
+ *recheck = true;
+ }
+ /*
+ * determine retval value as a function of strategy, key and query.
+ *
+ * Use GIST_LEAF(entry) to know where you're called in the index tree,
+ * which comes handy when supporting the = operator for example (you
+ * could check for non empty union() in non-leaf nodes and equality in
+ * leaf nodes).
+ */
+ PG_RETURN_BOOL(retval);
+ }
+
+
+ Here, key> is an element in the index and query> the
+ value being looked up in the index (which can be a SELECT> or
+ a DML>. The StrategyNumber> you get will be set to one
+ of the ones you declare in the corresponding CREATE OPERATOR
+ CLASS> command.
+
+
+
+ Of course the term DATE_TYPE> in the C code would have to
+ get replaced in a way to refer existing macros, such as
+ PG_GETARG_TEXT_P> and DatumGetTextP>.
+
+
+
+ The RECHECK> information used to exists in the operator
+ class definition but is now supported in the consistent
+ support function. The CREATE OPERATOR CLASS> will do
+ nothing about the specific signature of the consistent>
+ function, which will get called through the fmgr> interface:
+ using PG_NARGS()> is a good way to support older versions of
+ PostgreSQL>.
+
***************
*** 119,124 ****
--- 201,269 ----
entries, this function generates a new predicate that is true for all
the entries.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_union(internal, internal)
+ RETURNS text
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_union(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_union);
+ Datum
+ my_union(PG_FUNCTION_ARGS)
+ {
+ GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
+ GISTENTRY *ent = entryvec->vector;
+ data_type *out, *tmp, *old;
+ int numranges, i = 0;
+
+ numranges = entryvec->n;
+ tmp = DatumGetDataType(ent[0].key);
+ out = tmp;
+
+ if( numranges == 1 )
+ {
+ out = data_type_deep_copy(tmp);
+
+ PG_RETURN_DATA_TYPE_P(out);
+ }
+
+ for (i = 1; i < numranges; i++)
+ {
+ old = out;
+ tmp = DatumGetDataType(ent[i].key);
+ out = my_union_implementation(out, tmp);
+ }
+
+ PG_RETURN_DATA_TYPE_P(out);
+ }
+
+
+
+ As you can see, in this skeleton we're dealing with a data type
+ where union(X, Y, Z) = union(union(X, Y), Z)>. It's easy
+ enough to support data types where this is not the case, by
+ implementing the proper union implemantation and usage from the
+ GIST> support method.
+
+
+
+ All your union> implementation functions should return
+ pointers to newly palloc()>ed memory. You can't just
+ return whatever the input is as it's provided in a
+ MemoryContext> from where
+ PostgreSQL wouldn't be able to store it
+ in the index, should your union> function be called in a
+ CREATE INDEX>.
+
***************
*** 129,134 ****
--- 274,335 ----
Converts the data item into a format suitable for physical storage in
an index page.
+
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_compress(internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_compress(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_compress);
+ Datum
+ my_compress(PG_FUNCTION_ARGS)
+ {
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ GISTENTRY *retval;
+
+ if (entry->leafkey)
+ {
+ retval = palloc(sizeof(GISTENTRY));
+ if (DatumGetPointer(entry->key) != NULL)
+ {
+ /*
+ * prepare the compressed representation of the data into some local varlena
+ */
+ compressed_data_type compressed_data = palloc(sizeof(compressed_data_type));
+
+ gistentryinit(*retval, compressed_data,
+ entry->rel, entry->page, entry->offset, FALSE);
+ }
+ }
+ else
+ retval = entry;
+
+ PG_RETURN_POINTER(retval);
+ }
+
+
+
+ You have to adapt compressed_data_type> to the specific type
+ you're converting to in order to compress your leaf nodes, of
+ course.
+
+
+
+ Depending on your needs, you could also need to care about
+ compressing NULL> values in there, storing for example
+ (Datum) 0> like gist_circle_compress> is
+ doing.
+
***************
*** 140,145 ****
--- 341,369 ----
index representation of the data item into a format that can be
manipulated by the database.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_decompress(internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_decompress(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_decompress);
+ Datum
+ my_decompress(PG_FUNCTION_ARGS)
+ {
+ PG_RETURN_POINTER(PG_GETARG_POINTER(0));
+ }
+
+
***************
*** 151,156 ****
--- 375,419 ----
entry into a particular branch of the tree. items will be inserted
down the path of least penalty in the tree.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_penalty(internal, internal, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C' STRICT;
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_penalty(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_penalty);
+ Datum
+ my_penalty(PG_FUNCTION_ARGS)
+ {
+ GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
+ float *penalty = (float *) PG_GETARG_POINTER(2);
+
+ data_type *orig = DatumGetDataType(origentry->key);
+ data_type *new = DatumGetDataType(newentry->key);
+
+ *penalty = my_penalty_implementation(orig, new);
+ PG_RETURN_POINTER(penalty);
+ }
+
+
+
+
+ The penalty> function is crucial to good performances of the
+ index building and usage. It'll get used at query time to determine
+ which branch to follow when choosing where to add the new entry in
+ the tree. At query time, the more balanced the index, the quicker
+ the lookup.
+
***************
*** 162,167 ****
--- 425,522 ----
the page are to stay on the old page, and which are to move to the new
page.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_penalty(internal, internal, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C' STRICT;
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_picksplit(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_picksplit_jordan);
+ Datum
+ my_picksplit(PG_FUNCTION_ARGS)
+ {
+ GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
+ OffsetNumber maxoff = entryvec->n - 1;
+ GISTENTRY *ent = entryvec->vector;
+ GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
+
+ int i, nbytes;
+ OffsetNumber *left, *right;
+ data_type *tmp_union;
+ data_type *unionL;
+ data_type *unionR;
+
+ GISTENTRY **raw_entryvec;
+
+ maxoff = entryvec->n - 1;
+ nbytes = (maxoff + 1) * sizeof(OffsetNumber);
+
+ v->spl_left = (OffsetNumber *) palloc(nbytes);
+ left = v->spl_left;
+ v->spl_nleft = 0;
+
+ v->spl_right = (OffsetNumber *) palloc(nbytes);
+ right = v->spl_right;
+ v->spl_nright = 0;
+
+ unionL = NULL;
+ unionR = NULL;
+
+ /* Initialize the raw entry vector. */
+ raw_entryvec = (GISTENTRY **) malloc(entryvec->n * sizeof(void *));
+ for (i=FirstOffsetNumber; i <= maxoff; i=OffsetNumberNext(i))
+ raw_entryvec[i] = &(entryvec->vector[i]);
+
+ for (i=FirstOffsetNumber; i <= maxoff; i=OffsetNumberNext(i)) {
+ int real_index = raw_entryvec[i] - entryvec->vector;
+ tmp_union = DatumGetDataType(entryvec->vector[real_index].key);
+ Assert(tmp_union != NULL);
+
+ /*
+ * Choose where to put the index entries and update unionL and unionR accordingly.
+ * Append the entries to either v_spl_left or v_spl_right, and care about the counters.
+ */
+
+ if( my_choice_is_left(unionL, curl, unionR, curr) )
+ {
+ if( unionL == NULL )
+ unionL = tmp_union;
+ else
+ unionL = my_union_implementation(unionL, tmp_union);
+
+ *left = real_index;
+ ++left;
+ ++(v->spl_nleft);
+ }
+ else
+ {
+ /*
+ * Same on the right
+ */
+ }
+ }
+
+ v->spl_ldatum = DataTypeGetDatum(unionL);
+ v->spl_rdatum = DataTypeGetDatum(unionR);
+ PG_RETURN_POINTER(v);
+ }
+
+
+
+ The picksplit> implementation is crucial for optimized index
+ builds. Its implementation, combined with a proper penalty>
+ one, is where the challenge of implementing a performant
+ GIST> index lies.
+
***************
*** 171,176 ****
--- 526,563 ----
Returns true if two entries are identical, false otherwise.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_same(data_type, data_type, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_same(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_same);
+ Datum
+ my_same(PG_FUNCTION_ARGS)
+ {
+ prefix_range *v1 = PG_GETARG_PREFIX_RANGE_P(0);
+ prefix_range *v2 = PG_GETARG_PREFIX_RANGE_P(1);
+ bool *result = (bool *) PG_GETARG_POINTER(2);
+
+ *result = my_eq(v1, v2);
+ PG_RETURN_POINTER( result );
+ }
+
+
+
+ This is straightforward, even the memory place where to handle the
+ boolean return value is pre allocated.
+