*** ../postgresql/doc/src/sgml/gist.sgml Thu Jan 29 23:50:18 2009
--- gist.sgml Wed May 20 17:38:00 2009
***************
*** 92,98 ****
There are seven methods that an index operator class for
! GiST must provide:
--- 92,112 ----
There are seven methods that an index operator class for
! GiST must provide. Correctness of the index is ensured
! by proper implementation of the same>, consistent> and
! union> methods, while efficiency (speed) of the index will depend
! on the penalty> and picksplit> methods.
!
!
!
! The last two are compress> and decompress>, they allow to
! have internal tree data of a different type than the data indexed. The
! leaves are to be of the indexed data type type while the other tree nodes
! can be of any C struct (you still have to follow
! PostgreSQL> rules here, see about varlena> for
! variable sized data). If the tree nodes internal data type exists at the
! SQL level, the STORAGE> option of the CREATE
! OPERATORS CLASS> can be used.
***************
*** 108,113 ****
--- 122,172 ----
the predicate implies the query (recheck> = false) or
not (recheck> = true).
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_consistent(internal, data_type, smallint, oid, internal)
+ RETURNS bool
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_consistent(PG_FUNCTION_ARGS);
+
+ PG_FUNCTION_INFO_V1(my_consistent);
+ Datum
+ my_consistent(PG_FUNCTION_ARGS)
+ {
+ GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ data_type *query = PG_GETARG_DATA_TYPE_P(1);
+ StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+ data_type *key = DatumGetDataType(entry->key);
+ bool retval;
+
+ /*
+ * determine retval value as a function of strategy, key and query.
+ */
+ PG_RETURN_BOOL(retval);
+ }
+
+
+ Here, key> is an element in the index and query> the
+ value being looked up in the index (which can be a SELECT> or
+ a DML>. The StrategyNumber> you get will be set to one
+ of the ones you declare in the corresponding CREATE OPERATOR
+ CLASS> command.
+
+
+
+ Of course the term DATE_TYPE> in the C code would have to
+ get replaced in a way to refer existing macros, such as
+ PG_GETARG_TEXT_P> and DatumGetTextP>.
+
***************
*** 119,124 ****
--- 178,246 ----
entries, this function generates a new predicate that is true for all
the entries.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_union(internal, internal)
+ RETURNS text
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_union(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_union);
+ Datum
+ my_union(PG_FUNCTION_ARGS)
+ {
+ GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
+ GISTENTRY *ent = entryvec->vector;
+ data_type *out, *tmp, *old;
+ int numranges, i = 0;
+
+ numranges = entryvec->n;
+ tmp = DatumGetDataType(ent[0].key);
+ out = tmp;
+
+ if( numranges == 1 )
+ {
+ out = data_type_deep_copy(tmp);
+
+ PG_RETURN_DATA_TYPE_P(out);
+ }
+
+ for (i = 1; i < numranges; i++)
+ {
+ old = out;
+ tmp = DatumGetDataType(ent[i].key);
+ out = my_union_implementation(out, tmp);
+ }
+
+ PG_RETURN_DATA_TYPE_P(out);
+ }
+
+
+
+ As you can see, in this skeleton we're dealing with a data type
+ where union(X, Y, Z) = union(union(X, Y), Z)>. It's easy
+ enough to support data types where this is not the case, by
+ implementing the proper union implemantation and usage from the
+ GIST> support method.
+
+
+
+ All your union> implementation functions should return
+ pointers to newly palloc()>ed memory. You can't just
+ return whatever the input is as it's provided in a
+ MemoryContext> from where
+ PostgreSQL wouldn't be able to store it
+ in the index, should your union> function be called in a
+ CREATE INDEX>.
+
***************
*** 129,134 ****
--- 251,284 ----
Converts the data item into a format suitable for physical storage in
an index page.
+
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_compress(internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_compress(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_compress);
+ Datum
+ my_compress(PG_FUNCTION_ARGS)
+ {
+ PG_RETURN_POINTER(PG_GETARG_POINTER(0));
+ }
+
+
+
+ This skeleton is suitable only when you're storing the same data
+ type as the one you're indexing.
+
***************
*** 140,145 ****
--- 290,318 ----
index representation of the data item into a format that can be
manipulated by the database.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_decompress(internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_decompress(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_decompress);
+ Datum
+ my_decompress(PG_FUNCTION_ARGS)
+ {
+ PG_RETURN_POINTER(PG_GETARG_POINTER(0));
+ }
+
+
***************
*** 151,156 ****
--- 324,368 ----
entry into a particular branch of the tree. items will be inserted
down the path of least penalty in the tree.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_penalty(internal, internal, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C' STRICT;
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_penalty(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_penalty);
+ Datum
+ my_penalty(PG_FUNCTION_ARGS)
+ {
+ GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
+ GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
+ float *penalty = (float *) PG_GETARG_POINTER(2);
+
+ data_type *orig = DatumGetDataType(origentry->key);
+ data_type *new = DatumGetDataType(newentry->key);
+
+ *penalty = my_penalty_implementation(orig, new);
+ PG_RETURN_POINTER(penalty);
+ }
+
+
+
+
+ The penalty> function is crucial to good performances of the
+ index building and usage. It'll get used at query time to determine
+ which branch to follow when choosing where to add the new entry in
+ the tree. At query time, the more balanced the index, the quicker
+ the lookup.
+
***************
*** 162,167 ****
--- 374,471 ----
the page are to stay on the old page, and which are to move to the new
page.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_penalty(internal, internal, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C' STRICT;
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_picksplit(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_picksplit_jordan);
+ Datum
+ my_picksplit(PG_FUNCTION_ARGS)
+ {
+ GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
+ OffsetNumber maxoff = entryvec->n - 1;
+ GISTENTRY *ent = entryvec->vector;
+ GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
+
+ int i, nbytes;
+ OffsetNumber *left, *right;
+ data_type *tmp_union;
+ data_type *unionL;
+ data_type *unionR;
+
+ GISTENTRY **raw_entryvec;
+
+ maxoff = entryvec->n - 1;
+ nbytes = (maxoff + 1) * sizeof(OffsetNumber);
+
+ v->spl_left = (OffsetNumber *) palloc(nbytes);
+ left = v->spl_left;
+ v->spl_nleft = 0;
+
+ v->spl_right = (OffsetNumber *) palloc(nbytes);
+ right = v->spl_right;
+ v->spl_nright = 0;
+
+ unionL = NULL;
+ unionR = NULL;
+
+ /* Initialize the raw entry vector. */
+ raw_entryvec = (GISTENTRY **) malloc(entryvec->n * sizeof(void *));
+ for (i=FirstOffsetNumber; i <= maxoff; i=OffsetNumberNext(i))
+ raw_entryvec[i] = &(entryvec->vector[i]);
+
+ for (i=FirstOffsetNumber; i <= maxoff; i=OffsetNumberNext(i)) {
+ int real_index = raw_entryvec[i] - entryvec->vector;
+ tmp_union = DatumGetDataType(entryvec->vector[real_index].key);
+ Assert(tmp_union != NULL);
+
+ /*
+ * Choose where to put the index entries and update unionL and unionR accordingly.
+ * Append the entries to either v_spl_left or v_spl_right, and care about the counters.
+ */
+
+ if( my_choice_is_left(unionL, curl, unionR, curr) )
+ {
+ if( unionL == NULL )
+ unionL = tmp_union;
+ else
+ unionL = my_union_implementation(unionL, tmp_union);
+
+ *left = real_index;
+ ++left;
+ ++(v->spl_nleft);
+ }
+ else
+ {
+ /*
+ * Same on the right
+ */
+ }
+ }
+
+ v->spl_ldatum = DataTypeGetDatum(unionL);
+ v->spl_rdatum = DataTypeGetDatum(unionR);
+ PG_RETURN_POINTER(v);
+ }
+
+
+
+ The picksplit> implementation is crucial for optimized index
+ builds. Its implementation, combined with a proper penalty>
+ one, is where the challenge of implementing a performant
+ GIST> index lies.
+
***************
*** 171,176 ****
--- 475,512 ----
Returns true if two entries are identical, false otherwise.
+
+
+ The SQL> declaration of the function must look like this:
+
+
+ CREATE OR REPLACE FUNCTION my_same(data_type, data_type, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+
+
+ And the matching code in the C module could then follow such a skeleton:
+
+
+ Datum my_same(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_same);
+ Datum
+ my_same(PG_FUNCTION_ARGS)
+ {
+ prefix_range *v1 = PG_GETARG_PREFIX_RANGE_P(0);
+ prefix_range *v2 = PG_GETARG_PREFIX_RANGE_P(1);
+ bool *result = (bool *) PG_GETARG_POINTER(2);
+
+ *result = my_eq(v1, v2);
+ PG_RETURN_POINTER( result );
+ }
+
+
+
+ This is straightforward, even the memory place where to handle the
+ boolean return value is pre allocated.
+