Home > mailing lists
Re: rtree improvements - Mailing list pgsql-patches

From	Bruce Momjian
Subject	Re: rtree improvements
Date	September 26, 2001 12:30:23
Msg-id	200109261627.f8QGRDD26488@candle.pha.pa.us Whole thread Raw
In response to	rtree improvements (Kenneth Been <kennethb@telocity.com>)
List	pgsql-patches
Tree view
Your patch has been added to the PostgreSQL unapplied patches list at:

    http://candle.pha.pa.us/cgi-bin/pgpatches

I will try to apply it within the next 48 hours.


> I have made three changes to the rtree code: one bug fix and
> two performance improvements.  I put an explanation of the
> changes at
>
> http://cs1.cs.nyu.edu/been/postgres-rtree.html
>
> The performance improvements are quite significant.
>
> All the changes are in the file src/backend/access/rtree/rtree.c
>
> I was working with the 7.1.3 code.
>
> I'm including the diff output as an attachment.
>
> Ken
>

> diff -Naur postgresql-7.1.3.old/src/backend/access/rtree/rtree.c postgresql-7.1.3/src/backend/access/rtree/rtree.c
> --- postgresql-7.1.3.old/src/backend/access/rtree/rtree.c    Wed Mar 21 22:59:16 2001
> +++ postgresql-7.1.3/src/backend/access/rtree/rtree.c    Tue Sep 25 13:56:31 2001
> @@ -55,6 +55,14 @@
>      Datum        spl_rdatum;
>  } SPLITVEC;
>
> +/* for sorting tuples by cost, for picking split */
> +typedef struct SPLITCOST
> +{
> +    OffsetNumber    offset_number;
> +    float            cost_differential;
> +    bool            choose_left;
> +} SPLITCOST;
> +
>  typedef struct RTSTATE
>  {
>      FmgrInfo    unionFn;        /* union function */
> @@ -79,6 +87,7 @@
>         RTSTATE *rtstate);
>  static int    nospace(Page p, IndexTuple it);
>  static void initRtstate(RTSTATE *rtstate, Relation index);
> +static int qsort_comp_splitcost(const void *a, const void *b);
>
>
>  Datum
> @@ -427,7 +436,12 @@
>      FunctionCall2(&rtstate->sizeFn, datum,
>                    PointerGetDatum(&newd_size));
>
> -    if (newd_size != old_size)
> +    /*
> +     * If newd_size == 0 we have degenerate rectangles, so we
> +     * don't know if there was any change, so we have to
> +     * assume there was.
> +     */
> +    if ((newd_size == 0) || (newd_size != old_size))
>      {
>          TupleDesc    td = RelationGetDescr(r);
>
> @@ -503,6 +517,8 @@
>      OffsetNumber *spl_left,
>                 *spl_right;
>      TupleDesc    tupDesc;
> +    int            n;
> +    OffsetNumber newitemoff;
>
>      p = (Page) BufferGetPage(buffer);
>      opaque = (RTreePageOpaque) PageGetSpecialPointer(p);
> @@ -539,56 +555,64 @@
>      spl_right = v.spl_right;
>      leftoff = rightoff = FirstOffsetNumber;
>      maxoff = PageGetMaxOffsetNumber(p);
> -    for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
> -    {
> -        itemid = PageGetItemId(p, i);
> -        item = (IndexTuple) PageGetItem(p, itemid);
> -
> -        if (i == *spl_left)
> -        {
> -            if (PageAddItem(left, (Item) item, IndexTupleSize(item),
> -                            leftoff, LP_USED) == InvalidOffsetNumber)
> -                elog(ERROR, "rtdosplit: failed to copy index item in %s",
> -                     RelationGetRelationName(r));
> -            leftoff = OffsetNumberNext(leftoff);
> -            spl_left++;            /* advance in left split vector */
> -        }
> -        else
> -        {
> -            Assert(i == *spl_right);
> -            if (PageAddItem(right, (Item) item, IndexTupleSize(item),
> -                            rightoff, LP_USED) == InvalidOffsetNumber)
> -                elog(ERROR, "rtdosplit: failed to copy index item in %s",
> -                     RelationGetRelationName(r));
> -            rightoff = OffsetNumberNext(rightoff);
> -            spl_right++;        /* advance in right split vector */
> -        }
> -    }
> +    newitemoff = OffsetNumberNext(maxoff);
>
>      /* build an InsertIndexResult for this insertion */
>      res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
>
> -    /* now insert the new index tuple */
> -    if (*spl_left == maxoff + 1)
> +    /*
> +     * spl_left contains a list of the offset numbers of the
> +     * tuples that will go to the left page.  For each offset
> +     * number, get the tuple item, then add the item to the
> +     * left page.  Similarly for the right side.
> +     */
> +
> +    /* fill left node */
> +    for (n = 0; n < v.spl_nleft; n++)
>      {
> -        if (PageAddItem(left, (Item) itup, IndexTupleSize(itup),
> +        i = *spl_left;
> +        if (i == newitemoff)
> +            item = itup;
> +        else
> +        {
> +            itemid = PageGetItemId(p, i);
> +            item = (IndexTuple) PageGetItem(p, itemid);
> +        }
> +
> +        if (PageAddItem(left, (Item) item, IndexTupleSize(item),
>                          leftoff, LP_USED) == InvalidOffsetNumber)
>              elog(ERROR, "rtdosplit: failed to add index item to %s",
>                   RelationGetRelationName(r));
>          leftoff = OffsetNumberNext(leftoff);
> -        ItemPointerSet(&(res->pointerData), lbknum, leftoff);
> -        spl_left++;
> +
> +        if (i == newitemoff)
> +            ItemPointerSet(&(res->pointerData), lbknum, leftoff);
> +
> +        spl_left++;        /* advance in left split vector */
>      }
> -    else
> +
> +    /* fill right node */
> +    for (n = 0; n < v.spl_nright; n++)
>      {
> -        Assert(*spl_right == maxoff + 1);
> -        if (PageAddItem(right, (Item) itup, IndexTupleSize(itup),
> +        i = *spl_right;
> +        if (i == newitemoff)
> +            item = itup;
> +        else
> +        {
> +            itemid = PageGetItemId(p, i);
> +            item = (IndexTuple) PageGetItem(p, itemid);
> +        }
> +
> +        if (PageAddItem(right, (Item) item, IndexTupleSize(item),
>                          rightoff, LP_USED) == InvalidOffsetNumber)
>              elog(ERROR, "rtdosplit: failed to add index item to %s",
>                   RelationGetRelationName(r));
>          rightoff = OffsetNumberNext(rightoff);
> -        ItemPointerSet(&(res->pointerData), rbknum, rightoff);
> -        spl_right++;
> +
> +        if (i == newitemoff)
> +            ItemPointerSet(&(res->pointerData), rbknum, rightoff);
> +
> +        spl_right++;        /* advance in right split vector */
>      }
>
>      /* Make sure we consumed all of the split vectors, and release 'em */
> @@ -741,8 +765,10 @@
>   * In addition, the item to be added (itup) is listed in the appropriate
>   * vector.    It is represented by item number N+1 (N = # of items on page).
>   *
> - * Both vectors appear in sequence order with a terminating sentinel value
> - * of InvalidOffsetNumber.
> + * Both vectors have a terminating sentinel value of InvalidOffsetNumber,
> + * but the sentinal value is no longer used, because the SPLITVEC
> + * vector also contains the length of each vector, and that information
> + * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001
>   *
>   * The bounding-box datums for the two new pages are also returned in *v.
>   *
> @@ -797,6 +823,12 @@
>                  item_2_sz,
>                  left_avail_space,
>                  right_avail_space;
> +    int            total_num_tuples,
> +                num_tuples_without_seeds,
> +                max_after_split; /* in Guttman's lingo, (M - m) */
> +    float        diff; /* diff between cost of putting tuple left or right */
> +    SPLITCOST   *cost_vector;
> +    int            n;
>
>      /*
>       * First, make sure the new item is not so large that we can't
> @@ -812,6 +844,9 @@
>      maxoff = PageGetMaxOffsetNumber(page);
>      newitemoff = OffsetNumberNext(maxoff);        /* phony index for new
>                                                   * item */
> +    total_num_tuples = newitemoff;
> +    num_tuples_without_seeds = total_num_tuples - 2;
> +    max_after_split = total_num_tuples / 2;        /* works for m = M/2 */
>
>      /* Make arrays big enough for worst case, including sentinel */
>      nbytes = (maxoff + 2) * sizeof(OffsetNumber);
> @@ -909,47 +944,111 @@
>      right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2);
>
>      /*
> -     * Now split up the regions between the two seeds.    An important
> -     * property of this split algorithm is that the split vector v has the
> -     * indices of items to be split in order in its left and right
> -     * vectors.  We exploit this property by doing a merge in the code
> -     * that actually splits the page.
> +     * Now split up the regions between the two seeds.
> +     *
> +     * The cost_vector array will contain hints for determining where
> +     * each tuple should go.  Each record in the array will contain
> +     * a boolean, choose_left, that indicates which node the tuple
> +     * prefers to be on, and the absolute difference in cost between
> +     * putting the tuple in its favored node and in the other node.
>       *
> -     * For efficiency, we also place the new index tuple in this loop. This
> -     * is handled at the very end, when we have placed all the existing
> -     * tuples and i == maxoff + 1.
> +     * Later, we will sort the cost_vector in descending order by cost
> +     * difference, and consider the tuples in that order for
> +     * placement.  That way, the tuples that *really* want to be in
> +     * one node or the other get to choose first, and the tuples that
> +     * don't really care choose last.
> +     *
> +     * First, build the cost_vector array.  The new index tuple will
> +     * also be handled in this loop, and represented in the array,
> +     * with i==newitemoff.
> +     *
> +     * In the case of variable size tuples it is possible that we only
> +     * have the two seeds and no other tuples, in which case we don't
> +     * do any of this cost_vector stuff.
>       */
> +
> +    /* to keep compiler quiet */
> +    cost_vector = (SPLITCOST *) NULL;
> +
> +    if (num_tuples_without_seeds > 0)
> +    {
> +        cost_vector =
> +            (SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST));
> +        n = 0;
> +        for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
> +        {
> +            /* Compute new union datums and sizes for both choices */
> +
> +            if ((i == seed_1) || (i == seed_2))
> +                continue;
> +            else if (i == newitemoff)
> +                item_1 = itup;
> +            else
> +                item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
> +
> +            datum_alpha = IndexTupleGetDatum(item_1);
> +            union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha);
> +            union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha);
> +            FunctionCall2(&rtstate->sizeFn, union_dl,
> +                          PointerGetDatum(&size_alpha));
> +            FunctionCall2(&rtstate->sizeFn, union_dr,
> +                          PointerGetDatum(&size_beta));
> +
> +            diff = (size_alpha - size_l) - (size_beta - size_r);
> +
> +            cost_vector[n].offset_number = i;
> +            cost_vector[n].cost_differential = fabs(diff);
> +            cost_vector[n].choose_left = (diff < 0);
> +
> +            n++;
> +        }
> +
> +        /*
> +         * Sort the array.  The function qsort_comp_splitcost is
> +         * set up "backwards", to provided descending order.
> +         */
> +        qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST),
> +              &qsort_comp_splitcost);
> +    }
> +
> +    /*
> +     * Now make the final decisions about where each tuple will go,
> +     * and build the vectors to return in the SPLITVEC record.
> +     *
> +     * The cost_vector array contains (descriptions of) all the
> +     * tuples, in the order that we want to consider them, so we
> +     * we just iterate through it and place each tuple in left
> +     * or right nodes, according to the criteria described below.
> +     */
> +
>      left = v->spl_left;
>      v->spl_nleft = 0;
>      right = v->spl_right;
>      v->spl_nright = 0;
>
> -    for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
> +    /* Place the seeds first.
> +     * left avail space, left union, right avail space, and right
> +     * union have already been adjusted for the seeds.
> +     */
> +
> +    *left++ = seed_1;
> +    v->spl_nleft++;
> +
> +    *right++ = seed_2;
> +    v->spl_nright++;
> +
> +    for (n = 0; n < num_tuples_without_seeds; n++)
>      {
>          bool        left_feasible,
>                      right_feasible,
>                      choose_left;
>
>          /*
> -         * If we've already decided where to place this item, just put it
> -         * on the correct list.  Otherwise, we need to figure out which
> -         * page needs the least enlargement in order to store the item.
> +         * We need to figure out which page needs the least
> +         * enlargement in order to store the item.
>           */
>
> -        if (i == seed_1)
> -        {
> -            *left++ = i;
> -            v->spl_nleft++;
> -            /* left avail_space & union already includes this one */
> -            continue;
> -        }
> -        if (i == seed_2)
> -        {
> -            *right++ = i;
> -            v->spl_nright++;
> -            /* right avail_space & union already includes this one */
> -            continue;
> -        }
> +        i = cost_vector[n].offset_number;
>
>          /* Compute new union datums and sizes for both possible additions */
>          if (i == newitemoff)
> @@ -979,6 +1078,24 @@
>           * (We know that all the old items together can fit on one page, so
>           * we need not worry about any other problem than failing to fit
>           * the new item.)
> +         *
> +         * Guttman's algorithm actually has two factors to consider (in
> +         * order):  1. if one node has so many tuples already assigned to
> +         * it that the other needs all the rest in order to satisfy the
> +         * condition that neither node has fewer than m tuples, then
> +         * that is decisive; 2. otherwise, choose the page that shows
> +         * the smaller enlargement of its union area.
> +         *
> +         * I have chosen m = M/2, where M is the maximum number of
> +         * tuples on a page.  (Actually, this is only strictly
> +         * true for fixed size tuples.  For variable size tuples,
> +         * there still might have to be only one tuple on a page,
> +         * if it is really big.  But even with variable size
> +         * tuples we still try to get m as close as possible to M/2.)
> +         *
> +         * The question of which page shows the smaller enlargement of
> +         * its union area has already been answered, and the answer
> +         * stored in the choose_left field of the SPLITCOST record.
>           */
>          left_feasible = (left_avail_space >= item_1_sz &&
>                           ((left_avail_space - item_1_sz) >= newitemsz ||
> @@ -988,8 +1105,18 @@
>                             left_avail_space >= newitemsz));
>          if (left_feasible && right_feasible)
>          {
> -            /* Both feasible, use Guttman's algorithm */
> -            choose_left = (size_alpha - size_l < size_beta - size_r);
> +            /*
> +             * Both feasible, use Guttman's algorithm.
> +             * First check the m condition described above, and if
> +             * that doesn't apply, choose the page with the smaller
> +             * enlargement of its union area.
> +             */
> +            if (v->spl_nleft > max_after_split)
> +                choose_left = false;
> +            else if (v->spl_nright > max_after_split)
> +                choose_left = true;
> +            else
> +                choose_left = cost_vector[n].choose_left;
>          }
>          else if (left_feasible)
>              choose_left = true;
> @@ -1023,6 +1150,11 @@
>          }
>      }
>
> +    if (num_tuples_without_seeds > 0)
> +    {
> +        pfree(cost_vector);
> +    }
> +
>      *left = *right = InvalidOffsetNumber;        /* add ending sentinels */
>
>      v->spl_ldatum = datum_l;
> @@ -1152,6 +1284,21 @@
>      fmgr_info(size_proc, &rtstate->sizeFn);
>      fmgr_info(inter_proc, &rtstate->interFn);
>      return;
> +}
> +
> +/* for sorting SPLITCOST records in descending order */
> +static int
> +qsort_comp_splitcost(const void *a, const void *b)
> +{
> +    float diff =
> +        ((SPLITCOST *)a)->cost_differential -
> +        ((SPLITCOST *)b)->cost_differential;
> +    if (diff < 0)
> +        return 1;
> +    else if (diff > 0)
> +        return -1;
> +    else
> +        return 0;
>  }
>
>  #ifdef RTDEBUG

>
> ---------------------------(end of broadcast)---------------------------
> TIP 6: Have you searched our list archives?
>
> http://archives.postgresql.org

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026
pgsql-patches by date:
From: Marko Kreen
Date: 26 September 2001, 05:46:58
Subject: Re: pgcrypto Makefile update
From: Bruce Momjian
Date: 26 September 2001, 12:37:13
Subject: Re: pgcrypto Makefile update
Re: rtree improvements - Mailing list pgsql-patches

Previous

Next