*** a/configure.in
--- b/configure.in
*************** AC_MSG_WARN([*** skipping thread test on
*** 1913,1918 ****
--- 1913,1944 ----
  fi
  fi
  
+ # strxfrm stuff
+ if test "$PORTNAME" != "win32"
+ then
+ AC_MSG_CHECKING([strxfrm non-redundancy])
+ 
+ _CFLAGS="$CFLAGS"
+ _LIBS="$LIBS"
+ CFLAGS="$CFLAGS -DIN_CONFIGURE"
+ LIBS="$LIBS"
+ AC_TRY_RUN([#include "$srcdir/src/test/locale/test-strxfrm-redundant.c"],
+   [AC_MSG_RESULT(yes)],
+   [AC_MSG_RESULT(no)
+    AC_DEFINE(PG_BAD_STRXFRM, 1, [Define to 1 if you have strxfrm() with redundant header bytes.])
+   AC_MSG_WARN([strxfrm test program failed
+ This platform has a strxfrm implementation that produces header bytes. Optimization disabled. ])],
+   [AC_MSG_RESULT(maybe)
+   AC_MSG_WARN([
+ *** Skipping strxfrm test program because of cross-compile build.
+ *** Run the program src/test/locale/test-strxfrm-redundant.c on the target machine.
+ ])])
+ CFLAGS="$_CFLAGS"
+ LIBS="$_LIBS"
+ else
+ AC_MSG_WARN([*** skipping strxfrm test on Win32])
+ fi
+ 
  # If compiler will take -Wl,--as-needed (or various platform-specific
  # spellings thereof) then add that to LDFLAGS.  This is much easier than
  # trying to filter LIBS to the minimum for each executable.
*** a/src/backend/commands/analyze.c
--- b/src/backend/commands/analyze.c
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2292,2297 ****
--- 2292,2302 ----
  	/* We always use the default collation for statistics */
  	ssup.ssup_collation = DEFAULT_COLLATION_OID;
  	ssup.ssup_nulls_first = false;
+ 	/*
+ 	 * It isn't feasible to perform poor man's conversion, so opt out of that
+ 	 * additional optimization entirely
+ 	 */
+ 	ssup.type = sortKeyOther;
  
  	PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
  
*** a/src/backend/executor/nodeAgg.c
--- b/src/backend/executor/nodeAgg.c
*************** initialize_aggregates(AggState *aggstate
*** 377,383 ****
  									 peraggstate->sortOperators,
  									 peraggstate->sortCollations,
  									 peraggstate->sortNullsFirst,
! 									 work_mem, false);
  		}
  
  		/*
--- 377,383 ----
  									 peraggstate->sortOperators,
  									 peraggstate->sortCollations,
  									 peraggstate->sortNullsFirst,
! 									 work_mem, -1, false);
  		}
  
  		/*
*** a/src/backend/executor/nodeMergeAppend.c
--- b/src/backend/executor/nodeMergeAppend.c
*************** ExecInitMergeAppend(MergeAppend *node, E
*** 137,142 ****
--- 137,148 ----
  		sortKey->ssup_nulls_first = node->nullsFirst[i];
  		sortKey->ssup_attno = node->sortColIdx[i];
  
+ 		/*
+ 		 * It isn't feasible to perform poor man's conversion, so opt out of
+ 		 * that additional optimization entirely
+ 		 */
+ 		sortKey->type = sortKeyOther;
+ 
  		PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
  	}
  
*** a/src/backend/executor/nodeMergejoin.c
--- b/src/backend/executor/nodeMergejoin.c
*************** MJExamineQuals(List *mergeclauses,
*** 234,239 ****
--- 234,253 ----
  									 op_lefttype,
  									 op_righttype,
  									 BTSORTSUPPORT_PROC);
+ 
+ 		/*
+ 		 * sortsupport routine must know if poor man's optimization is
+ 		 * applicable in principle.  Tell routine to never use optimization,
+ 		 * since it isn't likely to be useful here.  In practice MJCompare()
+ 		 * probably yields equality more often than not.
+ 		 *
+ 		 * XXX: It might be worth setting this to sortKeyTrueLeading, to make
+ 		 * the comparison func more optimistic about the chances of equality
+ 		 * being indicated.  To do so would be an abuse of the interface,
+ 		 * though.
+ 		 */
+ 		clause->ssup.type = sortKeyOther;
+ 
  		if (OidIsValid(sortfunc))
  		{
  			/* The sort support function should provide a comparator */
*** a/src/backend/executor/nodeSort.c
--- b/src/backend/executor/nodeSort.c
*************** ExecSort(SortState *node)
*** 89,94 ****
--- 89,95 ----
  											  plannode->collations,
  											  plannode->nullsFirst,
  											  work_mem,
+ 											  plannode->plan.plan_rows,
  											  node->randomAccess);
  		if (node->bounded)
  			tuplesort_set_bound(tuplesortstate, node->bound);
*** a/src/backend/lib/Makefile
--- b/src/backend/lib/Makefile
*************** subdir = src/backend/lib
*** 12,17 ****
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = ilist.o binaryheap.o stringinfo.o
  
  include $(top_srcdir)/src/backend/common.mk
--- 12,17 ----
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = ilist.o binaryheap.o hyperloglog.o stringinfo.o
  
  include $(top_srcdir)/src/backend/common.mk
*** a/src/backend/lib/hyperloglog.c
--- b/src/backend/lib/hyperloglog.c
***************
*** 0 ****
--- 1,202 ----
+ /*-------------------------------------------------------------------------
+  *
+  * hyperloglog.c
+  *	  A simple HyperLogLog cardinality estimator implementation
+  *
+  * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+  *
+  * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+  * suited to estimating the cardinality of very large sets;  in particular, we
+  * have not attempted to further optimize the implementation as described in
+  * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+  * Engineering of a State of The Art Cardinality Estimation Algorithm".
+  *
+  * A sparse representation of HyperLogLog state is used, with fixed space
+  * overhead.
+  *
+  * IDENTIFICATION
+  *	  src/backend/lib/hyperloglog.c
+  *
+  *-------------------------------------------------------------------------
+  */
+ 
+ #include "postgres.h"
+ 
+ #include <math.h>
+ 
+ #include "lib/hyperloglog.h"
+ 
+ #define POW_2_32			(4294967296.0)
+ #define NEG_POW_2_32		(-4294967296.0)
+ 
+ static inline uint8 rho(uint32 x, uint8 b);
+ 
+ /*
+  * Initialize HyperLogLog track state
+  *
+  * bwidth is bit width (so register size will be 2 to the power of bwidth).
+  * Must be between 4 and 16 inclusive.
+  */
+ void
+ initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
+ {
+ 	double		alpha;
+ 
+ 	if (bwidth < 4 || bwidth > 16)
+ 		elog(ERROR, "bit width must be between 4 and 16 inclusive");
+ 
+ 	cState->registerWidth = bwidth;
+ 	cState->nRegisters = 1 << bwidth;
+ 	cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
+ 
+ 	/*
+ 	 * Initialize hashes array to zero, not negative infinity, per discussion
+ 	 * of the coupon collector problem in the HyperLogLog paper
+ 	 */
+ 	cState->hashesArr = palloc0(cState->arrSize);
+ 
+ 	/*
+ 	 * "alpha" is a value that for each possible number of registers (m) is
+ 	 * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
+ 	 * is "the indicator function" through which we finally compute E,
+ 	 * estimated cardinality).
+ 	 */
+ 	switch (cState->nRegisters)
+ 	{
+ 		case 16:
+ 			alpha = 0.673;
+ 			break;
+ 		case 32:
+ 			alpha = 0.697;
+ 			break;
+ 		case 64:
+ 			alpha = 0.709;
+ 			break;
+ 		default:
+ 			alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
+ 	}
+ 
+ 	/*
+ 	 * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
+ 	 * estimate E
+ 	 */
+ 	cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
+ }
+ 
+ /*
+  * Adds element to the estimator, from caller-supplied hash.
+  *
+  * It is critical that the hash value passed be an actual hash value, typically
+  * generated using hash_any().  The algorithm relies on a specific bit-pattern
+  * observable in conjunction with stochastic averaging.
+  */
+ void
+ addHyperLogLog(hyperLogLogState *cState, uint32 hash)
+ {
+ 	uint8		count;
+ 	uint32		index;
+ 
+ 	/* Use the first "k" (registerWidth) bits as a zero based index */
+ 	index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+ 
+ 	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+ 	count = rho(hash << cState->registerWidth,
+ 				BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+ 
+ 	cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
+ }
+ 
+ /*
+  * Estimates cardinality, based on elements added so far
+  */
+ double
+ estimateHyperLogLog(hyperLogLogState *cState)
+ {
+ 	double		result;
+ 	double		sum = 0.0;
+ 	int			i;
+ 
+ 	for (i = 0; i < cState->nRegisters; i++)
+ 	{
+ 		sum += 1.0 / pow(2.0, cState->hashesArr[i]);
+ 	}
+ 
+ 	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+ 	result = cState->alphaMM / sum;
+ 
+ 	if (result <= (5.0 / 2.0) * cState->nRegisters)
+ 	{
+ 		/* Small range correction */
+ 		int 	zero_count = 0;
+ 
+ 		for (i = 0; i < cState->nRegisters; i++)
+ 		{
+ 			if (cState->hashesArr[i] == 0)
+ 				zero_count++;
+ 		}
+ 
+ 		if (zero_count != 0)
+ 			result = cState->nRegisters * log((double) cState->nRegisters /
+ 											  zero_count);
+ 	}
+ 	else if (result > (1.0 / 30.0) * POW_2_32)
+ 	{
+ 		/* Large range correction */
+ 		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+ 	}
+ 
+ 	return result;
+ }
+ 
+ /*
+  * Merges the estimate from one HyperLogLog state to another, returning the
+  * estimate of their union.
+  *
+  * The number of registers in each must match.
+  */
+ void
+ mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState)
+ {
+ 	int		r;
+ 
+ 	if (cState->nRegisters != oState->nRegisters)
+ 		elog(ERROR, "number of registers mismatch: %zu != %zu",
+ 			 cState->nRegisters, oState->nRegisters);
+ 
+ 	for (r = 0; r < cState->nRegisters; ++r)
+ 	{
+ 		cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]);
+ 	}
+ }
+ 
+ 
+ /*
+  * Worker for addHyperLogLog().
+  *
+  * Calculates the position of the first set bit in first b bits of x argument
+  * starting from the first, reading from most significant to least significant
+  * bits.
+  *
+  * Example (when considering fist 10 bits of x):
+  *
+  * rho(x = 0b1000000000)   returns 1
+  * rho(x = 0b0010000000)   returns 3
+  * rho(x = 0b0000000000)   returns b + 1
+  *
+  * "The binary address determined by the first b bits of x"
+  *
+  * Return value "j" used to index bit pattern to watch.
+  */
+ static inline uint8
+ rho(uint32 x, uint8 b)
+ {
+ 	uint8	j = 1;
+ 
+ 	while (j <= b && !(x & 0x80000000))
+ 	{
+ 		j++;
+ 		x <<= 1;
+ 	}
+ 
+ 	return j;
+ }
*** a/src/backend/utils/adt/orderedsetaggs.c
--- b/src/backend/utils/adt/orderedsetaggs.c
*************** ordered_set_startup(FunctionCallInfo fci
*** 280,286 ****
  												   qstate->sortOperators,
  												   qstate->sortCollations,
  												   qstate->sortNullsFirsts,
! 												   work_mem, false);
  	else
  		osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
  													qstate->sortOperator,
--- 280,286 ----
  												   qstate->sortOperators,
  												   qstate->sortCollations,
  												   qstate->sortNullsFirsts,
! 												   work_mem, -1, false);
  	else
  		osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
  													qstate->sortOperator,
*** a/src/backend/utils/adt/varlena.c
--- b/src/backend/utils/adt/varlena.c
***************
*** 17,25 ****
--- 17,27 ----
  #include <ctype.h>
  #include <limits.h>
  
+ #include "access/hash.h"
  #include "access/tuptoaster.h"
  #include "catalog/pg_collation.h"
  #include "catalog/pg_type.h"
+ #include "lib/hyperloglog.h"
  #include "libpq/md5.h"
  #include "libpq/pqformat.h"
  #include "miscadmin.h"
***************
*** 29,34 ****
--- 31,37 ----
  #include "utils/bytea.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
+ #include "utils/sortsupport.h"
  
  
  /* GUC variable */
*************** typedef struct
*** 50,61 ****
--- 53,89 ----
  	int			skiptable[256]; /* skip distance for given mismatched char */
  } TextPositionState;
  
+ typedef struct
+ {
+ 	char	   *buf1;			/* 1st string, or poorman original string buf */
+ 	char	   *buf2;			/* 2nd string, or leading key/poor man blob */
+ 	int			buflen1;
+ 	int			buflen2;
+ 	hyperLogLogState hlstate;
+ 	double		length;			/* Aggregate string length copied so far */
+ #ifdef HAVE_LOCALE_T
+ 	pg_locale_t locale;
+ #endif
+ } TextSortSupport;
+ 
+ /*
+  * This should be large enough that most strings will fit, but small enough
+  * that we feel comfortable putting it on the stack
+  */
+ #define TEXTBUFLEN		1024
+ 
  #define DatumGetUnknownP(X)			((unknown *) PG_DETOAST_DATUM(X))
  #define DatumGetUnknownPCopy(X)		((unknown *) PG_DETOAST_DATUM_COPY(X))
  #define PG_GETARG_UNKNOWN_P(n)		DatumGetUnknownP(PG_GETARG_DATUM(n))
  #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
  #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
  
+ static void btpoorman_worker(SortSupport ssup, Oid collid);
+ static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
+ static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
+ static int bttextcmp_poorman(Datum x, Datum y, SortSupport ssup);
+ static Datum bttext_convert(Datum original, SortSupport ssup);
+ static bool bttext_abort(int memtupcount, double rowhint, SortSupport ssup);
  static int32 text_length(Datum str);
  static text *text_catenate(text *t1, text *t2);
  static text *text_substring(Datum str,
*************** varstr_cmp(char *arg1, int len1, char *a
*** 1356,1365 ****
  	}
  	else
  	{
! #define STACKBUFLEN		1024
! 
! 		char		a1buf[STACKBUFLEN];
! 		char		a2buf[STACKBUFLEN];
  		char	   *a1p,
  				   *a2p;
  
--- 1384,1391 ----
  	}
  	else
  	{
! 		char		a1buf[TEXTBUFLEN];
! 		char		a2buf[TEXTBUFLEN];
  		char	   *a1p,
  				   *a2p;
  
*************** varstr_cmp(char *arg1, int len1, char *a
*** 1393,1416 ****
  			int			a2len;
  			int			r;
  
! 			if (len1 >= STACKBUFLEN / 2)
  			{
  				a1len = len1 * 2 + 2;
  				a1p = palloc(a1len);
  			}
  			else
  			{
! 				a1len = STACKBUFLEN;
  				a1p = a1buf;
  			}
! 			if (len2 >= STACKBUFLEN / 2)
  			{
  				a2len = len2 * 2 + 2;
  				a2p = palloc(a2len);
  			}
  			else
  			{
! 				a2len = STACKBUFLEN;
  				a2p = a2buf;
  			}
  
--- 1419,1442 ----
  			int			a2len;
  			int			r;
  
! 			if (len1 >= TEXTBUFLEN / 2)
  			{
  				a1len = len1 * 2 + 2;
  				a1p = palloc(a1len);
  			}
  			else
  			{
! 				a1len = TEXTBUFLEN;
  				a1p = a1buf;
  			}
! 			if (len2 >= TEXTBUFLEN / 2)
  			{
  				a2len = len2 * 2 + 2;
  				a2p = palloc(a2len);
  			}
  			else
  			{
! 				a2len = TEXTBUFLEN;
  				a2p = a2buf;
  			}
  
*************** varstr_cmp(char *arg1, int len1, char *a
*** 1475,1485 ****
  		}
  #endif   /* WIN32 */
  
! 		if (len1 >= STACKBUFLEN)
  			a1p = (char *) palloc(len1 + 1);
  		else
  			a1p = a1buf;
! 		if (len2 >= STACKBUFLEN)
  			a2p = (char *) palloc(len2 + 1);
  		else
  			a2p = a2buf;
--- 1501,1511 ----
  		}
  #endif   /* WIN32 */
  
! 		if (len1 >= TEXTBUFLEN)
  			a1p = (char *) palloc(len1 + 1);
  		else
  			a1p = a1buf;
! 		if (len2 >= TEXTBUFLEN)
  			a2p = (char *) palloc(len2 + 1);
  		else
  			a2p = a2buf;
*************** bttextcmp(PG_FUNCTION_ARGS)
*** 1683,1688 ****
--- 1709,2204 ----
  	PG_RETURN_INT32(result);
  }
  
+ Datum
+ bttextsortsupport(PG_FUNCTION_ARGS)
+ {
+ 	SortSupport		ssup = (SortSupport) PG_GETARG_POINTER(0);
+ 	Oid				collid = ssup->ssup_collation;
+ 	MemoryContext	oldcontext;
+ 
+ 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
+ 
+ 	btpoorman_worker(ssup, collid);
+ 
+ 	MemoryContextSwitchTo(oldcontext);
+ 
+ 	PG_RETURN_VOID();
+ }
+ 
+ /*
+  * Worker for sort support routine
+  */
+ static void
+ btpoorman_worker(SortSupport ssup, Oid collid)
+ {
+ 	TextSortSupport	   *tss;
+ 
+ 	/*
+ 	 * WIN32 requires complex hacks when the database encoding is UTF-8 (except
+ 	 * when using the "C" collation).  For now, we don't optimize that case.
+ 	 */
+ #if defined(WIN32)
+ 	/*
+ 	 * FIXME:  There is an clear obligation to provide a comparator.  This
+ 	 * early return is therefore unacceptable.  It seems pretty ugly to
+ 	 * separately prepare a shim routine here, especially since there is no
+ 	 * convenient way to do a reverse lookup to get an ordering operator from
+ 	 * sortsupport state.  Something must be done, though.
+ 	 */
+ 	if (GetDatabaseEncoding() == PG_UTF8)
+ 		return;
+ #endif
+ 
+ 	/*
+ 	 * We are conservative about applying the poor man's normalized key
+ 	 * optimization in cases where it might be less effective.  In order to
+ 	 * apply that optimization, we require:
+ 	 *
+ 	 * 	* That the platform's strxfrm() meet a certain standard for
+ 	 * 	representing as much information as possible in leading bytes.
+ 	 *
+ 	 *	* That there are a full 8 bytes of storage per Datum on the platform,
+ 	 *	since we pack bytes into that representation.  Having only 4 bytes
+ 	 *	could make worse case performance drastically more likely.
+ 	 *
+ 	 * Still, there is no reason to not perform fmgr elision on these
+ 	 * platforms.
+ 	 */
+ #if defined(PG_BAD_STRXFRM) || SIZEOF_DATUM != 8
+ 	ssup->type = sortKeyOther;
+ #endif
+ 
+ 	/*
+ 	 * We may need a collation-sensitive comparison.  To make things faster,
+ 	 * we'll figure out the collation based on the locale id and cache the
+ 	 * result.  Also, since strxfrm()/strcoll() require NULL-terminated inputs,
+ 	 * prepare one or two palloc'd buffers to use as temporary workspace.  In
+ 	 * the ad-hoc comparison case we only use palloc'd buffers when we need
+ 	 * more space than we're comfortable allocating on the stack, but here we
+ 	 * can keep the buffers around for the whole sort, so it makes sense to
+ 	 * allocate them once and use them unconditionally (although we won't need
+ 	 * them when sorting proper begins and strxfrm() conversion has already
+ 	 * occurred, when sorting a poor man's key).
+ 	 */
+ 	tss = palloc(sizeof(TextSortSupport));
+ #ifdef HAVE_LOCALE_T
+ 	tss->locale = 0;
+ #endif
+ 
+ 	if (collid != DEFAULT_COLLATION_OID)
+ 	{
+ 		if (!OidIsValid(collid))
+ 		{
+ 			/*
+ 			 * This typically means that the parser could not resolve a
+ 			 * conflict of implicit collations, so report it that way.
+ 			 */
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INDETERMINATE_COLLATION),
+ 					 errmsg("could not determine which collation to use for string comparison"),
+ 					 errhint("Use the COLLATE clause to set the collation explicitly.")));
+ 		}
+ #ifdef HAVE_LOCALE_T
+ 		tss->locale = pg_newlocale_from_collation(collid);
+ #endif
+ 	}
+ 
+ 	tss->buf1 = palloc(TEXTBUFLEN);
+ 	tss->buflen1 = TEXTBUFLEN;
+ 	tss->buf2 = palloc(TEXTBUFLEN);
+ 	tss->buflen2 = TEXTBUFLEN;
+ 	tss->length = 0;
+ 
+ 	ssup->ssup_extra = tss;
+ 	if (ssup->type != sortKeyPoorman)
+ 	{
+ 		/*
+ 		 * If LC_COLLATE = C, we can make things quite a bit faster by using
+ 		 * memcmp() rather than strcoll().  To minimize the per-comparison
+ 		 * overhead, we make this decision just once for the whole sort.
+ 		 */
+ 		if (lc_collate_is_c(collid))
+ 			ssup->comparator = bttextfastcmp_c;
+ 		else
+ 			ssup->comparator = bttextfastcmp_locale;
+ 
+ 		ssup->converter = NULL;
+ 		ssup->abort_conversion = NULL;
+ 		ssup->proper = NULL;
+ 		return;
+ 	}
+ 
+ 	initHyperLogLog(&tss->hlstate, 10);
+ 
+ 	ssup->comparator = bttextcmp_poorman;
+ 	ssup->converter = bttext_convert;
+ 	ssup->abort_conversion = bttext_abort;
+ 
+ 	ssup->proper = palloc0(sizeof(SortSupportData));
+ 	ssup->proper->ssup_cxt = ssup->ssup_cxt;
+ 	ssup->proper->ssup_collation = ssup->ssup_collation;
+ 	ssup->proper->ssup_reverse = ssup->ssup_reverse;
+ 	ssup->proper->ssup_nulls_first = ssup->ssup_nulls_first;
+ 	ssup->proper->ssup_attno = ssup->ssup_attno;
+ 
+ 	/*
+ 	 * Initialize the "proper" sortsupport state with a reliable
+ 	 * strcoll()-based comparison func for tie-breaking.
+ 	 */
+ 	ssup->proper->type = sortKeyTrueLeading;
+ 	btpoorman_worker(ssup->proper, collid);
+ }
+ 
+ /*
+  * sortsupport comparison func (for C locale case)
+  */
+ static int
+ bttextfastcmp_c(Datum x, Datum y, SortSupport ssup)
+ {
+ 	text	   *arg1 = DatumGetTextPP(x);
+ 	text	   *arg2 = DatumGetTextPP(y);
+ 	char	   *a1p,
+ 			   *a2p;
+ 	int			len1,
+ 				len2,
+ 				result;
+ 
+ 	a1p = VARDATA_ANY(arg1);
+ 	a2p = VARDATA_ANY(arg2);
+ 
+ 	len1 = VARSIZE_ANY_EXHDR(arg1);
+ 	len2 = VARSIZE_ANY_EXHDR(arg2);
+ 
+ 	result = memcmp(a1p, a2p, Min(len1, len2));
+ 	if ((result == 0) && (len1 != len2))
+ 		result = (len1 < len2) ? -1 : 1;
+ 
+ 	/* We can't afford to leak memory here. */
+ 	if (PointerGetDatum(arg1) != x)
+ 		pfree(arg1);
+ 	if (PointerGetDatum(arg2) != y)
+ 		pfree(arg2);
+ 
+ 	return result;
+ }
+ 
+ /*
+  * sortsupport comparison func (for locale case)
+  */
+ static int
+ bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup)
+ {
+ 	text			   *arg1 = DatumGetTextPP(x);
+ 	text			   *arg2 = DatumGetTextPP(y);
+ 	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
+ 
+ 	/* working state */
+ 	char			   *a1p,
+ 					   *a2p;
+ 	int					len1,
+ 						len2,
+ 						result;
+ 
+ 	a1p = VARDATA_ANY(arg1);
+ 	a2p = VARDATA_ANY(arg2);
+ 
+ 	len1 = VARSIZE_ANY_EXHDR(arg1);
+ 	len2 = VARSIZE_ANY_EXHDR(arg2);
+ 
+ 	if (ssup->type == sortKeyTrueLeading && len1 == len2)
+ 	{
+ 		/*
+ 		 * "True" leading key.  This indicates that we're being called as a
+ 		 * fully reliable tie-breaker for the poor man's normalized key
+ 		 * comparison (there may be other attributes that must be subsequently
+ 		 * compared later).
+ 		 *
+ 		 * In general there is a pretty good chance control reached here
+ 		 * because the key is actually fully equal.  It seems worthwhile to try
+ 		 * and give an answer using only a cheap memcmp() comparison on the
+ 		 * assumption that this will indicate equality frequently enough for it
+ 		 * to be worth it on balance.  This is a reasonable assumption, since
+ 		 * sorting is almost certainly bottlenecked on memory bandwidth.
+ 		 */
+ 		if (memcmp(a1p, a2p, len1) == 0)
+ 			return 0;
+ 	}
+ 
+ 	if (len1 >= tss->buflen1)
+ 	{
+ 		pfree(tss->buf1);
+ 		tss->buflen1 *= 2;
+ 		tss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen1);
+ 	}
+ 	if (len2 >= tss->buflen2)
+ 	{
+ 		pfree(tss->buf2);
+ 		tss->buflen2 *= 2;
+ 		tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
+ 	}
+ 
+ 	memcpy(tss->buf1, a1p, len1);
+ 	tss->buf1[len1] = '\0';
+ 	memcpy(tss->buf2, a2p, len2);
+ 	tss->buf2[len2] = '\0';
+ 
+ #ifdef HAVE_LOCALE_T
+ 	if (tss->locale)
+ 		result = strcoll_l(tss->buf1, tss->buf2, tss->locale);
+ 	else
+ #endif
+ 		result = strcoll(tss->buf1, tss->buf2);
+ 
+ 	/*
+ 	 * In some locales strcoll() can claim that nonidentical strings are equal.
+ 	 * Believing that would be bad news for a number of reasons, so we follow
+ 	 * Perl's lead and sort "equal" strings according to strcmp().
+ 	 */
+ 	if (result == 0)
+ 		result = strcmp(tss->buf1, tss->buf2);
+ 
+ 	/* We can't afford to leak memory here. */
+ 	if (PointerGetDatum(arg1) != x)
+ 		pfree(arg1);
+ 	if (PointerGetDatum(arg2) != y)
+ 		pfree(arg2);
+ 
+ 	return result;
+ }
+ 
+ /*
+  * Poor man's normalized key comparison func
+  */
+ static int
+ bttextcmp_poorman(Datum x, Datum y, SortSupport ssup)
+ {
+ 	char   *a = (char *) &x;
+ 	char   *b = (char *) &y;
+ 	int 	result;
+ 
+ 	result = memcmp(a, b, sizeof(Datum));
+ 
+ 	/*
+ 	 * When result = 0, the core system will call bttextfastcmp_locale() or
+ 	 * bttextfastcmp_c().  Even a strcmp() on two non-truncated strxfrm() blobs
+ 	 * cannot indicate *equality* reliably, for the same reason that there is a
+ 	 * strcoll() strcmp() tie-breaker elsewhere (there'd still need to be a
+ 	 * strcmp() tie-breaker on the *original* string).
+ 	 *
+ 	 * XXX:  In principle it ought to be possible to tell the core system that
+ 	 * we really do know that the two strings are fully equal iff the C
+ 	 * collation is used, presuming that the core system could also somehow
+ 	 * differentiate between this case and the case where we have a truncated C
+ 	 * collated string that we have no firm conclusion on (perhaps a forth,
+ 	 * magical return value could be used while restricting all other return
+ 	 * values to (-1, 0, 1)).  It doesn't seem worth the trouble of surfacing
+ 	 * that distinction generally, though.
+ 	 */
+ 	return result;
+ }
+ 
+ /*
+  * Conversion routine for sortsupport.  Converts text to poor man's normalized
+  * keys.  Our encoding strategy is simple -- pack the first 8 bytes of a
+  * strxfrm() blob into a Datum.
+  */
+ static Datum
+ bttext_convert(Datum original, SortSupport ssup)
+ {
+ 	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
+ 	text			   *full = DatumGetTextPP(original);
+ 
+ 	/* working state */
+ 	Datum				res;
+ 	char			   *pres;
+ 	int					len;
+ 	Size				bsize;
+ 	uint32				lohalf,
+ 						hihalf,
+ 						hash;
+ 
+ 	/*
+ 	 * Convert text into a "poor man's normalized key".  This is a
+ 	 * pass-by-value Datum that is treated as a char array by the specialized
+ 	 * comparator bttextcmp_poorman().
+ 	 */
+ 	pres = (char *) &res;
+ 	/* memset() so non-copied bytes are always NULL */
+ 	memset(pres, 0, sizeof(Datum));
+ 	len = VARSIZE_ANY_EXHDR(full);
+ 
+ 	/* By convention, we use buffer 1 to store and NULL terminate text */
+ 	if (len >= tss->buflen1)
+ 	{
+ 		pfree(tss->buf1);
+ 		tss->buflen1 *= 2;
+ 		tss->buf1 = palloc(tss->buflen1);
+ 	}
+ 
+ 	/* Just like strcoll(), strxfrm() expects a NULL-terminated string */
+ 	memcpy(tss->buf1, VARDATA_ANY(full), len);
+ 	tss->buf1[len] = '\0';
+ 
+ retry:
+ 
+ 	/*
+ 	 * Note the lack of any special handling of the C locale here.  strxfrm()
+ 	 * is used indifferently.
+ 	 */
+ #ifdef HAVE_LOCALE_T
+ 	if (tss->locale)
+ 		bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale);
+ 	else
+ #endif
+ 		bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
+ 
+ 	if (bsize >= tss->buflen2)
+ 	{
+ 		/*
+ 		 * The C standard states that the contents of the buffer is now
+ 		 * unspecified.  Grow buffer, and retry.
+ 		 */
+ 		pfree(tss->buf2);
+ 		tss->buflen2 = Max(bsize + 1, tss->buflen2 * 2);
+ 		tss->buf2 = palloc(tss->buflen2);
+ 		goto retry;
+ 	}
+ 
+ 	memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
+ 
+ 	/*
+ 	 * Maintain approximate cardinality of poor man's keys using HyperLogLog.
+ 	 * Form 32-bit hash from packed 64-bit Datum representation.  Used as cheap
+ 	 * insurance against the worst case, where we do many string
+ 	 * transformations for savings in full strcoll()-based comparisons.
+ 	 */
+ 	lohalf = (uint32) res;
+ 	hihalf = (uint32) (res >> 32);
+ 	hash = hash_uint32(lohalf ^ hihalf);
+ 
+ 	addHyperLogLog(&tss->hlstate, hash);
+ 
+ 	/* Maintain total length of all strings, again for worst case prevention */
+ 	tss->length += len;
+ 
+ 	/*
+ 	 * Iff last byte isn't NULL, as in the common case where the entire Datum
+ 	 * is filled with blob bytes, that is interpreted as indicating that every
+ 	 * Datum byte should be compared.  This is safe because the strxfrm() blob
+ 	 * is itself NULL-terminated, leaving no danger of misinterpreting any NULL
+ 	 * bytes not intended to be interpreted as logically representing
+ 	 * termination.
+ 	 */
+ 	return res;
+ }
+ 
+ /*
+  * Callback for assessing projected effectiveness of poor man's normalized key
+  * optimization, using heuristic rules.  Returns value indicating if the poor
+  * man's optimization is estimated to be worth it.
+  */
+ static bool
+ bttext_abort(int memtupcount, double rowhint, SortSupport ssup)
+ {
+ 	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
+ 	double				est_distinct,
+ 						normalized_cardinality,
+ 						avg_text_len;
+ 
+ 	Assert(ssup->type == sortKeyPoorman);
+ 
+ 	avg_text_len = tss->length / (double) memtupcount;
+ 
+ 	if (rowhint > 5 && avg_text_len < 64)
+ 	{
+ 		double		normalized_rows_to_process;
+ 
+ 		normalized_rows_to_process = (rowhint - memtupcount) / rowhint;
+ 
+ 		if (normalized_rows_to_process > 0.90)
+ 		{
+ 			/*
+ 			 * Be patient -- don't consider aborting until we've processed an
+ 			 * estimated 10% of all rows to be sorted.
+ 			 */
+ #ifdef DEBUG_POORMAN_KEYS
+ 			elog(DEBUG1, "normalization patiently waited after %d tuples of %f",
+ 				 memtupcount, rowhint);
+ #endif
+ 			return false;
+ 		}
+ 
+ 		/*
+ 		 * Because the core code calls here at geometrically spaced intervals,
+ 		 * there is little point in ensuring that we don't abort the
+ 		 * normalization process too late, when the costs are mostly sunk and
+ 		 * it's probably worth proceeding with a marginal case.
+ 		 */
+ 	}
+ 
+ 	est_distinct = estimateHyperLogLog(&tss->hlstate);
+ 	normalized_cardinality = est_distinct / (double) memtupcount;
+ 
+ 	/*
+ 	 * We're concerned about weighing the costs of the poor man's optimization
+ 	 * against its probable benefit.
+ 	 *
+ 	 * The dominant cost is strxfrm() transformation for large strings, and not
+ 	 * extra bttextcmp_poorman() calls.  However, provided poor man's keys have
+ 	 * a high cardinality, it doesn't matter how expensive it is, because each
+ 	 * early transformation is very likely beneficial.  For smaller strings the
+ 	 * cost of inefficient use of CPU cache will dominate, and so a much less
+ 	 * stringent standard for cardinality is applied.
+ 	 */
+ #ifdef DEBUG_POORMAN_KEYS
+ 	elog(DEBUG1, "est_distinct after %d: %f (normalized_cardinality: %f, avg_len: %f)",
+ 		 memtupcount, est_distinct, normalized_cardinality, avg_text_len);
+ #endif
+ 
+ 	if (avg_text_len < 7.5)
+ 	{
+ 		/*
+ 		 * Very unlikely to lose with many short strings.  The key cardinality
+ 		 * doesn't much matter, because our tie-breaker saves a second
+ 		 * transformation (i.e.  strcoll() call) by performing a memcmp().
+ 		 */
+ 		;
+ 	}
+ 	else if (avg_text_len < 12)
+ 	{
+ 		if (normalized_cardinality < 0.00001)
+ 			goto abort;
+ 	}
+ 	else if (avg_text_len < 16)
+ 	{
+ 		if (normalized_cardinality < 0.0001)
+ 			goto abort;
+ 	}
+ 	else if (avg_text_len < 32)
+ 	{
+ 		if (normalized_cardinality < 0.001)
+ 			goto abort;
+ 	}
+ 	else if (avg_text_len < 64)
+ 	{
+ 		if (normalized_cardinality < 0.3)
+ 			goto abort;
+ 	}
+ 	else
+ 	{
+ 		if (normalized_cardinality < 0.65)
+ 			goto abort;
+ 	}
+ 
+ 	return false;
+ 
+ abort:
+ #ifdef DEBUG_POORMAN_KEYS
+ 	elog(DEBUG1, "aborted poorman normalization due to worst-case at %d",
+ 		 memtupcount);
+ #endif
+ 	return true;
+ }
  
  Datum
  text_larger(PG_FUNCTION_ARGS)
*** a/src/backend/utils/sort/sortsupport.c
--- b/src/backend/utils/sort/sortsupport.c
*************** PrepareSortSupportComparisonShim(Oid cmp
*** 82,87 ****
--- 82,90 ----
  
  	ssup->ssup_extra = extra;
  	ssup->comparator = comparison_shim;
+ 	ssup->proper = NULL;
+ 	ssup->converter = NULL;
+ 	ssup->abort_conversion = NULL;
  }
  
  /*
*************** PrepareSortSupportFromOrderingOp(Oid ord
*** 104,109 ****
--- 107,115 ----
  		elog(ERROR, "operator %u is not a valid ordering operator",
  			 orderingOp);
  
+ 	/* For now, make sure converter is NULL - opclass routine may set it */
+ 	ssup->converter = NULL;
+ 
  	if (issupport)
  	{
  		/* The sort support function should provide a comparator */
*** a/src/backend/utils/sort/tuplesort.c
--- b/src/backend/utils/sort/tuplesort.c
*************** bool		optimize_bounded_sort = true;
*** 150,156 ****
   * When sorting single Datums, the data value is represented directly by
   * datum1/isnull1.  If the datatype is pass-by-reference and isnull1 is false,
   * then datum1 points to a separately palloc'd data value that is also pointed
!  * to by the "tuple" pointer; otherwise "tuple" is NULL.
   *
   * While building initial runs, tupindex holds the tuple's run number.  During
   * merge passes, we re-use it to hold the input tape number that each tuple in
--- 150,159 ----
   * When sorting single Datums, the data value is represented directly by
   * datum1/isnull1.  If the datatype is pass-by-reference and isnull1 is false,
   * then datum1 points to a separately palloc'd data value that is also pointed
!  * to by the "tuple" pointer; otherwise "tuple" is NULL.  Note that there are
!  * some exceptions, as when the sort support infrastructure provides a "poor
!  * man's normalized key" representation.   When that occurs, extra precautions
!  * are taken when a comparison involving a pair of datum1s returns 0.
   *
   * While building initial runs, tupindex holds the tuple's run number.  During
   * merge passes, we re-use it to hold the input tape number that each tuple in
*************** struct Tuplesortstate
*** 353,358 ****
--- 356,373 ----
  	SortSupport onlyKey;
  
  	/*
+ 	 * Additional state for managing "poor man's normalized key" sortsupport
+ 	 * routines.  Feedback as to how effective the optimization is likely to be
+ 	 * is received from all opclasses that support this additional capability.
+ 	 * This gives us some reassurance that in the worst case (when all
+ 	 * normalized keys are the same), the process can be aborted before wasting
+ 	 * too many cycles on the normalization process.
+ 	 */
+ 	int			nextpoorcheck;	/* Tuple # at which to check applicability */
+ 	bool		aborted;		/* Normalization process aborted */
+ 	double		rowsHint;		/* Hint of total rows to be sorted */
+ 
+ 	/*
  	 * These variables are specific to the CLUSTER case; they are set by
  	 * tuplesort_begin_cluster.  Note CLUSTER also uses tupDesc and
  	 * indexScanKey.
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 600,606 ****
  					 int nkeys, AttrNumber *attNums,
  					 Oid *sortOperators, Oid *sortCollations,
  					 bool *nullsFirstFlags,
! 					 int workMem, bool randomAccess)
  {
  	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
  	MemoryContext oldcontext;
--- 615,622 ----
  					 int nkeys, AttrNumber *attNums,
  					 Oid *sortOperators, Oid *sortCollations,
  					 bool *nullsFirstFlags,
! 					 int workMem, double projectedTups,
! 					 bool randomAccess)
  {
  	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
  	MemoryContext oldcontext;
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 632,637 ****
--- 648,655 ----
  	state->reversedirection = reversedirection_heap;
  
  	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
+ 	state->nextpoorcheck = 5; 	/* Next check of poor man's applicability */
+ 	state->rowsHint = projectedTups; /* Hint to poor man's applicability test */
  
  	/* Prepare SortSupport data for each column */
  	state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
*************** tuplesort_begin_heap(TupleDesc tupDesc,
*** 648,657 ****
  		sortKey->ssup_nulls_first = nullsFirstFlags[i];
  		sortKey->ssup_attno = attNums[i];
  
  		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
  	}
  
! 	if (nkeys == 1)
  		state->onlyKey = state->sortKeys;
  
  	MemoryContextSwitchTo(oldcontext);
--- 666,691 ----
  		sortKey->ssup_nulls_first = nullsFirstFlags[i];
  		sortKey->ssup_attno = attNums[i];
  
+ 		/*
+ 		 * Must convey to sortsupport routine if poor man's optimization is
+ 		 * applicable in principle
+ 		 */
+ 		if (i == 0)
+ 			sortKey->type = sortKeyPoorman;
+ 		else
+ 			sortKey->type = sortKeyOther;
+ 
  		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
  	}
  
! 	/*
! 	 * The "onlyKey" optimization cannot be used when a tie-breaker for an
! 	 * unreliable poor man's normalized key comparison is required.  Typically,
! 	 * the optimization is only of significant value to pass-by-value types
! 	 * anyway, whereas poor man's normalized keys are typically used by
! 	 * pass-by-reference types.
! 	 */
! 	if (nkeys == 1 && !state->sortKeys->converter)
  		state->onlyKey = state->sortKeys;
  
  	MemoryContextSwitchTo(oldcontext);
*************** tuplesort_begin_datum(Oid datumType, Oid
*** 838,843 ****
--- 872,890 ----
  	/* Prepare SortSupport data */
  	state->onlyKey = (SortSupport) palloc0(sizeof(SortSupportData));
  
+ 	/*
+ 	 * "Other" key, because conversion to poor man's representation is
+ 	 * infeasible in the Datum case.  This is not a "leading key", because
+ 	 * those are only set by sortsupport routines.  If we set this to
+ 	 * sortKeyTrueLeading, we'd be making a misrepresentation to the
+ 	 * sortsupport routine (that there was a prior, unreliable comparison that
+ 	 * now needs a tie-breaker).
+ 	 *
+ 	 * XXX: It may be worth having our callers arrange to do a poor man's
+ 	 * normalization pass themselves, and represent to us that they'll do so
+ 	 * here, so that the datum case can avail of the optimization too.
+ 	 */
+ 	state->onlyKey->type = sortKeyOther;
  	state->onlyKey->ssup_cxt = CurrentMemoryContext;
  	state->onlyKey->ssup_collation = sortCollation;
  	state->onlyKey->ssup_nulls_first = nullsFirstFlag;
*************** comparetup_heap(const SortTuple *a, cons
*** 2858,2869 ****
  	int			nkey;
  	int32		compare;
  
! 	/* Compare the leading sort key */
! 	compare = ApplySortComparator(a->datum1, a->isnull1,
! 								  b->datum1, b->isnull1,
! 								  sortKey);
! 	if (compare != 0)
! 		return compare;
  
  	/* Compare additional sort keys */
  	ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
--- 2905,2919 ----
  	int			nkey;
  	int32		compare;
  
! 	if (!state->aborted)
! 	{
! 		/* Compare the leading sort key */
! 		compare = ApplySortComparator(a->datum1, a->isnull1,
! 									  b->datum1, b->isnull1,
! 									  sortKey);
! 		if (compare != 0)
! 			return compare;
! 	}
  
  	/* Compare additional sort keys */
  	ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
*************** comparetup_heap(const SortTuple *a, cons
*** 2871,2876 ****
--- 2921,2953 ----
  	rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
  	rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
  	tupDesc = state->tupDesc;
+ 
+ 	/*
+ 	 * If a leading poor man's comparison returned 0 or normalization strategy
+ 	 * was abandoned, call "true leading" key's comparator
+ 	 */
+ 	if (state->sortKeys->converter)
+ 	{
+ 		AttrNumber	attno = sortKey->ssup_attno;
+ 		Datum		datum1,
+ 					datum2;
+ 		bool		isnull1,
+ 					isnull2;
+ 
+ 		Assert(attno == sortKey->proper->ssup_attno);
+ 		Assert(sortKey->type == sortKeyPoorman);
+ 		Assert(sortKey->proper->type == sortKeyTrueLeading);
+ 
+ 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
+ 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
+ 
+ 		compare = ApplySortComparator(datum1, isnull1,
+ 									  datum2, isnull2,
+ 									  sortKey->proper);
+ 		if (compare != 0)
+ 			return compare;
+ 	}
+ 
  	sortKey++;
  	for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
  	{
*************** copytup_heap(Tuplesortstate *state, Sort
*** 2911,2920 ****
  	/* set up first-column key value */
  	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
  	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
! 	stup->datum1 = heap_getattr(&htup,
! 								state->sortKeys[0].ssup_attno,
! 								state->tupDesc,
! 								&stup->isnull1);
  }
  
  static void
--- 2988,3036 ----
  	/* set up first-column key value */
  	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
  	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
! 
! 	/* Once aborted, we give up on storing anything in datum1 entirely */
! 	if (state->aborted)
! 		return;
! 
! 	if (!state->sortKeys->converter)
! 	{
! 		/* Store ordinary Datum representation */
! 		stup->datum1 = heap_getattr(&htup,
! 									state->sortKeys[0].ssup_attno,
! 									state->tupDesc,
! 									&stup->isnull1);
! 	}
! 	else
! 	{
! 		Datum		original;
! 
! 		/*
! 		 * Store "poor man's normalized key", which cannot indicate equality in
! 		 * a trustworthy manner, and may require a tie-breaker
! 		 */
! 		original = heap_getattr(&htup, state->sortKeys[0].ssup_attno,
! 								state->tupDesc, &stup->isnull1);
! 
! 		if (stup->isnull1)
! 			stup->datum1 = original;
! 		else
! 			stup->datum1 = state->sortKeys->converter(original,
! 													  state->sortKeys);
! 
! 		/* Check effectiveness of optimization */
! 		if (state->memtupcount >= state->nextpoorcheck)
! 		{
! 			state->nextpoorcheck *= 2;
! 			if (state->sortKeys->abort_conversion(state->memtupcount,
! 												  state->rowsHint,
! 												  state->sortKeys))
! 			{
! 				/* Additional optimization did not work out -- give up */
! 				state->aborted = true;
! 			}
! 		}
! 	}
  }
  
  static void
*************** reversedirection_heap(Tuplesortstate *st
*** 2980,2985 ****
--- 3096,3112 ----
  		sortKey->ssup_reverse = !sortKey->ssup_reverse;
  		sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
  	}
+ 
+ 	/* If poor man's optimization is used, update "key proper" */
+ 	if (state->sortKeys->proper)
+ 	{
+ 		sortKey = state->sortKeys->proper;
+ 
+ 		Assert(sortKey->type == sortKeyTrueLeading);
+ 		sortKey->ssup_reverse = !sortKey->ssup_reverse;
+ 		sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
+ 	}
+ 
  }
  
  
*** a/src/include/catalog/pg_amproc.h
--- b/src/include/catalog/pg_amproc.h
*************** DATA(insert (	1989   26 26 1 356 ));
*** 122,127 ****
--- 122,128 ----
  DATA(insert (	1989   26 26 2 3134 ));
  DATA(insert (	1991   30 30 1 404 ));
  DATA(insert (	1994   25 25 1 360 ));
+ DATA(insert (	1994   25 25 2 3251 ));
  DATA(insert (	1996   1083 1083 1 1107 ));
  DATA(insert (	2000   1266 1266 1 1358 ));
  DATA(insert (	2002   1562 1562 1 1672 ));
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
*************** DATA(insert OID = 3135 ( btnamesortsuppo
*** 614,619 ****
--- 614,621 ----
  DESCR("sort support");
  DATA(insert OID = 360 (  bttextcmp		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "25 25" _null_ _null_ _null_ _null_ bttextcmp _null_ _null_ _null_ ));
  DESCR("less-equal-greater");
+ DATA(insert OID = 3251 ( bttextsortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2278 "2281" _null_ _null_ _null_ _null_ bttextsortsupport _null_ _null_ _null_ ));
+ DESCR("sort support");
  DATA(insert OID = 377 (  cash_cmp		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "790 790" _null_ _null_ _null_ _null_ cash_cmp _null_ _null_ _null_ ));
  DESCR("less-equal-greater");
  DATA(insert OID = 380 (  btreltimecmp	   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "703 703" _null_ _null_ _null_ _null_ btreltimecmp _null_ _null_ _null_ ));
*** a/src/include/lib/hyperloglog.h
--- b/src/include/lib/hyperloglog.h
***************
*** 0 ****
--- 1,42 ----
+ /*
+  * hyperloglog.h
+  *
+  * A simple HyperLogLog cardinality estimator implementation
+  *
+  * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+  *
+  * src/include/lib/hyperloglog.h
+  */
+ 
+ #ifndef HYPERLOGLOG_H
+ #define HYPERLOGLOG_H
+ 
+ /*
+  * HyperLogLog is an approximate technique for computing the number of distinct
+  * entries in a set.  Importantly, it does this by using a fixed amount of
+  * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+  * cardinality estimation algorithm" for more.
+  *
+  * hyperLogLogState
+  *
+  *		registerWidth		register width, in bits ("k")
+  *		nRegisters			number of registers
+  *		alphaMM				alpha * m ^ 2 (see initHyperLogLog())
+  *		hashesArr			array of hashes
+  *		arrSize				size of hashesArr
+  */
+ typedef struct hyperLogLogState
+ {
+ 	uint8		registerWidth;
+ 	Size		nRegisters;
+ 	double		alphaMM;
+ 	uint8	   *hashesArr;
+ 	Size		arrSize;
+ } hyperLogLogState;
+ 
+ extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
+ extern void	addHyperLogLog(hyperLogLogState *cState, uint32 hash);
+ extern double estimateHyperLogLog(hyperLogLogState *cState);
+ extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
+ 
+ #endif   /* HYPERLOGLOG_H */
*** a/src/include/utils/builtins.h
--- b/src/include/utils/builtins.h
*************** extern Datum bttintervalcmp(PG_FUNCTION_
*** 316,321 ****
--- 316,322 ----
  extern Datum btcharcmp(PG_FUNCTION_ARGS);
  extern Datum btnamecmp(PG_FUNCTION_ARGS);
  extern Datum bttextcmp(PG_FUNCTION_ARGS);
+ extern Datum bttextsortsupport(PG_FUNCTION_ARGS);
  
  /*
   *		Per-opclass sort support functions for new btrees.  Like the
*** a/src/include/utils/sortsupport.h
--- b/src/include/utils/sortsupport.h
***************
*** 49,54 ****
--- 49,61 ----
  
  #include "access/attnum.h"
  
+ typedef enum
+ {
+ 	sortKeyPoorman,		/* Leading (poor-man-applicable) key? */
+ 	sortKeyTrueLeading,	/* "True" (non-poorman's) leading key? */
+ 	sortKeyOther		/* Second or subsequent key */
+ } SortKeyType;
+ 
  typedef struct SortSupportData *SortSupport;
  
  typedef struct SortSupportData
*************** typedef struct SortSupportData
*** 92,103 ****
  	 * than, equal to, or greater than y.  Note that x and y are guaranteed
  	 * not null, and there is no way to return null either.  Do not return
  	 * INT_MIN, as callers are allowed to negate the result before using it.
  	 */
  	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
  
  	/*
! 	 * Additional sort-acceleration functions might be added here later.
  	 */
  } SortSupportData;
  
  
--- 99,188 ----
  	 * than, equal to, or greater than y.  Note that x and y are guaranteed
  	 * not null, and there is no way to return null either.  Do not return
  	 * INT_MIN, as callers are allowed to negate the result before using it.
+ 	 *
+ 	 * This comparator may be "semi-trustworthy" for opclasses with additional
+ 	 * special support for dealing with a poor man's normalized key
+ 	 * representation.
  	 */
  	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
  
  	/*
! 	 * "Poor man's normalized key" infrastructure follows.  All callbacks must
! 	 * be set by sortsupport opclasses that make use of this optional
! 	 * additional infrastructure.
! 	 *
! 	 * This allows opclass authors to supply a conversion routine, used to
! 	 * create an alternative representation of the underlying type (a "poor
! 	 * man's normalized key").  Typically, this representation is an ad-hoc,
! 	 * pass-by-value Datum format that only the opclass has knowledge of.  An
! 	 * alternative comparator, used only with this alternative representation
! 	 * must also be provided.  This representation is a simple approximation of
! 	 * the original Datum.  It must be possible to compare datums of this
! 	 * representation with each other using the supplied alternative
! 	 * comparator, and have any non-zero return value be a reliable proxy for
! 	 * what a proper comparison would indicate.  Returning zero from the
! 	 * alternative comparator does not indicate equality, as with a
! 	 * conventional support routine 1, though -- it indicates that it wasn't
! 	 * possible to determine how the two poor man's values compared.  A proper
! 	 * comparison is therefore required.  In many cases this results in most or
! 	 * all comparisons only using the cheap alternative comparison func, which
! 	 * is typically implemented as code that compiles to just a few CPU
! 	 * instructions.  The technique is particularly useful for in-memory
! 	 * quicksorts, which can much more effectively work with CPU caches when
! 	 * sorting pass-by-value types.  One goal is to extend all these advantages
! 	 * to pass-by-reference types.
! 	 *
! 	 * Opclass authors must consider the final cardinality of normalized keys
! 	 * when devising an encoding scheme.  It's possible for one strategy to
! 	 * work better than another with a certain usage pattern, while the inverse
! 	 * may be the case for some other usage pattern.
  	 */
+ 
+ 	/*
+ 	 * Sort key "type" mostly just relates to whether or not a poor man's
+ 	 * optimization is applicable in principle (i.e. the sortsupport routine
+ 	 * needs to know if its dealing with a leading key).  Even with a leading
+ 	 * key, internal sortsupport clients like tuplesort may represent it as
+ 	 * sortKeyOther because it isn't feasible to inject our conversion routine.
+ 	 * However, the sortKeyTrueLeading type means that it's a "proper"
+ 	 * sortsupport state, originally generated by the sortsupport routine
+ 	 * itself - the core system will never set a "true leading" key type.
+ 	 * There is very little distinction between a "true leading" and "other"
+ 	 * key type, though - the distinction only exists to allow sortsupport
+ 	 * routines to squeeze a bit more performance from the knowledge that a
+ 	 * fully reliable tie-breaker comparison is required because a prior
+ 	 * alternative comparison didn't work out (as opposed to being called
+ 	 * without there ever being such an alternative comparison).
+ 	 */
+ 	SortKeyType	type;			/* Position of key */
+ 
+ 	/*
+ 	 * Converter to poor man's format, from original representation.  Core code
+ 	 * uses this callback to convert to a pass-by-value untrustworthy
+ 	 * Datum/poor man's normalized key.  Note that original is guaranteed not
+ 	 * null.
+ 	 */
+ 	Datum		(*converter) (Datum original, SortSupport ssup);
+ 
+ 	/*
+ 	 * This callback allows clients to verify that the current strategy is
+ 	 * working out.  If there is a lot of duplicate poor man's keys in
+ 	 * practice, it's useful to be able to abandon the strategy before paying
+ 	 * too high a cost in conversion.
+ 	 */
+ 	bool		(*abort_conversion) (int memtupcount, double rowhint,
+ 									 SortSupport ssup);
+ 
+ 	/*
+ 	 * Alternative "true leading" SortSupport state for leading (poor man's)
+ 	 * key, used only when alternative comparator returned 0, and the core
+ 	 * system must use this separate state to perform a fully trustworthy
+ 	 * comparison.  This relates to the same attribute as our ssup_attno, but
+ 	 * code code like tuplesort is required to call it directly (i.e. it is
+ 	 * initialized by a poor man's SortSupport routine, and not any internal
+ 	 * code).
+ 	 */
+ 	struct SortSupportData *proper;
  } SortSupportData;
  
  
*** a/src/include/utils/tuplesort.h
--- b/src/include/utils/tuplesort.h
*************** extern Tuplesortstate *tuplesort_begin_h
*** 62,68 ****
  					 int nkeys, AttrNumber *attNums,
  					 Oid *sortOperators, Oid *sortCollations,
  					 bool *nullsFirstFlags,
! 					 int workMem, bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
  						Relation indexRel,
  						int workMem, bool randomAccess);
--- 62,68 ----
  					 int nkeys, AttrNumber *attNums,
  					 Oid *sortOperators, Oid *sortCollations,
  					 bool *nullsFirstFlags,
! 					 int workMem, double projectedTups, bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
  						Relation indexRel,
  						int workMem, bool randomAccess);
*** a/src/test/locale/test-strxfrm-redundant.c
--- b/src/test/locale/test-strxfrm-redundant.c
***************
*** 0 ****
--- 1,81 ----
+ /*-------------------------------------------------------------------------
+  *
+  * test-strxfrm-redundant.c
+  *		libc strxfrm redundancy test program
+  *
+  * Copyright (c) 2014, PostgreSQL Global Development Group
+  *
+  *	src/test/locale/test-strxfrm-redundant.c
+  *
+  *	This program tests to see if the system's C standard library strxfrm()
+  *	function has notable redundancy, or "header bytes".  Certain
+  *	implementations are known to have this problem, including the Mac OSX
+  *	system libc.  This is problematic because any header bytes the
+  *	implementation includes are wasted for the purposes of "poor man's
+  *	normalized key" optimization.
+  *
+  *	The standard that we apply is that if there are any header bytes
+  *	whatsoever, then the optimization cannot be used.  Also, any trailing bytes
+  *	should not contain information essential to the original string.  With
+  *	strings of 8 bytes or less comprised only of ASCII code points, we expect
+  *	the full benefit of 8 bytes of packed Datum storage (varlena.c checks that
+  *	we have 8 byte datums too).
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include <locale.h>
+ #include <string.h>
+ 
+ #define MAX_BLOB_SIZE 2048
+ 
+ int main()
+ {
+ 	char  res1[MAX_BLOB_SIZE];
+ 	char  res2[MAX_BLOB_SIZE];
+ 	int i;
+ 	size_t s, j;
+ 
+ 	/* Use default locale */
+ 	setlocale(LC_ALL, "");
+ 
+ 	s = strxfrm(res1, "abcdefgh", MAX_BLOB_SIZE);
+ 	j = strxfrm(res2, "ijklmnop", MAX_BLOB_SIZE);
+ 
+ 	if (s != j)
+ 		return 1;
+ 
+ 	/*
+ 	 * Ensure that first 8 bytes don't match (i.e. that there are no "header
+ 	 * bytes")
+ 	 */
+ 	for (i = 0; i < 8; i++)
+ 	{
+ 		if (res1[i] == res2[i])
+ 			return 1;
+ 	}
+ 
+ 	/*
+ 	 * Ensure that the remaining bytes are identical, and that we therefore are
+ 	 * guaranteed to have a 1:1 correspondence between blob bytes and original
+ 	 * string bytes, at least for the simple case where only ASCII code points
+ 	 * are transformed.
+ 	 *
+ 	 * On Glibc 2.19, with the "en_US.UTF8" collation, the strings under
+ 	 * consideration look like this once transformed (shown with a splice at 8
+ 	 * bytes to aid visualization):
+ 	 *
+ 	 * abcdefgh:  \x0c0d0e0f10111213  010909090909090909010909090909090909
+ 	 * ijklmnop:  \x1415161718191a1b  010909090909090909010909090909090909
+ 	 *
+ 	 * Before here must differ:    ^
+ 	 *
+ 	 * After here should be equal:    ^
+ 	 */
+ 	for (; i < s; i++)
+ 	{
+ 		if (res1[i] != res2[i])
+ 			return 1;
+ 	}
+ 
+ 	return 0;
+ }