Re: Re: Missing rows with index scan when collation is not "C" (PostgreSQL 9.5) - Mailing list pgsql-bugs
| From | Tom Lane |
|---|---|
| Subject | Re: Re: Missing rows with index scan when collation is not "C" (PostgreSQL 9.5) |
| Date | |
| Msg-id | 31913.1458747836@sss.pgh.pa.us Whole thread Raw |
| In response to | Re: Missing rows with index scan when collation is not "C" (PostgreSQL 9.5) (Robert Haas <robertmhaas@gmail.com>) |
| Responses |
Re: Re: Missing rows with index scan when collation is not "C" (PostgreSQL 9.5)
|
| List | pgsql-bugs |
Robert Haas <robertmhaas@gmail.com> writes:
> On Tue, Mar 22, 2016 at 10:44 PM, Noah Misch <noah@leadboat.com> wrote:
>> I, too, found MAXXFRMLEN insufficient; I raised it fourfold. Cygwin
>> 2.2.1(0.289/5/3) caught fire; 10% of locales passed. (varstr_sortsupport()
>> already blacklists the UTF8/native Windows case.) The test passed on Solaris
>> 10, Solaris 11, HP-UX B.11.31, OpenBSD 5.0, NetBSD 5.1.2, and FreeBSD 9.0.
>> See attached tryalllocales.sh outputs. I did not test AIX, because the AIX
>> machines I use have no UTF8 locales installed.
> Wow, thanks for the extensive testing. This suggests that, apart from
> Cygwin which apparently doesn't matter right now, the only thing that
> is busted is glibc. I believe we have yet to see a single locale that
> fails anywhere else (apart from Cygwin). Good thing so few of our
> users run glibc!
I extended my test program to be able to check locales using ISO-8859-x
encodings. RHEL6 shows me failures in a set of locales that is remarkably
unlike the set it fails on for UTF8 (though good ol de_DE manages to fail
in both encodings, as do a few others). I'm not sure what that implies
for the underlying bug(s).
> So, options:
> 1. We could make it the user's problem to figure out whether they've
> got a buggy glibc and add a GUC to shut this off, as previously
> suggested.
> 2. We could add a blacklist (either hardcoded or a GUC) shutting this
> off for locales known to be buggy anywhere.
> 3. We could write some test code that runs at startup time which
> reliably detects all of the broken locales we've so far uncovered and
> disables this if so.
> 4. We could shut this off for all Linux users in all locales and tell
> everybody to REINDEX. That would be pretty sad, though.
TBH, I think #1 is right out, unless maybe the GUC defaults to off.
We aren't that cavalier with data consistency in other departments.
#2 and #3 presume a level of knowledge of the bug details that we
have not got, and probably can't get by Monday.
As far as #4 goes, we're going to have to tell people to REINDEX
no matter what the other aspects of the fix look like. On-disk
indexes are broken right now, if you're using one of the affected
locales.
regards, tom lane
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <locale.h>
#include <langinfo.h>
#include <time.h>
/*
* Test: generate 1000 random UTF8 strings, sort them by strcoll, sanity-
* check the sort result, sort them by strxfrm, sanity-check that result,
* and compare the two sort orders.
*/
#define NSTRINGS 1000
#define MAXSTRLEN 20
#define MAXXFRMLEN (MAXSTRLEN * 10)
typedef struct
{
char strval[MAXSTRLEN];
char xfrmval[MAXXFRMLEN];
int strsortpos;
int xfrmsortpos;
} OneString;
/* qsort comparators */
static int
strcoll_compare(const void *pa, const void *pb)
{
const OneString *a = (const OneString *) pa;
const OneString *b = (const OneString *) pb;
return strcoll(a->strval, b->strval);
}
static int
strxfrm_compare(const void *pa, const void *pb)
{
const OneString *a = (const OneString *) pa;
const OneString *b = (const OneString *) pb;
return strcmp(a->xfrmval, b->xfrmval);
}
/* returns 1 if OK, 0 if inconsistency detected */
static int
run_test_case(int is_utf8)
{
int ok = 1;
OneString data[NSTRINGS];
int i,
j;
/* Generate random strings of length less than MAXSTRLEN bytes */
for (i = 0; i < NSTRINGS; i++)
{
char *p = data[i].strval;
int len;
len = 1 + (random() % (MAXSTRLEN - 1));
while (len > 0)
{
int c;
/* Generate random printable char in ISO8859-1 range */
/* Bias towards producing a lot of spaces */
if ((random() % 16) < 3)
c = ' ';
else
{
do
{
c = random() & 0xFF;
} while (!((c >= ' ' && c <= 127) || (c >= 0xA0 && c <= 0xFF)));
}
if (c <= 127 || !is_utf8)
{
*p++ = c;
len--;
}
else
{
if (len < 2)
break;
/* Poor man's utf8-ification */
*p++ = 0xC0 + (c >> 6);
len--;
*p++ = 0x80 + (c & 0x3F);
len--;
}
}
*p = '\0';
/* strxfrm each string as we produce it */
if (strxfrm(data[i].xfrmval, data[i].strval, MAXXFRMLEN) >= MAXXFRMLEN)
{
fprintf(stderr, "strxfrm() result for %d-length string exceeded %d bytes\n",
(int) strlen(data[i].strval), MAXXFRMLEN);
exit(1);
}
#if 0
printf("%d %s\n", i, data[i].strval);
#endif
}
/* Sort per strcoll(), and label, being careful in case some are equal */
qsort(data, NSTRINGS, sizeof(OneString), strcoll_compare);
j = 0;
for (i = 0; i < NSTRINGS; i++)
{
if (i > 0 && strcoll(data[i].strval, data[i-1].strval) != 0)
j++;
data[i].strsortpos = j;
}
/* Sanity-check: is each string <= those after it? */
for (i = 0; i < NSTRINGS; i++)
{
for (j = i + 1; j < NSTRINGS; j++)
{
if (strcoll(data[i].strval, data[j].strval) > 0)
{
fprintf(stdout, "strcoll sort inconsistency between positions %d and %d\n",
i, j);
ok = 0;
}
}
}
/* Sort per strxfrm(), and label, being careful in case some are equal */
qsort(data, NSTRINGS, sizeof(OneString), strxfrm_compare);
j = 0;
for (i = 0; i < NSTRINGS; i++)
{
if (i > 0 && strcmp(data[i].xfrmval, data[i-1].xfrmval) != 0)
j++;
data[i].xfrmsortpos = j;
}
/* Sanity-check: is each string <= those after it? */
for (i = 0; i < NSTRINGS; i++)
{
for (j = i + 1; j < NSTRINGS; j++)
{
if (strcmp(data[i].xfrmval, data[j].xfrmval) > 0)
{
fprintf(stdout, "strxfrm sort inconsistency between positions %d and %d\n",
i, j);
ok = 0;
}
}
}
/* Compare */
for (i = 0; i < NSTRINGS; i++)
{
if (data[i].strsortpos != data[i].xfrmsortpos)
{
fprintf(stdout, "inconsistency between strcoll (%d) and strxfrm (%d) orders\n",
data[i].strsortpos, data[i].xfrmsortpos);
ok = 0;
}
}
return ok;
}
int
main(int argc, char **argv)
{
const char *lc;
const char *cset;
int is_utf8;
int ntries;
int result = 0;
/* Absorb locale from environment, and report what we're using */
if (setlocale(LC_ALL, "") == NULL)
{
perror("setlocale(LC_ALL) failed");
exit(1);
}
lc = setlocale(LC_COLLATE, NULL);
if (lc)
{
printf("Using LC_COLLATE = \"%s\"\n", lc);
}
else
{
perror("setlocale(LC_COLLATE) failed");
exit(1);
}
lc = setlocale(LC_CTYPE, NULL);
if (lc)
{
printf("Using LC_CTYPE = \"%s\"\n", lc);
}
else
{
perror("setlocale(LC_CTYPE) failed");
exit(1);
}
/* Identify encoding */
cset = nl_langinfo(CODESET);
if (!cset)
{
perror("nl_langinfo(CODESET) failed");
exit(1);
}
if (strstr(cset, "utf") || strstr(cset, "UTF"))
is_utf8 = 1;
else if (strstr(cset, "iso-8859") || strstr(cset, "ISO-8859") ||
strstr(cset, "iso8859") || strstr(cset, "ISO8859"))
is_utf8 = 0;
else
{
fprintf(stderr, "unrecognized codeset name \"%s\"\n", cset);
exit(1);
}
/* Ensure new random() values on every run */
srandom((unsigned int) time(NULL));
/* argv[1] can be the max number of tries to run */
if (argc > 1)
ntries = atoi(argv[1]);
else
ntries = 1;
/* Run one test instance per loop */
while (ntries-- > 0)
{
if (!run_test_case(is_utf8))
result = 1;
}
return result;
}
az_AZ.utf8 BAD
ca_AD.utf8 BAD
ca_ES.utf8 BAD
ca_FR.utf8 BAD
ca_IT.utf8 BAD
crh_UA.utf8 BAD
csb_PL.utf8 BAD
cv_RU.utf8 BAD
da_DK.utf8 BAD
de_DE.utf8 BAD
en_CA.utf8 BAD
es_EC.utf8 BAD
es_US.utf8 BAD
fi_FI.utf8 BAD
fil_PH.utf8 BAD
fo_FO.utf8 BAD
fr_CA.utf8 BAD
fur_IT.utf8 BAD
hu_HU.utf8 BAD
ig_NG.utf8 BAD
ik_CA.utf8 BAD
iu_CA.utf8 BAD
kl_GL.utf8 BAD
ku_TR.utf8 BAD
nb_NO.utf8 BAD
nn_NO.utf8 BAD
no_NO.utf8 BAD
ro_RO.utf8 BAD
sc_IT.utf8 BAD
se_NO.utf8 BAD
shs_CA.utf8 BAD
sq_AL.utf8 BAD
sq_MK.utf8 BAD
sv_FI.utf8 BAD
sv_SE.utf8 BAD
tk_TM.utf8 BAD
tt_RU.utf8 BAD
tt_RU.utf8@iqtelif BAD
ug_CN.utf8 BAD
vi_VN.utf8 BAD
yo_NG.utf8 BAD
ar_AE.iso88596 BAD
ar_BH.iso88596 BAD
ar_DZ.iso88596 BAD
ar_EG.iso88596 BAD
ar_IQ.iso88596 BAD
ar_JO.iso88596 BAD
ar_KW.iso88596 BAD
ar_LB.iso88596 BAD
ar_LY.iso88596 BAD
ar_MA.iso88596 BAD
ar_OM.iso88596 BAD
ar_QA.iso88596 BAD
ar_SD.iso88596 BAD
ar_SY.iso88596 BAD
ar_TN.iso88596 BAD
ar_YE.iso88596 BAD
bs_BA.iso88592 BAD
ca_AD.iso885915 BAD
ca_ES.iso88591 BAD
ca_ES.iso885915@euro BAD
ca_FR.iso885915 BAD
ca_IT.iso885915 BAD
da_DK.iso88591 BAD
da_DK.iso885915 BAD
de_DE.iso88591 BAD
es_EC.iso88591 BAD
es_US.iso88591 BAD
fi_FI.iso88591 BAD
fi_FI.iso885915@euro BAD
fo_FO.iso88591 BAD
fr_CA.iso88591 BAD
he_IL.iso88598 BAD
hu_HU.iso88592 BAD
iw_IL.iso88598 BAD
kl_GL.iso88591 BAD
ku_TR.iso88599 BAD
mk_MK.iso88595 BAD
mt_MT.iso88593 BAD
nb_NO.iso88591 BAD
nn_NO.iso88591 BAD
no_NO.iso88591 BAD
ro_RO.iso88592 BAD
ru_RU.iso88595 BAD
sq_AL.iso88591 BAD
sv_FI.iso88591 BAD
sv_FI.iso885915@euro BAD
sv_SE.iso88591 BAD
sv_SE.iso885915 BAD
pgsql-bugs by date: