Re: Bunch of tsearch fixes and cleanup - Mailing list pgsql-patches
From | Heikki Linnakangas |
---|---|
Subject | Re: Bunch of tsearch fixes and cleanup |
Date | |
Msg-id | 46CEC3C2.50100@enterprisedb.com Whole thread Raw |
In response to | Re: Bunch of tsearch fixes and cleanup ("Heikki Linnakangas" <heikki@enterprisedb.com>) |
Responses |
Re: Bunch of tsearch fixes and cleanup
|
List | pgsql-patches |
And here's the attachment I forgot. Heikki Linnakangas wrote: > Heikki Linnakangas wrote: >> Tom Lane wrote: >>> Something that was annoying me yesterday was that it was not clear >>> whether we had fixed every single place that uses a tsearch config file >>> to assume that the file is in UTF8 and should be converted to database >>> encoding. So I was thinking of hardwiring the "recode" part into >>> readstopwords, and using wordop just for the "lowercase" part, which >>> seemed to me like a saner division of labor. That is, UTF8 is a policy >>> that we want to enforce globally, but lowercasing maybe not, and this >>> still leaves the door open for more processing besides lowercasing. >> I think we also want to always run input files through pg_verify_mbstr. >> We do it for stopwords, and synonym files (though incorrectly), but not >> for thesaurus files or ispell files. It's probably best to do that >> within the recode-function as well. > > Ok, here's an updated version of the patch. > > - ispell initialization crashed on empty dictionary file > - ispell initialization crashed on affix file with prefixes but no suffixes > - stop words file was ran through pg_verify_mbstr, with database > encoding, but it's later interpreted as being UTF-8. Now verifies that > it's UTF-8, regardless of database encoding. > > > - introduces new t_readline function that reads a line from a file, > verifies that it's valid UTF-8, and converts it to database encoding. > Modified all places that read tsearch config files to use this function > instead of fgets directly. > > - readstopwords now sorts the stop words after loading them. Removed the > separate sortstopwords function. > > - moved the wordop-input parameter from StopList struct to a direct > argument to readstopwords. Seems cleaner to me that way, the struct is > now purely an output of readstopwords, not mixed input/output. > readstopwords now recodes the input implicitly using t_readline. > > - bunch of comments added, typos fixed, and other cleanup > > PS. It's bank holiday here in the UK on Monday, so I won't be around > until Tuesday if something comes up. > -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com Index: src/backend/snowball/dict_snowball.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/snowball/dict_snowball.c,v retrieving revision 1.2 diff -c -r1.2 dict_snowball.c *** src/backend/snowball/dict_snowball.c 22 Aug 2007 01:39:44 -0000 1.2 --- src/backend/snowball/dict_snowball.c 24 Aug 2007 09:37:50 -0000 *************** *** 192,198 **** ListCell *l; d = (DictSnowball *) palloc0(sizeof(DictSnowball)); - d->stoplist.wordop = recode_and_lowerstr; foreach(l, dictoptions) { --- 192,197 ---- *************** *** 204,211 **** ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); ! readstoplist(defGetString(defel), &d->stoplist); ! sortstoplist(&d->stoplist); stoploaded = true; } else if (pg_strcasecmp("Language", defel->defname) == 0) --- 203,209 ---- ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); ! readstoplist(defGetString(defel), &d->stoplist, lowerstr); stoploaded = true; } else if (pg_strcasecmp("Language", defel->defname) == 0) Index: src/backend/tsearch/dict_ispell.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_ispell.c,v retrieving revision 1.2 diff -c -r1.2 dict_ispell.c *** src/backend/tsearch/dict_ispell.c 22 Aug 2007 01:39:44 -0000 1.2 --- src/backend/tsearch/dict_ispell.c 23 Aug 2007 21:12:33 -0000 *************** *** 39,45 **** ListCell *l; d = (DictISpell *) palloc0(sizeof(DictISpell)); - d->stoplist.wordop = recode_and_lowerstr; foreach(l, dictoptions) { --- 39,44 ---- *************** *** 73,80 **** ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); ! readstoplist(defGetString(defel), &(d->stoplist)); ! sortstoplist(&(d->stoplist)); stoploaded = true; } else --- 72,78 ---- ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); ! readstoplist(defGetString(defel), &(d->stoplist), lowerstr); stoploaded = true; } else Index: src/backend/tsearch/dict_simple.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_simple.c,v retrieving revision 1.2 diff -c -r1.2 dict_simple.c *** src/backend/tsearch/dict_simple.c 22 Aug 2007 01:39:44 -0000 1.2 --- src/backend/tsearch/dict_simple.c 23 Aug 2007 21:12:24 -0000 *************** *** 23,41 **** typedef struct { StopList stoplist; ! } DictExample; Datum dsimple_init(PG_FUNCTION_ARGS) { List *dictoptions = (List *) PG_GETARG_POINTER(0); ! DictExample *d = (DictExample *) palloc0(sizeof(DictExample)); bool stoploaded = false; ListCell *l; - d->stoplist.wordop = recode_and_lowerstr; - foreach(l, dictoptions) { DefElem *defel = (DefElem *) lfirst(l); --- 23,39 ---- typedef struct { StopList stoplist; ! } DictSimple; Datum dsimple_init(PG_FUNCTION_ARGS) { List *dictoptions = (List *) PG_GETARG_POINTER(0); ! DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple)); bool stoploaded = false; ListCell *l; foreach(l, dictoptions) { DefElem *defel = (DefElem *) lfirst(l); *************** *** 46,53 **** ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); ! readstoplist(defGetString(defel), &d->stoplist); ! sortstoplist(&d->stoplist); stoploaded = true; } else --- 44,50 ---- ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); ! readstoplist(defGetString(defel), &d->stoplist, lowerstr); stoploaded = true; } else *************** *** 65,80 **** Datum dsimple_lexize(PG_FUNCTION_ARGS) { ! DictExample *d = (DictExample *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); ! char *txt = lowerstr_with_len(in, len); TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) - { pfree(txt); - } else res[0].lexeme = txt; --- 62,77 ---- Datum dsimple_lexize(PG_FUNCTION_ARGS) { ! DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); ! char *txt; TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); + txt = lowerstr_with_len(in, len); + if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) pfree(txt); else res[0].lexeme = txt; Index: src/backend/tsearch/dict_synonym.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_synonym.c,v retrieving revision 1.2 diff -c -r1.2 dict_synonym.c *** src/backend/tsearch/dict_synonym.c 22 Aug 2007 04:13:15 -0000 1.2 --- src/backend/tsearch/dict_synonym.c 24 Aug 2007 10:00:05 -0000 *************** *** 20,28 **** #include "tsearch/ts_utils.h" #include "utils/builtins.h" - - #define SYNBUFLEN 4096 - typedef struct { char *in; --- 20,25 ---- *************** *** 31,53 **** typedef struct { ! int len; Syn *syn; } DictSyn; static char * findwrd(char *in, char **end) { char *start; ! *end = NULL; while (*in && t_isspace(in)) in += pg_mblen(in); if (*in == '\0') return NULL; start = in; while (*in && !t_isspace(in)) in += pg_mblen(in); --- 28,61 ---- typedef struct { ! int len; /* length of syn array */ Syn *syn; } DictSyn; + /* + * Finds the next whitespace-delimited word within the 'in' string. + * Returns a pointer to the first character of the word, and a pointer + * to the next byte after the last character in the word (in *end). + */ static char * findwrd(char *in, char **end) { char *start; ! /* Skip leading spaces */ while (*in && t_isspace(in)) in += pg_mblen(in); + /* Return NULL on empty lines */ if (*in == '\0') + { + *end = NULL; return NULL; + } + start = in; + /* Find end of word */ while (*in && !t_isspace(in)) in += pg_mblen(in); *************** *** 70,81 **** ListCell *l; char *filename = NULL; FILE *fin; - char buf[SYNBUFLEN]; char *starti, *starto, *end = NULL; int cur = 0; ! int slen; foreach(l, dictoptions) { --- 78,88 ---- ListCell *l; char *filename = NULL; FILE *fin; char *starti, *starto, *end = NULL; int cur = 0; ! char *line = NULL; foreach(l, dictoptions) { *************** *** 105,114 **** d = (DictSyn *) palloc0(sizeof(DictSyn)); ! while (fgets(buf, SYNBUFLEN, fin)) { ! slen = strlen(buf); ! pg_verifymbstr(buf, slen, false); if (cur == d->len) { if (d->len == 0) --- 112,144 ---- d = (DictSyn *) palloc0(sizeof(DictSyn)); ! while ((line = t_readline(fin)) != NULL) { ! starti = findwrd(line, &end); ! if (!starti) ! { ! /* Empty line */ ! goto skipline; ! } ! *end = '\0'; ! if (end >= line + strlen(line)) ! { ! /* A line with only one word. Ignore silently. */ ! goto skipline; ! } ! ! starto = findwrd(end + 1, &end); ! if (!starto) ! { ! /* A line with only one word. Ignore silently. */ ! goto skipline; ! } ! *end = '\0'; ! ! /* starti now points to the first word, and starto to the second ! * word on the line, with a \0 terminator at the end of both words. ! */ ! if (cur == d->len) { if (d->len == 0) *************** *** 123,158 **** } } ! starti = findwrd(buf, &end); ! if (!starti) ! continue; ! *end = '\0'; ! if (end >= buf + slen) ! continue; ! ! starto = findwrd(end + 1, &end); ! if (!starto) ! continue; ! *end = '\0'; ! ! d->syn[cur].in = recode_and_lowerstr(starti); ! d->syn[cur].out = recode_and_lowerstr(starto); ! if (!(d->syn[cur].in && d->syn[cur].out)) ! { ! FreeFile(fin); ! ereport(ERROR, ! (errcode(ERRCODE_OUT_OF_MEMORY), ! errmsg("out of memory"))); ! } cur++; } FreeFile(fin); d->len = cur; ! if (cur > 1) ! qsort(d->syn, d->len, sizeof(Syn), compareSyn); PG_RETURN_POINTER(d); } --- 153,171 ---- } } ! d->syn[cur].in = lowerstr(starti); ! d->syn[cur].out = lowerstr(starto); cur++; + + skipline: + pfree(line); } FreeFile(fin); d->len = cur; ! qsort(d->syn, d->len, sizeof(Syn), compareSyn); PG_RETURN_POINTER(d); } *************** *** 179,186 **** if (!found) PG_RETURN_POINTER(NULL); ! res = palloc(sizeof(TSLexeme) * 2); ! memset(res, 0, sizeof(TSLexeme) * 2); res[0].lexeme = pstrdup(found->out); PG_RETURN_POINTER(res); --- 192,198 ---- if (!found) PG_RETURN_POINTER(NULL); ! res = palloc0(sizeof(TSLexeme) * 2); res[0].lexeme = pstrdup(found->out); PG_RETURN_POINTER(res); Index: src/backend/tsearch/dict_thesaurus.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_thesaurus.c,v retrieving revision 1.2 diff -c -r1.2 dict_thesaurus.c *** src/backend/tsearch/dict_thesaurus.c 22 Aug 2007 01:39:44 -0000 1.2 --- src/backend/tsearch/dict_thesaurus.c 24 Aug 2007 10:02:16 -0000 *************** *** 170,179 **** thesaurusRead(char *filename, DictThesaurus * d) { FILE *fh; - char str[BUFSIZ]; int lineno = 0; uint16 idsubst = 0; bool useasis = false; filename = get_tsearch_config_filename(filename, "ths"); fh = AllocateFile(filename, "r"); --- 170,179 ---- thesaurusRead(char *filename, DictThesaurus * d) { FILE *fh; int lineno = 0; uint16 idsubst = 0; bool useasis = false; + char *line; filename = get_tsearch_config_filename(filename, "ths"); fh = AllocateFile(filename, "r"); *************** *** 183,209 **** errmsg("could not open thesaurus file \"%s\": %m", filename))); ! while (fgets(str, sizeof(str), fh)) { ! char *ptr, ! *recoded; int state = TR_WAITLEX; char *beginwrd = NULL; uint16 posinsubst = 0; uint16 nwrd = 0; - ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), - GetDatabaseEncoding(), PG_UTF8); - if (recoded == NULL) - elog(ERROR, "encoding conversion failed"); - lineno++; ! /* is it comment ? */ ! while (t_isspace(ptr)) ptr += pg_mblen(ptr); ! if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r')) continue; while (*ptr) { --- 183,210 ---- errmsg("could not open thesaurus file \"%s\": %m", filename))); ! while ((line = t_readline(fh)) != NULL) { ! char *ptr; int state = TR_WAITLEX; char *beginwrd = NULL; uint16 posinsubst = 0; uint16 nwrd = 0; lineno++; ! ptr = line; ! ! /* is it a comment? */ ! while (*ptr && t_isspace(ptr)) ptr += pg_mblen(ptr); ! ! if (t_iseq(ptr, '#') || *ptr == '\0' || ! t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) ! { ! pfree(line); continue; + } while (*ptr) { *************** *** 301,308 **** lineno, filename))); } ! if (recoded != str) ! pfree(recoded); } d->nsubst = idsubst; --- 302,308 ---- lineno, filename))); } ! pfree(line); } d->nsubst = idsubst; Index: src/backend/tsearch/spell.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/spell.c,v retrieving revision 1.1 diff -c -r1.1 spell.c *** src/backend/tsearch/spell.c 21 Aug 2007 01:11:18 -0000 1.1 --- src/backend/tsearch/spell.c 24 Aug 2007 10:41:12 -0000 *************** *** 21,28 **** /* ! * during initialization dictionary requires a lot ! * of memory, so it will use temporary context */ static MemoryContext tmpCtx = NULL; --- 21,31 ---- /* ! * Initialization requires a lot of memory that's not needed ! * after the initialization is done. In init function, ! * CurrentMemoryContext is a long lived memory context associated ! * with the dictionary cache entry, so we use a temporary context ! * for the short-lived stuff. */ static MemoryContext tmpCtx = NULL; *************** *** 32,37 **** --- 35,43 ---- static void checkTmpCtx(void) { + /* XXX: This assumes that CurrentMemoryContext doesn't have + * any children other than the one we create here. + */ if (CurrentMemoryContext->firstchild == NULL) { tmpCtx = AllocSetContextCreate(CurrentMemoryContext, *************** *** 74,90 **** static int cmpspellaffix(const void *s1, const void *s2) { ! return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag)); ! } ! ! static char * ! strnduplicate(char *s, int len) ! { ! char *d = (char *) palloc(len + 1); ! ! memcpy(d, s, len); ! d[len] = '\0'; ! return d; } static char * --- 80,86 ---- static int cmpspellaffix(const void *s1, const void *s2) { ! return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN)); } static char * *************** *** 185,191 **** } Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1); strcpy(Conf->Spell[Conf->nspell]->word, word); ! strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16); Conf->nspell++; } --- 181,187 ---- } Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1); strcpy(Conf->Spell[Conf->nspell]->word, word); ! strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN); Conf->nspell++; } *************** *** 197,205 **** void NIImportDictionary(IspellDict * Conf, const char *filename) { - char str[BUFSIZ], - *pstr; FILE *dict; checkTmpCtx(); --- 193,200 ---- void NIImportDictionary(IspellDict * Conf, const char *filename) { FILE *dict; + char *line; checkTmpCtx(); *************** *** 209,227 **** errmsg("could not open dictionary file \"%s\": %m", filename))); ! while (fgets(str, sizeof(str), dict)) { ! char *s, ! *recoded; const char *flag; ! recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), ! PG_UTF8, GetDatabaseEncoding()); ! if (recoded == NULL) ! elog(ERROR, "encoding conversion failed"); ! flag = NULL; ! if ((s = findchar(recoded, '/'))) { *s++ = '\0'; flag = s; --- 204,217 ---- errmsg("could not open dictionary file \"%s\": %m", filename))); ! while ((line = t_readline(dict)) != NULL) { ! char *s, *pstr; const char *flag; ! /* Extract flag from the line */ flag = NULL; ! if ((s = findchar(line, '/'))) { *s++ = '\0'; flag = s; *************** *** 240,247 **** else flag = ""; ! ! s = recoded; while (*s) { if (t_isspace(s)) --- 230,237 ---- else flag = ""; ! /* Remove trailing spaces */ ! s = line; while (*s) { if (t_isspace(s)) *************** *** 251,263 **** } s += pg_mblen(s); } ! pstr = lowerstr_ctx(recoded); NIAddSpell(Conf, pstr, flag); pfree(pstr); ! if (recoded != str) ! pfree(recoded); } FreeFile(dict); } --- 241,252 ---- } s += pg_mblen(s); } ! pstr = lowerstr_ctx(line); NIAddSpell(Conf, pstr, flag); pfree(pstr); ! pfree(line); } FreeFile(dict); } *************** *** 402,408 **** static bool parse_affentry(char *str, char *mask, char *find, char *repl, ! const char *filename, int line) { int state = PAE_WAIT_MASK; char *pmask = mask, --- 391,397 ---- static bool parse_affentry(char *str, char *mask, char *find, char *repl, ! const char *filename, int lineno) { int state = PAE_WAIT_MASK; char *pmask = mask, *************** *** 453,459 **** ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! line, filename))); } else if (state == PAE_INFIND) { --- 442,448 ---- ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! lineno, filename))); } else if (state == PAE_INFIND) { *************** *** 471,477 **** ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! line, filename))); } else if (state == PAE_WAIT_REPL) { --- 460,466 ---- ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! lineno, filename))); } else if (state == PAE_WAIT_REPL) { *************** *** 489,495 **** ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! line, filename))); } else if (state == PAE_INREPL) { --- 478,484 ---- ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! lineno, filename))); } else if (state == PAE_INREPL) { *************** *** 507,513 **** ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! line, filename))); } else elog(ERROR, "unknown state in parse_affentry: %d", state); --- 496,502 ---- ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! lineno, filename))); } else elog(ERROR, "unknown state in parse_affentry: %d", state); *************** *** 522,528 **** static void addFlagValue(IspellDict * Conf, char *s, uint32 val, ! const char *filename, int line) { while (*s && t_isspace(s)) s++; --- 511,517 ---- static void addFlagValue(IspellDict * Conf, char *s, uint32 val, ! const char *filename, int lineno) { while (*s && t_isspace(s)) s++; *************** *** 531,543 **** ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! line, filename))); if (pg_mblen(s) != 1) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", ! line, filename))); Conf->flagval[(unsigned int) *s] = (unsigned char) val; Conf->usecompound = true; --- 520,532 ---- ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", ! lineno, filename))); if (pg_mblen(s) != 1) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", ! lineno, filename))); Conf->flagval[(unsigned int) *s] = (unsigned char) val; Conf->usecompound = true; *************** *** 546,552 **** static void NIImportOOAffixes(IspellDict * Conf, const char *filename) { - char str[BUFSIZ]; char type[BUFSIZ], *ptype = NULL; char sflag[BUFSIZ]; --- 535,540 ---- *************** *** 560,568 **** int flag = 0; char flagflags = 0; FILE *affix; ! int line = 0; int scanread = 0; char scanbuf[BUFSIZ]; checkTmpCtx(); --- 548,557 ---- int flag = 0; char flagflags = 0; FILE *affix; ! int lineno = 0; int scanread = 0; char scanbuf[BUFSIZ]; + char *recoded; checkTmpCtx(); *************** *** 576,620 **** errmsg("could not open affix file \"%s\": %m", filename))); ! while (fgets(str, sizeof(str), affix)) { ! char *recoded; ! ! recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), ! PG_UTF8, GetDatabaseEncoding()); ! if (recoded == NULL) ! elog(ERROR, "encoding conversion failed"); ! ! line++; if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) continue; if (STRNCMP(recoded, "COMPOUNDFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"), ! FF_COMPOUNDFLAG, filename, line); else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"), ! FF_COMPOUNDBEGIN, filename, line); else if (STRNCMP(recoded, "COMPOUNDLAST") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"), ! FF_COMPOUNDLAST, filename, line); /* COMPOUNDLAST and COMPOUNDEND are synonyms */ else if (STRNCMP(recoded, "COMPOUNDEND") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDEND"), ! FF_COMPOUNDLAST, filename, line); else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"), ! FF_COMPOUNDMIDDLE, filename, line); else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0) addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"), ! FF_COMPOUNDONLY, filename, line); else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"), ! FF_COMPOUNDPERMITFLAG, filename, line); else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"), ! FF_COMPOUNDFORBIDFLAG, filename, line); else if (STRNCMP(recoded, "FLAG") == 0) { char *s = recoded + strlen("FLAG"); --- 565,605 ---- errmsg("could not open affix file \"%s\": %m", filename))); ! while ((recoded = t_readline(affix)) != NULL) { ! lineno++; if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) + { + pfree(recoded); continue; + } if (STRNCMP(recoded, "COMPOUNDFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"), ! FF_COMPOUNDFLAG, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"), ! FF_COMPOUNDBEGIN, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDLAST") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"), ! FF_COMPOUNDLAST, filename, lineno); /* COMPOUNDLAST and COMPOUNDEND are synonyms */ else if (STRNCMP(recoded, "COMPOUNDEND") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDEND"), ! FF_COMPOUNDLAST, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"), ! FF_COMPOUNDMIDDLE, filename, lineno); else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0) addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"), ! FF_COMPOUNDONLY, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"), ! FF_COMPOUNDPERMITFLAG, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"), ! FF_COMPOUNDFORBIDFLAG, filename, lineno); else if (STRNCMP(recoded, "FLAG") == 0) { char *s = recoded + strlen("FLAG"); *************** *** 626,639 **** ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"", ! line, filename))); } ! if (recoded != str) ! pfree(recoded); } FreeFile(affix); ! line = 0; sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5); --- 611,623 ---- ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"", ! lineno, filename))); } ! pfree(recoded); } FreeFile(affix); ! lineno = 0; sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5); *************** *** 643,660 **** errmsg("could not open affix file \"%s\": %m", filename))); ! while (fgets(str, sizeof(str), affix)) { ! char *recoded; ! ! recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), ! PG_UTF8, GetDatabaseEncoding()); ! if (recoded == NULL) ! elog(ERROR, "encoding conversion failed"); ! ! line++; if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) ! continue; scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask); --- 627,637 ---- errmsg("could not open affix file \"%s\": %m", filename))); ! while ((recoded = t_readline(affix)) != NULL) { ! lineno++; if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) ! goto nextline; scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask); *************** *** 662,673 **** pfree(ptype); ptype = lowerstr_ctx(type); if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx"))) ! continue; if (scanread == 4) { if (strlen(sflag) != 1) ! continue; flag = *sflag; isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false; pfind = lowerstr_ctx(find); --- 639,650 ---- pfree(ptype); ptype = lowerstr_ctx(type); if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx"))) ! goto nextline; if (scanread == 4) { if (strlen(sflag) != 1) ! goto nextline; flag = *sflag; isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false; pfind = lowerstr_ctx(find); *************** *** 683,689 **** int aflg = 0; if (strlen(sflag) != 1 || flag != *sflag || flag == 0) ! continue; prepl = lowerstr_ctx(repl); /* affix flag */ if ((ptr = strchr(prepl, '/')) != NULL) --- 660,666 ---- int aflg = 0; if (strlen(sflag) != 1 || flag != *sflag || flag == 0) ! goto nextline; prepl = lowerstr_ctx(repl); /* affix flag */ if ((ptr = strchr(prepl, '/')) != NULL) *************** *** 710,717 **** pfree(pmask); } ! if (recoded != str) ! pfree(recoded); } if (ptype) --- 687,694 ---- pfree(pmask); } ! nextline: ! pfree(recoded); } if (ptype) *************** *** 733,745 **** char find[BUFSIZ]; char repl[BUFSIZ]; char *s; ! int suffixes = 0; ! int prefixes = 0; int flag = 0; char flagflags = 0; FILE *affix; ! int line = 0; ! int oldformat = 0; checkTmpCtx(); --- 710,723 ---- char find[BUFSIZ]; char repl[BUFSIZ]; char *s; ! bool suffixes = false; ! bool prefixes = false; int flag = 0; char flagflags = 0; FILE *affix; ! int lineno = 0; ! bool oldformat = false; ! char *recoded = NULL; checkTmpCtx(); *************** *** 752,767 **** memset(Conf->flagval, 0, sizeof(Conf->flagval)); Conf->usecompound = false; ! while (fgets(str, sizeof(str), affix)) { ! if (pstr) ! pfree(pstr); ! pstr = recode_and_lowerstr(str); ! line++; if (*pstr == '#' || *pstr == '\n') ! continue; if (STRNCMP(pstr, "compoundwords") == 0) { --- 730,745 ---- memset(Conf->flagval, 0, sizeof(Conf->flagval)); Conf->usecompound = false; ! while ((recoded = t_readline(affix)) != NULL) { ! pstr = lowerstr(recoded); ! pfree(recoded); ! lineno++; ! /* Skip comments and empty lines */ if (*pstr == '#' || *pstr == '\n') ! goto nextline; if (STRNCMP(pstr, "compoundwords") == 0) { *************** *** 777,799 **** Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG; Conf->usecompound = true; } ! oldformat++; ! continue; } } if (STRNCMP(pstr, "suffixes") == 0) { ! suffixes = 1; ! prefixes = 0; ! oldformat++; ! continue; } if (STRNCMP(pstr, "prefixes") == 0) { ! suffixes = 0; ! prefixes = 1; ! oldformat++; ! continue; } if (STRNCMP(pstr, "flag") == 0) { --- 755,777 ---- Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG; Conf->usecompound = true; } ! oldformat = true; ! goto nextline; } } if (STRNCMP(pstr, "suffixes") == 0) { ! suffixes = true; ! prefixes = false; ! oldformat = true; ! goto nextline; } if (STRNCMP(pstr, "prefixes") == 0) { ! suffixes = false; ! prefixes = true; ! oldformat = true; ! goto nextline; } if (STRNCMP(pstr, "flag") == 0) { *************** *** 802,815 **** while (*s && t_isspace(s)) s++; ! oldformat++; /* allow only single-encoded flags */ if (pg_mblen(s) != 1) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", ! line, filename))); if (*s == '*') { --- 780,793 ---- while (*s && t_isspace(s)) s++; ! oldformat = true; /* allow only single-encoded flags */ if (pg_mblen(s) != 1) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", ! lineno, filename))); if (*s == '*') { *************** *** 830,839 **** ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", ! line, filename))); flag = (unsigned char) *s; ! continue; } if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 || STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0) --- 808,817 ---- ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", ! lineno, filename))); flag = (unsigned char) *s; ! goto nextline; } if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 || STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0) *************** *** 842,864 **** ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("wrong affix file format for flag at line %d of affix file \"%s\"", ! line, filename))); FreeFile(affix); NIImportOOAffixes(Conf, filename); return; } if ((!suffixes) && (!prefixes)) ! continue; ! if (!parse_affentry(pstr, mask, find, repl, filename, line)) ! continue; NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); - } - FreeFile(affix); ! if (pstr) pfree(pstr); } static int --- 820,842 ---- ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("wrong affix file format for flag at line %d of affix file \"%s\"", ! lineno, filename))); FreeFile(affix); NIImportOOAffixes(Conf, filename); return; } if ((!suffixes) && (!prefixes)) ! goto nextline; ! if (!parse_affentry(pstr, mask, find, repl, filename, lineno)) ! goto nextline; NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); ! nextline: pfree(pstr); + } + FreeFile(affix); } static int *************** *** 975,1012 **** return rs; } void NISortDictionary(IspellDict * Conf) { ! size_t i; ! int naffix = 3; checkTmpCtx(); /* compress affixes */ qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix); ! for (i = 1; i < Conf->nspell; i++) ! if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag)) naffix++; Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); ! naffix = 1; ! Conf->AffixData[0] = pstrdup(""); ! Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag); ! Conf->Spell[0]->p.d.affix = 1; ! Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word); ! for (i = 1; i < Conf->nspell; i++) { ! if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix])) { ! naffix++; ! Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag); } ! Conf->Spell[i]->p.d.affix = naffix; Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); } Conf->lenAffixData = Conf->nAffixData = naffix; qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell); Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); --- 953,1007 ---- return rs; } + /* + * Builds the Conf->Dictionary tree and AffixData from the imported dictionary + * and affixes. + */ void NISortDictionary(IspellDict * Conf) { ! int i; ! int naffix = 0; ! int curaffix; checkTmpCtx(); /* compress affixes */ + + /* Count the number of different flags used in the dictionary */ + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix); ! ! naffix = 0; ! for (i = 0; i < Conf->nspell; i++) ! { ! if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN)) naffix++; + } + /* + * Fill in Conf->AffixData with the affixes that were used + * in the dictionary. Replace textual flag-field of Conf->Spell + * entries with indexes into Conf->AffixData array. + */ Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); ! ! curaffix = -1; ! for (i = 0; i < Conf->nspell; i++) { ! if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN)) { ! curaffix++; ! Assert(curaffix < naffix); ! Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag); } ! ! Conf->Spell[i]->p.d.affix = curaffix; Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); } Conf->lenAffixData = Conf->nAffixData = naffix; + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell); Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); *************** *** 1085,1091 **** } static void ! mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) { int i, cnt = 0; --- 1080,1086 ---- } static void ! mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix) { int i, cnt = 0; *************** *** 1145,1151 **** AFFIX *Affix; size_t i; CMPDAffix *ptr; ! int firstsuffix = -1; checkTmpCtx(); --- 1140,1146 ---- AFFIX *Affix; size_t i; CMPDAffix *ptr; ! int firstsuffix = Conf->naffixes; checkTmpCtx(); *************** *** 1160,1166 **** for (i = 0; i < Conf->naffixes; i++) { Affix = &(((AFFIX *) Conf->Affix)[i]); ! if (Affix->type == FF_SUFFIX && firstsuffix < 0) firstsuffix = i; if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 && --- 1155,1161 ---- for (i = 0; i < Conf->naffixes; i++) { Affix = &(((AFFIX *) Conf->Affix)[i]); ! if (Affix->type == FF_SUFFIX && i < firstsuffix) firstsuffix = i; if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 && *************** *** 1185,1196 **** Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX); Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX); ! mkVoidAffix(Conf, 1, firstsuffix); ! mkVoidAffix(Conf, 0, firstsuffix); } static AffixNodeData * ! FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type) { AffixNodeData *StopLow, *StopHigh, --- 1180,1191 ---- Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX); Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX); ! mkVoidAffix(Conf, true, firstsuffix); ! mkVoidAffix(Conf, false, firstsuffix); } static AffixNodeData * ! FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type) { AffixNodeData *StopLow, *StopHigh, *************** *** 1374,1380 **** plevel = 0; while (pnode) { ! prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX); if (!prefix) break; for (j = 0; j < prefix->naff; j++) --- 1369,1375 ---- plevel = 0; while (pnode) { ! prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX); if (!prefix) break; for (j = 0; j < prefix->naff; j++) *************** *** 1398,1404 **** int baselen = 0; /* find possible suffix */ ! suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); if (!suffix) break; /* foreach suffix check affix */ --- 1393,1399 ---- int baselen = 0; /* find possible suffix */ ! suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); if (!suffix) break; /* foreach suffix check affix */ *************** *** 1416,1422 **** swrdlen = strlen(newword); while (pnode) { ! prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX); if (!prefix) break; for (j = 0; j < prefix->naff; j++) --- 1411,1417 ---- swrdlen = strlen(newword); while (pnode) { ! prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX); if (!prefix) break; for (j = 0; j < prefix->naff; j++) *************** *** 1626,1632 **** if (wordlen == level + 1) { /* well, it was last word */ ! var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos); var->nstem++; pfree(notprobed); return var; --- 1621,1627 ---- if (wordlen == level + 1) { /* well, it was last word */ ! var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos); var->nstem++; pfree(notprobed); return var; *************** *** 1641,1647 **** ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level); /* we can find next word */ level++; ! var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos); var->nstem++; node = Conf->Dictionary; startpos = level; --- 1636,1642 ---- ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level); /* we can find next word */ level++; ! var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos); var->nstem++; node = Conf->Dictionary; startpos = level; *************** *** 1656,1662 **** level++; } ! var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos); var->nstem++; pfree(notprobed); return var; --- 1651,1657 ---- level++; } ! var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos); var->nstem++; pfree(notprobed); return var; Index: src/backend/tsearch/ts_locale.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_locale.c,v retrieving revision 1.1 diff -c -r1.1 ts_locale.c *** src/backend/tsearch/ts_locale.c 21 Aug 2007 01:11:18 -0000 1.1 --- src/backend/tsearch/ts_locale.c 24 Aug 2007 09:47:44 -0000 *************** *** 125,152 **** } #endif /* TS_USE_WIDE */ /* ! * Convert C-string from UTF8 to server encoding and ! * lower it */ char * ! recode_and_lowerstr(char *str) { ! char *recoded; ! char *ret; ! ! recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), ! PG_UTF8, GetDatabaseEncoding()); if (recoded == NULL) elog(ERROR, "encoding conversion failed"); ! ret = lowerstr(recoded); ! ! if (recoded != str) ! pfree(recoded); ! return ret; } char * --- 125,169 ---- } #endif /* TS_USE_WIDE */ + /* ! * Utility function to read a line from a tsearch data file, ! * and recode it to database encoding. The returned string ! * is palloc'd. */ char * ! t_readline(FILE *fp) { ! int len; ! static char *recoded = NULL; ! static char buf[4096]; ! ! if(fgets(buf, sizeof(buf), fp) == NULL) ! return NULL; ! ! len = strnlen(buf, sizeof(buf)); ! ! /* Make sure the input is valid UTF-8 */ ! (void) pg_verify_mbstr(PG_UTF8, buf, len, false); ! ! recoded = (char *) pg_do_encoding_conversion( ! (unsigned char *) buf, ! len, ! PG_UTF8, ! GetDatabaseEncoding()); if (recoded == NULL) elog(ERROR, "encoding conversion failed"); ! if (recoded == buf) ! { ! /* we can use the length of the original string, because ! * no conversion was done ! */ ! recoded = pnstrdup(recoded, len); ! } ! return recoded; } char * *************** *** 155,160 **** --- 172,180 ---- return lowerstr_with_len(str, strlen(str)); } + /* + * Returned string is palloc'd + */ char * lowerstr_with_len(char *str, int len) { Index: src/backend/tsearch/ts_parse.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_parse.c,v retrieving revision 1.1 diff -c -r1.1 ts_parse.c *** src/backend/tsearch/ts_parse.c 21 Aug 2007 01:11:18 -0000 1.1 --- src/backend/tsearch/ts_parse.c 23 Aug 2007 12:29:51 -0000 *************** *** 308,314 **** { /* * Dictionary normalizes lexemes, so we remove from stack all ! * used lexemes , return to basic mode and redo end of stack * (if it exists) */ if (res) --- 308,314 ---- { /* * Dictionary normalizes lexemes, so we remove from stack all ! * used lexemes, return to basic mode and redo end of stack * (if it exists) */ if (res) *************** *** 571,577 **** } text * ! generatHeadline(HeadlineText * prs) { text *out; int len = 128; --- 571,577 ---- } text * ! generateHeadline(HeadlineText * prs) { text *out; int len = 128; Index: src/backend/tsearch/ts_utils.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_utils.c,v retrieving revision 1.2 diff -c -r1.2 ts_utils.c *** src/backend/tsearch/ts_utils.c 22 Aug 2007 01:39:44 -0000 1.2 --- src/backend/tsearch/ts_utils.c 24 Aug 2007 10:57:58 -0000 *************** *** 63,83 **** return result; } ! #define STOPBUFLEN 4096 void ! readstoplist(char *in, StopList * s) { char **stop = NULL; s->len = 0; if (in && *in) { char *filename = get_tsearch_config_filename(in, "stop"); FILE *hin; - char buf[STOPBUFLEN]; int reallen = 0; - int line = 0; if ((hin = AllocateFile(filename, "r")) == NULL) ereport(ERROR, --- 63,90 ---- return result; } ! static int ! comparestr(const void *a, const void *b) ! { ! return strcmp(*(char **) a, *(char **) b); ! } + /* + * Reads a stopword file. Each word is ran through 'wordop' + * function, if given. + */ void ! readstoplist(char *in, StopList * s, char *(*wordop) (char *)) { char **stop = NULL; + char *line; s->len = 0; if (in && *in) { char *filename = get_tsearch_config_filename(in, "stop"); FILE *hin; int reallen = 0; if ((hin = AllocateFile(filename, "r")) == NULL) ereport(ERROR, *************** *** 85,109 **** errmsg("could not open stopword file \"%s\": %m", filename))); ! while (fgets(buf, STOPBUFLEN, hin)) { ! char *pbuf = buf; ! line++; ! while (*pbuf && !isspace(*pbuf)) pbuf++; *pbuf = '\0'; ! if (*buf == '\0') ! continue; ! ! if (!pg_verifymbstr(buf, strlen(buf), true)) { ! FreeFile(hin); ! ereport(ERROR, ! (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("invalid multibyte encoding at line %d in file \"%s\"", ! line, filename))); } if (s->len >= reallen) --- 92,111 ---- errmsg("could not open stopword file \"%s\": %m", filename))); ! while ((line = t_readline(hin)) != NULL) { ! char *pbuf = line; ! /* Trim trailing space */ ! while (*pbuf && !t_isspace(pbuf)) pbuf++; *pbuf = '\0'; ! /* Skip empty lines */ ! if (*line == '\0') { ! pfree(line); ! continue; } if (s->len >= reallen) *************** *** 120,130 **** } } ! ! if (s->wordop) ! stop[s->len] = s->wordop(buf); else ! stop[s->len] = pstrdup(buf); (s->len)++; } --- 122,135 ---- } } ! if (wordop) ! { ! stop[s->len] = wordop(line); ! if (stop[s->len] != line) ! pfree(line); ! } else ! stop[s->len] = line; (s->len)++; } *************** *** 133,149 **** } s->stop = stop; - } - - static int - comparestr(const void *a, const void *b) - { - return strcmp(*(char **) a, *(char **) b); - } ! void ! sortstoplist(StopList * s) ! { if (s->stop && s->len > 0) qsort(s->stop, s->len, sizeof(char *), comparestr); } --- 138,145 ---- } s->stop = stop; ! /* Sort to allow binary searching */ if (s->stop && s->len > 0) qsort(s->stop, s->len, sizeof(char *), comparestr); } Index: src/backend/tsearch/wparser.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/wparser.c,v retrieving revision 1.2 diff -c -r1.2 wparser.c *** src/backend/tsearch/wparser.c 22 Aug 2007 01:39:45 -0000 1.2 --- src/backend/tsearch/wparser.c 23 Aug 2007 12:29:59 -0000 *************** *** 325,331 **** PointerGetDatum(prsoptions), PointerGetDatum(query)); ! out = generatHeadline(&prs); PG_FREE_IF_COPY(in, 1); PG_FREE_IF_COPY(query, 2); --- 325,331 ---- PointerGetDatum(prsoptions), PointerGetDatum(query)); ! out = generateHeadline(&prs); PG_FREE_IF_COPY(in, 1); PG_FREE_IF_COPY(query, 2); Index: src/include/tsearch/ts_locale.h =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_locale.h,v retrieving revision 1.1 diff -c -r1.1 ts_locale.h *** src/include/tsearch/ts_locale.h 21 Aug 2007 01:11:29 -0000 1.1 --- src/include/tsearch/ts_locale.h 24 Aug 2007 09:48:14 -0000 *************** *** 83,88 **** char *lowerstr(char *str); char *lowerstr_with_len(char *str, int len); ! char *recode_and_lowerstr(char *str); #endif /* __TSLOCALE_H__ */ --- 83,88 ---- char *lowerstr(char *str); char *lowerstr_with_len(char *str, int len); ! char *t_readline(FILE *fp); #endif /* __TSLOCALE_H__ */ Index: src/include/tsearch/ts_public.h =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_public.h,v retrieving revision 1.2 diff -c -r1.2 ts_public.h *** src/include/tsearch/ts_public.h 22 Aug 2007 01:39:46 -0000 1.2 --- src/include/tsearch/ts_public.h 23 Aug 2007 19:55:25 -0000 *************** *** 71,81 **** { int len; char **stop; - char *(*wordop) (char *); } StopList; ! extern void sortstoplist(StopList * s); ! extern void readstoplist(char *in, StopList * s); extern bool searchstoplist(StopList * s, char *key); /* --- 71,79 ---- { int len; char **stop; } StopList; ! extern void readstoplist(char *in, StopList * s, char *(*wordop) (char *)); extern bool searchstoplist(StopList * s, char *key); /* Index: src/include/tsearch/ts_utils.h =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_utils.h,v retrieving revision 1.1 diff -c -r1.1 ts_utils.h *** src/include/tsearch/ts_utils.h 21 Aug 2007 01:11:29 -0000 1.1 --- src/include/tsearch/ts_utils.h 23 Aug 2007 12:30:32 -0000 *************** *** 102,108 **** * headline framework, flow in common to generate: * 1 parse text with hlparsetext * 2 parser-specific function to find part ! * 3 generatHeadline to generate result text */ typedef struct --- 102,108 ---- * headline framework, flow in common to generate: * 1 parse text with hlparsetext * 2 parser-specific function to find part ! * 3 generateHeadline to generate result text */ typedef struct *************** *** 131,137 **** extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen); ! extern text *generatHeadline(HeadlineText * prs); /* * token/node types for parsing --- 131,137 ---- extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen); ! extern text *generateHeadline(HeadlineText * prs); /* * token/node types for parsing Index: src/include/tsearch/dicts/spell.h =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/dicts/spell.h,v retrieving revision 1.1 diff -c -r1.1 spell.h *** src/include/tsearch/dicts/spell.h 21 Aug 2007 01:11:29 -0000 1.1 --- src/include/tsearch/dicts/spell.h 24 Aug 2007 10:59:49 -0000 *************** *** 18,23 **** --- 18,29 ---- #include "tsearch/dicts/regis.h" #include "tsearch/ts_public.h" + /* + * Max length of a flag name. Names longer than this will be truncated + * to the maximum. + */ + #define MAXFLAGLEN 16 + struct SPNode; typedef struct *************** *** 54,67 **** { union { ! char flag[16]; struct { int affix; int len; } d; } p; ! char word[1]; } SPELL; #define SPELLHDRSZ (offsetof(SPELL, word)) --- 60,76 ---- { union { ! /* flag is filled in by NIImportDictionary. After NISortDictionary, ! * d is valid and flag is invalid. ! */ ! char flag[MAXFLAGLEN]; struct { int affix; int len; } d; } p; ! char word[1]; /* variable length, null-terminated */ } SPELL; #define SPELLHDRSZ (offsetof(SPELL, word)) *************** *** 90,95 **** --- 99,110 ---- #define FF_COMPOUNDPERMITFLAG 0x10 #define FF_COMPOUNDFORBIDFLAG 0x20 #define FF_CROSSPRODUCT 0x40 + + /* + * Don't change the order of these. Initialization + * sorts by because these, and expects prefixes to + * come first after sorting. + */ #define FF_SUFFIX 1 #define FF_PREFIX 0 *************** *** 126,134 **** int naffixes; AFFIX *Affix; ! int nspell; ! int mspell; SPELL **Spell; AffixNode *Suffix; AffixNode *Prefix; --- 141,151 ---- int naffixes; AFFIX *Affix; ! /* Temporary array of all words in the dict file. Only used during ! * initialization */ SPELL **Spell; + int nspell; /* number of entries in Spell-array */ + int mspell; /* allocated length of Spell-array */ AffixNode *Suffix; AffixNode *Prefix;
pgsql-patches by date: