Re: tsearch refactorings - Mailing list pgsql-patches
From | Heikki Linnakangas |
---|---|
Subject | Re: tsearch refactorings |
Date | |
Msg-id | 46E11F8C.5020903@enterprisedb.com Whole thread Raw |
In response to | Re: tsearch refactorings (Tom Lane <tgl@sss.pgh.pa.us>) |
Responses |
XML binary I/O (was Re: tsearch refactorings)
Re: tsearch refactorings |
List | pgsql-patches |
Tom Lane wrote: > Any portion of a binary value that is considered textual should be > converted to and from client encoding --- cf textsend/textrecv. > This should be pretty trivial to fix, just call a different support > routine. You do need to adjust length and position fields in the structs as well. I fixed (rewrote, almost) the send/recv functions, and added a comment above them describing the on-wire format. The CRC is now recalculated in tsquery as well per previous discussion. Patch attached. This is on top of the previous patches I sent. It includes some additional changes that I had already started with. Most notably: - change the alignment requirement of lexemes in TSVector slightly. Lexeme strings were always padded to 2-byte aligned length to make sure that if there's position array (uint16[]) it has the right alignment. The patch changes that so that the padding is not done when there's no positions. That makes the storage of tsvectors without positions slightly more compact. - added some #include "miscadmin.h" lines I missed in the earlier when I added calls to check_stack_depth(). BTW, the encoding of the XML datatype looks pretty funky. xml_recv first reads the xml string with pq_getmsgtext, which applies a client->server conversion. Then the xml declaration is parsed, extracting the encoding attribute. Then the string is converted again from that encoding (or UTF-8 if none was specified) to server encoding. I don't understand how it's supposed to work, but ISTM there's one conversion too much, > BTW, Teodor, are you intending to review/apply Heikki's tsearch fixes, > or do you want someone else to do it? I am getting confused with the patches and version I have lying around here... I think I'll have to wait for review of the patches I've posted this far before I continue hacking. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com *** ../pgsql.tsearch-2/src/backend/utils/adt/tsginidx.c 2007-09-06 11:19:57.000000000 +0100 --- ./src/backend/utils/adt/tsginidx.c 2007-09-07 09:20:27.000000000 +0100 *************** *** 22,28 **** gin_extract_tsvector(PG_FUNCTION_ARGS) { TSVector vector = PG_GETARG_TSVECTOR(0); ! uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); Datum *entries = NULL; *nentries = vector->size; --- 22,28 ---- gin_extract_tsvector(PG_FUNCTION_ARGS) { TSVector vector = PG_GETARG_TSVECTOR(0); ! int32 *nentries = (int32 *) PG_GETARG_POINTER(1); Datum *entries = NULL; *nentries = vector->size; *************** *** 54,60 **** gin_extract_query(PG_FUNCTION_ARGS) { TSQuery query = PG_GETARG_TSQUERY(0); ! uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); StrategyNumber strategy = PG_GETARG_UINT16(2); Datum *entries = NULL; --- 54,60 ---- gin_extract_query(PG_FUNCTION_ARGS) { TSQuery query = PG_GETARG_TSQUERY(0); ! int32 *nentries = (int32 *) PG_GETARG_POINTER(1); StrategyNumber strategy = PG_GETARG_UINT16(2); Datum *entries = NULL; *** ../pgsql.tsearch-2/src/backend/utils/adt/tsquery.c 2007-09-05 11:59:09.000000000 +0100 --- ./src/backend/utils/adt/tsquery.c 2007-09-07 09:35:18.000000000 +0100 *************** *** 21,27 **** #include "tsearch/ts_utils.h" #include "utils/memutils.h" #include "utils/pg_crc.h" - #include "nodes/bitmapset.h" struct TSQueryParserStateData --- 21,26 ---- *************** *** 384,399 **** } } - /* - * Fills in the left-fields previously left unfilled. The input - * QueryItems must be in polish (prefix) notation. - */ static void ! findoprnd(QueryItem *ptr, uint32 *pos) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); if (ptr[*pos].type == QI_VAL || ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, * they haven't been cleansed --- 383,397 ---- } } static void ! findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); + if (*pos >= nnodes) + elog(ERROR, "malformed tsquery; operand not found"); + if (ptr[*pos].type == QI_VAL || ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, * they haven't been cleansed *************** *** 410,416 **** { ptr[*pos].operator.left = 1; (*pos)++; ! findoprnd(ptr, pos); } else { --- 408,414 ---- { ptr[*pos].operator.left = 1; (*pos)++; ! findoprnd_recurse(ptr, pos, nnodes); } else { *************** *** 420,432 **** Assert(curitem->oper == OP_AND || curitem->oper == OP_OR); (*pos)++; ! findoprnd(ptr, pos); curitem->left = *pos - tmp; ! findoprnd(ptr, pos); } } } /* * Each value (operand) in the query is be passed to pushval. pushval can * transform the simple value to an arbitrarily complex expression using --- 418,448 ---- Assert(curitem->oper == OP_AND || curitem->oper == OP_OR); (*pos)++; ! findoprnd_recurse(ptr, pos, nnodes); curitem->left = *pos - tmp; ! findoprnd_recurse(ptr, pos, nnodes); } } } + + /* + * Fills in the left-fields previously left unfilled. The input + * QueryItems must be in polish (prefix) notation. + */ + static void + findoprnd(QueryItem *ptr, int size) + { + uint32 pos; + + pos = 0; + findoprnd_recurse(ptr, &pos, size); + + if (pos != size) + elog(ERROR, "malformed tsquery; extra nodes"); + } + + /* * Each value (operand) in the query is be passed to pushval. pushval can * transform the simple value to an arbitrarily complex expression using *************** *** 452,458 **** TSQuery query; int commonlen; QueryItem *ptr; - uint32 pos = 0; ListCell *cell; /* init state */ --- 468,473 ---- *************** *** 522,529 **** pfree(state.op); /* Set left operand pointers for every operator. */ ! pos = 0; ! findoprnd(ptr, &pos); return query; } --- 537,543 ---- pfree(state.op); /* Set left operand pointers for every operator. */ ! findoprnd(ptr, query->size); return query; } *************** *** 734,739 **** --- 748,769 ---- PG_RETURN_CSTRING(nrm.buf); } + /* + * Binary Input / Output functions. The binary format is as follows: + * + * uint32 number of operators/operands in the query + * + * Followed by the operators and operands, in prefix notation. For each + * operand: + * + * uint8 type, QI_VAL + * uint8 weight + * operand text in client encoding, null-terminated + * + * For each operator: + * uint8 type, QI_OPR + * uint8 operator, one of OP_AND, OP_OR, OP_NOT. + */ Datum tsquerysend(PG_FUNCTION_ARGS) { *************** *** 744,750 **** pq_begintypsend(&buf); ! pq_sendint(&buf, query->size, sizeof(int32)); for (i = 0; i < query->size; i++) { pq_sendint(&buf, item->type, sizeof(item->type)); --- 774,780 ---- pq_begintypsend(&buf); ! pq_sendint(&buf, query->size, sizeof(uint32)); for (i = 0; i < query->size; i++) { pq_sendint(&buf, item->type, sizeof(item->type)); *************** *** 752,767 **** switch(item->type) { case QI_VAL: ! pq_sendint(&buf, item->operand.weight, sizeof(item->operand.weight)); ! pq_sendint(&buf, item->operand.valcrc, sizeof(item->operand.valcrc)); ! pq_sendint(&buf, item->operand.length, sizeof(int16)); /* istrue flag is just for temporary use in tsrank.c/Cover, * so we don't need to transfer that */ break; case QI_OPR: pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper)); - if (item->operator.oper != OP_NOT) - pq_sendint(&buf, item->operator.left, sizeof(item->operator.left)); break; default: elog(ERROR, "unknown tsquery node type %d", item->type); --- 782,794 ---- switch(item->type) { case QI_VAL: ! pq_sendint(&buf, item->operand.weight, sizeof(uint8)); ! pq_sendstring(&buf, GETOPERAND(query) + item->operand.distance); /* istrue flag is just for temporary use in tsrank.c/Cover, * so we don't need to transfer that */ break; case QI_OPR: pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper)); break; default: elog(ERROR, "unknown tsquery node type %d", item->type); *************** *** 769,782 **** item++; } - item = GETQUERY(query); - for (i = 0; i < query->size; i++) - { - if (item->type == QI_VAL) - pq_sendbytes(&buf, GETOPERAND(query) + item->operand.distance, item->operand.length); - item++; - } - PG_FREE_IF_COPY(query, 0); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); --- 796,801 ---- *************** *** 788,924 **** StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSQuery query; int i, - size, len; QueryItem *item; ! int datalen = 0; char *ptr; ! Bitmapset *parentset = NULL; size = pq_getmsgint(buf, sizeof(uint32)); ! if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem))) elog(ERROR, "invalid size of tsquery"); ! len = HDRSIZETQ + sizeof(QueryItem) * size; ! query = (TSQuery) palloc(len); query->size = size; item = GETQUERY(query); for (i = 0; i < size; i++) { item->type = (int8) pq_getmsgint(buf, sizeof(int8)); ! switch(item->type) { ! case QI_VAL: ! item->operand.weight = (int8) pq_getmsgint(buf, sizeof(int8)); ! item->operand.valcrc = (int32) pq_getmsgint(buf, sizeof(int32)); ! item->operand.length = pq_getmsgint(buf, sizeof(int16)); ! ! /* Check that the weight bitmap is valid */ ! if (item->operand.weight < 0 || item->operand.weight > 0xF) ! elog(ERROR, "invalid weight bitmap"); ! ! /* XXX: We don't check that the CRC is valid. Actually, if we ! * bothered to calculate it to verify, there would be no need ! * to transfer it. ! */ ! ! /* ! * Check that datalen doesn't grow too large. Without the ! * check, a malicious client could induce a buffer overflow ! * by sending a tsquery whose size exceeds 2GB. datalen ! * would overflow, we would allocate a too small buffer below, ! * and overflow the buffer. Because operand.length is a 20-bit ! * field, adding one such value to datalen must exceed ! * MaxAllocSize before wrapping over the 32-bit datalen field, ! * so this check will protect from it. ! */ ! if (datalen > MAXSTRLEN) ! elog(ERROR, "invalid tsquery; total operand length exceeded"); ! ! /* We can calculate distance from datalen, no need to send it ! * across the wire. If we did, we would have to check that ! * it's valid anyway. ! */ ! item->operand.distance = datalen; ! datalen += item->operand.length + 1; /* \0 */ ! ! break; ! case QI_OPR: ! item->operator.oper = (int8) pq_getmsgint(buf, sizeof(int8)); ! if (item->operator.oper != OP_NOT && ! item->operator.oper != OP_OR && ! item->operator.oper != OP_AND) ! elog(ERROR, "unknown operator type %d", (int) item->operator.oper); ! ! /* ! * Check that no previous operator node points to the right ! * operand. That would mean that the operand node ! * has two parents. ! */ ! if (bms_is_member(i + 1, parentset)) ! elog(ERROR, "malformed query tree"); ! ! parentset = bms_add_member(parentset, i + 1); ! ! if(item->operator.oper != OP_NOT) ! { ! uint32 left = (uint32) pq_getmsgint(buf, sizeof(uint32)); ! ! /* ! * Right operand is implicitly at "this+1". Don't allow ! * left to point to the right operand, or to self. ! */ ! if (left <= 1 || i + left >= size) ! elog(ERROR, "invalid pointer to left operand"); ! ! /* ! * Check that no previous operator node points to the left ! * operand. ! */ ! if (bms_is_member(i + left, parentset)) ! elog(ERROR, "malformed query tree"); ! ! parentset = bms_add_member(parentset, i + left); ! ! item->operator.left = left; ! } ! ! if (i == size - 1) ! elog(ERROR, "invalid pointer to right operand"); ! break; ! default: ! elog(ERROR, "unknown tsquery node type %d", item->type); } item++; } ! /* Now check that each node, except the root, has a parent. We ! * already checked above that no node has more than one parent. */ ! if (bms_num_members(parentset) != size - 1 && size != 0) ! elog(ERROR, "malformed query tree"); ! query = (TSQuery) repalloc(query, len + datalen); - item = GETQUERY(query); ptr = GETOPERAND(query); for (i = 0; i < size; i++) { if (item->type == QI_VAL) { ! memcpy(ptr, ! pq_getmsgbytes(buf, item->operand.length), ! item->operand.length); ! ptr += item->operand.length; ! *ptr++ = '\0'; } item++; } Assert(ptr - GETOPERAND(query) == datalen); SET_VARSIZE(query, len + datalen); --- 807,919 ---- StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSQuery query; int i, len; QueryItem *item; ! int datalen; char *ptr; ! uint32 size; ! const char **operands; size = pq_getmsgint(buf, sizeof(uint32)); ! if (size > (MaxAllocSize / sizeof(QueryItem))) elog(ERROR, "invalid size of tsquery"); ! /* Allocate space to temporarily hold operand strings */ ! operands = palloc(size * sizeof(char *)); ! /* Allocate space for all the QueryItems. */ ! len = HDRSIZETQ + sizeof(QueryItem) * size; ! query = (TSQuery) palloc0(len); query->size = size; item = GETQUERY(query); + datalen = 0; for (i = 0; i < size; i++) { item->type = (int8) pq_getmsgint(buf, sizeof(int8)); ! if (item->type == QI_VAL) { ! size_t val_len; /* length after recoding to server encoding */ ! uint8 weight; ! const char *val; ! pg_crc32 valcrc; ! ! weight = (uint8) pq_getmsgint(buf, sizeof(uint8)); ! val = pq_getmsgstring(buf); ! val_len = strlen(val); ! ! /* Sanity checks */ ! ! if (weight > 0xF) ! elog(ERROR, "invalid tsquery; invalid weight bitmap"); ! ! if (val_len > MAXSTRLEN) ! elog(ERROR, "invalid tsquery; operand too long"); ! ! if (datalen > MAXSTRPOS) ! elog(ERROR, "invalid tsquery; total operand length exceeded"); ! ! /* Looks valid. */ ! ! INIT_CRC32(valcrc); ! COMP_CRC32(valcrc, val, val_len); ! FIN_CRC32(valcrc); ! ! item->operand.weight = weight; ! item->operand.valcrc = (int32) valcrc; ! item->operand.length = val_len; ! item->operand.distance = datalen; ! ! /* ! * Operand strings are copied to the final struct after this loop; ! * here we just collect them to an array ! */ ! operands[i] = val; ! ! datalen += val_len + 1; /* + 1 for the '\0' terminator */ ! } ! else if (item->type == QI_OPR) ! { ! int8 oper; ! oper = (int8) pq_getmsgint(buf, sizeof(int8)); ! if (oper != OP_NOT && oper != OP_OR && oper != OP_AND) ! elog(ERROR, "invalid tsquery; unknown operator type %d", (int) oper); ! if (i == size - 1) ! elog(ERROR, "invalid pointer to right operand"); ! item->operator.oper = oper; } + else + elog(ERROR, "unknown tsquery node type %d", item->type); item++; } ! /* Enlarge buffer to make room for the operand values. */ query = (TSQuery) repalloc(query, len + datalen); item = GETQUERY(query); ptr = GETOPERAND(query); + + /* + * Fill in the left-pointers. Checks that the tree is well-formed + * as a side-effect. + */ + findoprnd(item, size); + + /* Copy operands to output struct */ for (i = 0; i < size; i++) { if (item->type == QI_VAL) { ! memcpy(ptr, operands[i], item->operand.length + 1); ! ptr += item->operand.length + 1; } item++; } + pfree(operands); + Assert(ptr - GETOPERAND(query) == datalen); SET_VARSIZE(query, len + datalen); *** ../pgsql.tsearch-2/src/backend/utils/adt/tsquery_cleanup.c 2007-09-05 12:14:43.000000000 +0100 --- ./src/backend/utils/adt/tsquery_cleanup.c 2007-09-07 09:35:48.000000000 +0100 *************** *** 17,22 **** --- 17,23 ---- #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" + #include "miscadmin.h" typedef struct NODE { diff -r -c -x '*.o' -x '*.Po' -x config.log -x '*.so' -x CVS -x gram.c ../pgsql.tsearch-2/src/backend/utils/adt/tsquery_rewrite.c./src/backend/utils/adt/tsquery_rewrite.c *** ../pgsql.tsearch-2/src/backend/utils/adt/tsquery_rewrite.c 2007-09-05 12:18:46.000000000 +0100 --- ./src/backend/utils/adt/tsquery_rewrite.c 2007-09-06 23:25:00.000000000 +0100 *************** *** 17,22 **** --- 17,23 ---- #include "executor/spi.h" #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" + #include "miscadmin.h" static int *** ../pgsql.tsearch-2/src/backend/utils/adt/tsquery_util.c 2007-09-05 12:21:29.000000000 +0100 --- ./src/backend/utils/adt/tsquery_util.c 2007-09-06 23:25:14.000000000 +0100 *************** *** 16,21 **** --- 16,22 ---- #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" + #include "miscadmin.h" QTNode * QT2QTN(QueryItem * in, char *operand) *** ../pgsql.tsearch-2/src/backend/utils/adt/tsrank.c 2007-09-05 12:24:27.000000000 +0100 --- ./src/backend/utils/adt/tsrank.c 2007-09-06 23:56:29.000000000 +0100 *************** *** 18,23 **** --- 18,24 ---- #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" #include "utils/array.h" + #include "miscadmin.h" static float weights[] = {0.1, 0.2, 0.4, 1.0}; *************** *** 176,183 **** return res; } static WordEntryPos POSNULL[] = { ! 0, 0 }; --- 177,185 ---- return res; } + /* A dummy WordEntryPos array to use when haspos is false */ static WordEntryPos POSNULL[] = { ! 1, /* Number of elements that follow */ 0 }; *************** *** 207,213 **** } pos = (uint16 **) palloc(sizeof(uint16 *) * q->size); memset(pos, 0, sizeof(uint16 *) * q->size); - *(uint16 *) POSNULL = lengthof(POSNULL) - 1; WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1); for (i = 0; i < size; i++) --- 209,214 ---- *************** *** 265,271 **** QueryOperand **item; int size = q->size; - *(uint16 *) POSNULL = lengthof(POSNULL) - 1; item = SortAndUniqItems(q, &size); for (i = 0; i < size; i++) --- 266,271 ---- *************** *** 593,599 **** DocRepresentation *doc; char *operand; - *(uint16 *) POSNULL = lengthof(POSNULL) - 1; doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); operand = GETOPERAND(query); reset_istrue_flag(query); --- 593,598 ---- *** ../pgsql.tsearch-2/src/backend/utils/adt/tsvector.c 2007-09-03 11:05:31.000000000 +0100 --- ./src/backend/utils/adt/tsvector.c 2007-09-07 09:47:46.000000000 +0100 *************** *** 75,92 **** } static int ! compareentry(const void *a, const void *b, void *arg) { char *BufferStr = (char *) arg; ! if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len) { ! return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos], ! &BufferStr[((WordEntryIN *) b)->entry.pos], ! ((WordEntryIN *) a)->entry.len); } ! return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1; } static int --- 75,94 ---- } static int ! compareentry(const void *va, const void *vb, void *arg) { char *BufferStr = (char *) arg; + WordEntryIN *a = (WordEntryIN *) va; + WordEntryIN *b = (WordEntryIN *) vb; ! if (a->entry.len == b->entry.len) { ! return strncmp(&BufferStr[a->entry.pos], ! &BufferStr[b->entry.pos], ! a->entry.len); } ! return (a->entry.len > b->entry.len) ? 1 : -1; } static int *************** *** 104,109 **** --- 106,114 ---- a->poslen = uniquePos(a->pos, a->poslen); *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos); } + else + *outbuflen = a->entry.len; + return l; } res = a; *************** *** 118,127 **** { if (res->entry.haspos) { res->poslen = uniquePos(res->pos, res->poslen); *outbuflen += res->poslen * sizeof(WordEntryPos); } ! *outbuflen += SHORTALIGN(res->entry.len); res++; memcpy(res, ptr, sizeof(WordEntryIN)); } --- 123,134 ---- { if (res->entry.haspos) { + *outbuflen += SHORTALIGN(res->entry.len); res->poslen = uniquePos(res->pos, res->poslen); *outbuflen += res->poslen * sizeof(WordEntryPos); } ! else ! *outbuflen += res->entry.len; res++; memcpy(res, ptr, sizeof(WordEntryIN)); } *************** *** 147,158 **** } ptr++; } if (res->entry.haspos) { res->poslen = uniquePos(res->pos, res->poslen); *outbuflen += res->poslen * sizeof(WordEntryPos); } ! *outbuflen += SHORTALIGN(res->entry.len); return res + 1 - a; } --- 154,171 ---- } ptr++; } + + /* add last item */ + if (res->entry.haspos) { + *outbuflen += SHORTALIGN(res->entry.len); + res->poslen = uniquePos(res->pos, res->poslen); *outbuflen += res->poslen * sizeof(WordEntryPos); } ! else ! *outbuflen += res->entry.len; return res + 1 - a; } *************** *** 367,372 **** --- 380,397 ---- PG_RETURN_CSTRING(outbuf); } + /* + * Binary Input / Output functions. The binary format is as follows: + * + * uint32 number of lexemes + * + * for each lexeme: + * lexeme text in client encoding, null-terminated + * uint16 number of positions + * for each position: + * uint16 WordEntryPos + */ + Datum tsvectorsend(PG_FUNCTION_ARGS) { *************** *** 381,398 **** pq_sendint(&buf, vec->size, sizeof(int32)); for (i = 0; i < vec->size; i++) { ! /* ! * We are sure that sizeof(WordEntry) == sizeof(int32) */ ! pq_sendint(&buf, *(int32 *) weptr, sizeof(int32)); ! pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len); ! if (weptr->haspos) { WordEntryPos *wepptr = POSDATAPTR(vec, weptr); ! pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos)); ! for (j = 0; j < POSDATALEN(vec, weptr); j++) pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); } weptr++; --- 406,427 ---- pq_sendint(&buf, vec->size, sizeof(int32)); for (i = 0; i < vec->size; i++) { ! uint16 npos; ! ! /* the strings in the TSVector array are not null-terminated, so ! * we have to send the null-terminator separately */ ! pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); ! pq_sendbyte(&buf, '\0'); ! ! npos = POSDATALEN(vec, weptr); ! pq_sendint(&buf, npos, sizeof(uint16)); ! if(npos > 0) { WordEntryPos *wepptr = POSDATAPTR(vec, weptr); ! for (j = 0; j < npos; j++) pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); } weptr++; *************** *** 407,477 **** StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSVector vec; int i; ! uint32 size; ! WordEntry *weptr; ! int datalen = 0; ! Size len; ! size = pq_getmsgint(buf, sizeof(uint32)); ! if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry))) elog(ERROR, "invalid size of tsvector"); ! len = DATAHDRSIZE + sizeof(WordEntry) * size; ! len = len * 2; /* times two to make room for lexemes */ vec = (TSVector) palloc0(len); ! vec->size = size; ! weptr = ARRPTR(vec); ! for (i = 0; i < size; i++) { ! int32 tmp; ! weptr = ARRPTR(vec) + i; /* ! * We are sure that sizeof(WordEntry) == sizeof(int32) */ ! tmp = pq_getmsgint(buf, sizeof(int32)); ! *weptr = *(WordEntry *) & tmp; ! ! while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len) { len *= 2; vec = (TSVector) repalloc(vec, len); - weptr = ARRPTR(vec) + i; } ! memcpy(STRPTR(vec) + weptr->pos, ! pq_getmsgbytes(buf, weptr->len), ! weptr->len); ! datalen += SHORTALIGN(weptr->len); ! if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0) elog(ERROR, "lexemes are unordered"); ! if (weptr->haspos) { ! uint16 j, ! npos; WordEntryPos *wepptr; ! npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); ! if (npos > MAXNUMPOS) ! elog(ERROR, "unexpected number of positions"); ! ! while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len) { ! len *= 2; ! vec = (TSVector) repalloc(vec, len); ! weptr = ARRPTR(vec) + i; } ! memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16)); ! wepptr = POSDATAPTR(vec, weptr); for (j = 0; j < npos; j++) { ! wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16)); if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) elog(ERROR, "position information is unordered"); } --- 436,527 ---- StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSVector vec; int i; ! int32 nentries; ! int datalen; /* number of bytes used in the variable size area ! * after fixed size TSVector header and WordEntries ! */ ! Size hdrlen; ! Size len; /* allocated size of vec */ ! nentries = pq_getmsgint(buf, sizeof(int32)); ! if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry))) elog(ERROR, "invalid size of tsvector"); ! hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries; ! len = hdrlen * 2; /* times two to make room for lexemes */ vec = (TSVector) palloc0(len); ! vec->size = nentries; ! datalen = 0; ! for (i = 0; i < nentries; i++) { ! const char *lexeme; ! uint16 npos; ! size_t lex_len; ! ! lexeme = pq_getmsgstring(buf); ! npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); ! ! /* sanity checks */ ! lex_len = strlen(lexeme); ! if (lex_len < 0 || lex_len > MAXSTRLEN) ! elog(ERROR, "invalid tsvector; lexeme too long"); ! ! if (datalen > MAXSTRPOS) ! elog(ERROR, "invalid tsvector; maximum total lexeme length exceeded"); ! ! if (npos > MAXNUMPOS) ! elog(ERROR, "unexpected number of positions"); /* ! * Looks valid. Fill the WordEntry struct, and copy lexeme. ! * ! * But make sure the buffer is large enough first. */ ! while (hdrlen + SHORTALIGN(datalen + lex_len) + ! (npos + 1) * sizeof(WordEntryPos) >= len) { len *= 2; vec = (TSVector) repalloc(vec, len); } ! vec->entries[i].haspos = (npos > 0) ? 1 : 0; ! vec->entries[i].len = lex_len; ! vec->entries[i].pos = datalen; ! memcpy(STRPTR(vec) + datalen, lexeme, lex_len); ! ! datalen += lex_len; ! ! if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0) elog(ERROR, "lexemes are unordered"); ! /* Receive positions */ ! ! if (npos > 0) { ! uint16 j; WordEntryPos *wepptr; ! /* ! * Pad to 2-byte alignment if necessary. Though we used palloc0 ! * for the initial allocation, subsequent repalloc'd memory ! * areas are not initialized to zero. ! */ ! if (datalen != SHORTALIGN(datalen)) { ! *(STRPTR(vec) + datalen) = '\0'; ! datalen = SHORTALIGN(datalen); } ! memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16)); ! ! wepptr = POSDATAPTR(vec, &vec->entries[i]); for (j = 0; j < npos; j++) { ! wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) elog(ERROR, "position information is unordered"); } *************** *** 480,486 **** } } ! SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen)); PG_RETURN_TSVECTOR(vec); } --- 530,536 ---- } } ! SET_VARSIZE(vec, hdrlen + datalen); PG_RETURN_TSVECTOR(vec); } *** ../pgsql.tsearch-2/src/backend/utils/adt/tsvector_op.c 2007-08-31 21:20:00.000000000 +0100 --- ./src/backend/utils/adt/tsvector_op.c 2007-09-06 23:47:32.000000000 +0100 *************** *** 165,171 **** char *cur; for (i = 0; i < in->size; i++) ! len += SHORTALIGN(arrin[i].len); len = CALCDATASIZE(in->size, len); out = (TSVector) palloc0(len); --- 165,171 ---- char *cur; for (i = 0; i < in->size; i++) ! len += arrin[i].len; len = CALCDATASIZE(in->size, len); out = (TSVector) palloc0(len); *************** *** 179,185 **** arrout[i].haspos = 0; arrout[i].len = arrin[i].len; arrout[i].pos = cur - STRPTR(out); ! cur += SHORTALIGN(arrout[i].len); } PG_FREE_IF_COPY(in, 0); --- 179,185 ---- arrout[i].haspos = 0; arrout[i].len = arrin[i].len; arrout[i].pos = cur - STRPTR(out); ! cur += arrout[i].len; } PG_FREE_IF_COPY(in, 0); *************** *** 351,362 **** ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr1->len); if (ptr->haspos) { memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; --- 351,365 ---- ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; if (ptr->haspos) { + cur += SHORTALIGN(ptr1->len); memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } + else + cur += ptr1->len; + ptr++; ptr1++; i1--; *************** *** 367,382 **** ptr->len = ptr2->len; memcpy(cur, data2 + ptr2->pos, ptr2->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr2->len); if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr2++; i2--; --- 370,389 ---- ptr->len = ptr2->len; memcpy(cur, data2 + ptr2->pos, ptr2->len); ptr->pos = cur - data; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + cur += SHORTALIGN(ptr2->len); + if (addlen == 0) ptr->haspos = 0; else cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); } + else + cur += ptr2->len; + ptr++; ptr2++; i2--; *************** *** 387,395 **** ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr1->len); if (ptr->haspos) { if (ptr1->haspos) { memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); --- 394,402 ---- ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; if (ptr->haspos) { + cur += SHORTALIGN(ptr1->len); if (ptr1->haspos) { memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); *************** *** 407,412 **** --- 414,422 ---- cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } + else + cur += ptr1->len; + ptr++; ptr1++; ptr2++; *************** *** 421,432 **** ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr1->len); if (ptr->haspos) { memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; --- 431,445 ---- ptr->len = ptr1->len; memcpy(cur, data1 + ptr1->pos, ptr1->len); ptr->pos = cur - data; if (ptr->haspos) { + cur += SHORTALIGN(ptr1->len); memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } + else + cur += ptr1->len; + ptr++; ptr1++; i1--; *************** *** 438,453 **** ptr->len = ptr2->len; memcpy(cur, data2 + ptr2->pos, ptr2->len); ptr->pos = cur - data; - cur += SHORTALIGN(ptr2->len); if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr2++; i2--; --- 451,470 ---- ptr->len = ptr2->len; memcpy(cur, data2 + ptr2->pos, ptr2->len); ptr->pos = cur - data; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + cur += SHORTALIGN(ptr2->len); + if (addlen == 0) ptr->haspos = 0; else cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); } + else + cur += ptr2->len; + ptr++; ptr2++; i2--; *************** *** 484,491 **** static bool checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item) { ! WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16)); ! uint16 len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len))); while (len--) { --- 501,508 ---- static bool checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item) { ! WordEntryPos *ptr = (WordEntryPos *) (chkval->values + SHORTALIGN(val->pos + val->len) + sizeof(uint16)); ! uint16 len = *((uint16 *) (chkval->values + SHORTALIGN(val->pos + val->len))); while (len--) { *** ../pgsql.tsearch-2/src/include/tsearch/ts_type.h 2007-09-05 12:17:02.000000000 +0100 --- ./src/include/tsearch/ts_type.h 2007-09-07 09:20:43.000000000 +0100 *************** *** 62,87 **** * bytes from end of WordEntry array to start of * corresponding lexeme. * 4) Lexeme's storage: ! * SHORTALIGNED(lexeme) and position information if it exists ! * Position information: first int2 - is a number of positions and it ! * follows array of WordEntryPos */ typedef struct { int32 vl_len_; /* varlena header (do not touch directly!) */ ! uint32 size; ! char data[1]; } TSVectorData; typedef TSVectorData *TSVector; ! #define DATAHDRSIZE (VARHDRSZ + sizeof(int4)) ! #define CALCDATASIZE(x, lenstr) ( (x) * sizeof(WordEntry) + DATAHDRSIZE + (lenstr) ) ! #define ARRPTR(x) ( (WordEntry*) ( (char*)(x) + DATAHDRSIZE ) ) ! #define STRPTR(x) ( (char*)(x) + DATAHDRSIZE + ( sizeof(WordEntry) * ((TSVector)(x))->size ) ) ! #define STRSIZE(x) ( ((TSVector)(x))->len - DATAHDRSIZE - ( sizeof(WordEntry) * ((TSVector)(x))->size ) ) ! #define _POSDATAPTR(x,e) (STRPTR(x)+((WordEntry*)(e))->pos+SHORTALIGN(((WordEntry*)(e))->len)) #define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 ) #define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) ) --- 62,94 ---- * bytes from end of WordEntry array to start of * corresponding lexeme. * 4) Lexeme's storage: ! * lexeme (without null-terminator) ! * if haspos is true: ! * padding byte if necessary to make the number of positions 2-byte aligned ! * uint16 number of positions that follow. ! * uint16[] positions ! * ! * The positions must be sorted. */ typedef struct { int32 vl_len_; /* varlena header (do not touch directly!) */ ! int32 size; ! WordEntry entries[1]; /* var size */ ! /* lexemes follow */ } TSVectorData; typedef TSVectorData *TSVector; ! #define DATAHDRSIZE (offsetof(TSVectorData, entries)) ! #define CALCDATASIZE(x, lenstr) (DATAHDRSIZE + (x) * sizeof(WordEntry) + (lenstr) ) ! #define ARRPTR(x) ( (x)->entries ) ! ! /* returns a pointer to the beginning of lexemes */ ! #define STRPTR(x) ( (char *) &(x)->entries[x->size] ) ! ! #define _POSDATAPTR(x,e) (STRPTR(x) + SHORTALIGN((e)->pos + (e)->len)) #define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 ) #define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) ) *************** *** 166,172 **** * C: 1<<1 * D: 1<<0 */ ! int8 weight; int32 valcrc; /* XXX: pg_crc32 would be a more appropriate data type, but we use comparisons to signedintegers in the code. They would need to be changed as well. */ /* pointer to text value of operand, must correlate with WordEntry */ --- 173,179 ---- * C: 1<<1 * D: 1<<0 */ ! uint8 weight; int32 valcrc; /* XXX: pg_crc32 would be a more appropriate data type, but we use comparisons to signedintegers in the code. They would need to be changed as well. */ /* pointer to text value of operand, must correlate with WordEntry */
pgsql-patches by date: