tsvector extraction patch - Mailing list pgsql-hackers
From | Hans-Juergen Schoenig -- PostgreSQL |
---|---|
Subject | tsvector extraction patch |
Date | |
Msg-id | 4A4CC071.2080300@cybertec.at Whole thread Raw |
Responses |
Re: tsvector extraction patch
Re: tsvector extraction patch |
List | pgsql-hackers |
hello, i made a small patch which i found useful for my personal tasks. it would be nice to see this in 8.5. if not core then maybe contrib. it transforms a tsvector to table format which is really nice for text processing and comparison. test=# SELECT * FROM tsvcontent(to_tsvector('english', 'i am pretty sure this is a good patch')); lex | rank --------+------ good | 8 patch | 9 pretti | 3 sure | 4 (4 rows) many thanks, hans -- Cybertec Schoenig & Schoenig GmbH Reyergasse 9 / 2 A-2700 Wiener Neustadt Web: www.postgresql-support.de diff -dcrpN postgresql-8.4.0.old/contrib/Makefile postgresql-8.4.0/contrib/Makefile *** postgresql-8.4.0.old/contrib/Makefile 2009-03-26 00:20:01.000000000 +0100 --- postgresql-8.4.0/contrib/Makefile 2009-06-29 11:03:04.000000000 +0200 *************** WANTED_DIRS = \ *** 39,44 **** --- 39,45 ---- tablefunc \ test_parser \ tsearch2 \ + tsvcontent \ vacuumlo ifeq ($(with_openssl),yes) diff -dcrpN postgresql-8.4.0.old/contrib/tsvcontent/Makefile postgresql-8.4.0/contrib/tsvcontent/Makefile *** postgresql-8.4.0.old/contrib/tsvcontent/Makefile 1970-01-01 01:00:00.000000000 +0100 --- postgresql-8.4.0/contrib/tsvcontent/Makefile 2009-06-29 11:20:21.000000000 +0200 *************** *** 0 **** --- 1,19 ---- + # $PostgreSQL: pgsql/contrib/tablefunc/Makefile,v 1.9 2007/11/10 23:59:51 momjian Exp $ + + MODULES = tsvcontent + DATA_built = tsvcontent.sql + DATA = uninstall_tsvcontent.sql + + + SHLIB_LINK += $(filter -lm, $(LIBS)) + + ifdef USE_PGXS + PG_CONFIG = pg_config + PGXS := $(shell $(PG_CONFIG) --pgxs) + include $(PGXS) + else + subdir = contrib/tsvcontent + top_builddir = ../.. + include $(top_builddir)/src/Makefile.global + include $(top_srcdir)/contrib/contrib-global.mk + endif diff -dcrpN postgresql-8.4.0.old/contrib/tsvcontent/tsvcontent.c postgresql-8.4.0/contrib/tsvcontent/tsvcontent.c *** postgresql-8.4.0.old/contrib/tsvcontent/tsvcontent.c 1970-01-01 01:00:00.000000000 +0100 --- postgresql-8.4.0/contrib/tsvcontent/tsvcontent.c 2009-06-29 11:18:35.000000000 +0200 *************** *** 0 **** --- 1,169 ---- + #include "postgres.h" + + #include "fmgr.h" + #include "funcapi.h" + #include "miscadmin.h" + #include "executor/spi.h" + #include "lib/stringinfo.h" + #include "nodes/nodes.h" + #include "utils/builtins.h" + #include "utils/lsyscache.h" + #include "utils/syscache.h" + #include "utils/memutils.h" + #include "tsearch/ts_type.h" + #include "tsearch/ts_utils.h" + #include "catalog/pg_type.h" + + #include "tsvcontent.h" + + PG_MODULE_MAGIC; + + PG_FUNCTION_INFO_V1(tsvcontent); + + Datum + tsvcontent(PG_FUNCTION_ARGS) + { + FuncCallContext *funcctx; + TupleDesc ret_tupdesc; + AttInMetadata *attinmeta; + int call_cntr; + int max_calls; + ts_to_txt_fctx *fctx; + Datum result[2]; + bool isnull[2] = { false, false }; + MemoryContext oldcontext; + + /* input value containing the TS vector */ + TSVector in = PG_GETARG_TSVECTOR(0); + + /* stuff done only on the first call of the function */ + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + int i, j; + char *wepv_base; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* + * switch to memory context appropriate for multiple function calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + switch (get_call_result_type(fcinfo, NULL, &tupdesc)) + { + case TYPEFUNC_COMPOSITE: + /* success */ + break; + case TYPEFUNC_RECORD: + /* failed to determine actual type of RECORD */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function returning record called in context " + "that cannot accept type record"))); + break; + default: + /* result type isn't composite */ + elog(ERROR, "return type must be a row type"); + break; + } + + /* make sure we have a persistent copy of the tupdesc */ + tupdesc = CreateTupleDescCopy(tupdesc); + + /* + * Generate attribute metadata needed later to produce tuples from raw + * C strings + */ + attinmeta = TupleDescGetAttInMetadata(tupdesc); + funcctx->attinmeta = attinmeta; + + /* allocate memory */ + fctx = (ts_to_txt_fctx *) palloc(sizeof(ts_to_txt_fctx)); + + wepv_base = (char *)in + offsetof(TSVectorData, entries) + in->size * sizeof(WordEntry); + + fctx->n_tsvt = 0; + for (i = 0; i < in->size; i++) + { + if (in->entries[i].haspos) + { + WordEntryPosVector *wepv = (WordEntryPosVector *) + (wepv_base + in->entries[i].pos + SHORTALIGN(in->entries[i].len)); + + fctx->n_tsvt += wepv->npos; + } + else + fctx->n_tsvt++; + } + + fctx->tsvt = palloc(fctx->n_tsvt * sizeof(tsvec_tuple)); + + for (i = 0, j = 0; i < in->size; i++) + { + int pos = in->entries[i].pos; + int len = in->entries[i].len; + + if (in->entries[i].haspos) + { + WordEntryPosVector *wepv = (WordEntryPosVector *) + (wepv_base + in->entries[i].pos + SHORTALIGN(len)); + uint16 npos = wepv->npos; + int o; + for (o = 0; o < npos; o++) + { + fctx->tsvt[j].txt = palloc(len + 1); + memcpy(fctx->tsvt[j].txt, wepv_base + pos, len); + fctx->tsvt[j].txt[len] = '\0'; + fctx->tsvt[j].pos = wepv->pos[o]; + j++; + } + } + else + { + fctx->tsvt[j].txt = palloc(len + 1); + memcpy(fctx->tsvt[j].txt, wepv_base + pos, len); + fctx->tsvt[j].txt[len] = '\0'; + fctx->tsvt[j].pos = 0; + j++; + } + } + + /* total number of tuples to be returned */ + funcctx->max_calls = fctx->n_tsvt; + + funcctx->user_fctx = fctx; + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + + call_cntr = funcctx->call_cntr; + max_calls = funcctx->max_calls; + fctx = funcctx->user_fctx; + + /* attribute return type and return tuple description */ + attinmeta = funcctx->attinmeta; + ret_tupdesc = attinmeta->tupdesc; + + /* are there any records inside the tsvector left? */ + if (call_cntr < max_calls && call_cntr < fctx->n_tsvt) /* do when there is more left to send */ + { + HeapTuple tuple; + + result[0] = DirectFunctionCall1(textin, CStringGetDatum(fctx->tsvt[call_cntr].txt)); + result[1] = Int32GetDatum(fctx->tsvt[call_cntr].pos); + + tuple = heap_form_tuple(ret_tupdesc, result, isnull); + + /* send the result */ + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + /* do when there is no more left */ + SRF_RETURN_DONE(funcctx); + } + } + diff -dcrpN postgresql-8.4.0.old/contrib/tsvcontent/tsvcontent.h postgresql-8.4.0/contrib/tsvcontent/tsvcontent.h *** postgresql-8.4.0.old/contrib/tsvcontent/tsvcontent.h 1970-01-01 01:00:00.000000000 +0100 --- postgresql-8.4.0/contrib/tsvcontent/tsvcontent.h 2009-06-29 11:18:13.000000000 +0200 *************** *** 0 **** --- 1,13 ---- + typedef struct + { + char *txt; + int pos; + } tsvec_tuple; + + typedef struct + { + int n_tsvt; + tsvec_tuple *tsvt; + } ts_to_txt_fctx; + + extern Datum tsvcontent(PG_FUNCTION_ARGS); diff -dcrpN postgresql-8.4.0.old/contrib/tsvcontent/tsvcontent.sql.in postgresql-8.4.0/contrib/tsvcontent/tsvcontent.sql.in *** postgresql-8.4.0.old/contrib/tsvcontent/tsvcontent.sql.in 1970-01-01 01:00:00.000000000 +0100 --- postgresql-8.4.0/contrib/tsvcontent/tsvcontent.sql.in 2009-06-29 11:19:04.000000000 +0200 *************** *** 0 **** --- 1,6 ---- + CREATE TYPE tsvcontent AS (lex text, rank integer); + + -- List words in "tsvector format" and their occurences found in a tsvector. + CREATE OR REPLACE FUNCTION tsvcontent(vec tsvector) RETURNS SETOF tsvcontent + AS '$libdir/tsvcontent', 'tsvcontent' + LANGUAGE C STRICT;
pgsql-hackers by date: