1 files changed, 887 insertions, 0 deletions
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
new file mode 100644
index 0000000000..8c544ad4f8
--- /dev/null
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -0,0 +1,887 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_thesaurus.c
+ *		Thesaurus dictionary: phrase to phrase substitution
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/namespace.h"
+#include "storage/fd.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "tsearch/ts_utils.h"
+#include "utils/builtins.h"
+
+
+/*
+ * Temporay we use TSLexeme.flags for inner use...
+ */
+#define DT_USEASIS		0x1000
+
+typedef struct LexemeInfo
+{
+	uint16		idsubst;		/* entry's number in DictThesaurus->subst */
+	uint16		posinsubst;		/* pos info in entry */
+	uint16		tnvariant;		/* total num lexemes in one variant */
+	struct LexemeInfo *nextentry;
+	struct LexemeInfo *nextvariant;
+} LexemeInfo;
+
+typedef struct
+{
+	char	   *lexeme;
+	LexemeInfo *entries;
+} TheLexeme;
+
+typedef struct
+{
+	uint16		lastlexeme;		/* number lexemes to substitute */
+	uint16		reslen;
+	TSLexeme   *res;			/* prepared substituted result */
+} TheSubstitute;
+
+typedef struct
+{
+	/* subdictionary to normalize lexemes */
+	Oid			subdictOid;
+	TSDictionaryCacheEntry *subdict;
+
+	/* Array to search lexeme by exact match */
+	TheLexeme  *wrds;
+	int			nwrds;
+	int			ntwrds;
+
+	/*
+	 * Storage of substituted result, n-th element is for n-th expression
+	 */
+	TheSubstitute *subst;
+	int			nsubst;
+} DictThesaurus;
+
+
+static void
+newLexeme(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
+{
+	TheLexeme  *ptr;
+
+	if (d->nwrds >= d->ntwrds)
+	{
+		if (d->ntwrds == 0)
+		{
+			d->ntwrds = 16;
+			d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
+		}
+		else
+		{
+			d->ntwrds *= 2;
+			d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
+		}
+	}
+
+	ptr = d->wrds + d->nwrds;
+	d->nwrds++;
+
+	ptr->lexeme = palloc(e - b + 1);
+
+	memcpy(ptr->lexeme, b, e - b);
+	ptr->lexeme[e - b] = '\0';
+
+	ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
+
+	ptr->entries->nextentry = NULL;
+	ptr->entries->idsubst = idsubst;
+	ptr->entries->posinsubst = posinsubst;
+}
+
+static void
+addWrd(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
+{
+	static int	nres = 0;
+	static int	ntres = 0;
+	TheSubstitute *ptr;
+
+	if (nwrd == 0)
+	{
+		nres = ntres = 0;
+
+		if (idsubst >= d->nsubst)
+		{
+			if (d->nsubst == 0)
+			{
+				d->nsubst = 16;
+				d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
+			}
+			else
+			{
+				d->nsubst *= 2;
+				d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
+			}
+		}
+	}
+
+	ptr = d->subst + idsubst;
+
+	ptr->lastlexeme = posinsubst - 1;
+
+	if (nres + 1 >= ntres)
+	{
+		if (ntres == 0)
+		{
+			ntres = 2;
+			ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
+		}
+		else
+		{
+			ntres *= 2;
+			ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
+		}
+
+	}
+
+	ptr->res[nres].lexeme = palloc(e - b + 1);
+	memcpy(ptr->res[nres].lexeme, b, e - b);
+	ptr->res[nres].lexeme[e - b] = '\0';
+
+	ptr->res[nres].nvariant = nwrd;
+	if (useasis)
+		ptr->res[nres].flags = DT_USEASIS;
+	else
+		ptr->res[nres].flags = 0;
+
+	ptr->res[++nres].lexeme = NULL;
+}
+
+#define TR_WAITLEX	1
+#define TR_INLEX	2
+#define TR_WAITSUBS 3
+#define TR_INSUBS	4
+
+static void
+thesaurusRead(char *filename, DictThesaurus * d)
+{
+	FILE	   *fh;
+	char		str[BUFSIZ];
+	int			lineno = 0;
+	uint16		idsubst = 0;
+	bool		useasis = false;
+
+	filename = get_tsearch_config_filename(filename, "ths");
+	fh = AllocateFile(filename, "r");
+	if (!fh)
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIG_FILE_ERROR),
+				 errmsg("could not open thesaurus file \"%s\": %m",
+						filename)));
+
+	while (fgets(str, sizeof(str), fh))
+	{
+		char	   *ptr,
+				   *recoded;
+		int			state = TR_WAITLEX;
+		char	   *beginwrd = NULL;
+		uint16		posinsubst = 0;
+		uint16		nwrd = 0;
+
+		ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
+											 GetDatabaseEncoding(), PG_UTF8);
+		if (recoded == NULL)
+			elog(ERROR, "encoding conversion failed");
+
+		lineno++;
+
+		/* is it comment ? */
+		while (t_isspace(ptr))
+			ptr += pg_mblen(ptr);
+		if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
+			continue;
+
+		while (*ptr)
+		{
+			if (state == TR_WAITLEX)
+			{
+				if (t_iseq(ptr, ':'))
+				{
+					if (posinsubst == 0)
+					{
+						FreeFile(fh);
+						ereport(ERROR,
+								(errcode(ERRCODE_CONFIG_FILE_ERROR),
+								 errmsg("unexpected delimiter at line %d of thesaurus file \"%s\"",
+										lineno, filename)));
+					}
+					state = TR_WAITSUBS;
+				}
+				else if (!t_isspace(ptr))
+				{
+					beginwrd = ptr;
+					state = TR_INLEX;
+				}
+			}
+			else if (state == TR_INLEX)
+			{
+				if (t_iseq(ptr, ':'))
+				{
+					newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
+					state = TR_WAITSUBS;
+				}
+				else if (t_isspace(ptr))
+				{
+					newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
+					state = TR_WAITLEX;
+				}
+			}
+			else if (state == TR_WAITSUBS)
+			{
+				if (t_iseq(ptr, '*'))
+				{
+					useasis = true;
+					state = TR_INSUBS;
+					beginwrd = ptr + pg_mblen(ptr);
+				}
+				else if (t_iseq(ptr, '\\'))
+				{
+					useasis = false;
+					state = TR_INSUBS;
+					beginwrd = ptr + pg_mblen(ptr);
+				}
+				else if (!t_isspace(ptr))
+				{
+					useasis = false;
+					beginwrd = ptr;
+					state = TR_INSUBS;
+				}
+			}
+			else if (state == TR_INSUBS)
+			{
+				if (t_isspace(ptr))
+				{
+					if (ptr == beginwrd)
+						ereport(ERROR,
+								(errcode(ERRCODE_CONFIG_FILE_ERROR),
+								 errmsg("unexpected end of line or lexeme at line %d of thesaurus file \"%s\"",
+										lineno, filename)));
+					addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
+					state = TR_WAITSUBS;
+				}
+			}
+			else
+				elog(ERROR, "unrecognized thesaurus state: %d", state);
+
+			ptr += pg_mblen(ptr);
+		}
+
+		if (state == TR_INSUBS)
+		{
+			if (ptr == beginwrd)
+				ereport(ERROR,
+						(errcode(ERRCODE_CONFIG_FILE_ERROR),
+						 errmsg("unexpected end of line or lexeme at line %d of thesaurus file \"%s\"",
+								lineno, filename)));
+			addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
+		}
+
+		idsubst++;
+
+		if (!(nwrd && posinsubst))
+		{
+			FreeFile(fh);
+			ereport(ERROR,
+					(errcode(ERRCODE_CONFIG_FILE_ERROR),
+					 errmsg("unexpected end of line at line %d of thesaurus file \"%s\"",
+							lineno, filename)));
+		}
+
+		if (recoded != str)
+			pfree(recoded);
+	}
+
+	d->nsubst = idsubst;
+
+	FreeFile(fh);
+}
+
+static TheLexeme *
+addCompiledLexeme(TheLexeme * newwrds, int *nnw, int *tnm, TSLexeme * lexeme, LexemeInfo * src, uint16 tnvariant)
+{
+
+	if (*nnw >= *tnm)
+	{
+		*tnm *= 2;
+		newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
+	}
+
+	newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
+
+	if (lexeme && lexeme->lexeme)
+	{
+		newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
+		newwrds[*nnw].entries->tnvariant = tnvariant;
+	}
+	else
+	{
+		newwrds[*nnw].lexeme = NULL;
+		newwrds[*nnw].entries->tnvariant = 1;
+	}
+
+	newwrds[*nnw].entries->idsubst = src->idsubst;
+	newwrds[*nnw].entries->posinsubst = src->posinsubst;
+
+	newwrds[*nnw].entries->nextentry = NULL;
+
+	(*nnw)++;
+	return newwrds;
+}
+
+static int
+cmpLexemeInfo(LexemeInfo * a, LexemeInfo * b)
+{
+	if (a == NULL || b == NULL)
+		return 0;
+
+	if (a->idsubst == b->idsubst)
+	{
+		if (a->posinsubst == b->posinsubst)
+		{
+			if (a->tnvariant == b->tnvariant)
+				return 0;
+
+			return (a->tnvariant > b->tnvariant) ? 1 : -1;
+		}
+
+		return (a->posinsubst > b->posinsubst) ? 1 : -1;
+	}
+
+	return (a->idsubst > b->idsubst) ? 1 : -1;
+}
+
+static int
+cmpLexeme(TheLexeme * a, TheLexeme * b)
+{
+	if (a->lexeme == NULL)
+	{
+		if (b->lexeme == NULL)
+			return 0;
+		else
+			return 1;
+	}
+	else if (b->lexeme == NULL)
+		return -1;
+
+	return strcmp(a->lexeme, b->lexeme);
+}
+
+static int
+cmpLexemeQ(const void *a, const void *b)
+{
+	return cmpLexeme((TheLexeme *) a, (TheLexeme *) b);
+}
+
+static int
+cmpTheLexeme(const void *a, const void *b)
+{
+	TheLexeme  *la = (TheLexeme *) a;
+	TheLexeme  *lb = (TheLexeme *) b;
+	int			res;
+
+	if ((res = cmpLexeme(la, lb)) != 0)
+		return res;
+
+	return -cmpLexemeInfo(la->entries, lb->entries);
+}
+
+static void
+compileTheLexeme(DictThesaurus * d)
+{
+	int			i,
+				nnw = 0,
+				tnm = 16;
+	TheLexeme  *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
+			   *ptrwrds;
+
+	for (i = 0; i < d->nwrds; i++)
+	{
+		TSLexeme   *ptr;
+
+		ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+									   PointerGetDatum(d->subdict->dictData),
+										  PointerGetDatum(d->wrds[i].lexeme),
+									Int32GetDatum(strlen(d->wrds[i].lexeme)),
+													 PointerGetDatum(NULL)));
+
+		if (!(ptr && ptr->lexeme))
+		{
+			if (!ptr)
+				elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
+					 d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+			else
+				elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
+					 d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+
+			newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
+		}
+		else
+		{
+			while (ptr->lexeme)
+			{
+				TSLexeme   *remptr = ptr + 1;
+				int			tnvar = 1;
+				int			curvar = ptr->nvariant;
+
+				/* compute n words in one variant */
+				while (remptr->lexeme)
+				{
+					if (remptr->nvariant != (remptr - 1)->nvariant)
+						break;
+					tnvar++;
+					remptr++;
+				}
+
+				remptr = ptr;
+				while (remptr->lexeme && remptr->nvariant == curvar)
+				{
+					newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
+					remptr++;
+				}
+
+				ptr = remptr;
+			}
+		}
+
+		pfree(d->wrds[i].lexeme);
+		pfree(d->wrds[i].entries);
+	}
+
+	pfree(d->wrds);
+	d->wrds = newwrds;
+	d->nwrds = nnw;
+	d->ntwrds = tnm;
+
+	if (d->nwrds > 1)
+	{
+		qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
+
+		/* uniq */
+		newwrds = d->wrds;
+		ptrwrds = d->wrds + 1;
+		while (ptrwrds - d->wrds < d->nwrds)
+		{
+			if (cmpLexeme(ptrwrds, newwrds) == 0)
+			{
+				if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
+				{
+					ptrwrds->entries->nextentry = newwrds->entries;
+					newwrds->entries = ptrwrds->entries;
+				}
+				else
+					pfree(ptrwrds->entries);
+
+				if (ptrwrds->lexeme)
+					pfree(ptrwrds->lexeme);
+			}
+			else
+			{
+				newwrds++;
+				*newwrds = *ptrwrds;
+			}
+
+			ptrwrds++;
+		}
+
+		d->nwrds = newwrds - d->wrds + 1;
+		d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
+	}
+}
+
+static void
+compileTheSubstitute(DictThesaurus * d)
+{
+	int			i;
+
+	for (i = 0; i < d->nsubst; i++)
+	{
+		TSLexeme   *rem = d->subst[i].res,
+				   *outptr,
+				   *inptr;
+		int			n = 2;
+
+		outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
+		outptr->lexeme = NULL;
+		inptr = rem;
+
+		while (inptr && inptr->lexeme)
+		{
+			TSLexeme   *lexized,
+						tmplex[2];
+
+			if (inptr->flags & DT_USEASIS)
+			{					/* do not lexize */
+				tmplex[0] = *inptr;
+				tmplex[0].flags = 0;
+				tmplex[1].lexeme = NULL;
+				lexized = tmplex;
+			}
+			else
+			{
+				lexized = (TSLexeme *) DatumGetPointer(
+													   FunctionCall4(
+													   &(d->subdict->lexize),
+									   PointerGetDatum(d->subdict->dictData),
+											  PointerGetDatum(inptr->lexeme),
+										Int32GetDatum(strlen(inptr->lexeme)),
+														PointerGetDatum(NULL)
+																	 )
+					);
+			}
+
+			if (lexized && lexized->lexeme)
+			{
+				int			toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
+
+				while (lexized->lexeme)
+				{
+					if (outptr - d->subst[i].res + 1 >= n)
+					{
+						int			diff = outptr - d->subst[i].res;
+
+						n *= 2;
+						d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
+						outptr = d->subst[i].res + diff;
+					}
+
+					*outptr = *lexized;
+					outptr->lexeme = pstrdup(lexized->lexeme);
+
+					outptr++;
+					lexized++;
+				}
+
+				if (toset > 0)
+					d->subst[i].res[toset].flags |= TSL_ADDPOS;
+			}
+			else if (lexized)
+			{
+				elog(NOTICE, "thesaurus word \"%s\" in substitution is a stop-word, ignored (rule %d)", inptr->lexeme, i + 1);
+			}
+			else
+			{
+				elog(ERROR, "thesaurus word \"%s\" in substitution isn't recognized (rule %d)", inptr->lexeme, i + 1);
+			}
+
+			if (inptr->lexeme)
+				pfree(inptr->lexeme);
+			inptr++;
+		}
+
+		if (outptr == d->subst[i].res)
+			elog(ERROR, "all words in thesaurus substitution are stop words (rule %d)", i + 1);
+
+		d->subst[i].reslen = outptr - d->subst[i].res;
+
+		pfree(rem);
+	}
+}
+
+Datum
+thesaurus_init(PG_FUNCTION_ARGS)
+{
+	DictThesaurus *d;
+	Map		   *cfg,
+			   *pcfg;
+	text	   *in;
+	char	   *subdictname = NULL;
+	bool		fileloaded = false;
+
+	/* init functions must defend against NULLs for themselves */
+	if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("NULL config not allowed for Thesaurus")));
+	in = PG_GETARG_TEXT_P(0);
+
+	parse_keyvalpairs(in, &cfg);
+	PG_FREE_IF_COPY(in, 0);
+
+	d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
+
+	pcfg = cfg;
+	while (pcfg->key)
+	{
+		if (pg_strcasecmp("DictFile", pcfg->key) == 0)
+		{
+			if (fileloaded)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("multiple DictFile parameters")));
+			thesaurusRead(pcfg->value, d);
+			fileloaded = true;
+		}
+		else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
+		{
+			if (subdictname)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("multiple Dictionary parameters")));
+			subdictname = pstrdup(pcfg->value);
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized Thesaurus parameter: \"%s\"",
+							pcfg->key)));
+		}
+		pfree(pcfg->key);
+		pfree(pcfg->value);
+		pcfg++;
+	}
+	pfree(cfg);
+
+	if (!fileloaded)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("missing DictFile parameter")));
+	if (!subdictname)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("missing Dictionary parameter")));
+
+	d->subdictOid = TSDictionaryGetDictid(stringToQualifiedNameList(subdictname), false);
+	d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
+
+	compileTheLexeme(d);
+	compileTheSubstitute(d);
+
+	PG_RETURN_POINTER(d);
+}
+
+static LexemeInfo *
+findTheLexeme(DictThesaurus * d, char *lexeme)
+{
+	TheLexeme	key = {lexeme, NULL}, *res;
+
+	if (d->nwrds == 0)
+		return NULL;
+
+	res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
+
+	if (res == NULL)
+		return NULL;
+	return res->entries;
+}
+
+static bool
+matchIdSubst(LexemeInfo * stored, uint16 idsubst)
+{
+	bool		res = true;
+
+	if (stored)
+	{
+		res = false;
+
+		for (; stored; stored = stored->nextvariant)
+			if (stored->idsubst == idsubst)
+			{
+				res = true;
+				break;
+			}
+	}
+
+	return res;
+}
+
+static LexemeInfo *
+findVariant(LexemeInfo * in, LexemeInfo * stored, uint16 curpos, LexemeInfo ** newin, int newn)
+{
+	for (;;)
+	{
+		int			i;
+		LexemeInfo *ptr = newin[0];
+
+		for (i = 0; i < newn; i++)
+		{
+			while (newin[i] && newin[i]->idsubst < ptr->idsubst)
+				newin[i] = newin[i]->nextentry;
+
+			if (newin[i] == NULL)
+				return in;
+
+			if (newin[i]->idsubst > ptr->idsubst)
+			{
+				ptr = newin[i];
+				i = -1;
+				continue;
+			}
+
+			while (newin[i]->idsubst == ptr->idsubst)
+			{
+				if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
+				{
+					ptr = newin[i];
+					break;
+				}
+
+				newin[i] = newin[i]->nextentry;
+				if (newin[i] == NULL)
+					return in;
+			}
+
+			if (newin[i]->idsubst != ptr->idsubst)
+			{
+				ptr = newin[i];
+				i = -1;
+				continue;
+			}
+		}
+
+		if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
+		{						/* found */
+
+			ptr->nextvariant = in;
+			in = ptr;
+		}
+
+		/* step forward */
+		for (i = 0; i < newn; i++)
+			newin[i] = newin[i]->nextentry;
+	}
+
+	return NULL;
+}
+
+static TSLexeme *
+copyTSLexeme(TheSubstitute * ts)
+{
+	TSLexeme   *res;
+	uint16		i;
+
+	res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
+	for (i = 0; i < ts->reslen; i++)
+	{
+		res[i] = ts->res[i];
+		res[i].lexeme = pstrdup(ts->res[i].lexeme);
+	}
+
+	res[ts->reslen].lexeme = NULL;
+
+	return res;
+}
+
+static TSLexeme *
+checkMatch(DictThesaurus * d, LexemeInfo * info, uint16 curpos, bool *moreres)
+{
+	*moreres = false;
+	while (info)
+	{
+		Assert(info->idsubst < d->nsubst);
+		if (info->nextvariant)
+			*moreres = true;
+		if (d->subst[info->idsubst].lastlexeme == curpos)
+			return copyTSLexeme(d->subst + info->idsubst);
+		info = info->nextvariant;
+	}
+
+	return NULL;
+}
+
+Datum
+thesaurus_lexize(PG_FUNCTION_ARGS)
+{
+	DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
+	DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
+	TSLexeme   *res = NULL;
+	LexemeInfo *stored,
+			   *info = NULL;
+	uint16		curpos = 0;
+	bool		moreres = false;
+
+	if (PG_NARGS() < 4 || dstate == NULL)
+		elog(ERROR, "forbidden call of thesaurus or nested call");
+
+	if (dstate->isend)
+		PG_RETURN_POINTER(NULL);
+	stored = (LexemeInfo *) dstate->private;
+
+	if (stored)
+		curpos = stored->posinsubst + 1;
+
+	if (!d->subdict->isvalid)
+		d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
+
+	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+									   PointerGetDatum(d->subdict->dictData),
+													 PG_GETARG_DATUM(1),
+													 PG_GETARG_DATUM(2),
+													 PointerGetDatum(NULL)));
+
+	if (res && res->lexeme)
+	{
+		TSLexeme   *ptr = res,
+				   *basevar;
+
+		while (ptr->lexeme)
+		{
+			uint16		nv = ptr->nvariant;
+			uint16		i,
+						nlex = 0;
+			LexemeInfo **infos;
+
+			basevar = ptr;
+			while (ptr->lexeme && nv == ptr->nvariant)
+			{
+				nlex++;
+				ptr++;
+			}
+
+			infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
+			for (i = 0; i < nlex; i++)
+				if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
+					break;
+
+			if (i < nlex)
+			{
+				/* no chance to find */
+				pfree(infos);
+				continue;
+			}
+
+			info = findVariant(info, stored, curpos, infos, nlex);
+		}
+	}
+	else if (res)
+	{							/* stop-word */
+		LexemeInfo *infos = findTheLexeme(d, NULL);
+
+		info = findVariant(NULL, stored, curpos, &infos, 1);
+	}
+	else
+	{
+		info = NULL;			/* word isn't recognized */
+	}
+
+	dstate->private = (void *) info;
+
+	if (!info)
+	{
+		dstate->getnext = false;
+		PG_RETURN_POINTER(NULL);
+	}
+
+	if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
+	{
+		dstate->getnext = moreres;
+		PG_RETURN_POINTER(res);
+	}
+
+	dstate->getnext = true;
+
+	PG_RETURN_POINTER(NULL);
+}