Fix serious performance problems in json(b) to_tsvector().

In an off-list followup to bug #14745, Bob Jones complained that to_tsvector() on a 2MB jsonb value took an unreasonable amount of time and space --- enough to draw the wrath of the OOM killer on his machine. On my machine, his example proved to require upwards of 18 seconds and 4GB, which seemed pretty bogus considering that to_tsvector() on the same data treated as text took just a couple hundred msec and 10 or so MB. On investigation, the problem is that the implementation scans each string element of the json(b) and converts it to tsvector separately, then applies tsvector_concat() to join those separate tsvectors. The unreasonable memory usage came from leaking every single one of the transient tsvectors --- but even without that mistake, this is an O(N^2) or worse algorithm, because tsvector_concat() has to repeatedly process the words coming from earlier elements. We can fix it by accumulating all the lexeme data and applying make_tsvector() just once. As a side benefit, that also makes the desired adjustment of lexeme positions far cheaper, because we can just tweak the running "pos" counter between JSON elements. In passing, try to make the explanation of that tweak more intelligible. (I didn't think that a barely-readable comment far removed from the actual code was helpful.) And do some minor other code beautification.
author: Tom Lane <tgl@sss.pgh.pa.us> 2017-07-18 12:45:51 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2017-07-18 12:45:51 -0400
commit: b4c6d31c0be0f5c42a75d50afcf13bdd392db4a1 (patch)
tree: 74c9c8037187322adbb80c4859b044c3c639943d /src
parent: fb9bd4b0469e06d96c8cfff86d231401b0916736 (diff)
download: postgresql-b4c6d31c0be0f5c42a75d50afcf13bdd392db4a1.tar.gz
2 files changed, 58 insertions, 71 deletions
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
index 6400440756..b410a49908 100644
--- a/src/backend/tsearch/to_tsany.c
+++ b/src/backend/tsearch/to_tsany.c
@@ -28,11 +28,11 @@ typedef struct MorphOpaque
 typedef struct TSVectorBuildState
 {
 	ParsedText *prs;
-	TSVector	result;
 	Oid			cfgId;
 } TSVectorBuildState;
 
-static void add_to_tsvector(void *state, char *elem_value, int elem_len);
+static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
+
 
 Datum
 get_current_ts_config(PG_FUNCTION_ARGS)
@@ -270,34 +270,33 @@ jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
 {
 	Oid			cfgId = PG_GETARG_OID(0);
 	Jsonb	   *jb = PG_GETARG_JSONB(1);
+	TSVector	result;
 	TSVectorBuildState state;
-	ParsedText *prs = (ParsedText *) palloc(sizeof(ParsedText));
+	ParsedText	prs;
 
-	prs->words = NULL;
-	state.result = NULL;
+	prs.words = NULL;
+	prs.curwords = 0;
+	state.prs = &prs;
 	state.cfgId = cfgId;
-	state.prs = prs;
 
-	iterate_jsonb_string_values(jb, &state, (JsonIterateStringValuesAction) add_to_tsvector);
+	iterate_jsonb_string_values(jb, &state, add_to_tsvector);
 
-	PG_FREE_IF_COPY(jb, 1);
-
-	if (state.result == NULL)
+	if (prs.curwords > 0)
+		result = make_tsvector(&prs);
+	else
 	{
 		/*
-		 * There weren't any string elements in jsonb, so wee need to return
-		 * an empty vector
+		 * There weren't any string elements in jsonb, so we need to return an
+		 * empty vector
 		 */
-
-		if (prs->words != NULL)
-			pfree(prs->words);
-
-		state.result = palloc(CALCDATASIZE(0, 0));
-		SET_VARSIZE(state.result, CALCDATASIZE(0, 0));
-		state.result->size = 0;
+		result = palloc(CALCDATASIZE(0, 0));
+		SET_VARSIZE(result, CALCDATASIZE(0, 0));
+		result->size = 0;
 	}
 
-	PG_RETURN_TSVECTOR(state.result);
+	PG_FREE_IF_COPY(jb, 1);
+
+	PG_RETURN_TSVECTOR(result);
 }
 
 Datum
@@ -317,33 +316,33 @@ json_to_tsvector_byid(PG_FUNCTION_ARGS)
 {
 	Oid			cfgId = PG_GETARG_OID(0);
 	text	   *json = PG_GETARG_TEXT_P(1);
+	TSVector	result;
 	TSVectorBuildState state;
-	ParsedText *prs = (ParsedText *) palloc(sizeof(ParsedText));
+	ParsedText	prs;
 
-	prs->words = NULL;
-	state.result = NULL;
+	prs.words = NULL;
+	prs.curwords = 0;
+	state.prs = &prs;
 	state.cfgId = cfgId;
-	state.prs = prs;
 
-	iterate_json_string_values(json, &state, (JsonIterateStringValuesAction) add_to_tsvector);
+	iterate_json_string_values(json, &state, add_to_tsvector);
 
-	PG_FREE_IF_COPY(json, 1);
-	if (state.result == NULL)
+	if (prs.curwords > 0)
+		result = make_tsvector(&prs);
+	else
 	{
 		/*
-		 * There weren't any string elements in json, so wee need to return an
+		 * There weren't any string elements in json, so we need to return an
 		 * empty vector
 		 */
-
-		if (prs->words != NULL)
-			pfree(prs->words);
-
-		state.result = palloc(CALCDATASIZE(0, 0));
-		SET_VARSIZE(state.result, CALCDATASIZE(0, 0));
-		state.result->size = 0;
+		result = palloc(CALCDATASIZE(0, 0));
+		SET_VARSIZE(result, CALCDATASIZE(0, 0));
+		result->size = 0;
 	}
 
-	PG_RETURN_TSVECTOR(state.result);
+	PG_FREE_IF_COPY(json, 1);
+
+	PG_RETURN_TSVECTOR(result);
 }
 
 Datum
@@ -359,45 +358,42 @@ json_to_tsvector(PG_FUNCTION_ARGS)
 }
 
 /*
- * Extend current TSVector from _state with a new one,
- * build over a json(b) element.
+ * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
  */
 static void
 add_to_tsvector(void *_state, char *elem_value, int elem_len)
 {
 	TSVectorBuildState *state = (TSVectorBuildState *) _state;
 	ParsedText *prs = state->prs;
-	TSVector	item_vector;
-	int			i;
+	int32		prevwords;
 
-	prs->lenwords = elem_len / 6;
-	if (prs->lenwords == 0)
-		prs->lenwords = 2;
+	if (prs->words == NULL)
+	{
+		/*
+		 * First time through: initialize words array to a reasonable size.
+		 * (parsetext() will realloc it bigger as needed.)
+		 */
+		prs->lenwords = Max(elem_len / 6, 64);
+		prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
+		prs->curwords = 0;
+		prs->pos = 0;
+	}
 
-	prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
-	prs->curwords = 0;
-	prs->pos = 0;
+	prevwords = prs->curwords;
 
 	parsetext(state->cfgId, prs, elem_value, elem_len);
 
-	if (prs->curwords)
-	{
-		if (state->result != NULL)
-		{
-			for (i = 0; i < prs->curwords; i++)
-				prs->words[i].pos.pos = prs->words[i].pos.pos + TS_JUMP;
-
-			item_vector = make_tsvector(prs);
-
-			state->result = (TSVector) DirectFunctionCall2(tsvector_concat,
-														   TSVectorGetDatum(state->result),
-														   PointerGetDatum(item_vector));
-		}
-		else
-			state->result = make_tsvector(prs);
-	}
+	/*
+	 * If we extracted any words from this JSON element, advance pos to create
+	 * an artificial break between elements.  This is because we don't want
+	 * phrase searches to think that the last word in this element is adjacent
+	 * to the first word in the next one.
+	 */
+	if (prs->curwords > prevwords)
+		prs->pos += 1;
 }
 
+
 /*
  * to_tsquery
  */
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 2885bc0153..30d7c4bccd 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -86,15 +86,6 @@ typedef struct
 #define MAXNUMPOS	(256)
 #define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) )
 
-/*
- * In case if a TSVector contains several parts and we want to treat them as
- * separate, it's necessary to add an artificial increment to position of each
- * lexeme from every next part. It's required to avoid the situation when
- * tsquery can find a phrase consisting of lexemes from two of such parts.
- * TS_JUMP defined a value of this increment.
- */
-#define TS_JUMP 1
-
 /* This struct represents a complete tsvector datum */
 typedef struct
 {
author	Tom Lane <tgl@sss.pgh.pa.us>	2017-07-18 12:45:51 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2017-07-18 12:45:51 -0400
commit	b4c6d31c0be0f5c42a75d50afcf13bdd392db4a1 (patch)
tree	74c9c8037187322adbb80c4859b044c3c639943d /src
parent	fb9bd4b0469e06d96c8cfff86d231401b0916736 (diff)
download	postgresql-b4c6d31c0be0f5c42a75d50afcf13bdd392db4a1.tar.gz