4 files changed, 343 insertions, 16 deletions
diff --git a/src/backend/regex/README b/src/backend/regex/README
new file mode 100644
index 0000000000..3fd58c0001
--- /dev/null
+++ b/src/backend/regex/README
@@ -0,0 +1,291 @@
+Implementation notes about Henry Spencer's regex library
+========================================================
+
+If Henry ever had any internals documentation, he didn't publish it.
+So this file is an attempt to reverse-engineer some docs.
+
+General source-file layout
+--------------------------
+
+There are four separately-compilable source files, each exposing exactly
+one exported function:
+	regcomp.c: pg_regcomp
+	regexec.c: pg_regexec
+	regerror.c: pg_regerror
+	regfree.c: pg_regfree
+(The pg_ prefixes were added by the Postgres project to distinguish this
+library version from any similar one that might be present on a particular
+system.  They'd need to be removed or replaced in any standalone version
+of the library.)
+
+There are additional source files regc_*.c that are #include'd in regcomp,
+and similarly additional source files rege_*.c that are #include'd in
+regexec.  This was done to avoid exposing internal symbols globally;
+all functions not meant to be part of the library API are static.
+
+(Actually the above is a lie in one respect: there is one more global
+symbol, pg_set_regex_collation in regcomp.  It is not meant to be part of
+the API, but it has to be global because both regcomp and regexec call it.
+It'd be better to get rid of that, as well as the static variables it
+sets, in favor of keeping the needed locale state in the regex structs.
+We have not done this yet for lack of a design for how to add
+application-specific state to the structs.)
+
+What's where in src/backend/regex/:
+
+regcomp.c		Top-level regex compilation code
+regc_color.c		Color map management
+regc_cvec.c		Character vector (cvec) management
+regc_lex.c		Lexer
+regc_nfa.c		NFA handling
+regc_locale.c		Application-specific locale code from Tcl project
+regc_pg_locale.c	Postgres-added application-specific locale code
+regexec.c		Top-level regex execution code
+rege_dfa.c		DFA creation and execution
+regerror.c		pg_regerror: generate text for a regex error code
+regfree.c		pg_regfree: API to free a no-longer-needed regex_t
+
+The locale-specific code is concerned primarily with case-folding and with
+expanding locale-specific character classes, such as [[:alnum:]].  It
+really needs refactoring if this is ever to become a standalone library.
+
+The header files for the library are in src/include/regex/:
+
+regcustom.h		Customizes library for particular application
+regerrs.h		Error message list
+regex.h			Exported API
+regguts.h		Internals declarations
+
+
+DFAs, NFAs, and all that
+------------------------
+
+This library is a hybrid DFA/NFA regex implementation.  (If you've never
+heard either of those terms, get thee to a first-year comp sci textbook.)
+It might not be clear at first glance what that really means and how it
+relates to what you'll see in the code.  Here's what really happens:
+
+* Initial parsing of a regex generates an NFA representation, with number
+of states approximately proportional to the length of the regexp.
+
+* The NFA is then optimized into a "compact NFA" representation, which is
+basically the same data but without fields that are not going to be needed
+at runtime.  We do a little bit of cleanup too, such as removing
+unreachable states that might be created as a result of the rather naive
+transformation done by initial parsing.  The cNFA representation is what
+is passed from regcomp to regexec.
+
+* Unlike traditional NFA-based regex engines, we do not execute directly
+from the NFA representation, as that would require backtracking and so be
+very slow in some cases.  Rather, we execute a DFA, which ideally can
+process an input string in linear time (O(M) for M characters of input)
+without backtracking.  Each state of the DFA corresponds to a set of
+states of the NFA, that is all the states that the NFA might have been in
+upon reaching the current point in the input string.  Therefore, an NFA
+with N states might require as many as 2^N states in the corresponding
+DFA, which could easily require unreasonable amounts of memory.  We deal
+with this by materializing states of the DFA lazily (only when needed) and
+keeping them in a limited-size cache.  The possible need to build the same
+state of the DFA repeatedly makes this approach not truly O(M) time, but
+in the worst case as much as O(M*N).  That's still far better than the
+worst case for a backtracking NFA engine.
+
+If that were the end of it, we'd just say this is a DFA engine, with the
+use of NFAs being merely an implementation detail.  However, a DFA engine
+cannot handle some important regex features such as capturing parens and
+back-references.  If the parser finds that a regex uses these features
+(collectively called "messy cases" in the code), then we have to use
+NFA-style backtracking search after all.
+
+When using the NFA mode, the representation constructed by the parser
+consists of a tree of sub-expressions ("subre"s).  Leaf tree nodes are
+either plain regular expressions (which are executed as DFAs in the manner
+described above) or back-references (which try to match the input to some
+previous substring).  Non-leaf nodes are capture nodes (which save the
+location of the substring currently matching their child node) or
+concatenation or alternation nodes.  At execution time, the executor
+recursively scans the tree.  At concatenation or alternation nodes,
+it considers each possible alternative way of matching the input string,
+ie each place where the string could be split for a concatenation, or each
+child node for an alternation.  It tries the next alternative if the match
+fails according to the child nodes.  This is exactly the sort of
+backtracking search done by a traditional NFA regex engine.  If there are
+many tree levels it can get very slow.
+
+But all is not lost: we can still be smarter than the average pure NFA
+engine.  To do this, each subre node has an associated DFA, which
+represents what the node could possibly match insofar as a mathematically
+pure regex can describe that, which basically means "no backrefs".
+Before we perform any search of possible alternative sub-matches, we run
+the DFA to see if it thinks the proposed substring could possibly match.
+If not, we can reject the match immediately without iterating through many
+possibilities.
+
+As an example, consider the regex "(a[bc]+)\1".  The compiled
+representation will have a top-level concatenation subre node.  Its left
+child is a capture node, and the child of that is a plain DFA node for
+"a[bc]+".  The concatenation's right child is a backref node for \1.
+The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
+where the backref has been replaced by a copy of the DFA for its referent
+expression.  When executed, the concatenation node will have to search for
+a possible division of the input string that allows its two child nodes to
+each match their part of the string (and although this specific case can
+only succeed when the division is at the middle, the code does not know
+that, nor would it be true in general).  However, we can first run the DFA
+and quickly reject any input that doesn't contain two a's and some number
+of b's and c's.  If the DFA doesn't match, there is no need to recurse to
+the two child nodes for each possible string division point.  In many
+cases, this prefiltering makes the search run much faster than a pure NFA
+engine could do.  It is this behavior that justifies using the phrase
+"hybrid DFA/NFA engine" to describe Spencer's library.
+
+
+Colors and colormapping
+-----------------------
+
+In many common regex patterns, there are large numbers of characters that
+can be treated alike by the execution engine.  A simple example is the
+pattern "[[:alpha:]][[:alnum:]]*" for an identifier.  Basically the engine
+only needs to care whether an input symbol is a letter, a digit, or other.
+We could build the NFA or DFA with a separate arc for each possible letter
+and digit, but that's very wasteful of space and not so cheap to execute
+either, especially when dealing with Unicode which can have thousands of
+letters.  Instead, the parser builds a "color map" that maps each possible
+input symbol to a "color", or equivalence class.  The NFA or DFA
+representation then has arcs labeled with colors, not specific input
+symbols.  At execution, the first thing the executor does with each input
+symbol is to look up its color in the color map, and then everything else
+works from the color only.
+
+To build the colormap, we start by assigning every possible input symbol
+the color WHITE, which means "other" (that is, at the end of parsing, the
+symbols that are still WHITE are those not explicitly referenced anywhere
+in the regex).  When we see a simple literal character or a bracket
+expression in the regex, we want to assign that character, or all the
+characters represented by the bracket expression, a unique new color that
+can be used to label the NFA arc corresponding to the state transition for
+matching this character or bracket expression.  The basic idea is:
+first, change the color assigned to a character to some new value;
+second, run through all the existing arcs in the partially-built NFA,
+and for each one referencing the character's old color, add a parallel
+arc referencing its new color (this keeps the reassignment from changing
+the semantics of what we already built); and third, add a new arc with
+the character's new color to the current pair of NFA states, denoting
+that seeing this character allows the state transition to be made.
+
+This is complicated a bit by not wanting to create more colors
+(equivalence classes) than absolutely necessary.  In particular, if a
+bracket expression mentions two characters that had the same color before,
+they should still share the same color after we process the bracket, since
+there is still not a need to distinguish them.  But we do need to
+distinguish them from other characters that previously had the same color
+yet are not listed in the bracket expression.  To mechanize this, the code
+has a concept of "parent colors" and "subcolors", where a color's subcolor
+is the new color that we are giving to any characters of that color while
+parsing the current atom.  (The word "parent" is a bit unfortunate here,
+because it suggests a long-lived relationship, but a subcolor link really
+only lasts for the duration of parsing a single atom.)  In other words,
+a subcolor link means that we are in process of splitting the parent color
+into two colors (equivalence classes), depending on whether or not each
+member character should be included by the current regex atom.
+
+As an example, suppose we have the regex "a\d\wx".  Initially all possible
+character codes are labeled WHITE (color 0).  To parse the atom "a", we
+create a new color (1), update "a"'s color map entry to 1, and create an
+arc labeled 1 between the first two states of the NFA.  Now we see \d,
+which is really a bracket expression containing the digits "0"-"9".
+First we process "0", which is currently WHITE, so we create a new color
+(2), update "0"'s color map entry to 2, and create an arc labeled 2
+between the second and third states of the NFA.  We also mark color WHITE
+as having the subcolor 2, which means that future relabelings of WHITE
+characters should also select 2 as the new color.  Thus, when we process
+"1", we won't create a new color but re-use 2.  We update "1"'s color map
+entry to 2, and then find that we don't need a new arc because there is
+already one labeled 2 between the second and third states of the NFA.
+Similarly for the other 8 digits, so there will be only one arc labeled 2
+between NFA states 2 and 3 for all members of this bracket expression.
+At completion of processing of the bracket expression, we call okcolors()
+which breaks all the existing parent/subcolor links; there is no longer a
+marker saying that WHITE characters should be relabeled 2.  (Note:
+actually, we did the same creation and clearing of a subcolor link for the
+primitive atom "a", but it didn't do anything very interesting.)  Now we
+come to the "\w" bracket expression, which for simplicity assume expands
+to just "[a-z0-9]".  We process "a", but observe that it is already the
+sole member of its color 1.  This means there is no need to subdivide that
+equivalence class more finely, so we do not create any new color.  We just
+make an arc labeled 1 between the third and fourth NFA states.  Next we
+process "b", which is WHITE and far from the only WHITE character, so we
+create a new color (3), link that as WHITE's subcolor, relabel "b" as
+color 3, and make an arc labeled 3.  As we process "c" through "z", each
+is relabeled from WHITE to 3, but no new arc is needed.  Now we come to
+"0", which is not the only member of its color 2, so we suppose that a new
+color is needed and create color 4.  We link 4 as subcolor of 2, relabel
+"0" as color 4 in the map, and add an arc for color 4.  Next "1" through
+"9" are similarly relabeled as color 4, with no additional arcs needed.
+Having finished the bracket expression, we call okcolors(), which breaks
+the subcolor links.  okcolors() further observes that we have removed
+every member of color 2 (the previous color of the digit characters).
+Therefore, it runs through the partial NFA built so far and relabels arcs
+labeled 2 to color 4; in particular the arc from NFA state 2 to state 3 is
+relabeled color 4.  Then it frees up color 2, since we have no more use
+for that color.  We now have an NFA in which transitions for digits are
+consistently labeled with color 4.  Last, we come to the atom "x".
+"x" is currently labeled with color 3, and it's not the only member of
+that color, so we realize that we now need to distinguish "x" from other
+letters when we did not before.  We create a new color, which might have
+been 5 but instead we recycle the unused color 2.  "x" is relabeled 2 in
+the color map and 2 is linked as the subcolor of 3, and we add an arc for
+2 between states 4 and 5 of the NFA.  Now we call okcolors(), which breaks
+the subcolor link between colors 3 and 2 and notices that both colors are
+nonempty.  Therefore, it also runs through the existing NFA arcs and adds
+an additional arc labeled 2 wherever there is an arc labeled 3; this
+action ensures that characters of color 2 (i.e., "x") will still be
+considered as allowing any transitions they did before.  We are now done
+parsing the regex, and we have these final color assignments:
+	color 1: "a"
+	color 2: "x"
+	color 3: other letters
+	color 4: digits
+and the NFA has these arcs:
+	states 1 -> 2 on color 1 (hence, "a" only)
+	states 2 -> 3 on color 4 (digits)
+	states 3 -> 4 on colors 1, 3, 4, and 2 (covering all \w characters)
+	states 4 -> 5 on color 2 ("x" only)
+which can be seen to be a correct representation of the regex.
+
+Given this summary, we can see we need the following operations for
+colors:
+
+* A fast way to look up the current color assignment for any character
+  code.  (This is needed during both parsing and execution, while the
+  remaining operations are needed only during parsing.)
+* A way to alter the color assignment for any given character code.
+* We must track the number of characters currently assigned to each
+  color, so that we can detect empty and singleton colors.
+* We must track all existing NFA arcs of a given color, so that we
+  can relabel them at need, or add parallel arcs of a new color when
+  an existing color has to be subdivided.
+
+The last two of these are handled with the "struct colordesc" array and
+the "colorchain" links in NFA arc structs.  The color map proper (that
+is, the per-character lookup array) is handled as a multi-level tree,
+with each tree level indexed by one byte of a character's value.  The
+code arranges to not have more than one copy of bottom-level tree pages
+that are all-the-same-color.
+
+Unfortunately, this design does not seem terribly efficient for common
+cases such as a tree in which all Unicode letters are colored the same,
+because there aren't that many places where we get a whole page all the
+same color, except at the end of the map.  (It also strikes me that given
+PG's current restrictions on the range of Unicode values, we could use a
+3-level rather than 4-level tree; but there's not provision for that in
+regguts.h at the moment.)
+
+A bigger problem is that it just doesn't seem very reasonable to have to
+consider each Unicode letter separately at regex parse time for a regex
+such as "\w"; more than likely, a huge percentage of those codes will
+never be seen at runtime.  We need to fix things so that locale-based
+character classes are somehow processed "symbolically" without making a
+full expansion of their contents at parse time.  This would mean that we'd
+have to be ready to call iswalpha() at runtime, but if that only happens
+for high-code-value characters, it shouldn't be a big performance hit.
diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c
index fb6f06b524..580a693161 100644
--- a/src/backend/regex/regc_cvec.c
+++ b/src/backend/regex/regc_cvec.c
@@ -77,6 +77,7 @@ static void
 addchr(struct cvec * cv,		/* character vector */
 	   chr c)					/* character to add */
 {
+	assert(cv->nchrs < cv->chrspace);
 	cv->chrs[cv->nchrs++] = (chr) c;
 }
 
@@ -95,17 +96,27 @@ addrange(struct cvec * cv,		/* character vector */
 }
 
 /*
- * getcvec - get a cvec, remembering it as v->cv
+ * getcvec - get a transient cvec, initialized to empty
+ *
+ * The returned cvec is valid only until the next call of getcvec, which
+ * typically will recycle the space.  Callers should *not* free the cvec
+ * explicitly; it will be cleaned up when the struct vars is destroyed.
+ *
+ * This is typically used while interpreting bracket expressions.  In that
+ * usage the cvec is only needed momentarily until we build arcs from it,
+ * so transientness is a convenient behavior.
  */
 static struct cvec *
 getcvec(struct vars * v,		/* context */
 		int nchrs,				/* to hold this many chrs... */
 		int nranges)			/* ... and this many ranges */
 {
+	/* recycle existing transient cvec if large enough */
 	if (v->cv != NULL && nchrs <= v->cv->chrspace &&
 		nranges <= v->cv->rangespace)
 		return clearcvec(v->cv);
 
+	/* nope, make a new one */
 	if (v->cv != NULL)
 		freecvec(v->cv);
 	v->cv = newcvec(nchrs, nranges);
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index bd4d4c3761..4f9da5b046 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -356,6 +356,7 @@ pg_regcomp(regex_t *re,
 	ZAPCNFA(g->search);
 	v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
 	CNOERR();
+	/* set up a reasonably-sized transient cvec for getcvec usage */
 	v->cv = newcvec(100, 20);
 	if (v->cv == NULL)
 		return freev(v, REG_ESPACE);
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 0cced701db..fb6789b560 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -181,34 +181,52 @@ union tree
 #define tcolor	colors.ccolor
 #define tptr	ptrs.pptr
 
-/* internal per-color descriptor structure for the color machinery */
+/*
+ * Per-color data structure for the compile-time color machinery
+ *
+ * If "sub" is not NOSUB then it is the number of the color's current
+ * subcolor, i.e. we are in process of dividing this color (character
+ * equivalence class) into two colors.  See src/backend/regex/README for
+ * discussion of subcolors.
+ *
+ * Currently-unused colors have the FREECOL bit set and are linked into a
+ * freelist using their "sub" fields, but only if their color numbers are
+ * less than colormap.max.  Any array entries beyond "max" are just garbage.
+ */
 struct colordesc
 {
 	uchr		nchrs;			/* number of chars of this color */
-	color		sub;			/* open subcolor (if any); free chain ptr */
-#define  NOSUB	 COLORLESS
-	struct arc *arcs;			/* color chain */
-	int			flags;
+	color		sub;			/* open subcolor, if any; or free-chain ptr */
+#define  NOSUB	 COLORLESS		/* value of "sub" when no open subcolor */
+	struct arc *arcs;			/* chain of all arcs of this color */
+	int			flags;			/* bit values defined next */
 #define  FREECOL 01				/* currently free */
 #define  PSEUDO  02				/* pseudocolor, no real chars */
 #define  UNUSEDCOLOR(cd) ((cd)->flags&FREECOL)
 	union tree *block;			/* block of solid color, if any */
 };
 
-/* the color map itself */
+/*
+ * The color map itself
+ *
+ * Only the "tree" part is used at execution time, and that only via the
+ * GETCOLOR() macro.  Possibly that should be separated from the compile-time
+ * data.
+ */
 struct colormap
 {
 	int			magic;
 #define  CMMAGIC 0x876
 	struct vars *v;				/* for compile error reporting */
-	size_t		ncds;			/* number of colordescs */
-	size_t		max;			/* highest in use */
+	size_t		ncds;			/* allocated length of colordescs array */
+	size_t		max;			/* highest color number currently in use */
 	color		free;			/* beginning of free chain (if non-0) */
-	struct colordesc *cd;
+	struct colordesc *cd;		/* pointer to array of colordescs */
 #define  CDEND(cm)	 (&(cm)->cd[(cm)->max + 1])
+	/* If we need up to NINLINECDS, we store them here to save a malloc */
 #define  NINLINECDS  ((size_t)10)
 	struct colordesc cdspace[NINLINECDS];
-	union tree	tree[NBYTS];	/* tree top, plus fill blocks */
+	union tree	tree[NBYTS];	/* tree top, plus lower-level fill blocks */
 };
 
 /* optimization magic to do fast chr->color mapping */
@@ -229,19 +247,25 @@ struct colormap
 
 
 /*
- * Interface definitions for locale-interface functions in locale.c.
+ * Interface definitions for locale-interface functions in regc_locale.c.
  */
 
-/* Representation of a set of characters. */
+/*
+ * Representation of a set of characters.  chrs[] represents individual
+ * code points, ranges[] represents ranges in the form min..max inclusive.
+ *
+ * Note that in cvecs gotten from newcvec() and intended to be freed by
+ * freecvec(), both arrays of chrs are after the end of the struct, not
+ * separately malloc'd; so chrspace and rangespace are effectively immutable.
+ */
 struct cvec
 {
 	int			nchrs;			/* number of chrs */
-	int			chrspace;		/* number of chrs possible */
+	int			chrspace;		/* number of chrs allocated in chrs[] */
 	chr		   *chrs;			/* pointer to vector of chrs */
 	int			nranges;		/* number of ranges (chr pairs) */
-	int			rangespace;		/* number of chrs possible */
+	int			rangespace;		/* number of ranges allocated in ranges[] */
 	chr		   *ranges;			/* pointer to vector of chr pairs */
-	/* both batches of chrs are on the end */
 };