Refactor regex character class parsing in [:name:]

re_wctype function is used in three separate places and in all of those places almost exact code extracting the name from [:name:] surrounds it. Furthermore, re_wctype requires a NUL-terminated string, so the name of the character class is copied to a temporary buffer. The code duplication and unnecessary memory copying can be avoided by pushing the responsibility of parsing the whole [:name:] sequence to the function. Furthermore, since now the function has access to the length of the character class name (since it’s doing the parsing), it can take advantage of that information in skipping some string comparisons and using a constant-length memcmp instead of strcmp which needs to take care of NUL bytes. * src/regex.c (re_wctype): Delete function. Replace it with: (re_wctype_parse): New function which parses a whole [:name:] string and returns a RECC_* constant or -1 if the string is not of [:name:] format. (regex_compile): Use re_wctype_parse. * src/syntax.c (skip_chars): Use re_wctype_parse.
author: Michal Nazarewicz <mina86@mina86.com> 2016-07-17 03:09:38 +0200
committer: Michal Nazarewicz <mina86@mina86.com> 2016-08-02 15:39:10 +0200
commit: 4538a5e37e8dacde4b3e828d832c4c558a146912 (patch)
tree: 43a158bf0635a01bf5946730ac439fd0b3b8f606 /src/syntax.c
parent: e7257061317c604492d20f26f312b9e925aa1860 (diff)
download: emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.tar.gz
1 files changed, 26 insertions, 70 deletions
diff --git a/src/syntax.c b/src/syntax.c
index f8d987b377c..667de402ec4 100644
--- a/src/syntax.c
+++ b/src/syntax.c
@@ -1691,44 +1691,22 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
       /* At first setup fastmap.  */
       while (i_byte < size_byte)
 	{
-	  c = str[i_byte++];
-
-	  if (handle_iso_classes && c == '['
-	      && i_byte < size_byte
-	      && str[i_byte] == ':')
+	  if (handle_iso_classes)
 	    {
-	      const unsigned char *class_beg = str + i_byte + 1;
-	      const unsigned char *class_end = class_beg;
-	      const unsigned char *class_limit = str + size_byte - 2;
-	      /* Leave room for the null.  */
-	      unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
-	      re_wctype_t cc;
-
-	      if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
-		class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
-
-	      while (class_end < class_limit
-		     && *class_end >= 'a' && *class_end <= 'z')
-		class_end++;
-
-	      if (class_end == class_beg
-		  || *class_end != ':' || class_end[1] != ']')
-		goto not_a_class_name;
-
-	      memcpy (class_name, class_beg, class_end - class_beg);
-	      class_name[class_end - class_beg] = 0;
-
-	      cc = re_wctype (class_name);
+	      const unsigned char *ch = str + i_byte;
+	      re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
 	      if (cc == 0)
 		error ("Invalid ISO C character class");
-
-	      iso_classes = Fcons (make_number (cc), iso_classes);
-
-	      i_byte = class_end + 2 - str;
-	      continue;
+	      if (cc != -1)
+		{
+		  iso_classes = Fcons (make_number (cc), iso_classes);
+		  i_byte = ch - str;
+		  continue;
+		}
 	    }
 
-	not_a_class_name:
+	  c = str[i_byte++];
+
 	  if (c == '\\')
 	    {
 	      if (i_byte == size_byte)
@@ -1808,54 +1786,32 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
       while (i_byte < size_byte)
 	{
 	  int leading_code = str[i_byte];
-	  c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
-	  i_byte += len;
 
-	  if (handle_iso_classes && c == '['
-	      && i_byte < size_byte
-	      && STRING_CHAR (str + i_byte) == ':')
+	  if (handle_iso_classes)
 	    {
-	      const unsigned char *class_beg = str + i_byte + 1;
-	      const unsigned char *class_end = class_beg;
-	      const unsigned char *class_limit = str + size_byte - 2;
-	      /* Leave room for the null.	 */
-	      unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
-	      re_wctype_t cc;
-
-	      if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
-		class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
-
-	      while (class_end < class_limit
-		     && *class_end >= 'a' && *class_end <= 'z')
-		class_end++;
-
-	      if (class_end == class_beg
-		  || *class_end != ':' || class_end[1] != ']')
-		goto not_a_class_name_multibyte;
-
-	      memcpy (class_name, class_beg, class_end - class_beg);
-	      class_name[class_end - class_beg] = 0;
-
-	      cc = re_wctype (class_name);
+	      const unsigned char *ch = str + i_byte;
+	      re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
 	      if (cc == 0)
 		error ("Invalid ISO C character class");
-
-	      iso_classes = Fcons (make_number (cc), iso_classes);
-
-	      i_byte = class_end + 2 - str;
-	      continue;
+	      if (cc != -1)
+		{
+		  iso_classes = Fcons (make_number (cc), iso_classes);
+		  i_byte = ch - str;
+		  continue;
+		}
 	    }
 
-	not_a_class_name_multibyte:
-	  if (c == '\\')
+	  if (leading_code== '\\')
 	    {
-	      if (i_byte == size_byte)
+	      if (++i_byte == size_byte)
 		break;
 
 	      leading_code = str[i_byte];
-	      c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
-	      i_byte += len;
 	    }
+	  c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
+	  i_byte += len;
+
+
 	  /* Treat `-' as range character only if another character
 	     follows.  */
 	  if (i_byte + 1 < size_byte
author	Michal Nazarewicz <mina86@mina86.com>	2016-07-17 03:09:38 +0200
committer	Michal Nazarewicz <mina86@mina86.com>	2016-08-02 15:39:10 +0200
commit	4538a5e37e8dacde4b3e828d832c4c558a146912 (patch)
tree	43a158bf0635a01bf5946730ac439fd0b3b8f606 /src/syntax.c
parent	e7257061317c604492d20f26f312b9e925aa1860 (diff)
download	emacs-4538a5e37e8dacde4b3e828d832c4c558a146912.tar.gz