experiment with additional lookup tables to reduce logic that occurs in the tight looplxrperf_multiple_tables

author: Lloyd Hilaiel <lloyd@hilaiel.com> 2011-04-25 15:04:31 -0600
committer: Lloyd Hilaiel <lloyd@hilaiel.com> 2011-04-25 15:04:31 -0600
commit: 24d46a675e6bb61f09a6b20881d567eec7f06b12 (patch)
tree: b530f16a85183021fa4f7630384951a00f89d964
parent: bb396891e9f43570b892804b9d7da3f6dd823fc3 (diff)
download: yajl-lxrperf_multiple_tables.tar.gz
1 files changed, 142 insertions, 58 deletions
diff --git a/src/yajl_lex.c b/src/yajl_lex.c
index b098e6a..48fe829 100644
--- a/src/yajl_lex.c
+++ b/src/yajl_lex.c
@@ -24,7 +24,7 @@
 
 #ifdef YAJL_LEXER_DEBUG
 static const char *
-tokToStr(yajl_tok tok) 
+tokToStr(yajl_tok tok)
 {
     switch (tok) {
         case yajl_tok_bool: return "bool";
@@ -53,13 +53,13 @@ tokToStr(yajl_tok tok)
  * the network or disk).  This makes the lexer more complex.  The
  * responsibility of the lexer is to handle transparently the case where
  * a chunk boundary falls in the middle of a token.  This is
- * accomplished is via a buffer and a character reading abstraction. 
+ * accomplished is via a buffer and a character reading abstraction.
  *
  * Overview of implementation
  *
  * When we lex to end of input string before end of token is hit, we
  * copy all of the input text composing the token into our lexBuf.
- * 
+ *
  * Every time we read a character, we do so through the readChar function.
  * readChar's responsibility is to handle pulling all chars from the buffer
  * before pulling chars from input text
@@ -74,7 +74,7 @@ struct yajl_lexer_t {
     yajl_lex_error error;
 
     /* a input buffer to handle the case where a token is spread over
-     * multiple chunks */ 
+     * multiple chunks */
     yajl_buf buf;
 
     /* in the case where we have data in the lexBuf, bufOff holds
@@ -178,6 +178,93 @@ static const char charLookupTable[256] =
        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC
 };
 
+static const char scanningLookupTableUTFChecking[256] =
+{
+/*00*/ 0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+/*08*/ 0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+/*00*/ 0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+/*08*/ 0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+
+/*21*/ 1      , 1      , 0      , 1      , 1      , 1      , 1      , 1      ,
+/*28*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*31*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*38*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+
+/*41*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*48*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*51*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*58*/ 1      , 1      , 1      , 1      , 0      , 1      , 1      , 1      ,
+
+/*61*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*68*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*71*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*78*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+       0    , 0    , 0    , 0    , 0    , 0    , 0    , 0
+};
+
+
+static const char scanningLookupTableNoUTFChecking[256] =
+{
+/*00*/ 0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+/*08*/ 0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+/*00*/ 0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+/*08*/ 0    , 0    , 0    , 0    , 0    , 0    , 0    , 0    ,
+
+/*21*/ 1      , 1      , 0      , 1      , 1      , 1      , 1      , 1      ,
+/*28*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*31*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*38*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+
+/*41*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*48*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*51*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*58*/ 1      , 1      , 1      , 1      , 0      , 1      , 1      , 1      ,
+
+/*61*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*68*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*71*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+/*78*/ 1      , 1      , 1      , 1      , 1      , 1      , 1      , 1      ,
+
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1    ,
+       1    , 1    , 1    , 1    , 1    , 1    , 1    , 1
+};
+
 /** process a variable length utf8 encoded codepoint.
  *
  *  returns:
@@ -186,7 +273,7 @@ static const char charLookupTable[256] =
  *    yajl_tok_eof - if end of input was hit before validation could
  *                   complete
  *    yajl_tok_error - if invalid utf8 was encountered
- * 
+ *
  *  NOTE: on error the offset will point to the first char of the
  *  invalid utf8 */
 #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
@@ -200,7 +287,7 @@ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
         /* single byte */
         return yajl_tok_string;
     } else if ((curChar >> 5) == 0x6) {
-        /* two byte */ 
+        /* two byte */
         UTF8_CHECK_EOF;
         curChar = readChar(lexer, jsonText, offset);
         if ((curChar >> 6) == 0x2) return yajl_tok_string;
@@ -226,7 +313,7 @@ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
                 if ((curChar >> 6) == 0x2) return yajl_tok_string;
             }
         }
-    } 
+    }
 
     return yajl_tok_error;
 }
@@ -254,13 +341,10 @@ if (*offset >= jsonTextLen) { \
 static size_t
 yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
 {
-    unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
+    const unsigned char * table = (utf8check ? scanningLookupTableUTFChecking :
+                                   scanningLookupTableNoUTFChecking);
     size_t skip = 0;
-    while (skip < len && !(charLookupTable[*buf] & mask))
-    {
-        skip++;
-        buf++;
-    }
+    while (skip < len && table[*(buf++)]) skip++;
     return skip;
 }
 
@@ -279,7 +363,7 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
         {
             const unsigned char * p;
             size_t len;
-            
+
             if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
                  lexer->bufOff < yajl_buf_len(lexer->buf)))
             {
@@ -287,8 +371,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
                      (lexer->bufOff));
                 len = yajl_buf_len(lexer->buf) - lexer->bufOff;
                 lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
-            }                
-            else if (*offset < jsonTextLen) 
+            }
+            else if (*offset < jsonTextLen)
             {
                 p = jsonText + *offset;
                 len = jsonTextLen - *offset;
@@ -316,8 +400,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
                 unsigned int i = 0;
 
                 for (i=0;i<4;i++) {
-                    STR_CHECK_EOF;                
-                    curChar = readChar(lexer, jsonText, offset);                
+                    STR_CHECK_EOF;
+                    curChar = readChar(lexer, jsonText, offset);
                     if (!(charLookupTable[curChar] & VHC)) {
                         /* back up to offending char */
                         unreadChar(lexer, offset);
@@ -329,8 +413,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
                 /* back up to offending char */
                 unreadChar(lexer, offset);
                 lexer->error = yajl_lex_string_invalid_escaped_char;
-                goto finish_string_lex;                
-            } 
+                goto finish_string_lex;
+            }
         }
         /* when not validating UTF8 it's a simple table lookup to determine
          * if the present character is invalid */
@@ -338,29 +422,29 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
             /* back up to offending char */
             unreadChar(lexer, offset);
             lexer->error = yajl_lex_string_invalid_json_char;
-            goto finish_string_lex;                
+            goto finish_string_lex;
         }
         /* when in validate UTF8 mode we need to do some extra work */
         else if (lexer->validateUTF8) {
             yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
                                             offset, curChar);
-            
+
             if (t == yajl_tok_eof) {
                 tok = yajl_tok_eof;
                 goto finish_string_lex;
             } else if (t == yajl_tok_error) {
                 lexer->error = yajl_lex_string_invalid_utf8;
                 goto finish_string_lex;
-            } 
+            }
         }
-        /* accept it, and move on */ 
+        /* accept it, and move on */
     }
   finish_string_lex:
     /* tell our buddy, the parser, wether he needs to process this string
      * again */
     if (hasEscapes && tok == yajl_tok_string) {
         tok = yajl_tok_string_with_escapes;
-    } 
+    }
 
     return tok;
 }
@@ -379,23 +463,23 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
 
     yajl_tok tok = yajl_tok_integer;
 
-    RETURN_IF_EOF;    
+    RETURN_IF_EOF;
     c = readChar(lexer, jsonText, offset);
 
     /* optional leading minus */
     if (c == '-') {
-        RETURN_IF_EOF;    
-        c = readChar(lexer, jsonText, offset); 
+        RETURN_IF_EOF;
+        c = readChar(lexer, jsonText, offset);
     }
 
     /* a single zero, or a series of integers */
     if (c == '0') {
-        RETURN_IF_EOF;    
-        c = readChar(lexer, jsonText, offset); 
+        RETURN_IF_EOF;
+        c = readChar(lexer, jsonText, offset);
     } else if (c >= '1' && c <= '9') {
         do {
-            RETURN_IF_EOF;    
-            c = readChar(lexer, jsonText, offset); 
+            RETURN_IF_EOF;
+            c = readChar(lexer, jsonText, offset);
         } while (c >= '0' && c <= '9');
     } else {
         unreadChar(lexer, offset);
@@ -406,15 +490,15 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
     /* optional fraction (indicates this is floating point) */
     if (c == '.') {
         int numRd = 0;
-        
+
         RETURN_IF_EOF;
-        c = readChar(lexer, jsonText, offset); 
+        c = readChar(lexer, jsonText, offset);
 
         while (c >= '0' && c <= '9') {
             numRd++;
             RETURN_IF_EOF;
-            c = readChar(lexer, jsonText, offset); 
-        } 
+            c = readChar(lexer, jsonText, offset);
+        }
 
         if (!numRd) {
             unreadChar(lexer, offset);
@@ -427,18 +511,18 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
     /* optional exponent (indicates this is floating point) */
     if (c == 'e' || c == 'E') {
         RETURN_IF_EOF;
-        c = readChar(lexer, jsonText, offset); 
+        c = readChar(lexer, jsonText, offset);
 
         /* optional sign */
         if (c == '+' || c == '-') {
             RETURN_IF_EOF;
-            c = readChar(lexer, jsonText, offset); 
+            c = readChar(lexer, jsonText, offset);
         }
 
         if (c >= '0' && c <= '9') {
             do {
                 RETURN_IF_EOF;
-                c = readChar(lexer, jsonText, offset); 
+                c = readChar(lexer, jsonText, offset);
             } while (c >= '0' && c <= '9');
         } else {
             unreadChar(lexer, offset);
@@ -447,10 +531,10 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
         }
         tok = yajl_tok_double;
     }
-    
+
     /* we always go "one too far" */
     unreadChar(lexer, offset);
-    
+
     return tok;
 }
 
@@ -462,24 +546,24 @@ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
 
     yajl_tok tok = yajl_tok_comment;
 
-    RETURN_IF_EOF;    
+    RETURN_IF_EOF;
     c = readChar(lexer, jsonText, offset);
 
     /* either slash or star expected */
     if (c == '/') {
         /* now we throw away until end of line */
         do {
-            RETURN_IF_EOF;    
-            c = readChar(lexer, jsonText, offset); 
+            RETURN_IF_EOF;
+            c = readChar(lexer, jsonText, offset);
         } while (c != '\n');
     } else if (c == '*') {
-        /* now we throw away until end of comment */        
+        /* now we throw away until end of comment */
         for (;;) {
-            RETURN_IF_EOF;    
-            c = readChar(lexer, jsonText, offset); 
+            RETURN_IF_EOF;
+            c = readChar(lexer, jsonText, offset);
             if (c == '*') {
-                RETURN_IF_EOF;    
-                c = readChar(lexer, jsonText, offset);                 
+                RETURN_IF_EOF;
+                c = readChar(lexer, jsonText, offset);
                 if (c == '/') {
                     break;
                 } else {
@@ -491,7 +575,7 @@ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
         lexer->error = yajl_lex_invalid_char;
         tok = yajl_tok_error;
     }
-    
+
     return tok;
 }
 
@@ -599,7 +683,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
                 goto lexed;
             }
             case '-':
-            case '0': case '1': case '2': case '3': case '4': 
+            case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7': case '8': case '9': {
                 /* integer parsing wants to start from the beginning */
                 unreadChar(lexer, offset);
@@ -626,11 +710,11 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
                                        jsonTextLen, offset);
                 if (tok == yajl_tok_comment) {
                     /* "error" is silly, but that's the initial
-                     * state of tok.  guilty until proven innocent. */  
+                     * state of tok.  guilty until proven innocent. */
                     tok = yajl_tok_error;
                     yajl_buf_clear(lexer->buf);
                     lexer->bufInUse = 0;
-                    startOffset = *offset; 
+                    startOffset = *offset;
                     break;
                 }
                 /* hit error or eof, bail */
@@ -651,7 +735,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
         lexer->bufInUse = 1;
         yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
         lexer->bufOff = 0;
-        
+
         if (tok != yajl_tok_eof) {
             *outBuf = yajl_buf_data(lexer->buf);
             *outLen = yajl_buf_len(lexer->buf);
@@ -667,7 +751,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
     {
         assert(*outLen >= 2);
         (*outBuf)++;
-        *outLen -= 2; 
+        *outLen -= 2;
     }
 
 
@@ -698,7 +782,7 @@ yajl_lex_error_to_string(yajl_lex_error error)
         case yajl_lex_string_invalid_escaped_char:
             return "inside a string, '\\' occurs before a character "
                    "which it may not.";
-        case yajl_lex_string_invalid_json_char:            
+        case yajl_lex_string_invalid_json_char:
             return "invalid character inside string.";
         case yajl_lex_string_invalid_hex_char:
             return "invalid (non-hex) character occurs after '\\u' inside "
@@ -751,13 +835,13 @@ yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
     size_t bufOff = lexer->bufOff;
     unsigned int bufInUse = lexer->bufInUse;
     yajl_tok tok;
-    
+
     tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
                        &outBuf, &outLen);
 
     lexer->bufOff = bufOff;
     lexer->bufInUse = bufInUse;
     yajl_buf_truncate(lexer->buf, bufLen);
-    
+
     return tok;
 }
author	Lloyd Hilaiel <lloyd@hilaiel.com>	2011-04-25 15:04:31 -0600
committer	Lloyd Hilaiel <lloyd@hilaiel.com>	2011-04-25 15:04:31 -0600
commit	24d46a675e6bb61f09a6b20881d567eec7f06b12 (patch)
tree	b530f16a85183021fa4f7630384951a00f89d964
parent	bb396891e9f43570b892804b9d7da3f6dd823fc3 (diff)
download	yajl-lxrperf_multiple_tables.tar.gz