From 567874b9181ebdef8a55f93137bc934280f84378 Mon Sep 17 00:00:00 2001
From: Lloyd Hilaiel <lloyd@hilaiel.com>
Date: Mon, 25 Apr 2011 15:22:38 -0600
Subject: experiment with skipping 2 bytes at a time while string scanning

---
 src/yajl_lex.c | 141 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 55 deletions(-)

diff --git a/src/yajl_lex.c b/src/yajl_lex.c
index b098e6a..d4ad27f 100644
--- a/src/yajl_lex.c
+++ b/src/yajl_lex.c
@@ -24,7 +24,7 @@
 
 #ifdef YAJL_LEXER_DEBUG
 static const char *
-tokToStr(yajl_tok tok) 
+tokToStr(yajl_tok tok)
 {
     switch (tok) {
         case yajl_tok_bool: return "bool";
@@ -53,13 +53,13 @@ tokToStr(yajl_tok tok)
  * the network or disk).  This makes the lexer more complex.  The
  * responsibility of the lexer is to handle transparently the case where
  * a chunk boundary falls in the middle of a token.  This is
- * accomplished is via a buffer and a character reading abstraction. 
+ * accomplished is via a buffer and a character reading abstraction.
  *
  * Overview of implementation
  *
  * When we lex to end of input string before end of token is hit, we
  * copy all of the input text composing the token into our lexBuf.
- * 
+ *
  * Every time we read a character, we do so through the readChar function.
  * readChar's responsibility is to handle pulling all chars from the buffer
  * before pulling chars from input text
@@ -74,7 +74,7 @@ struct yajl_lexer_t {
     yajl_lex_error error;
 
     /* a input buffer to handle the case where a token is spread over
-     * multiple chunks */ 
+     * multiple chunks */
     yajl_buf buf;
 
     /* in the case where we have data in the lexBuf, bufOff holds
@@ -178,6 +178,29 @@ static const char charLookupTable[256] =
        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC
 };
 
+static unsigned char * shortLookupTable = NULL;
+static void buildShortTable(void)
+{
+    union {
+        struct {
+            unsigned char low;
+            unsigned char high;
+        } c;
+        unsigned short s;
+    } scs;
+    int i,j;
+
+    shortLookupTable = (unsigned char *) calloc(sizeof(unsigned char), 256 * 256);
+
+    for (i=0;i<256;i++) {
+        scs.c.low = charLookupTable[i];
+        for (j=0;j<256;j++) {
+            shortLookupTable[i + (j << 8)] = charLookupTable[i] | charLookupTable[j];
+        }
+    }
+//    printf("built table\n");
+}
+
 /** process a variable length utf8 encoded codepoint.
  *
  *  returns:
@@ -186,7 +209,7 @@ static const char charLookupTable[256] =
  *    yajl_tok_eof - if end of input was hit before validation could
  *                   complete
  *    yajl_tok_error - if invalid utf8 was encountered
- * 
+ *
  *  NOTE: on error the offset will point to the first char of the
  *  invalid utf8 */
 #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
@@ -200,7 +223,7 @@ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
         /* single byte */
         return yajl_tok_string;
     } else if ((curChar >> 5) == 0x6) {
-        /* two byte */ 
+        /* two byte */
         UTF8_CHECK_EOF;
         curChar = readChar(lexer, jsonText, offset);
         if ((curChar >> 6) == 0x2) return yajl_tok_string;
@@ -226,7 +249,7 @@ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
                 if ((curChar >> 6) == 0x2) return yajl_tok_string;
             }
         }
-    } 
+    }
 
     return yajl_tok_error;
 }
@@ -254,13 +277,19 @@ if (*offset >= jsonTextLen) { \
 static size_t
 yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
 {
-    unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
+    unsigned char mask = IJC|(utf8check ? NUC : 0);
     size_t skip = 0;
-    while (skip < len && !(charLookupTable[*buf] & mask))
-    {
+
+    /* 2byte align */
+    if ((unsigned long) buf & 0x1) {
+        if (!len || charLookupTable[*buf] & mask) return 0;
         skip++;
         buf++;
     }
+    while (1+skip < len && !(shortLookupTable[*((unsigned short *) buf)] & mask)) {
+        skip+=2;
+        buf+=2;
+    }
     return skip;
 }
 
@@ -271,6 +300,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
     yajl_tok tok = yajl_tok_error;
     int hasEscapes = 0;
 
+    if (!shortLookupTable) buildShortTable();
+
     for (;;) {
         unsigned char curChar;
 
@@ -279,7 +310,7 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
         {
             const unsigned char * p;
             size_t len;
-            
+
             if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
                  lexer->bufOff < yajl_buf_len(lexer->buf)))
             {
@@ -287,8 +318,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
                      (lexer->bufOff));
                 len = yajl_buf_len(lexer->buf) - lexer->bufOff;
                 lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
-            }                
-            else if (*offset < jsonTextLen) 
+            }
+            else if (*offset < jsonTextLen)
             {
                 p = jsonText + *offset;
                 len = jsonTextLen - *offset;
@@ -316,8 +347,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
                 unsigned int i = 0;
 
                 for (i=0;i<4;i++) {
-                    STR_CHECK_EOF;                
-                    curChar = readChar(lexer, jsonText, offset);                
+                    STR_CHECK_EOF;
+                    curChar = readChar(lexer, jsonText, offset);
                     if (!(charLookupTable[curChar] & VHC)) {
                         /* back up to offending char */
                         unreadChar(lexer, offset);
@@ -329,8 +360,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
                 /* back up to offending char */
                 unreadChar(lexer, offset);
                 lexer->error = yajl_lex_string_invalid_escaped_char;
-                goto finish_string_lex;                
-            } 
+                goto finish_string_lex;
+            }
         }
         /* when not validating UTF8 it's a simple table lookup to determine
          * if the present character is invalid */
@@ -338,29 +369,29 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
             /* back up to offending char */
             unreadChar(lexer, offset);
             lexer->error = yajl_lex_string_invalid_json_char;
-            goto finish_string_lex;                
+            goto finish_string_lex;
         }
         /* when in validate UTF8 mode we need to do some extra work */
         else if (lexer->validateUTF8) {
             yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
                                             offset, curChar);
-            
+
             if (t == yajl_tok_eof) {
                 tok = yajl_tok_eof;
                 goto finish_string_lex;
             } else if (t == yajl_tok_error) {
                 lexer->error = yajl_lex_string_invalid_utf8;
                 goto finish_string_lex;
-            } 
+            }
         }
-        /* accept it, and move on */ 
+        /* accept it, and move on */
     }
   finish_string_lex:
     /* tell our buddy, the parser, wether he needs to process this string
      * again */
     if (hasEscapes && tok == yajl_tok_string) {
         tok = yajl_tok_string_with_escapes;
-    } 
+    }
 
     return tok;
 }
@@ -379,23 +410,23 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
 
     yajl_tok tok = yajl_tok_integer;
 
-    RETURN_IF_EOF;    
+    RETURN_IF_EOF;
     c = readChar(lexer, jsonText, offset);
 
     /* optional leading minus */
     if (c == '-') {
-        RETURN_IF_EOF;    
-        c = readChar(lexer, jsonText, offset); 
+        RETURN_IF_EOF;
+        c = readChar(lexer, jsonText, offset);
     }
 
     /* a single zero, or a series of integers */
     if (c == '0') {
-        RETURN_IF_EOF;    
-        c = readChar(lexer, jsonText, offset); 
+        RETURN_IF_EOF;
+        c = readChar(lexer, jsonText, offset);
     } else if (c >= '1' && c <= '9') {
         do {
-            RETURN_IF_EOF;    
-            c = readChar(lexer, jsonText, offset); 
+            RETURN_IF_EOF;
+            c = readChar(lexer, jsonText, offset);
         } while (c >= '0' && c <= '9');
     } else {
         unreadChar(lexer, offset);
@@ -406,15 +437,15 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
     /* optional fraction (indicates this is floating point) */
     if (c == '.') {
         int numRd = 0;
-        
+
         RETURN_IF_EOF;
-        c = readChar(lexer, jsonText, offset); 
+        c = readChar(lexer, jsonText, offset);
 
         while (c >= '0' && c <= '9') {
             numRd++;
             RETURN_IF_EOF;
-            c = readChar(lexer, jsonText, offset); 
-        } 
+            c = readChar(lexer, jsonText, offset);
+        }
 
         if (!numRd) {
             unreadChar(lexer, offset);
@@ -427,18 +458,18 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
     /* optional exponent (indicates this is floating point) */
     if (c == 'e' || c == 'E') {
         RETURN_IF_EOF;
-        c = readChar(lexer, jsonText, offset); 
+        c = readChar(lexer, jsonText, offset);
 
         /* optional sign */
         if (c == '+' || c == '-') {
             RETURN_IF_EOF;
-            c = readChar(lexer, jsonText, offset); 
+            c = readChar(lexer, jsonText, offset);
         }
 
         if (c >= '0' && c <= '9') {
             do {
                 RETURN_IF_EOF;
-                c = readChar(lexer, jsonText, offset); 
+                c = readChar(lexer, jsonText, offset);
             } while (c >= '0' && c <= '9');
         } else {
             unreadChar(lexer, offset);
@@ -447,10 +478,10 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
         }
         tok = yajl_tok_double;
     }
-    
+
     /* we always go "one too far" */
     unreadChar(lexer, offset);
-    
+
     return tok;
 }
 
@@ -462,24 +493,24 @@ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
 
     yajl_tok tok = yajl_tok_comment;
 
-    RETURN_IF_EOF;    
+    RETURN_IF_EOF;
     c = readChar(lexer, jsonText, offset);
 
     /* either slash or star expected */
     if (c == '/') {
         /* now we throw away until end of line */
         do {
-            RETURN_IF_EOF;    
-            c = readChar(lexer, jsonText, offset); 
+            RETURN_IF_EOF;
+            c = readChar(lexer, jsonText, offset);
         } while (c != '\n');
     } else if (c == '*') {
-        /* now we throw away until end of comment */        
+        /* now we throw away until end of comment */
         for (;;) {
-            RETURN_IF_EOF;    
-            c = readChar(lexer, jsonText, offset); 
+            RETURN_IF_EOF;
+            c = readChar(lexer, jsonText, offset);
             if (c == '*') {
-                RETURN_IF_EOF;    
-                c = readChar(lexer, jsonText, offset);                 
+                RETURN_IF_EOF;
+                c = readChar(lexer, jsonText, offset);
                 if (c == '/') {
                     break;
                 } else {
@@ -491,7 +522,7 @@ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
         lexer->error = yajl_lex_invalid_char;
         tok = yajl_tok_error;
     }
-    
+
     return tok;
 }
 
@@ -599,7 +630,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
                 goto lexed;
             }
             case '-':
-            case '0': case '1': case '2': case '3': case '4': 
+            case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7': case '8': case '9': {
                 /* integer parsing wants to start from the beginning */
                 unreadChar(lexer, offset);
@@ -626,11 +657,11 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
                                        jsonTextLen, offset);
                 if (tok == yajl_tok_comment) {
                     /* "error" is silly, but that's the initial
-                     * state of tok.  guilty until proven innocent. */  
+                     * state of tok.  guilty until proven innocent. */
                     tok = yajl_tok_error;
                     yajl_buf_clear(lexer->buf);
                     lexer->bufInUse = 0;
-                    startOffset = *offset; 
+                    startOffset = *offset;
                     break;
                 }
                 /* hit error or eof, bail */
@@ -651,7 +682,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
         lexer->bufInUse = 1;
         yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
         lexer->bufOff = 0;
-        
+
         if (tok != yajl_tok_eof) {
             *outBuf = yajl_buf_data(lexer->buf);
             *outLen = yajl_buf_len(lexer->buf);
@@ -667,7 +698,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
     {
         assert(*outLen >= 2);
         (*outBuf)++;
-        *outLen -= 2; 
+        *outLen -= 2;
     }
 
 
@@ -698,7 +729,7 @@ yajl_lex_error_to_string(yajl_lex_error error)
         case yajl_lex_string_invalid_escaped_char:
             return "inside a string, '\\' occurs before a character "
                    "which it may not.";
-        case yajl_lex_string_invalid_json_char:            
+        case yajl_lex_string_invalid_json_char:
             return "invalid character inside string.";
         case yajl_lex_string_invalid_hex_char:
             return "invalid (non-hex) character occurs after '\\u' inside "
@@ -751,13 +782,13 @@ yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
     size_t bufOff = lexer->bufOff;
     unsigned int bufInUse = lexer->bufInUse;
     yajl_tok tok;
-    
+
     tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
                        &outBuf, &outLen);
 
     lexer->bufOff = bufOff;
     lexer->bufInUse = bufInUse;
     yajl_buf_truncate(lexer->buf, bufLen);
-    
+
     return tok;
 }
-- 
cgit v1.2.1