From 567874b9181ebdef8a55f93137bc934280f84378 Mon Sep 17 00:00:00 2001 From: Lloyd Hilaiel Date: Mon, 25 Apr 2011 15:22:38 -0600 Subject: experiment with skipping 2 bytes at a time while string scanning --- src/yajl_lex.c | 141 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 55 deletions(-) diff --git a/src/yajl_lex.c b/src/yajl_lex.c index b098e6a..d4ad27f 100644 --- a/src/yajl_lex.c +++ b/src/yajl_lex.c @@ -24,7 +24,7 @@ #ifdef YAJL_LEXER_DEBUG static const char * -tokToStr(yajl_tok tok) +tokToStr(yajl_tok tok) { switch (tok) { case yajl_tok_bool: return "bool"; @@ -53,13 +53,13 @@ tokToStr(yajl_tok tok) * the network or disk). This makes the lexer more complex. The * responsibility of the lexer is to handle transparently the case where * a chunk boundary falls in the middle of a token. This is - * accomplished is via a buffer and a character reading abstraction. + * accomplished is via a buffer and a character reading abstraction. * * Overview of implementation * * When we lex to end of input string before end of token is hit, we * copy all of the input text composing the token into our lexBuf. - * + * * Every time we read a character, we do so through the readChar function. * readChar's responsibility is to handle pulling all chars from the buffer * before pulling chars from input text @@ -74,7 +74,7 @@ struct yajl_lexer_t { yajl_lex_error error; /* a input buffer to handle the case where a token is spread over - * multiple chunks */ + * multiple chunks */ yajl_buf buf; /* in the case where we have data in the lexBuf, bufOff holds @@ -178,6 +178,29 @@ static const char charLookupTable[256] = NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC }; +static unsigned char * shortLookupTable = NULL; +static void buildShortTable(void) +{ + union { + struct { + unsigned char low; + unsigned char high; + } c; + unsigned short s; + } scs; + int i,j; + + shortLookupTable = (unsigned char *) calloc(sizeof(unsigned char), 256 * 256); + + for (i=0;i<256;i++) { + scs.c.low = charLookupTable[i]; + for (j=0;j<256;j++) { + shortLookupTable[i + (j << 8)] = charLookupTable[i] | charLookupTable[j]; + } + } +// printf("built table\n"); +} + /** process a variable length utf8 encoded codepoint. * * returns: @@ -186,7 +209,7 @@ static const char charLookupTable[256] = * yajl_tok_eof - if end of input was hit before validation could * complete * yajl_tok_error - if invalid utf8 was encountered - * + * * NOTE: on error the offset will point to the first char of the * invalid utf8 */ #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; } @@ -200,7 +223,7 @@ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText, /* single byte */ return yajl_tok_string; } else if ((curChar >> 5) == 0x6) { - /* two byte */ + /* two byte */ UTF8_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); if ((curChar >> 6) == 0x2) return yajl_tok_string; @@ -226,7 +249,7 @@ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText, if ((curChar >> 6) == 0x2) return yajl_tok_string; } } - } + } return yajl_tok_error; } @@ -254,13 +277,19 @@ if (*offset >= jsonTextLen) { \ static size_t yajl_string_scan(const unsigned char * buf, size_t len, int utf8check) { - unsigned char mask = IJC|NFP|(utf8check ? NUC : 0); + unsigned char mask = IJC|(utf8check ? NUC : 0); size_t skip = 0; - while (skip < len && !(charLookupTable[*buf] & mask)) - { + + /* 2byte align */ + if ((unsigned long) buf & 0x1) { + if (!len || charLookupTable[*buf] & mask) return 0; skip++; buf++; } + while (1+skip < len && !(shortLookupTable[*((unsigned short *) buf)] & mask)) { + skip+=2; + buf+=2; + } return skip; } @@ -271,6 +300,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, yajl_tok tok = yajl_tok_error; int hasEscapes = 0; + if (!shortLookupTable) buildShortTable(); + for (;;) { unsigned char curChar; @@ -279,7 +310,7 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, { const unsigned char * p; size_t len; - + if ((lexer->bufInUse && yajl_buf_len(lexer->buf) && lexer->bufOff < yajl_buf_len(lexer->buf))) { @@ -287,8 +318,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, (lexer->bufOff)); len = yajl_buf_len(lexer->buf) - lexer->bufOff; lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8); - } - else if (*offset < jsonTextLen) + } + else if (*offset < jsonTextLen) { p = jsonText + *offset; len = jsonTextLen - *offset; @@ -316,8 +347,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, unsigned int i = 0; for (i=0;i<4;i++) { - STR_CHECK_EOF; - curChar = readChar(lexer, jsonText, offset); + STR_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); if (!(charLookupTable[curChar] & VHC)) { /* back up to offending char */ unreadChar(lexer, offset); @@ -329,8 +360,8 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, /* back up to offending char */ unreadChar(lexer, offset); lexer->error = yajl_lex_string_invalid_escaped_char; - goto finish_string_lex; - } + goto finish_string_lex; + } } /* when not validating UTF8 it's a simple table lookup to determine * if the present character is invalid */ @@ -338,29 +369,29 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, /* back up to offending char */ unreadChar(lexer, offset); lexer->error = yajl_lex_string_invalid_json_char; - goto finish_string_lex; + goto finish_string_lex; } /* when in validate UTF8 mode we need to do some extra work */ else if (lexer->validateUTF8) { yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen, offset, curChar); - + if (t == yajl_tok_eof) { tok = yajl_tok_eof; goto finish_string_lex; } else if (t == yajl_tok_error) { lexer->error = yajl_lex_string_invalid_utf8; goto finish_string_lex; - } + } } - /* accept it, and move on */ + /* accept it, and move on */ } finish_string_lex: /* tell our buddy, the parser, wether he needs to process this string * again */ if (hasEscapes && tok == yajl_tok_string) { tok = yajl_tok_string_with_escapes; - } + } return tok; } @@ -379,23 +410,23 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, yajl_tok tok = yajl_tok_integer; - RETURN_IF_EOF; + RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); /* optional leading minus */ if (c == '-') { - RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); } /* a single zero, or a series of integers */ if (c == '0') { - RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); } else if (c >= '1' && c <= '9') { do { - RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); } while (c >= '0' && c <= '9'); } else { unreadChar(lexer, offset); @@ -406,15 +437,15 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, /* optional fraction (indicates this is floating point) */ if (c == '.') { int numRd = 0; - + RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + c = readChar(lexer, jsonText, offset); while (c >= '0' && c <= '9') { numRd++; RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); - } + c = readChar(lexer, jsonText, offset); + } if (!numRd) { unreadChar(lexer, offset); @@ -427,18 +458,18 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, /* optional exponent (indicates this is floating point) */ if (c == 'e' || c == 'E') { RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + c = readChar(lexer, jsonText, offset); /* optional sign */ if (c == '+' || c == '-') { RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + c = readChar(lexer, jsonText, offset); } if (c >= '0' && c <= '9') { do { RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + c = readChar(lexer, jsonText, offset); } while (c >= '0' && c <= '9'); } else { unreadChar(lexer, offset); @@ -447,10 +478,10 @@ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, } tok = yajl_tok_double; } - + /* we always go "one too far" */ unreadChar(lexer, offset); - + return tok; } @@ -462,24 +493,24 @@ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText, yajl_tok tok = yajl_tok_comment; - RETURN_IF_EOF; + RETURN_IF_EOF; c = readChar(lexer, jsonText, offset); /* either slash or star expected */ if (c == '/') { /* now we throw away until end of line */ do { - RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); } while (c != '\n'); } else if (c == '*') { - /* now we throw away until end of comment */ + /* now we throw away until end of comment */ for (;;) { - RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); if (c == '*') { - RETURN_IF_EOF; - c = readChar(lexer, jsonText, offset); + RETURN_IF_EOF; + c = readChar(lexer, jsonText, offset); if (c == '/') { break; } else { @@ -491,7 +522,7 @@ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText, lexer->error = yajl_lex_invalid_char; tok = yajl_tok_error; } - + return tok; } @@ -599,7 +630,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, goto lexed; } case '-': - case '0': case '1': case '2': case '3': case '4': + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { /* integer parsing wants to start from the beginning */ unreadChar(lexer, offset); @@ -626,11 +657,11 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, jsonTextLen, offset); if (tok == yajl_tok_comment) { /* "error" is silly, but that's the initial - * state of tok. guilty until proven innocent. */ + * state of tok. guilty until proven innocent. */ tok = yajl_tok_error; yajl_buf_clear(lexer->buf); lexer->bufInUse = 0; - startOffset = *offset; + startOffset = *offset; break; } /* hit error or eof, bail */ @@ -651,7 +682,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, lexer->bufInUse = 1; yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset); lexer->bufOff = 0; - + if (tok != yajl_tok_eof) { *outBuf = yajl_buf_data(lexer->buf); *outLen = yajl_buf_len(lexer->buf); @@ -667,7 +698,7 @@ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText, { assert(*outLen >= 2); (*outBuf)++; - *outLen -= 2; + *outLen -= 2; } @@ -698,7 +729,7 @@ yajl_lex_error_to_string(yajl_lex_error error) case yajl_lex_string_invalid_escaped_char: return "inside a string, '\\' occurs before a character " "which it may not."; - case yajl_lex_string_invalid_json_char: + case yajl_lex_string_invalid_json_char: return "invalid character inside string."; case yajl_lex_string_invalid_hex_char: return "invalid (non-hex) character occurs after '\\u' inside " @@ -751,13 +782,13 @@ yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText, size_t bufOff = lexer->bufOff; unsigned int bufInUse = lexer->bufInUse; yajl_tok tok; - + tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset, &outBuf, &outLen); lexer->bufOff = bufOff; lexer->bufInUse = bufInUse; yajl_buf_truncate(lexer->buf, bufLen); - + return tok; } -- cgit v1.2.1