diff options
author | Brent Miller <bdmiller@yahoo-inc.com> | 2009-08-20 06:50:22 +0000 |
---|---|---|
committer | Brent Miller <bdmiller@yahoo-inc.com> | 2009-08-20 06:50:22 +0000 |
commit | 126ad95fc4895412b631f90657e72e8c75780667 (patch) | |
tree | 5bf6ff58c58e35bb146492a11a6d7c22d740f6b3 | |
parent | f8663fc16fc1082b72ab721f5a6b3781543293a8 (diff) | |
download | json-c-126ad95fc4895412b631f90657e72e8c75780667.tar.gz |
* Add handling of surrogate pairs
git-svn-id: http://svn.metaparadigm.com/svn/json-c/trunk@53 327403b1-1117-474d-bef2-5cb71233fd97
-rw-r--r-- | ChangeLog | 2 | ||||
-rw-r--r-- | Makefile.am | 5 | ||||
-rw-r--r-- | json_tokener.c | 102 | ||||
-rw-r--r-- | test4.c | 44 |
4 files changed, 133 insertions, 20 deletions
@@ -1,3 +1,5 @@ + * Add handling of surrogate pairs (json_tokener.c, test4.c, Makefile.am) + Brent Miller, bdmiller at yahoo dash inc dot com * Correction to comment describing printbuf_memappend in printbuf.h Brent Miller, bdmiller at yahoo dash inc dot com diff --git a/Makefile.am b/Makefile.am index fbedab8..1c1a9ba 100644 --- a/Makefile.am +++ b/Makefile.am @@ -31,7 +31,7 @@ libjson_la_SOURCES = \ linkhash.c \ printbuf.c -check_PROGRAMS = test1 test2 test3 +check_PROGRAMS = test1 test2 test3 test4 test1_SOURCES = test1.c test1_LDADD = $(lib_LTLIBRARIES) @@ -41,3 +41,6 @@ test2_LDADD = $(lib_LTLIBRARIES) test3_SOURCES = test3.c test3_LDADD = $(lib_LTLIBRARIES) + +test4_SOURCES = test4.c +test4_LDADD = $(lib_LTLIBRARIES) diff --git a/json_tokener.c b/json_tokener.c index 04f11ba..8d0b5dc 100644 --- a/json_tokener.c +++ b/json_tokener.c @@ -58,6 +58,12 @@ const char* json_tokener_errors[] = { "expected comment", }; +/* Stuff for decoding unicode sequences */ +#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800) +#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00) +#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000) +static unsigned char utf8_replacement_char[3] = { 0xEF, 0xBF, 0xBD }; + struct json_tokener* json_tokener_new(void) { @@ -176,6 +182,7 @@ char* strndup(const char* str, size_t n) #define ADVANCE_CHAR(str, tok) \ ( ++(str), ((tok)->char_offset)++, c) + /* End optimization macro defs */ @@ -398,40 +405,97 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, break; case json_tokener_state_escape_unicode: - /* Note that the following code is inefficient for handling large - * chunks of extended chars, calling printbuf_memappend() once - * for each multi-byte character of input. - * This is a good area for future optimization. - */ { - /* Advance until we change state */ + unsigned int got_hi_surrogate = 0; + + /* Handle a 4-byte sequence, or two sequences if a surrogate pair */ while(1) { if(strchr(json_hex_chars, c)) { tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4)); if(tok->st_pos == 4) { - unsigned char utf_out[3]; + unsigned char unescaped_utf[4]; + + if (got_hi_surrogate) { + if (IS_LOW_SURROGATE(tok->ucs_char)) { + /* Recalculate the ucs_char, then fall thru to process normally */ + tok->ucs_char = DECODE_SURROGATE_PAIR(got_hi_surrogate, tok->ucs_char); + } else { + /* Hi surrogate was not followed by a low surrogate */ + /* Replace the hi and process the rest normally */ + printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3); + } + got_hi_surrogate = 0; + } + if (tok->ucs_char < 0x80) { - utf_out[0] = tok->ucs_char; - printbuf_memappend_fast(tok->pb, (char*)utf_out, 1); + unescaped_utf[0] = tok->ucs_char; + printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 1); } else if (tok->ucs_char < 0x800) { - utf_out[0] = 0xc0 | (tok->ucs_char >> 6); - utf_out[1] = 0x80 | (tok->ucs_char & 0x3f); - printbuf_memappend_fast(tok->pb, (char*)utf_out, 2); + unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6); + unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 2); + } else if (IS_HIGH_SURROGATE(tok->ucs_char)) { + /* Got a high surrogate. Remember it and look for the + * the beginning of another sequence, which should be the + * low surrogate. + */ + got_hi_surrogate = tok->ucs_char; + /* Not at end, and the next two chars should be "\u" */ + if ((tok->char_offset+1 != len) && + (tok->char_offset+2 != len) && + (str[1] == '\\') && + (str[2] == 'u')) + { + ADVANCE_CHAR(str, tok); + ADVANCE_CHAR(str, tok); + + /* Advance to the first char of the next sequence and + * continue processing with the next sequence. + */ + if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) { + printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3); + goto out; + } + tok->ucs_char = 0; + tok->st_pos = 0; + continue; /* other json_tokener_state_escape_unicode */ + } else { + /* Got a high surrogate without another sequence following + * it. Put a replacement char in for the hi surrogate + * and pretend we finished. + */ + printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3); + } + } else if (IS_LOW_SURROGATE(tok->ucs_char)) { + /* Got a low surrogate not preceded by a high */ + printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3); + } else if (tok->ucs_char < 0x10000) { + unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12); + unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f); + unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 3); + } else if (tok->ucs_char < 0x110000) { + unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07); + unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f); + unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f); + unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 4); } else { - utf_out[0] = 0xe0 | (tok->ucs_char >> 12); - utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f); - utf_out[2] = 0x80 | (tok->ucs_char & 0x3f); - printbuf_memappend_fast(tok->pb, (char*)utf_out, 3); - } + /* Don't know what we got--insert the replacement char */ + printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3); + } state = saved_state; break; } } else { tok->err = json_tokener_error_parse_string; goto out; - } - if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) + } + if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) { + if (got_hi_surrogate) /* Clean up any pending chars */ + printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3); goto out; + } } } break; @@ -0,0 +1,44 @@ +/* + * gcc -o utf8 utf8.c -I/home/y/include -L./.libs -ljson +*/ + +#include <stdio.h> +#include <string.h> +#include <json/json_object.h> +#include <json/json_tokener.h> + +void print_hex( const unsigned char* s) { + const unsigned char *iter = s; + unsigned char ch; + while ((ch = *iter++) != 0) { + if( ',' != ch) + printf("%x ", ch); + else + printf( ","); + } + printf("\n"); +} + +int main() { + const char *input = "\"\\ud840\\udd26,\\ud840\\udd27,\\ud800\\udd26,\\ud800\\udd27\""; + const char *expected = "\xF0\xA0\x84\xA6,\xF0\xA0\x84\xA7,\xF0\x90\x84\xA6,\xF0\x90\x84\xA7"; + struct json_object *parse_result = json_tokener_parse((char*)input); + const char *unjson = json_object_get_string(parse_result); + + printf("input: %s\n", input); + + int strings_match = !strcmp( expected, unjson); + if (strings_match) { + printf("JSON parse result is correct: %s\n", unjson); + printf("PASS\n"); + return(0); + } else { + printf("JSON parse result doesn't match expected string\n"); + printf("expected string bytes: "); + print_hex( expected); + printf("parsed string bytes: "); + print_hex( unjson); + printf("FAIL\n"); + return(1); + } +} |