From 8aa29de2ea2507c1995bb9777c87935f4618c460 Mon Sep 17 00:00:00 2001 From: Dave Beckett Date: Sat, 1 Nov 2014 20:34:48 -0700 Subject: (raptor_stringbuffer_append_turtle_string): Enforce URI restrictions. Add is_uri argument to distinguish. Report type label correctly. URIs may not have \t \b \n \r \f or raw ' ' or \u0020 or \u003C or \u003E Update all raptor_stringbuffer_append_turtle_string callers to pass URI flag. --- src/raptor_internal.h | 2 +- src/turtle_common.c | 98 ++++++++++++++++++++++++++++++++------------------- src/turtle_lexer.l | 12 +++---- 3 files changed, 68 insertions(+), 44 deletions(-) diff --git a/src/raptor_internal.h b/src/raptor_internal.h index 95433d27..8ec62e9f 100644 --- a/src/raptor_internal.h +++ b/src/raptor_internal.h @@ -1232,7 +1232,7 @@ typedef void (*raptor_simple_message_handler)(void *user_data, const char *messa /* turtle_common.c */ -RAPTOR_INTERNAL_API int raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, const unsigned char *text, size_t len, int delim, raptor_simple_message_handler error_handler, void *error_data); +RAPTOR_INTERNAL_API int raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, const unsigned char *text, size_t len, int delim, raptor_simple_message_handler error_handler, void *error_data, int is_uri); /* raptor_abbrev.c */ diff --git a/src/turtle_common.c b/src/turtle_common.c index 97d05cc6..a8625764 100644 --- a/src/turtle_common.c +++ b/src/turtle_common.c @@ -64,6 +64,8 @@ * * Turtle 2013 allows \ with -_~.!$&\'()*+,;=/?#@% * + * URIs may not have \t \b \n \r \f or raw ' ' or \u0020 or \u003C or \u003E + * * Return value: non-0 on failure **/ int @@ -71,33 +73,49 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, const unsigned char *text, size_t len, int delim, raptor_simple_message_handler error_handler, - void *error_data) + void *error_data, + int is_uri) { size_t i; const unsigned char *s; unsigned char *d; unsigned char *string = RAPTOR_MALLOC(unsigned char*, len + 1); - + const char* label = (is_uri ? "URI" : "string"); + if(!string) return -1; for(s = text, d = string, i = 0; i < len; s++, i++) { unsigned char c=*s; + if(c == ' ' && is_uri) { + error_handler(error_data, + "Turtle %s error - character '%c'", label, c); + RAPTOR_FREE(char*, string); + return 1; + } + if(c == '\\' ) { s++; i++; c = *s; - if(c == 'n') - *d++ = '\n'; - else if(c == 'r') - *d++ = '\r'; - else if(c == 't') - *d++ = '\t'; - else if(c == 'b') - *d++ = '\b'; - else if(c == 'f') - *d++ = '\f'; - else if(c == '\\' || c == delim || + if(c == 'n' || c == 'r' || c == 't' || c == 'b' || c == 'f') { + if(is_uri) { + error_handler(error_data, + "Turtle %s error - illegal URI escape '\\%c'", label, c); + RAPTOR_FREE(char*, string); + return 1; + } + if(c == 'n') + *d++ = '\n'; + else if(c == 'r') + *d++ = '\r'; + else if(c == 't') + *d++ = '\t'; + else if(c == 'b') + *d++ = '\b'; + else /* 'f' */ + *d++ = '\f'; + } else if(c == '\\' || c == delim || c == '-' || c == '_' || c == '~' || c == '.' || c == '!' || c == '$' || c == '&' || c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' || c == ',' || c == ';' ||c == '=' || @@ -113,7 +131,7 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, s++; i++; if(i+ulen > len) { error_handler(error_data, - "Turtle string error - \\%c over end of line", c); + "Turtle %s error - \\%c over end of line", label, c); RAPTOR_FREE(char*, string); return 1; } @@ -122,8 +140,8 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, char cc = s[ii]; if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { error_handler(error_data, - "Turtle string error - illegal hex digit %c in Unicode escape '%c%s...'", - cc, c, s); + "Turtle %s error - illegal hex digit %c in Unicode escape '%c%s...'", + label, cc, c, s); RAPTOR_FREE(char*, string); return 1; } @@ -132,8 +150,8 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { error_handler(error_data, - "Turtle string error - illegal Unicode escape '%c%s...'", - c, s); + "Turtle %s error - illegal Unicode escape '%c%s...'", + label, c, s); RAPTOR_FREE(char*, string); return 1; } @@ -141,10 +159,16 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, s+= ulen-1; i+= ulen-1; + if(is_uri && (unichar == 0x0020 || unichar == 0x003C || unichar == 0x003E)) { + error_handler(error_data, + "Turtle %s error - illegal Unicode escape \\u%04lX in URI.", label, unichar); + break; + } + if(unichar > raptor_unicode_max_codepoint) { error_handler(error_data, - "Turtle string error - illegal Unicode character with code point #x%lX (max #x%lX).", - unichar, raptor_unicode_max_codepoint); + "Turtle %s error - illegal Unicode character with code point #x%lX (max #x%lX).", + label, unichar, raptor_unicode_max_codepoint); RAPTOR_FREE(char*, string); return 1; } @@ -153,8 +177,8 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, len-(d-string)); if(unichar_width < 0) { error_handler(error_data, - "Turtle string error - illegal Unicode character with code point #x%lX.", - unichar); + "Turtle %s error - illegal Unicode character with code point #x%lX.", + label, unichar); RAPTOR_FREE(char*, string); return 1; } @@ -163,8 +187,8 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, } else { /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */ error_handler(error_data, - "Turtle string error - illegal escape \\%c (#x%02X) in \"%s\"", - c, c, text); + "Turtle %s error - illegal escape \\%c (#x%02X) in \"%s\"", + label, c, c, text); } } else *d++=c; @@ -182,13 +206,13 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, /** - * raptor_turtle_expand_name_escapes: - * @name: turtle name to decode + * raptor_turtle_expand_qname_escapes: + * @name: turtle qname string to decode * @len: length of name * @error_handler: error handling function * @error_data: error handler data * - * Expands Turtle escapes for the given name + * Expands Turtle escapes for the given turtle qname string * * The passed in string is handled according to the Turtle string * escape rules giving a UTF-8 encoded output of the Unicode codepoints. @@ -201,10 +225,10 @@ raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer, * Return value: new length or 0 on failure **/ size_t -raptor_turtle_expand_name_escapes(unsigned char *name, - size_t len, - raptor_simple_message_handler error_handler, - void *error_data) +raptor_turtle_expand_qname_escapes(unsigned char *name, + size_t len, + raptor_simple_message_handler error_handler, + void *error_data) { size_t i; const unsigned char *s; @@ -245,7 +269,7 @@ raptor_turtle_expand_name_escapes(unsigned char *name, s++; i++; if(i+ulen > len) { error_handler(error_data, - "Turtle string error - \\%c over end of line", c); + "Turtle name error - \\%c over end of line", c); return 1; } @@ -253,7 +277,7 @@ raptor_turtle_expand_name_escapes(unsigned char *name, char cc = s[ii]; if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { error_handler(error_data, - "Turtle string error - illegal hex digit %c in Unicode escape '%c%s...'", + "Turtle name error - illegal hex digit %c in Unicode escape '%c%s...'", cc, c, s); return 1; } @@ -262,7 +286,7 @@ raptor_turtle_expand_name_escapes(unsigned char *name, n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { error_handler(error_data, - "Turtle string error - illegal Uncode escape '%c%s...'", + "Turtle name error - illegal Uncode escape '%c%s...'", c, s); return 1; } @@ -272,7 +296,7 @@ raptor_turtle_expand_name_escapes(unsigned char *name, if(unichar > raptor_unicode_max_codepoint) { error_handler(error_data, - "Turtle string error - illegal Unicode character with code point #x%lX (max #x%lX).", + "Turtle name error - illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint); return 1; } @@ -281,7 +305,7 @@ raptor_turtle_expand_name_escapes(unsigned char *name, len - (d-name)); if(unichar_width < 0) { error_handler(error_data, - "Turtle string error - illegal Unicode character with code point #x%lX.", + "Turtle name error - illegal Unicode character with code point #x%lX.", unichar); return 1; } @@ -290,7 +314,7 @@ raptor_turtle_expand_name_escapes(unsigned char *name, } else { /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */ error_handler(error_data, - "Turtle string error - illegal escape \\%c (#x%02X) in \"%s\"", + "Turtle name error - illegal escape \\%c (#x%02X) in \"%s\"", c, c, name); } } else diff --git a/src/turtle_lexer.l b/src/turtle_lexer.l index 08bdb62b..fb5bb665 100644 --- a/src/turtle_lexer.l +++ b/src/turtle_lexer.l @@ -290,7 +290,7 @@ EXPONENT [eE][+-]?[0-9]+ turtle_parser->lineno++; } - if(raptor_stringbuffer_append_turtle_string(turtle_parser->sb, (unsigned char*)yytext, yyleng, '"', (raptor_simple_message_handler)turtle_lexer_syntax_error, rdf_parser)) { /* " */ + if(raptor_stringbuffer_append_turtle_string(turtle_parser->sb, (unsigned char*)yytext, yyleng, '"', (raptor_simple_message_handler)turtle_lexer_syntax_error, rdf_parser, 0)) { /* " */ BEGIN(INITIAL); raptor_free_stringbuffer(turtle_parser->sb); turtle_parser->sb = NULL; @@ -353,7 +353,7 @@ EXPONENT [eE][+-]?[0-9]+ turtle_parser->lineno++; } - if(raptor_stringbuffer_append_turtle_string(turtle_parser->sb, (unsigned char*)yytext, yyleng, '"', (raptor_simple_message_handler)turtle_lexer_syntax_error, rdf_parser)) { /* " */ + if(raptor_stringbuffer_append_turtle_string(turtle_parser->sb, (unsigned char*)yytext, yyleng, '"', (raptor_simple_message_handler)turtle_lexer_syntax_error, rdf_parser, 0)) { /* " */ BEGIN(INITIAL); raptor_free_stringbuffer(turtle_parser->sb); turtle_parser->sb = NULL; @@ -444,7 +444,7 @@ EXPONENT [eE][+-]?[0-9]+ /* start at yytext + 1 to skip '<' and operate over * length-2 bytes to skip '<' and '>' */ - if(raptor_stringbuffer_append_turtle_string(sb, (unsigned char*)yytext+1, yyleng-2, '>', (raptor_simple_message_handler)turtle_lexer_syntax_error, rdf_parser)) { + if(raptor_stringbuffer_append_turtle_string(sb, (unsigned char*)yytext+1, yyleng-2, '>', (raptor_simple_message_handler)turtle_lexer_syntax_error, rdf_parser, 1)) { raptor_free_stringbuffer(sb); YY_FATAL_ERROR_EOF("raptor_stringbuffer_append_turtle_string failed"); } @@ -490,7 +490,7 @@ EXPONENT [eE][+-]?[0-9]+ sb = raptor_new_stringbuffer(); if(!sb) TURTLE_LEXER_OOM(); - if(raptor_stringbuffer_append_turtle_string(sb, (unsigned char*)yytext+1, yyleng-1, '>', (raptor_simple_message_handler)turtle_lexer_syntax_error, rdf_parser)) { + if(raptor_stringbuffer_append_turtle_string(sb, (unsigned char*)yytext+1, yyleng-1, '>', (raptor_simple_message_handler)turtle_lexer_syntax_error, rdf_parser, 1)) { raptor_free_stringbuffer(sb); YY_FATAL_ERROR_EOF("raptor_stringbuffer_append_turtle_string failed"); } @@ -560,8 +560,8 @@ turtle_copy_string_token(raptor_parser* rdf_parser, return NULL; rc = raptor_stringbuffer_append_turtle_string(sb, string, len, delim, - (raptor_simple_message_handler)turtle_lexer_syntax_error, - rdf_parser); + (raptor_simple_message_handler)turtle_lexer_syntax_error, + rdf_parser, 0); if(rc) { raptor_free_stringbuffer(sb); return NULL; -- cgit v1.2.1