diff options
author | jakub <jakub@138bc75d-0d04-0410-961f-82ee72b054a4> | 2009-10-19 21:41:15 +0000 |
---|---|---|
committer | jakub <jakub@138bc75d-0d04-0410-961f-82ee72b054a4> | 2009-10-19 21:41:15 +0000 |
commit | 538ba11a2394dd5e651ccc047aeced3ca4c95afd (patch) | |
tree | 2156a8d43e7b0cb0eb28776c7a735b606e1caea8 /libcpp | |
parent | 5e9082a7a2a00296343284b155667ae01ca967be (diff) | |
download | gcc-538ba11a2394dd5e651ccc047aeced3ca4c95afd.tar.gz |
* charset.c (cpp_init_iconv): Initialize utf8_cset_desc.
(_cpp_destroy_iconv): Destroy utf8_cset_desc, char16_cset_desc
and char32_cset_desc.
(converter_for_type): Handle CPP_UTF8STRING.
(cpp_interpret_string): Handle CPP_UTF8STRING and raw-strings.
* directives.c (get__Pragma_string): Handle CPP_UTF8STRING.
(parse_include): Reject raw strings.
* include/cpplib.h (CPP_UTF8STRING): New token type.
* internal.h (struct cpp_reader): Add utf8_cset_desc field.
* lex.c (lex_raw_string): New function.
(lex_string): Handle u8 string literals, call lex_raw_string
for raw string literals.
(_cpp_lex_direct): Call lex_string even for u8" and {,u,U,L,u8}R"
sequences.
* macro.c (stringify_arg): Handle CPP_UTF8STRING.
* c-common.c (c_parse_error): Handle CPP_UTF8STRING.
* c-lex.c (c_lex_with_flags): Likewise. Test C_LEX_STRING_NO_JOIN
instead of C_LEX_RAW_STRINGS.
(lex_string): Handle CPP_UTF8STRING.
* c-parser.c (c_parser_postfix_expression): Likewise.
* c-pragma.h (C_LEX_RAW_STRINGS): Rename to ...
(C_LEX_STRING_NO_JOIN): ... this.
* parser.c (cp_lexer_print_token, cp_parser_is_string_literal,
cp_parser_string_literal, cp_parser_primary_expression): Likewise.
(cp_lexer_get_preprocessor_token): Use C_LEX_STRING_JOIN instead
of C_LEX_RAW_STRINGS.
* gcc.dg/raw-string-1.c: New test.
* gcc.dg/raw-string-2.c: New test.
* gcc.dg/raw-string-3.c: New test.
* gcc.dg/raw-string-4.c: New test.
* gcc.dg/raw-string-5.c: New test.
* gcc.dg/raw-string-6.c: New test.
* gcc.dg/raw-string-7.c: New test.
* gcc.dg/utf8-1.c: New test.
* gcc.dg/utf8-2.c: New test.
* gcc.dg/utf-badconcat2.c: New test.
* gcc.dg/utf-dflt2.c: New test.
* gcc.dg/cpp/include6.c: New test.
* g++.dg/ext/raw-string-1.C: New test.
* g++.dg/ext/raw-string-2.C: New test.
* g++.dg/ext/raw-string-3.C: New test.
* g++.dg/ext/raw-string-4.C: New test.
* g++.dg/ext/raw-string-5.C: New test.
* g++.dg/ext/raw-string-6.C: New test.
* g++.dg/ext/raw-string-7.C: New test.
* g++.dg/ext/utf8-1.C: New test.
* g++.dg/ext/utf8-2.C: New test.
* g++.dg/ext/utf-badconcat2.C: New test.
* g++.dg/ext/utf-dflt2.C: New test.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@152995 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libcpp')
-rw-r--r-- | libcpp/ChangeLog | 18 | ||||
-rw-r--r-- | libcpp/charset.c | 52 | ||||
-rw-r--r-- | libcpp/directives.c | 6 | ||||
-rw-r--r-- | libcpp/include/cpplib.h | 5 | ||||
-rw-r--r-- | libcpp/internal.h | 4 | ||||
-rw-r--r-- | libcpp/lex.c | 223 | ||||
-rw-r--r-- | libcpp/macro.c | 3 |
7 files changed, 295 insertions, 16 deletions
diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog index 3259c56f46e..5946b29dc56 100644 --- a/libcpp/ChangeLog +++ b/libcpp/ChangeLog @@ -1,3 +1,21 @@ +2009-10-19 Jakub Jelinek <jakub@redhat.com> + + * charset.c (cpp_init_iconv): Initialize utf8_cset_desc. + (_cpp_destroy_iconv): Destroy utf8_cset_desc, char16_cset_desc + and char32_cset_desc. + (converter_for_type): Handle CPP_UTF8STRING. + (cpp_interpret_string): Handle CPP_UTF8STRING and raw-strings. + * directives.c (get__Pragma_string): Handle CPP_UTF8STRING. + (parse_include): Reject raw strings. + * include/cpplib.h (CPP_UTF8STRING): New token type. + * internal.h (struct cpp_reader): Add utf8_cset_desc field. + * lex.c (lex_raw_string): New function. + (lex_string): Handle u8 string literals, call lex_raw_string + for raw string literals. + (_cpp_lex_direct): Call lex_string even for u8" and {,u,U,L,u8}R" + sequences. + * macro.c (stringify_arg): Handle CPP_UTF8STRING. + 2009-10-14 Jakub Jelinek <jakub@redhat.com> PR preprocessor/41543 diff --git a/libcpp/charset.c b/libcpp/charset.c index bd24ec2490d..837ccd77aab 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -721,6 +721,8 @@ cpp_init_iconv (cpp_reader *pfile) pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET); pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision); + pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET); + pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision); pfile->char16_cset_desc = init_iconv_desc (pfile, be ? "UTF-16BE" : "UTF-16LE", SOURCE_CHARSET); @@ -741,6 +743,12 @@ _cpp_destroy_iconv (cpp_reader *pfile) { if (pfile->narrow_cset_desc.func == convert_using_iconv) iconv_close (pfile->narrow_cset_desc.cd); + if (pfile->utf8_cset_desc.func == convert_using_iconv) + iconv_close (pfile->utf8_cset_desc.cd); + if (pfile->char16_cset_desc.func == convert_using_iconv) + iconv_close (pfile->char16_cset_desc.cd); + if (pfile->char32_cset_desc.func == convert_using_iconv) + iconv_close (pfile->char32_cset_desc.cd); if (pfile->wide_cset_desc.func == convert_using_iconv) iconv_close (pfile->wide_cset_desc.cd); } @@ -1339,6 +1347,8 @@ converter_for_type (cpp_reader *pfile, enum cpp_ttype type) { default: return pfile->narrow_cset_desc; + case CPP_UTF8STRING: + return pfile->utf8_cset_desc; case CPP_CHAR16: case CPP_STRING16: return pfile->char16_cset_desc; @@ -1373,7 +1383,47 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count, for (i = 0; i < count; i++) { p = from[i].text; - if (*p == 'L' || *p == 'u' || *p == 'U') p++; + if (*p == 'u') + { + if (*++p == '8') + p++; + } + else if (*p == 'L' || *p == 'U') p++; + if (*p == 'R') + { + const uchar *prefix; + + /* Skip over 'R"'. */ + p += 2; + prefix = p; + while (*p != '[') + p++; + p++; + limit = from[i].text + from[i].len; + if (limit >= p + (p - prefix) + 1) + limit -= (p - prefix) + 1; + + for (;;) + { + base = p; + while (p < limit && (*p != '\\' || (p[1] != 'u' && p[1] != 'U'))) + p++; + if (p > base) + { + /* We have a run of normal characters; these can be fed + directly to convert_cset. */ + if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf)) + goto fail; + } + if (p == limit) + break; + + p = convert_ucn (pfile, p + 1, limit, &tbuf, cvt); + } + + continue; + } + p++; /* Skip leading quote. */ limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */ diff --git a/libcpp/directives.c b/libcpp/directives.c index f9dba539ea2..01bb599e266 100644 --- a/libcpp/directives.c +++ b/libcpp/directives.c @@ -697,7 +697,8 @@ parse_include (cpp_reader *pfile, int *pangle_brackets, /* Allow macro expansion. */ header = get_token_no_padding (pfile); *location = header->src_loc; - if (header->type == CPP_STRING || header->type == CPP_HEADER_NAME) + if ((header->type == CPP_STRING && header->val.str.text[0] != 'R') + || header->type == CPP_HEADER_NAME) { fname = XNEWVEC (char, header->val.str.len - 1); memcpy (fname, header->val.str.text + 1, header->val.str.len - 2); @@ -1537,7 +1538,8 @@ get__Pragma_string (cpp_reader *pfile) if (string->type == CPP_EOF) _cpp_backup_tokens (pfile, 1); if (string->type != CPP_STRING && string->type != CPP_WSTRING - && string->type != CPP_STRING32 && string->type != CPP_STRING16) + && string->type != CPP_STRING32 && string->type != CPP_STRING16 + && string->type != CPP_UTF8STRING) return NULL; paren = get_token_no_padding (pfile); diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index df04668dda0..e95f01a412a 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -127,6 +127,7 @@ struct _cpp_file; TK(WSTRING, LITERAL) /* L"string" */ \ TK(STRING16, LITERAL) /* u"string" */ \ TK(STRING32, LITERAL) /* U"string" */ \ + TK(UTF8STRING, LITERAL) /* u8"string" */ \ TK(OBJC_STRING, LITERAL) /* @"string" - Objective-C */ \ TK(HEADER_NAME, LITERAL) /* <stdio.h> in #include */ \ \ @@ -728,10 +729,10 @@ extern const unsigned char *cpp_macro_definition (cpp_reader *, extern void _cpp_backup_tokens (cpp_reader *, unsigned int); extern const cpp_token *cpp_peek_token (cpp_reader *, int); -/* Evaluate a CPP_CHAR or CPP_WCHAR token. */ +/* Evaluate a CPP_*CHAR* token. */ extern cppchar_t cpp_interpret_charconst (cpp_reader *, const cpp_token *, unsigned int *, int *); -/* Evaluate a vector of CPP_STRING or CPP_WSTRING tokens. */ +/* Evaluate a vector of CPP_*STRING* tokens. */ extern bool cpp_interpret_string (cpp_reader *, const cpp_string *, size_t, cpp_string *, enum cpp_ttype); diff --git a/libcpp/internal.h b/libcpp/internal.h index 21e51c6553c..aaa231c2ab1 100644 --- a/libcpp/internal.h +++ b/libcpp/internal.h @@ -397,6 +397,10 @@ struct cpp_reader struct cset_converter narrow_cset_desc; /* Descriptor for converting from the source character set to the + UTF-8 execution character set. */ + struct cset_converter utf8_cset_desc; + + /* Descriptor for converting from the source character set to the UTF-16 execution character set. */ struct cset_converter char16_cset_desc; diff --git a/libcpp/lex.c b/libcpp/lex.c index bab14a4baa3..55bffa9a326 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -617,12 +617,192 @@ create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base, token->val.str.text = dest; } +/* Lexes a raw string. The stored string contains the spelling, including + double quotes, delimiter string, '[' and ']', any leading + 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the + literal, or CPP_OTHER if it was not properly terminated. + + The spelling is NUL-terminated, but it is not guaranteed that this + is the first NUL since embedded NULs are preserved. */ + +static void +lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base, + const uchar *cur) +{ + source_location saw_NUL = 0; + const uchar *raw_prefix; + unsigned int raw_prefix_len = 0; + enum cpp_ttype type; + size_t total_len = 0; + _cpp_buff *first_buff = NULL, *last_buff = NULL; + + type = (*base == 'L' ? CPP_WSTRING : + *base == 'U' ? CPP_STRING32 : + *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16) + : CPP_STRING); + + raw_prefix = cur + 1; + while (raw_prefix_len < 16) + { + switch (raw_prefix[raw_prefix_len]) + { + case ' ': case '[': case ']': case '\t': + case '\v': case '\f': case '\n': default: + break; + /* Basic source charset except the above chars. */ + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + case '_': case '{': case '}': case '#': case '(': case ')': + case '<': case '>': case '%': case ':': case ';': case '.': + case '?': case '*': case '+': case '-': case '/': case '^': + case '&': case '|': case '~': case '!': case '=': case ',': + case '\\': case '"': case '\'': + raw_prefix_len++; + continue; + } + break; + } + + if (raw_prefix[raw_prefix_len] != '[') + { + int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len) + + 1; + if (raw_prefix_len == 16) + cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col, + "raw string delimiter longer than 16 characters"); + else + cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col, + "invalid character '%c' in raw string delimiter", + (int) raw_prefix[raw_prefix_len]); + pfile->buffer->cur = raw_prefix - 1; + create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER); + return; + } + + cur = raw_prefix + raw_prefix_len + 1; + for (;;) + { + cppchar_t c = *cur++; + + if (c == ']' + && strncmp ((const char *) cur, (const char *) raw_prefix, + raw_prefix_len) == 0 + && cur[raw_prefix_len] == '"') + { + cur += raw_prefix_len + 1; + break; + } + else if (c == '\n') + { + if (pfile->state.in_directive + || pfile->state.parsing_args + || pfile->state.in_deferred_pragma) + { + cur--; + type = CPP_OTHER; + cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0, + "unterminated raw string"); + break; + } + + /* raw strings allow embedded non-escaped newlines, which + complicates this routine a lot. */ + if (first_buff == NULL) + { + total_len = cur - base; + first_buff = last_buff = _cpp_get_buff (pfile, total_len); + memcpy (BUFF_FRONT (last_buff), base, total_len); + raw_prefix = BUFF_FRONT (last_buff) + (raw_prefix - base); + BUFF_FRONT (last_buff) += total_len; + } + else + { + size_t len = cur - base; + size_t cur_len = len > BUFF_ROOM (last_buff) + ? BUFF_ROOM (last_buff) : len; + + total_len += len; + memcpy (BUFF_FRONT (last_buff), base, cur_len); + BUFF_FRONT (last_buff) += cur_len; + if (len > cur_len) + { + last_buff = _cpp_append_extend_buff (pfile, last_buff, + len - cur_len); + memcpy (BUFF_FRONT (last_buff), base + cur_len, + len - cur_len); + BUFF_FRONT (last_buff) += len - cur_len; + } + } + + if (pfile->buffer->cur < pfile->buffer->rlimit) + CPP_INCREMENT_LINE (pfile, 0); + pfile->buffer->need_line = true; + + if (!_cpp_get_fresh_line (pfile)) + { + source_location src_loc = token->src_loc; + token->type = CPP_EOF; + /* Tell the compiler the line number of the EOF token. */ + token->src_loc = pfile->line_table->highest_line; + token->flags = BOL; + if (first_buff != NULL) + _cpp_release_buff (pfile, first_buff); + cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0, + "unterminated raw string"); + return; + } + + cur = base = pfile->buffer->cur; + } + else if (c == '\0' && !saw_NUL) + LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table, + CPP_BUF_COLUMN (pfile->buffer, cur)); + } + + if (saw_NUL && !pfile->state.skipping) + cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0, + "null character(s) preserved in literal"); + + pfile->buffer->cur = cur; + if (first_buff == NULL) + create_literal (pfile, token, base, cur - base, type); + else + { + uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1); + + token->type = type; + token->val.str.len = total_len + (cur - base); + token->val.str.text = dest; + last_buff = first_buff; + while (last_buff != NULL) + { + memcpy (dest, last_buff->base, + BUFF_FRONT (last_buff) - last_buff->base); + dest += BUFF_FRONT (last_buff) - last_buff->base; + last_buff = last_buff->next; + } + _cpp_release_buff (pfile, first_buff); + memcpy (dest, base, cur - base); + dest[cur - base] = '\0'; + } +} + /* Lexes a string, character constant, or angle-bracketed header file name. The stored string contains the spelling, including opening - quote and leading any leading 'L', 'u' or 'U'. It returns the type - of the literal, or CPP_OTHER if it was not properly terminated, or - CPP_LESS for an unterminated header name which must be relexed as - normal tokens. + quote and any leading 'L', 'u', 'U' or 'u8' and optional + 'R' modifier. It returns the type of the literal, or CPP_OTHER + if it was not properly terminated, or CPP_LESS for an unterminated + header name which must be relexed as normal tokens. The spelling is NUL-terminated, but it is not guaranteed that this is the first NUL since embedded NULs are preserved. */ @@ -636,12 +816,24 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base) cur = base; terminator = *cur++; - if (terminator == 'L' || terminator == 'u' || terminator == 'U') + if (terminator == 'L' || terminator == 'U') terminator = *cur++; - if (terminator == '\"') + else if (terminator == 'u') + { + terminator = *cur++; + if (terminator == '8') + terminator = *cur++; + } + if (terminator == 'R') + { + lex_raw_string (pfile, token, base, cur); + return; + } + if (terminator == '"') type = (*base == 'L' ? CPP_WSTRING : *base == 'U' ? CPP_STRING32 : - *base == 'u' ? CPP_STRING16 : CPP_STRING); + *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16) + : CPP_STRING); else if (terminator == '\'') type = (*base == 'L' ? CPP_WCHAR : *base == 'U' ? CPP_CHAR32 : @@ -1101,10 +1293,21 @@ _cpp_lex_direct (cpp_reader *pfile) case 'L': case 'u': case 'U': - /* 'L', 'u' or 'U' may introduce wide characters or strings. */ + case 'R': + /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters, + wide strings or raw strings. */ if (c == 'L' || CPP_OPTION (pfile, uliterals)) { - if (*buffer->cur == '\'' || *buffer->cur == '"') + if ((*buffer->cur == '\'' && c != 'R') + || *buffer->cur == '"' + || (*buffer->cur == 'R' + && c != 'R' + && buffer->cur[1] == '"' + && CPP_OPTION (pfile, uliterals)) + || (*buffer->cur == '8' + && c == 'u' + && (buffer->cur[1] == '"' + || (buffer->cur[1] == 'R' && buffer->cur[2] == '"')))) { lex_string (pfile, result, buffer->cur - 1); break; @@ -1120,7 +1323,7 @@ _cpp_lex_direct (cpp_reader *pfile) case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': - case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'S': case 'T': case 'V': case 'W': case 'X': case 'Y': case 'Z': result->type = CPP_NAME; diff --git a/libcpp/macro.c b/libcpp/macro.c index f31805955c6..1d284cf9f8a 100644 --- a/libcpp/macro.c +++ b/libcpp/macro.c @@ -379,7 +379,8 @@ stringify_arg (cpp_reader *pfile, macro_arg *arg) escape_it = (token->type == CPP_STRING || token->type == CPP_CHAR || token->type == CPP_WSTRING || token->type == CPP_WCHAR || token->type == CPP_STRING32 || token->type == CPP_CHAR32 - || token->type == CPP_STRING16 || token->type == CPP_CHAR16); + || token->type == CPP_STRING16 || token->type == CPP_CHAR16 + || token->type == CPP_UTF8STRING); /* Room for each char being written in octal, initial space and final quote and NUL. */ |