diff options
author | geoffk <geoffk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2005-03-15 00:36:33 +0000 |
---|---|---|
committer | geoffk <geoffk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2005-03-15 00:36:33 +0000 |
commit | bce471493dbd17218e3acbc35ee809986ba855c0 (patch) | |
tree | d3cd092701f32b8f84eec7a95a4e244aafcf795e /libcpp/charset.c | |
parent | eddd9b2eb64bd2a51a3d297225a8a1cac5a8a055 (diff) | |
download | gcc-bce471493dbd17218e3acbc35ee809986ba855c0.tar.gz |
Index: gcc/ChangeLog
2005-03-14 Geoffrey Keating <geoffk@apple.com>
* doc/cppopts.texi (-fexec-charset): Add concept index entry.
(-fwide-exec-charset): Likewise.
(-finput-charset): Likewise.
* doc/invoke.texi (Warning Options): Document -Wnormalized=.
* c-opts.c (c_common_handle_option): Handle -Wnormalized=.
* c.opt (Wnormalized): New.
Index: libcpp/ChangeLog
2005-03-14 Geoffrey Keating <geoffk@apple.com>
* init.c (cpp_create_reader): Default warn_normalize to normalized_C.
* charset.c: Update for new format of ucnid.h.
(ucn_valid_in_identifier): Update for new format of ucnid.h.
Add NST parameter, and update it; update callers.
(cpp_valid_ucn): Add NST parameter, update callers. Replace abort
with cpp_error.
(convert_ucn): Pass normalize_state to cpp_valid_ucn.
* internal.h (struct normalize_state): New.
(INITIAL_NORMALIZE_STATE): New.
(NORMALIZE_STATE_RESULT): New.
(NORMALIZE_STATE_UPDATE_IDNUM): New.
(_cpp_valid_ucn): New.
* lex.c (warn_about_normalization): New.
(forms_identifier_p): Add normalize_state parameter, update callers.
(lex_identifier): Add normalize_state parameter, update callers. Keep
the state current.
(lex_number): Likewise.
(_cpp_lex_direct): Pass normalize_state to subroutines. Check
it with warn_about_normalization.
* makeucnid.c: New.
* ucnid.h: Replace.
* ucnid.pl: Remove.
* ucnid.tab: Make appropriate for input to makeucnid.c. Remove
comments about obsolete version of C++.
* include/cpplib.h (enum cpp_normalize_level): New.
(struct cpp_options): Add warn_normalize field.
Index: gcc/testsuite/ChangeLog
2005-03-14 Geoffrey Keating <geoffk@apple.com>
* gcc.dg/cpp/normalize-1.c: New.
* gcc.dg/cpp/normalize-2.c: New.
* gcc.dg/cpp/normalize-3.c: New.
* gcc.dg/cpp/normalize-4.c: New.
* gcc.dg/cpp/ucnid-4.c: New.
* gcc.dg/cpp/ucnid-5.c: New.
* g++.dg/cpp/normalize-1.C: New.
* g++.dg/cpp/ucnid-1.C: New.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@96459 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libcpp/charset.c')
-rw-r--r-- | libcpp/charset.c | 130 |
1 files changed, 110 insertions, 20 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c index cd25f10a2e6..f028b371440 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -22,7 +22,6 @@ Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "system.h" #include "cpplib.h" #include "internal.h" -#include "ucnid.h" /* Character set handling for C-family languages. @@ -786,43 +785,128 @@ width_to_mask (size_t width) return ((size_t) 1 << width) - 1; } +/* A large table of unicode character information. */ +enum { + /* Valid in a C99 identifier? */ + C99 = 1, + /* Valid in a C99 identifier, but not as the first character? */ + DIG = 2, + /* Valid in a C++ identifier? */ + CXX = 4, + /* NFC representation is not valid in an identifier? */ + CID = 8, + /* Might be valid NFC form? */ + NFC = 16, + /* Might be valid NFKC form? */ + NKC = 32, + /* Certain preceding characters might make it not valid NFC/NKFC form? */ + CTX = 64 +}; + +static const struct { + /* Bitmap of flags above. */ + unsigned char flags; + /* Combining class of the character. */ + unsigned char combine; + /* Last character in the range described by this entry. */ + unsigned short end; +} ucnranges[] = { +#include "ucnid.h" +}; + /* Returns 1 if C is valid in an identifier, 2 if C is valid except at the start of an identifier, and 0 if C is not valid in an identifier. We assume C has already gone through the checks of - _cpp_valid_ucn. The algorithm is a simple binary search on the - table defined in cppucnid.h. */ + _cpp_valid_ucn. Also update NST for C if returning nonzero. The + algorithm is a simple binary search on the table defined in + ucnid.h. */ static int -ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c) +ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, + struct normalize_state *nst) { int mn, mx, md; - mn = -1; - mx = ARRAY_SIZE (ucnranges); - while (mx - mn > 1) + if (c > 0xFFFF) + return 0; + + mn = 0; + mx = ARRAY_SIZE (ucnranges) - 1; + while (mx != mn) { md = (mn + mx) / 2; - if (c < ucnranges[md].lo) + if (c <= ucnranges[md].end) mx = md; - else if (c > ucnranges[md].hi) - mn = md; else - goto found; + mn = md + 1; } - return 0; - found: /* When -pedantic, we require the character to have been listed by the standard for the current language. Otherwise, we accept the union of the acceptable sets for C++98 and C99. */ + if (! (ucnranges[mn].flags & (C99 | CXX))) + return 0; + if (CPP_PEDANTIC (pfile) - && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99)) + && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99)) || (CPP_OPTION (pfile, cplusplus) - && !(ucnranges[md].flags & CXX)))) + && !(ucnranges[mn].flags & CXX)))) return 0; + /* Update NST. */ + if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class) + nst->level = normalized_none; + else if (ucnranges[mn].flags & CTX) + { + bool safe; + cppchar_t p = nst->previous; + + /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */ + if (c == 0x09BE) + safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */ + else if (c == 0x0B3E) + safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */ + else if (c == 0x0BBE) + safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */ + else if (c == 0x0CC2) + safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */ + else if (c == 0x0D3E) + safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */ + /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC, + and are combined algorithmically from a sequence of the form + 1100-1112 1161-1175 11A8-11C2 + (if the third is not present, it is treated as 11A7, which is not + really a valid character). + Unfortunately, C99 allows (only) the NFC form, but C++ allows + only the combining characters. */ + else if (c >= 0x1161 && c <= 0x1175) + safe = p < 0x1100 || p > 0x1112; + else if (c >= 0x11A8 && c <= 0x11C2) + safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0); + else + { + /* Uh-oh, someone updated ucnid.h without updating this code. */ + cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c); + safe = true; + } + if (!safe && c < 0x1161) + nst->level = normalized_none; + else if (!safe) + nst->level = MAX (nst->level, normalized_identifier_C); + } + else if (ucnranges[mn].flags & NKC) + ; + else if (ucnranges[mn].flags & NFC) + nst->level = MAX (nst->level, normalized_C); + else if (ucnranges[mn].flags & CID) + nst->level = MAX (nst->level, normalized_identifier_C); + else + nst->level = normalized_none; + nst->previous = c; + nst->prev_class = ucnranges[mn].combine; + /* In C99, UCN digits may not begin identifiers. */ - if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG)) + if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG)) return 2; return 1; @@ -853,7 +937,8 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c) cppchar_t _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, - const uchar *limit, int identifier_pos) + const uchar *limit, int identifier_pos, + struct normalize_state *nst) { cppchar_t result, c; unsigned int length; @@ -873,7 +958,10 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, else if (str[-1] == 'U') length = 8; else - abort(); + { + cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN"); + length = 4; + } result = 0; do @@ -915,10 +1003,11 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, CPP_OPTION (pfile, warn_dollars) = 0; cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number"); } + NORMALIZE_STATE_UPDATE_IDNUM (nst); } else if (identifier_pos) { - int validity = ucn_valid_in_identifier (pfile, result); + int validity = ucn_valid_in_identifier (pfile, result, nst); if (validity == 0) cpp_error (pfile, CPP_DL_ERROR, @@ -950,9 +1039,10 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, int rval; struct cset_converter cvt = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc; + struct normalize_state nst = INITIAL_NORMALIZE_STATE; from++; /* Skip u/U. */ - ucn = _cpp_valid_ucn (pfile, &from, limit, 0); + ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst); rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft); if (rval) |