diff options
Diffstat (limited to 'gcc/go/gofrontend/lex.cc')
-rw-r--r-- | gcc/go/gofrontend/lex.cc | 55 |
1 files changed, 54 insertions, 1 deletions
diff --git a/gcc/go/gofrontend/lex.cc b/gcc/go/gofrontend/lex.cc index 5b7ce6869e6..6add84ed1f7 100644 --- a/gcc/go/gofrontend/lex.cc +++ b/gcc/go/gofrontend/lex.cc @@ -722,7 +722,16 @@ Lex::next_token() unsigned int ci; bool issued_error; this->lineoff_ = p - this->linebuf_; - this->advance_one_utf8_char(p, &ci, &issued_error); + const char *pnext = this->advance_one_utf8_char(p, &ci, + &issued_error); + + // Ignore byte order mark at start of file. + if (ci == 0xfeff) + { + p = pnext; + break; + } + if (Lex::is_unicode_letter(ci)) return this->gather_identifier(); @@ -831,6 +840,14 @@ Lex::advance_one_utf8_char(const char* p, unsigned int* value, *issued_error = true; return p + 1; } + + // Warn about byte order mark, except at start of file. + if (*value == 0xfeff && (this->lineno_ != 1 || this->lineoff_ != 0)) + { + error_at(this->location(), "Unicode (UTF-8) BOM in middle of file"); + *issued_error = true; + } + return p + adv; } @@ -1295,6 +1312,12 @@ Lex::append_char(unsigned int v, bool is_character, std::string* str, // Turn it into the "replacement character". v = 0xfffd; } + if (v >= 0xd800 && v < 0xe000) + { + warning_at(location, 0, + "unicode code point 0x%x is invalid surrogate pair", v); + v = 0xfffd; + } if (v <= 0xffff) { buf[0] = 0xe0 + (v >> 12); @@ -1705,6 +1728,27 @@ struct Unicode_range unsigned int stride; }; +// A table of whitespace characters--Unicode code points classified as +// "Space", "C" locale whitespace characters, the "next line" control +// character (0085), the line separator (2028), the paragraph +// separator (2029), and the "zero-width non-break space" (feff). + +static const Unicode_range unicode_space[] = +{ + { 0x0009, 0x000d, 1 }, + { 0x0020, 0x0020, 1 }, + { 0x0085, 0x0085, 1 }, + { 0x00a0, 0x00a0, 1 }, + { 0x1680, 0x1680, 1 }, + { 0x180e, 0x180e, 1 }, + { 0x2000, 0x200a, 1 }, + { 0x2028, 0x2029, 1 }, + { 0x202f, 0x202f, 1 }, + { 0x205f, 0x205f, 1 }, + { 0x3000, 0x3000, 1 }, + { 0xfeff, 0xfeff, 1 }, +}; + // A table of Unicode digits--Unicode code points classified as // "Digit". @@ -2294,6 +2338,15 @@ Lex::is_in_unicode_range(unsigned int c, const Unicode_range* ranges, } } +// Return whether C is a space character. + +bool +Lex::is_unicode_space(unsigned int c) +{ + return Lex::is_in_unicode_range(c, unicode_space, + ARRAY_SIZE(unicode_space)); +} + // Return whether C is a Unicode digit--a Unicode code point // classified as "Digit". |