summaryrefslogtreecommitdiff
path: root/gcc/go/gofrontend/lex.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/go/gofrontend/lex.cc')
-rw-r--r--gcc/go/gofrontend/lex.cc55
1 files changed, 54 insertions, 1 deletions
diff --git a/gcc/go/gofrontend/lex.cc b/gcc/go/gofrontend/lex.cc
index 5b7ce6869e6..6add84ed1f7 100644
--- a/gcc/go/gofrontend/lex.cc
+++ b/gcc/go/gofrontend/lex.cc
@@ -722,7 +722,16 @@ Lex::next_token()
unsigned int ci;
bool issued_error;
this->lineoff_ = p - this->linebuf_;
- this->advance_one_utf8_char(p, &ci, &issued_error);
+ const char *pnext = this->advance_one_utf8_char(p, &ci,
+ &issued_error);
+
+ // Ignore byte order mark at start of file.
+ if (ci == 0xfeff)
+ {
+ p = pnext;
+ break;
+ }
+
if (Lex::is_unicode_letter(ci))
return this->gather_identifier();
@@ -831,6 +840,14 @@ Lex::advance_one_utf8_char(const char* p, unsigned int* value,
*issued_error = true;
return p + 1;
}
+
+ // Warn about byte order mark, except at start of file.
+ if (*value == 0xfeff && (this->lineno_ != 1 || this->lineoff_ != 0))
+ {
+ error_at(this->location(), "Unicode (UTF-8) BOM in middle of file");
+ *issued_error = true;
+ }
+
return p + adv;
}
@@ -1295,6 +1312,12 @@ Lex::append_char(unsigned int v, bool is_character, std::string* str,
// Turn it into the "replacement character".
v = 0xfffd;
}
+ if (v >= 0xd800 && v < 0xe000)
+ {
+ warning_at(location, 0,
+ "unicode code point 0x%x is invalid surrogate pair", v);
+ v = 0xfffd;
+ }
if (v <= 0xffff)
{
buf[0] = 0xe0 + (v >> 12);
@@ -1705,6 +1728,27 @@ struct Unicode_range
unsigned int stride;
};
+// A table of whitespace characters--Unicode code points classified as
+// "Space", "C" locale whitespace characters, the "next line" control
+// character (0085), the line separator (2028), the paragraph
+// separator (2029), and the "zero-width non-break space" (feff).
+
+static const Unicode_range unicode_space[] =
+{
+ { 0x0009, 0x000d, 1 },
+ { 0x0020, 0x0020, 1 },
+ { 0x0085, 0x0085, 1 },
+ { 0x00a0, 0x00a0, 1 },
+ { 0x1680, 0x1680, 1 },
+ { 0x180e, 0x180e, 1 },
+ { 0x2000, 0x200a, 1 },
+ { 0x2028, 0x2029, 1 },
+ { 0x202f, 0x202f, 1 },
+ { 0x205f, 0x205f, 1 },
+ { 0x3000, 0x3000, 1 },
+ { 0xfeff, 0xfeff, 1 },
+};
+
// A table of Unicode digits--Unicode code points classified as
// "Digit".
@@ -2294,6 +2338,15 @@ Lex::is_in_unicode_range(unsigned int c, const Unicode_range* ranges,
}
}
+// Return whether C is a space character.
+
+bool
+Lex::is_unicode_space(unsigned int c)
+{
+ return Lex::is_in_unicode_range(c, unicode_space,
+ ARRAY_SIZE(unicode_space));
+}
+
// Return whether C is a Unicode digit--a Unicode code point
// classified as "Digit".