1 files changed, 54 insertions, 1 deletions
diff --git a/gcc/go/gofrontend/lex.cc b/gcc/go/gofrontend/lex.cc
index 5b7ce6869e6..6add84ed1f7 100644
--- a/gcc/go/gofrontend/lex.cc
+++ b/gcc/go/gofrontend/lex.cc
@@ -722,7 +722,16 @@ Lex::next_token()
 		unsigned int ci;
 		bool issued_error;
 		this->lineoff_ = p - this->linebuf_;
-		this->advance_one_utf8_char(p, &ci, &issued_error);
+		const char *pnext = this->advance_one_utf8_char(p, &ci,
+								&issued_error);
+
+		// Ignore byte order mark at start of file.
+		if (ci == 0xfeff)
+		  {
+		    p = pnext;
+		    break;
+		  }
+
 		if (Lex::is_unicode_letter(ci))
 		  return this->gather_identifier();
 
@@ -831,6 +840,14 @@ Lex::advance_one_utf8_char(const char* p, unsigned int* value,
       *issued_error = true;
       return p + 1;
     }
+
+  // Warn about byte order mark, except at start of file.
+  if (*value == 0xfeff && (this->lineno_ != 1 || this->lineoff_ != 0))
+    {
+      error_at(this->location(), "Unicode (UTF-8) BOM in middle of file");
+      *issued_error = true;
+    }
+
   return p + adv;
 }
 
@@ -1295,6 +1312,12 @@ Lex::append_char(unsigned int v, bool is_character, std::string* str,
 	  // Turn it into the "replacement character".
 	  v = 0xfffd;
 	}
+      if (v >= 0xd800 && v < 0xe000)
+	{
+	  warning_at(location, 0,
+		     "unicode code point 0x%x is invalid surrogate pair", v);
+	  v = 0xfffd;
+	}
       if (v <= 0xffff)
 	{
 	  buf[0] = 0xe0 + (v >> 12);
@@ -1705,6 +1728,27 @@ struct Unicode_range
   unsigned int stride;
 };
 
+// A table of whitespace characters--Unicode code points classified as
+// "Space", "C" locale whitespace characters, the "next line" control
+// character (0085), the line separator (2028), the paragraph
+// separator (2029), and the "zero-width non-break space" (feff).
+
+static const Unicode_range unicode_space[] =
+{
+  { 0x0009, 0x000d, 1 },
+  { 0x0020, 0x0020, 1 },
+  { 0x0085, 0x0085, 1 },
+  { 0x00a0, 0x00a0, 1 },
+  { 0x1680, 0x1680, 1 },
+  { 0x180e, 0x180e, 1 },
+  { 0x2000, 0x200a, 1 },
+  { 0x2028, 0x2029, 1 },
+  { 0x202f, 0x202f, 1 },
+  { 0x205f, 0x205f, 1 },
+  { 0x3000, 0x3000, 1 },
+  { 0xfeff, 0xfeff, 1 },
+};
+
 // A table of Unicode digits--Unicode code points classified as
 // "Digit".
 
@@ -2294,6 +2338,15 @@ Lex::is_in_unicode_range(unsigned int c, const Unicode_range* ranges,
     }
 }
 
+// Return whether C is a space character.
+
+bool
+Lex::is_unicode_space(unsigned int c)
+{
+  return Lex::is_in_unicode_range(c, unicode_space,
+				  ARRAY_SIZE(unicode_space));
+}
+
 // Return whether C is a Unicode digit--a Unicode code point
 // classified as "Digit".