summaryrefslogtreecommitdiff
path: root/libstdc++-v3
diff options
context:
space:
mode:
authorredi <redi@138bc75d-0d04-0410-961f-82ee72b054a4>2017-03-16 15:27:51 +0000
committerredi <redi@138bc75d-0d04-0410-961f-82ee72b054a4>2017-03-16 15:27:51 +0000
commit8928efc42c0b386b88c91704cadecf4fd83b869b (patch)
treede8fd5a5cb199e551d99a1fede15c60f699ed6d8 /libstdc++-v3
parent04e802e9524420a06267f472c726801231d4be9d (diff)
downloadgcc-8928efc42c0b386b88c91704cadecf4fd83b869b.tar.gz
PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling
PR libstdc++/79980 * include/bits/locale_conv.h (__do_str_codecvt): Set __count on error path. * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads for manipulating codecvt_mode values. (read_utf16_bom): Compare input to BOM constants instead of integral constants that depend on endianness. Take mode parameter by reference and adjust it, to distinguish between no BOM present and UTF-16BE BOM present. (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom. (surrogates): New enumeration type. (utf16_in, utf16_out): Add surrogates parameter to choose between UTF-16 and UCS2 behaviour. (utf16_span, ucs2_span): Use std::min not std::max. (ucs2_out): Use std::min not std::max. Disallow surrogate pairs. (ucs2_in): Likewise. Adjust calls to read_utf16_bom. * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test. * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@246200 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libstdc++-v3')
-rw-r--r--libstdc++-v3/ChangeLog19
-rw-r--r--libstdc++-v3/include/bits/locale_conv.h5
-rw-r--r--libstdc++-v3/src/c++11/codecvt.cc94
-rw-r--r--libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc115
-rw-r--r--libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc94
5 files changed, 296 insertions, 31 deletions
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog
index 98735ca0c38..83f74efa390 100644
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,5 +1,24 @@
2017-03-16 Jonathan Wakely <jwakely@redhat.com>
+ PR libstdc++/79980
+ * include/bits/locale_conv.h (__do_str_codecvt): Set __count on
+ error path.
+ * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
+ for manipulating codecvt_mode values.
+ (read_utf16_bom): Compare input to BOM constants instead of integral
+ constants that depend on endianness. Take mode parameter by
+ reference and adjust it, to distinguish between no BOM present and
+ UTF-16BE BOM present.
+ (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
+ (surrogates): New enumeration type.
+ (utf16_in, utf16_out): Add surrogates parameter to choose between
+ UTF-16 and UCS2 behaviour.
+ (utf16_span, ucs2_span): Use std::min not std::max.
+ (ucs2_out): Use std::min not std::max. Disallow surrogate pairs.
+ (ucs2_in): Likewise. Adjust calls to read_utf16_bom.
+ * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
+ * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.
+
PR libstdc++/79511
* src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff
as a surrogate pair.
diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h
index cd8f1466480..9b952d45165 100644
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
&& (__outstr.size() - __outchars) < __maxlen);
if (__result == codecvt_base::error)
- return false;
+ {
+ __count = __next - __first;
+ return false;
+ }
if (__result == codecvt_base::noconv)
{
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 9b63e2b79f9..a50804cc164 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -24,13 +24,27 @@
#include <codecvt>
#include <cstring> // std::memcpy, std::memcmp
-#include <bits/stl_algobase.h> // std::max
+#include <bits/stl_algobase.h> // std::min
#ifdef _GLIBCXX_USE_C99_STDINT_TR1
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
+ // The standard doesn't define these operators, which is annoying.
+ static underlying_type<codecvt_mode>::type
+ to_integer(codecvt_mode m)
+ { return static_cast<mode_t>(m); }
+
+ static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
+ { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
+
+ static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
+ { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
+
+ static codecvt_mode operator~(codecvt_mode m)
+ { return codecvt_mode(~to_integer(m)); }
+
namespace
{
// Largest code point that fits in a single UTF-16 code unit.
@@ -117,22 +131,26 @@ namespace
read_bom(from, utf8_bom);
}
- // If consume_header is set in mode update from.next to after any BOM.
- // Return little_endian iff the UTF-16LE BOM was present.
- codecvt_mode
- read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
+ // If consume_header is not set in mode, no effects.
+ // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
+ // - if the UTF-16BE BOM was found unset little_endian in mode, or
+ // - if the UTF-16LE BOM was found set little_endian in mode.
+ void
+ read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
{
if (mode & consume_header && from.size())
{
- if (*from.next == 0xFEFF)
- ++from.next;
- else if (*from.next == 0xFFFE)
+ if (!memcmp(from.next, utf16_bom, 2))
+ {
+ ++from.next;
+ mode &= ~little_endian;
+ }
+ else if (!memcmp(from.next, utf16le_bom, 2))
{
++from.next;
- return little_endian;
+ mode |= little_endian;
}
}
- return {};
}
// Read a codepoint from a UTF-8 multibyte sequence.
@@ -380,8 +398,7 @@ namespace
ucs4_in(range<const char16_t>& from, range<char32_t>& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{
- if (read_utf16_bom(from, mode) == little_endian)
- mode = codecvt_mode(mode & little_endian);
+ read_utf16_bom(from, mode);
while (from.size() && to.size())
{
const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
@@ -413,11 +430,15 @@ namespace
return codecvt_base::ok;
}
- // utf8 -> utf16
+ // Flag indicating whether to process UTF-16 or UCS2
+ enum class surrogates { allowed, disallowed };
+
+ // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
template<typename C>
codecvt_base::result
utf16_in(range<const char>& from, range<C>& to,
- unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+ unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+ surrogates s = surrogates::allowed)
{
read_utf8_bom(from, mode);
while (from.size() && to.size())
@@ -425,7 +446,12 @@ namespace
const char* const first = from.next;
const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == incomplete_mb_character)
- return codecvt_base::partial;
+ {
+ if (s == surrogates::allowed)
+ return codecvt_base::partial;
+ else
+ return codecvt_base::error; // No surrogates in UCS2
+ }
if (codepoint > maxcode)
return codecvt_base::error;
if (!write_utf16_code_point(to, codepoint, mode))
@@ -437,11 +463,12 @@ namespace
return codecvt_base::ok;
}
- // utf16 -> utf8
+ // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
template<typename C>
codecvt_base::result
utf16_out(range<const C>& from, range<char>& to,
- unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+ unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+ surrogates s = surrogates::allowed)
{
if (!write_utf8_bom(to, mode))
return codecvt_base::partial;
@@ -451,6 +478,9 @@ namespace
int inc = 1;
if (is_high_surrogate(c))
{
+ if (s == surrogates::disallowed)
+ return codecvt_base::error; // No surrogates in UCS-2
+
if (from.size() < 2)
return codecvt_base::ok; // stop converting at this point
@@ -492,7 +522,7 @@ namespace
++count;
}
if (count+1 == max) // take one more character if it fits in a single unit
- read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
+ read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
return from.next;
}
@@ -501,7 +531,9 @@ namespace
ucs2_in(range<const char>& from, range<char16_t>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
- return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
+ return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
}
// ucs2 -> utf8
@@ -509,7 +541,9 @@ namespace
ucs2_out(range<const char16_t>& from, range<char>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
- return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
+ return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
}
// ucs2 -> utf16
@@ -537,14 +571,14 @@ namespace
ucs2_in(range<const char16_t>& from, range<char16_t>& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
- if (read_utf16_bom(from, mode) == little_endian)
- mode = codecvt_mode(mode & little_endian);
- maxcode = std::max(max_single_utf16_unit, maxcode);
+ read_utf16_bom(from, mode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
while (from.size() && to.size())
{
const char32_t c = read_utf16_code_point(from, maxcode, mode);
if (c == incomplete_mb_character)
- return codecvt_base::partial;
+ return codecvt_base::error; // UCS-2 only supports single units.
if (c > maxcode)
return codecvt_base::error;
*to.next++ = c;
@@ -557,9 +591,9 @@ namespace
char32_t maxcode, codecvt_mode mode)
{
range<const char16_t> from{ begin, end };
- if (read_utf16_bom(from, mode) == little_endian)
- mode = codecvt_mode(mode & little_endian);
- maxcode = std::max(max_single_utf16_unit, maxcode);
+ read_utf16_bom(from, mode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode);
@@ -572,7 +606,8 @@ namespace
{
range<const char> from{ begin, end };
read_utf8_bom(from, mode);
- maxcode = std::max(max_single_utf16_unit, maxcode);
+ // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+ maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf8_code_point(from, maxcode);
@@ -598,8 +633,7 @@ namespace
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
range<const char16_t> from{ begin, end };
- if (read_utf16_bom(from, mode) == little_endian)
- mode = codecvt_mode(mode & little_endian);
+ read_utf16_bom(from, mode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode);
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
new file mode 100644
index 00000000000..9383818d86b
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
@@ -0,0 +1,115 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79980
+
+constexpr std::codecvt_mode mode(std::codecvt_mode m)
+{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
+
+template<typename WCh, unsigned long Max = 0x10FFFF,
+ std::codecvt_mode Mode = std::consume_header>
+ using Conv
+ = std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
+
+void
+test01()
+{
+ const char src[] = "\xFE\xFF\xAB\xCD";
+ Conv<char16_t> conv;
+ auto dst = conv.from_bytes(src, src+4);
+ VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test02()
+{
+ const char src[] = "\xFF\xFE\xAB\xCD";
+ Conv<char16_t> conv;
+ auto dst = conv.from_bytes(src, src+4);
+ VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test03()
+{
+ const char src[] = "\xFE\xFF\xAB\xCD";
+ Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+ auto dst = conv.from_bytes(src, src+4);
+ VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test04()
+{
+ const char src[] = "\xFF\xFE\xAB\xCD";
+ Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+ auto dst = conv.from_bytes(src, src+4);
+ VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test05()
+{
+ const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
+ Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
+ std::u16string result = conv.from_bytes(src, src+4);
+ VERIFY( result == u"from_bytes failed" );
+ VERIFY( conv.converted() == 2 );
+}
+
+void
+test06()
+{
+ const char src[] = "\0\x61\xAB\xCD";
+ Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+ std::u16string result = conv.from_bytes(src, src+3); // incomplete character
+ VERIFY( result == u"from_bytes failed" );
+ VERIFY( conv.converted() == 2 );
+}
+
+void
+test07()
+{
+ Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+ // ucs2 to utf-16 conversion should fail on invalid ucs2 input:
+ std::u16string utf16 = u"1234\U00001111\U0001ffff";
+ auto out = conv.to_bytes(utf16);
+ VERIFY( out == "to_bytes failed" );
+ VERIFY( conv.converted() == 5 );
+
+ // And should also fail on incomplete surrogate pair (not return partial):
+ out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
+ VERIFY( out == "to_bytes failed" );
+ VERIFY( conv.converted() == 5 );
+}
+
+int main()
+{
+ test01();
+ test02();
+ test03();
+ test04();
+ test05();
+ test06();
+ test07();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
new file mode 100644
index 00000000000..1251acb85be
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
@@ -0,0 +1,94 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+
+using std::wstring_convert;
+using std::codecvt_utf8;
+
+void
+test01()
+{
+ std::string src = u8"1234\U00001111\U0001ffff";
+ wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
+
+ // utf-8 to ucs2 conversion should fail on character outside BMP
+ auto ucs2 = c.from_bytes(src);
+ VERIFY( ucs2 == u"BAD" );
+ VERIFY( c.converted() == 7 );
+
+ // ucs2 to utf-8 conversion should fail on invalid ucs2 input:
+ std::u16string utf16 = u"1234\U00001111\U0001ffff";
+ auto out = c.to_bytes(utf16);
+ VERIFY( out == "bad" );
+ VERIFY( c.converted() == 5 );
+
+ // And should also fail on incomplete surrogate pair (not return partial):
+ out = c.to_bytes(utf16.substr(0, utf16.size()-1));
+ VERIFY( out == "bad" );
+ VERIFY( c.converted() == 5 );
+}
+
+void
+test02()
+{
+ std::string src = u8"1234\U00001111\U0001ffff";
+ wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
+
+ // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
+ auto ucs2 = c.from_bytes(src);
+ VERIFY( ucs2 == u"BAD" );
+ VERIFY( c.converted() == 4 );
+}
+
+void
+test03()
+{
+ std::string src = u8"1234\U00001111\U0001ffff";
+ wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
+
+ // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
+ auto ucs4 = c.from_bytes(src);
+ VERIFY( ucs4 == U"BAD" );
+ VERIFY( c.converted() == 7 );
+}
+
+void
+test04()
+{
+ std::string src = u8"1234\U00001111\U0001ffff";
+ wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
+
+ // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
+ auto ucs4 = c.from_bytes(src);
+ VERIFY( ucs4 == U"BAD" );
+ VERIFY( c.converted() == 4 );
+}
+
+int
+main()
+{
+ test01();
+ test02();
+ test03();
+ test04();
+}