diff options
author | Chris Loer <chris.loer@gmail.com> | 2018-06-27 15:01:54 -0700 |
---|---|---|
committer | Chris Loer <chris.loer@mapbox.com> | 2018-07-03 10:03:05 -0700 |
commit | b9d3ecc990ccac102bcfde0e848a4f31b739ad54 (patch) | |
tree | b9bee9a41cb37fe754410c37eb3ff256ad7e4e94 /src | |
parent | 251f5e605f1f1bb3e56115f8cef66cacabfd9d83 (diff) | |
download | qtlocation-mapboxgl-b9d3ecc990ccac102bcfde0e848a4f31b739ad54.tar.gz |
[core] Introduce LanguageTag for parsing BCP 47 tags
Diffstat (limited to 'src')
-rw-r--r-- | src/mbgl/text/language_tag.cpp | 237 | ||||
-rw-r--r-- | src/mbgl/text/language_tag.hpp | 44 |
2 files changed, 281 insertions, 0 deletions
diff --git a/src/mbgl/text/language_tag.cpp b/src/mbgl/text/language_tag.cpp new file mode 100644 index 0000000000..4c2712f103 --- /dev/null +++ b/src/mbgl/text/language_tag.cpp @@ -0,0 +1,237 @@ +#include <mbgl/text/language_tag.hpp> + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunknown-pragmas" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wshadow" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wshorten-64-to-32" +#pragma clang diagnostic ignored "-Wtautological-constant-compare" +#include <boost/spirit/include/qi.hpp> +#include <boost/spirit/include/phoenix_core.hpp> +#include <boost/spirit/include/phoenix_operator.hpp> +#pragma clang diagnostic pop +#pragma GCC diagnostic pop + +#include <sstream> + +/* + ABNF for BCP 47 from: https://tools.ietf.org/html/bcp47 + + Language-Tag = langtag ; normal language tags + / privateuse ; private use tag + / grandfathered ; grandfathered tags NOT IMPLEMENTED + + langtag = language + ["-" script] + ["-" region] + *("-" variant) + *("-" extension) + ["-" privateuse] + + language = 2*3ALPHA ; shortest ISO 639 code + ["-" extlang] ; sometimes followed by + ; extended language subtags + / 4ALPHA ; or reserved for future use + / 5*8ALPHA ; or registered language subtag + + extlang = 3ALPHA ; selected ISO 639 codes + *2("-" 3ALPHA) ; permanently reserved + + script = 4ALPHA ; ISO 15924 code + + region = 2ALPHA ; ISO 3166-1 code + / 3DIGIT ; UN M.49 code + + variant = 5*8alphanum ; registered variants + / (DIGIT 3alphanum) + + extension = singleton 1*("-" (2*8alphanum)) + + ; Single alphanumerics + ; "x" reserved for private use + singleton = DIGIT ; 0 - 9 + / %x41-57 ; A - W + / %x59-5A ; Y - Z + / %x61-77 ; a - w + / %x79-7A ; y - z + + privateuse = "x" 1*("-" (1*8alphanum)) + + grandfathered = irregular ; non-redundant tags registered + / regular ; during the RFC 3066 era + + irregular = "en-GB-oed" ; irregular tags do not match + / "i-ami" ; the 'langtag' production and + / "i-bnn" ; would not otherwise be + / "i-default" ; considered 'well-formed' + / "i-enochian" ; These tags are all valid, + / "i-hak" ; but most are deprecated + / "i-klingon" ; in favor of more modern + / "i-lux" ; subtags or subtag + / "i-mingo" ; combination + / "i-navajo" + / "i-pwn" + / "i-tao" + / "i-tay" + / "i-tsu" + / "sgn-BE-FR" + / "sgn-BE-NL" + / "sgn-CH-DE" + + regular = "art-lojban" ; these tags match the 'langtag' + / "cel-gaulish" ; production, but their subtags + / "no-bok" ; are not extended language + / "no-nyn" ; or variant subtags: their meaning + / "zh-guoyu" ; is defined by their registration + / "zh-hakka" ; and all of these are deprecated + / "zh-min" ; in favor of a more modern + / "zh-min-nan" ; subtag or sequence of subtags + / "zh-xiang" + + alphanum = (ALPHA / DIGIT) ; letters and numbers + +*/ + +namespace mbgl { + +namespace qi = boost::spirit::qi; +namespace phoenix = boost::phoenix; +namespace ascii = boost::spirit::ascii; + +template <typename Iterator> +struct bcp47_parser : qi::grammar<Iterator> +{ + bcp47_parser() : bcp47_parser::base_type(start) + { + using qi::lit; + using qi::repeat; + using qi::inf; + using qi::eoi; + using ascii::char_; + using ascii::no_case; + using ascii::digit; + using ascii::alnum; + using ascii::alpha; + + using boost::spirit::qi::_1; + + start %= no_case[langtag | privateuse | grandfathered]; + + langtag %= (language) [phoenix::ref(languageTag.language) = _1] + >> -("-" >> (script)[phoenix::ref(languageTag.script) = _1]) + >> -("-" >> (region)[phoenix::ref(languageTag.region) = _1]) + >> *("-" >> variant) + >> *("-" >> extension) + >> -("-" >> privateuse); + + language %= (repeat(2,3)[alpha] >> -("-" >> extlang)) // shortest ISO 639 code + // sometimes followed by extended language subtags + | repeat(4)[alpha] // or reserved for future use + | repeat(5,8)[alpha]; // or registered language subtag + + // We add lookaheads for "-"/eoi so that spurious matches on subtags don't prevent backtracking + extlang = repeat(3)[alpha] >> (&lit('-') | eoi) >> repeat(0,2)["-" >> repeat(3)[alpha] >> (&lit('-') | eoi)]; + + script = repeat(4)[alpha] >> (&lit('-') | eoi); + + region = (repeat(2)[alpha] | repeat(3)[digit]) >> (&lit('-') | eoi); + + variant = (repeat(5,8)[alnum] | (digit >> repeat(3,inf)[alnum])) >> (&lit('-') | eoi); + + extension = singleton >> +("-" >> repeat(2,8)[alnum]) >> (&lit('-') | eoi); + + singleton = digit | char_('a','w') | char_('y','z'); // "no-case" handles A-W and Y-Z + + privateuse = "x" >> +("-" >> repeat(1,8)[alnum]) >> (&lit('-') | eoi); + + grandfathered = regular | irregular; + + irregular = lit("en-GB-oed") + | "i-ami" + | "i-bnn" + | "i-default" + | "i-enochian" + | "i-hak" + | "i-klingon" + | "i-lux" + | "i-mingo" + | "i-navajo" + | "i-pwn" + | "i-tao" + | "i-tay" + | "i-tsu" + | "sgn-BE-FR" + | "sgn-BE-NL" + | "sgn-CH-DE"; + + regular = lit("art-lojban") + | "cel-gaulish" + | "no-bok" + | "no-nyn" + | "zh-guoyu" + | "zh-hakka" + | "zh-min" + | "zh-min-nan" + | "zh-xiang"; + } + + qi::rule<Iterator> start; + qi::rule<Iterator> langtag; + qi::rule<Iterator, std::string()> language; + qi::rule<Iterator> extlang; + qi::rule<Iterator, std::string()> script; + qi::rule<Iterator, std::string()> region; + qi::rule<Iterator> variant; + qi::rule<Iterator> extension; + qi::rule<Iterator> singleton; + qi::rule<Iterator> privateuse; + qi::rule<Iterator> grandfathered; + qi::rule<Iterator> irregular; + qi::rule<Iterator> regular; + + LanguageTag languageTag; +}; + +LanguageTag LanguageTag::fromBCP47(const std::string& bcp47Tag) { + typedef std::string::const_iterator iterator_type; + typedef bcp47_parser<iterator_type> bcp47_parser; + + bcp47_parser parser; + std::string::const_iterator iter = bcp47Tag.begin(); + std::string::const_iterator end = bcp47Tag.end(); + bool r = parse(iter, end, parser); + if (r && iter == end) { + return parser.languageTag; + } else { + // Invalid tags are treated as empty/"default" + return LanguageTag(); + } +} + +LanguageTag::LanguageTag(optional<std::string> language_, optional<std::string> script_, optional<std::string> region_) + : language(std::move(language_)) + , script(std::move(script_)) + , region(std::move(region_)) +{} + +std::string LanguageTag::toBCP47() const { + std::stringstream bcp47; + if (!language) { + // BCP 47 requires a language, but we're matching implementations that accept "" + // to mean something like "default" + return bcp47.str(); + } else { + bcp47 << *language; + } + + if (script) { + bcp47 << "-" << *script; + } + + if (region) { + bcp47 << "-" << *region; + } + return bcp47.str(); +} +} // end namespace mbgl diff --git a/src/mbgl/text/language_tag.hpp b/src/mbgl/text/language_tag.hpp new file mode 100644 index 0000000000..7a6a16531f --- /dev/null +++ b/src/mbgl/text/language_tag.hpp @@ -0,0 +1,44 @@ +#pragma once + +#include <mbgl/util/optional.hpp> + +#include <string> + +/* + Use LanguageTag to go back and forth between BCP 47 language tags + and their component language/script/region. + + This implementation accepts but will not round-trip additional + variant/extension/privateuse/grandfathered information in a BCP 47 tag. + + Why implement this? + Mapbox Style Spec specifies locales with BCP 47 + Android and Intl.Collator implementations speak BCP 47 out of the box + Darwin implementation requires translation to "Language Identifier" + We're OK with not supporting extensions, but we want to succesfully + parse any valid BCP 47 tag and get out the base language/script/region. + + Mozilla's version: https://dxr.mozilla.org/mozilla-central/source/intl/locale/MozLocale.cpp + Looks like it actually supports a subset of BCP 47. + See https://bugzilla.mozilla.org/show_bug.cgi?id=bcp47 + + Chromium is based on ICU version: https://ssl.icu-project.org/apiref/icu4c/uloc_8h.html + Getting all the locale information is overkill for us, we just want + language/script/region. + */ + +namespace mbgl { + +struct LanguageTag { + LanguageTag() = default; + LanguageTag(optional<std::string> language_, optional<std::string> script_, optional<std::string> region_); + + static LanguageTag fromBCP47(const std::string& bcp47Tag); + std::string toBCP47() const; + + optional<std::string> language; // ISO 639 + optional<std::string> script; // ISO 15924 + optional<std::string> region; // ISO 3316-1 || UN M.49 +}; + +} // end namespace mbgl |