diff options
author | Aarni Koskela <akx@iki.fi> | 2018-05-28 12:12:23 +0300 |
---|---|---|
committer | Aarni Koskela <akx@iki.fi> | 2018-05-28 12:21:42 +0300 |
commit | 77dc9d4024b78c2339f7cf3bff1a2e8be8e2d0f7 (patch) | |
tree | c06dca648ac95e59b6c1e11d4bc54ca6b7629738 /scripts | |
parent | a72cdf171a9e656ef7d8f31ebb3216c0b4e96556 (diff) | |
download | babel-77dc9d4024b78c2339f7cf3bff1a2e8be8e2d0f7.tar.gz |
import_cldr: ignore formatting rules for non-Latin numbering systemsnumber-sys-import
Previously the script could have inadvertently merged formatting rules between numbering systems due to the XML selectors used. This makes sure only Latin rules are used for the time being. When support for other numbering systems is properly added (see #470), these checks can be changed.
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/import_cldr.py | 157 |
1 files changed, 101 insertions, 56 deletions
diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py index cd0ec37..60aa6c2 100755 --- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -389,6 +389,8 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False): territory != '001' and territory or None ])) + data['locale_id'] = locale_id + if locale_id in plural_rules: data['plural_form'] = plural_rules[locale_id] if locale_id in ordinal_rules: @@ -430,6 +432,31 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False): write_datafile(data_filename, data, dump_json=dump_json) +def _should_skip_number_elem(data, elem): + """ + Figure out whether the numbering-containing element `elem` is in a currently + non-supported (i.e. currently non-Latin) numbering system. + + If it is, a warning is raised. + + :param data: The root data element, for formatting the warning. + :param elem: Element with `numberSystem` key + :return: Boolean + """ + number_system = elem.get('numberSystem', 'latn') + + if number_system != 'latn': + log('%s: Unsupported number system "%s" in <%s numberSystem="%s">' % ( + data['locale_id'], + number_system, + elem.tag, + number_system, + )) + return True + + return False + + def _should_skip_elem(elem, type=None, dest=None): """ Check whether the given element should be skipped. @@ -701,59 +728,73 @@ def parse_calendar_datetime_skeletons(data, calendar): def parse_number_symbols(data, tree): number_symbols = data.setdefault('number_symbols', {}) - for elem in tree.findall('.//numbers/symbols/*'): - if _should_skip_elem(elem): + for symbol_elem in tree.findall('.//numbers/symbols'): + if _should_skip_number_elem(data, symbol_elem): # TODO: Support other number systems continue - number_symbols[elem.tag] = text_type(elem.text) + + for elem in symbol_elem.findall('./*'): + if _should_skip_elem(elem): + continue + number_symbols[elem.tag] = text_type(elem.text) def parse_decimal_formats(data, tree): decimal_formats = data.setdefault('decimal_formats', {}) - for elem in tree.findall('.//decimalFormats/decimalFormatLength'): - length_type = elem.attrib.get('type') - if _should_skip_elem(elem, length_type, decimal_formats): + for df_elem in tree.findall('.//decimalFormats'): + if _should_skip_number_elem(data, df_elem): # TODO: Support other number systems continue - if elem.findall('./alias'): - # TODO map the alias to its target - continue - for pattern_el in elem.findall('./decimalFormat/pattern'): - pattern_type = pattern_el.attrib.get('type') - pattern = numbers.parse_pattern(text_type(pattern_el.text)) - if pattern_type: - # This is a compact decimal format, see: - # http://www.unicode.org/reports/tr35/tr35-45/tr35-numbers.html#Compact_Number_Formats - - # These are mapped into a `compact_decimal_formats` dictionary - # with the format {length: {count: {multiplier: pattern}}}. - - # TODO: Add support for formatting them. - compact_decimal_formats = data.setdefault('compact_decimal_formats', {}) - length_map = compact_decimal_formats.setdefault(length_type, {}) - length_count_map = length_map.setdefault(pattern_el.attrib['count'], {}) - length_count_map[pattern_type] = pattern - else: - # Regular decimal format. - decimal_formats[length_type] = pattern + for elem in df_elem.findall('./decimalFormatLength'): + length_type = elem.attrib.get('type') + if _should_skip_elem(elem, length_type, decimal_formats): + continue + if elem.findall('./alias'): + # TODO map the alias to its target + continue + for pattern_el in elem.findall('./decimalFormat/pattern'): + pattern_type = pattern_el.attrib.get('type') + pattern = numbers.parse_pattern(text_type(pattern_el.text)) + if pattern_type: + # This is a compact decimal format, see: + # http://www.unicode.org/reports/tr35/tr35-45/tr35-numbers.html#Compact_Number_Formats + + # These are mapped into a `compact_decimal_formats` dictionary + # with the format {length: {count: {multiplier: pattern}}}. + + # TODO: Add support for formatting them. + compact_decimal_formats = data.setdefault('compact_decimal_formats', {}) + length_map = compact_decimal_formats.setdefault(length_type, {}) + length_count_map = length_map.setdefault(pattern_el.attrib['count'], {}) + length_count_map[pattern_type] = pattern + else: + # Regular decimal format. + decimal_formats[length_type] = pattern def parse_scientific_formats(data, tree): scientific_formats = data.setdefault('scientific_formats', {}) - for elem in tree.findall('.//scientificFormats/scientificFormatLength'): - type = elem.attrib.get('type') - if _should_skip_elem(elem, type, scientific_formats): + for sf_elem in tree.findall('.//scientificFormats'): + if _should_skip_number_elem(data, sf_elem): # TODO: Support other number systems continue - pattern = text_type(elem.findtext('scientificFormat/pattern')) - scientific_formats[type] = numbers.parse_pattern(pattern) + for elem in sf_elem.findall('./scientificFormatLength'): + type = elem.attrib.get('type') + if _should_skip_elem(elem, type, scientific_formats): + continue + pattern = text_type(elem.findtext('scientificFormat/pattern')) + scientific_formats[type] = numbers.parse_pattern(pattern) def parse_percent_formats(data, tree): percent_formats = data.setdefault('percent_formats', {}) - for elem in tree.findall('.//percentFormats/percentFormatLength'): - type = elem.attrib.get('type') - if _should_skip_elem(elem, type, percent_formats): + + for pf_elem in tree.findall('.//percentFormats'): + if _should_skip_number_elem(data, pf_elem): # TODO: Support other number systems continue - pattern = text_type(elem.findtext('percentFormat/pattern')) - percent_formats[type] = numbers.parse_pattern(pattern) + for elem in pf_elem.findall('.//percentFormatLength'): + type = elem.attrib.get('type') + if _should_skip_elem(elem, type, percent_formats): + continue + pattern = text_type(elem.findtext('percentFormat/pattern')) + percent_formats[type] = numbers.parse_pattern(pattern) def parse_currency_names(data, tree): @@ -837,25 +878,29 @@ def parse_interval_formats(data, tree): def parse_currency_formats(data, tree): currency_formats = data.setdefault('currency_formats', {}) - for length_elem in tree.findall('.//currencyFormats/currencyFormatLength'): - curr_length_type = length_elem.attrib.get('type') - for elem in length_elem.findall('currencyFormat'): - type = elem.attrib.get('type') - if curr_length_type: - # Handle `<currencyFormatLength type="short">`, etc. - # TODO(3.x): use nested dicts instead of colon-separated madness - type = '%s:%s' % (type, curr_length_type) - if _should_skip_elem(elem, type, currency_formats): - continue - for child in elem.getiterator(): - if child.tag == 'alias': - currency_formats[type] = Alias( - _translate_alias(['currency_formats', elem.attrib['type']], - child.attrib['path']) - ) - elif child.tag == 'pattern': - pattern = text_type(child.text) - currency_formats[type] = numbers.parse_pattern(pattern) + for currency_format in tree.findall('.//currencyFormats'): + if _should_skip_number_elem(data, currency_format): # TODO: Support other number systems + continue + + for length_elem in currency_format.findall('./currencyFormatLength'): + curr_length_type = length_elem.attrib.get('type') + for elem in length_elem.findall('currencyFormat'): + type = elem.attrib.get('type') + if curr_length_type: + # Handle `<currencyFormatLength type="short">`, etc. + # TODO(3.x): use nested dicts instead of colon-separated madness + type = '%s:%s' % (type, curr_length_type) + if _should_skip_elem(elem, type, currency_formats): + continue + for child in elem.getiterator(): + if child.tag == 'alias': + currency_formats[type] = Alias( + _translate_alias(['currency_formats', elem.attrib['type']], + child.attrib['path']) + ) + elif child.tag == 'pattern': + pattern = text_type(child.text) + currency_formats[type] = numbers.parse_pattern(pattern) def parse_day_period_rules(tree): |