import_cldr: ignore formatting rules for non-Latin numbering systemsnumber-sys-import

Previously the script could have inadvertently merged formatting rules between numbering systems due to the XML selectors used. This makes sure only Latin rules are used for the time being. When support for other numbering systems is properly added (see #470), these checks can be changed.
author: Aarni Koskela <akx@iki.fi> 2018-05-28 12:12:23 +0300
committer: Aarni Koskela <akx@iki.fi> 2018-05-28 12:21:42 +0300
commit: 77dc9d4024b78c2339f7cf3bff1a2e8be8e2d0f7 (patch)
tree: c06dca648ac95e59b6c1e11d4bc54ca6b7629738 /scripts
parent: a72cdf171a9e656ef7d8f31ebb3216c0b4e96556 (diff)
download: babel-77dc9d4024b78c2339f7cf3bff1a2e8be8e2d0f7.tar.gz
1 files changed, 101 insertions, 56 deletions
diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py
index cd0ec37..60aa6c2 100755
--- a/scripts/import_cldr.py
+++ b/scripts/import_cldr.py
@@ -389,6 +389,8 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False):
             territory != '001' and territory or None
         ]))
 
+        data['locale_id'] = locale_id
+
         if locale_id in plural_rules:
             data['plural_form'] = plural_rules[locale_id]
         if locale_id in ordinal_rules:
@@ -430,6 +432,31 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False):
         write_datafile(data_filename, data, dump_json=dump_json)
 
 
+def _should_skip_number_elem(data, elem):
+    """
+    Figure out whether the numbering-containing element `elem` is in a currently
+    non-supported (i.e. currently non-Latin) numbering system.
+
+    If it is, a warning is raised.
+
+    :param data: The root data element, for formatting the warning.
+    :param elem: Element with `numberSystem` key
+    :return: Boolean
+    """
+    number_system = elem.get('numberSystem', 'latn')
+
+    if number_system != 'latn':
+        log('%s: Unsupported number system "%s" in <%s numberSystem="%s">' % (
+            data['locale_id'],
+            number_system,
+            elem.tag,
+            number_system,
+        ))
+        return True
+
+    return False
+
+
 def _should_skip_elem(elem, type=None, dest=None):
     """
     Check whether the given element should be skipped.
@@ -701,59 +728,73 @@ def parse_calendar_datetime_skeletons(data, calendar):
 
 def parse_number_symbols(data, tree):
     number_symbols = data.setdefault('number_symbols', {})
-    for elem in tree.findall('.//numbers/symbols/*'):
-        if _should_skip_elem(elem):
+    for symbol_elem in tree.findall('.//numbers/symbols'):
+        if _should_skip_number_elem(data, symbol_elem):  # TODO: Support other number systems
             continue
-        number_symbols[elem.tag] = text_type(elem.text)
+
+        for elem in symbol_elem.findall('./*'):
+            if _should_skip_elem(elem):
+                continue
+            number_symbols[elem.tag] = text_type(elem.text)
 
 
 def parse_decimal_formats(data, tree):
     decimal_formats = data.setdefault('decimal_formats', {})
-    for elem in tree.findall('.//decimalFormats/decimalFormatLength'):
-        length_type = elem.attrib.get('type')
-        if _should_skip_elem(elem, length_type, decimal_formats):
+    for df_elem in tree.findall('.//decimalFormats'):
+        if _should_skip_number_elem(data, df_elem):  # TODO: Support other number systems
             continue
-        if elem.findall('./alias'):
-            # TODO map the alias to its target
-            continue
-        for pattern_el in elem.findall('./decimalFormat/pattern'):
-            pattern_type = pattern_el.attrib.get('type')
-            pattern = numbers.parse_pattern(text_type(pattern_el.text))
-            if pattern_type:
-                # This is a compact decimal format, see:
-                # http://www.unicode.org/reports/tr35/tr35-45/tr35-numbers.html#Compact_Number_Formats
-
-                # These are mapped into a `compact_decimal_formats` dictionary
-                # with the format {length: {count: {multiplier: pattern}}}.
-
-                # TODO: Add support for formatting them.
-                compact_decimal_formats = data.setdefault('compact_decimal_formats', {})
-                length_map = compact_decimal_formats.setdefault(length_type, {})
-                length_count_map = length_map.setdefault(pattern_el.attrib['count'], {})
-                length_count_map[pattern_type] = pattern
-            else:
-                # Regular decimal format.
-                decimal_formats[length_type] = pattern
+        for elem in df_elem.findall('./decimalFormatLength'):
+            length_type = elem.attrib.get('type')
+            if _should_skip_elem(elem, length_type, decimal_formats):
+                continue
+            if elem.findall('./alias'):
+                # TODO map the alias to its target
+                continue
+            for pattern_el in elem.findall('./decimalFormat/pattern'):
+                pattern_type = pattern_el.attrib.get('type')
+                pattern = numbers.parse_pattern(text_type(pattern_el.text))
+                if pattern_type:
+                    # This is a compact decimal format, see:
+                    # http://www.unicode.org/reports/tr35/tr35-45/tr35-numbers.html#Compact_Number_Formats
+
+                    # These are mapped into a `compact_decimal_formats` dictionary
+                    # with the format {length: {count: {multiplier: pattern}}}.
+
+                    # TODO: Add support for formatting them.
+                    compact_decimal_formats = data.setdefault('compact_decimal_formats', {})
+                    length_map = compact_decimal_formats.setdefault(length_type, {})
+                    length_count_map = length_map.setdefault(pattern_el.attrib['count'], {})
+                    length_count_map[pattern_type] = pattern
+                else:
+                    # Regular decimal format.
+                    decimal_formats[length_type] = pattern
 
 
 def parse_scientific_formats(data, tree):
     scientific_formats = data.setdefault('scientific_formats', {})
-    for elem in tree.findall('.//scientificFormats/scientificFormatLength'):
-        type = elem.attrib.get('type')
-        if _should_skip_elem(elem, type, scientific_formats):
+    for sf_elem in tree.findall('.//scientificFormats'):
+        if _should_skip_number_elem(data, sf_elem):  # TODO: Support other number systems
             continue
-        pattern = text_type(elem.findtext('scientificFormat/pattern'))
-        scientific_formats[type] = numbers.parse_pattern(pattern)
+        for elem in sf_elem.findall('./scientificFormatLength'):
+            type = elem.attrib.get('type')
+            if _should_skip_elem(elem, type, scientific_formats):
+                continue
+            pattern = text_type(elem.findtext('scientificFormat/pattern'))
+            scientific_formats[type] = numbers.parse_pattern(pattern)
 
 
 def parse_percent_formats(data, tree):
     percent_formats = data.setdefault('percent_formats', {})
-    for elem in tree.findall('.//percentFormats/percentFormatLength'):
-        type = elem.attrib.get('type')
-        if _should_skip_elem(elem, type, percent_formats):
+
+    for pf_elem in tree.findall('.//percentFormats'):
+        if _should_skip_number_elem(data, pf_elem):  # TODO: Support other number systems
             continue
-        pattern = text_type(elem.findtext('percentFormat/pattern'))
-        percent_formats[type] = numbers.parse_pattern(pattern)
+        for elem in pf_elem.findall('.//percentFormatLength'):
+            type = elem.attrib.get('type')
+            if _should_skip_elem(elem, type, percent_formats):
+                continue
+            pattern = text_type(elem.findtext('percentFormat/pattern'))
+            percent_formats[type] = numbers.parse_pattern(pattern)
 
 
 def parse_currency_names(data, tree):
@@ -837,25 +878,29 @@ def parse_interval_formats(data, tree):
 
 def parse_currency_formats(data, tree):
     currency_formats = data.setdefault('currency_formats', {})
-    for length_elem in tree.findall('.//currencyFormats/currencyFormatLength'):
-        curr_length_type = length_elem.attrib.get('type')
-        for elem in length_elem.findall('currencyFormat'):
-            type = elem.attrib.get('type')
-            if curr_length_type:
-                # Handle `<currencyFormatLength type="short">`, etc.
-                # TODO(3.x): use nested dicts instead of colon-separated madness
-                type = '%s:%s' % (type, curr_length_type)
-            if _should_skip_elem(elem, type, currency_formats):
-                continue
-            for child in elem.getiterator():
-                if child.tag == 'alias':
-                    currency_formats[type] = Alias(
-                        _translate_alias(['currency_formats', elem.attrib['type']],
-                                         child.attrib['path'])
-                    )
-                elif child.tag == 'pattern':
-                    pattern = text_type(child.text)
-                    currency_formats[type] = numbers.parse_pattern(pattern)
+    for currency_format in tree.findall('.//currencyFormats'):
+        if _should_skip_number_elem(data, currency_format):  # TODO: Support other number systems
+            continue
+
+        for length_elem in currency_format.findall('./currencyFormatLength'):
+            curr_length_type = length_elem.attrib.get('type')
+            for elem in length_elem.findall('currencyFormat'):
+                type = elem.attrib.get('type')
+                if curr_length_type:
+                    # Handle `<currencyFormatLength type="short">`, etc.
+                    # TODO(3.x): use nested dicts instead of colon-separated madness
+                    type = '%s:%s' % (type, curr_length_type)
+                if _should_skip_elem(elem, type, currency_formats):
+                    continue
+                for child in elem.getiterator():
+                    if child.tag == 'alias':
+                        currency_formats[type] = Alias(
+                            _translate_alias(['currency_formats', elem.attrib['type']],
+                                             child.attrib['path'])
+                        )
+                    elif child.tag == 'pattern':
+                        pattern = text_type(child.text)
+                        currency_formats[type] = numbers.parse_pattern(pattern)
 
 
 def parse_day_period_rules(tree):
author	Aarni Koskela <akx@iki.fi>	2018-05-28 12:12:23 +0300
committer	Aarni Koskela <akx@iki.fi>	2018-05-28 12:21:42 +0300
commit	77dc9d4024b78c2339f7cf3bff1a2e8be8e2d0f7 (patch)
tree	c06dca648ac95e59b6c1e11d4bc54ca6b7629738 /scripts
parent	a72cdf171a9e656ef7d8f31ebb3216c0b4e96556 (diff)
download	babel-77dc9d4024b78c2339f7cf3bff1a2e8be8e2d0f7.tar.gz