diff options
author | Seth M Morton <seth.m.morton@gmail.com> | 2017-08-18 21:08:01 -0700 |
---|---|---|
committer | Seth M Morton <seth.m.morton@gmail.com> | 2017-08-18 23:24:27 -0700 |
commit | 3a75ddb0dda38e59bd1e034390933ec39a1ab0ff (patch) | |
tree | 71dfc73c6591701a57190fbffef6ab72c716530d | |
parent | c2f4b5d8adc2c1fddc2968c75fae04175f96061f (diff) | |
download | natsort-3a75ddb0dda38e59bd1e034390933ec39a1ab0ff.tar.gz |
Add unicode normalization to all input.
All unicode input now gets 'NFD' normalization, which ensures that
all characters that look the same are represented by the same code
points. 'NFD' was chosen because it is the expanded for which will
cause (for example) 'é' to be placed immediately after 'e' rather than
after 'z'.
Users can choose 'NFKD' with ns.COMPATIBILITYNORMALIZE (or ns.CN) which
will change certain characters to their compatible (and often ASCII)
representation. This may be useful to cause force numbers in odd
representations to be transformed to ASCII which will potentially give
better sorting orders.
This will close issue #44.
-rw-r--r-- | natsort/ns_enum.py | 38 | ||||
-rw-r--r-- | natsort/utils.py | 15 | ||||
-rw-r--r-- | test_natsort/test_input_string_transform_factory.py | 34 | ||||
-rw-r--r-- | test_natsort/test_natsorted.py | 6 | ||||
-rw-r--r-- | test_natsort/test_utils.py | 1 |
5 files changed, 64 insertions, 30 deletions
diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py index e5ffbf5..37a00de 100644 --- a/natsort/ns_enum.py +++ b/natsort/ns_enum.py @@ -39,7 +39,7 @@ class ns(object): This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful when attempting to sort real numbers. NOEXP, N - Tell `natsort` to not search for exponents as part of the number. + Tell `natsort` to not search for exponents as part of a float number. For example, with `NOEXP` the number "5.6E5" would be interpreted as `5.6`, `"E"`, and `5` instead of `560000`. PATH, P @@ -51,6 +51,13 @@ class ns(object): sorted properly; 'Folder/' will be placed at the end, not at the front. It is the same as setting the old `as_path` option to `True`. + COMPATIBILITYNORMALIZE, CN + Use the "NFKD" unicode normalization form on input rather than the + default "NFD". This will transform characters such as '⑦' into + '7'. Please see https://stackoverflow.com/a/7934397/1399279, + https://stackoverflow.com/a/7931547/1399279, + and http://unicode.org/reports/tr15/ full details into unicode + normalization. LOCALE, L Tell `natsort` to be locale-aware when sorting. This includes both proper sorting of alphabetical characters as well as proper @@ -129,20 +136,21 @@ class ns(object): # The below are options. The values are stored as powers of two # so bitmasks can be used to extract the user's requested options. - FLOAT = F = 1 << 0 - SIGNED = S = 1 << 1 - REAL = R = FLOAT | SIGNED - NOEXP = N = 1 << 2 - PATH = P = 1 << 3 - LOCALEALPHA = LA = 1 << 4 - LOCALENUM = LN = 1 << 5 - LOCALE = L = LOCALEALPHA | LOCALENUM - IGNORECASE = IC = 1 << 6 - LOWERCASEFIRST = LF = 1 << 7 - GROUPLETTERS = G = 1 << 8 - UNGROUPLETTERS = UG = 1 << 9 - CAPITALFIRST = C = UNGROUPLETTERS - NANLAST = NL = 1 << 10 + FLOAT = F = 1 << 0 + SIGNED = S = 1 << 1 + REAL = R = FLOAT | SIGNED + NOEXP = N = 1 << 2 + PATH = P = 1 << 3 + LOCALEALPHA = LA = 1 << 4 + LOCALENUM = LN = 1 << 5 + LOCALE = L = LOCALEALPHA | LOCALENUM + IGNORECASE = IC = 1 << 6 + LOWERCASEFIRST = LF = 1 << 7 + GROUPLETTERS = G = 1 << 8 + UNGROUPLETTERS = UG = 1 << 9 + CAPITALFIRST = C = UNGROUPLETTERS + NANLAST = NL = 1 << 10 + COMPATIBILITYNORMALIZE = CN = 1 << 11 # The below are private options for internal use only. _NUMERIC_ONLY = REAL | NOEXP diff --git a/natsort/utils.py b/natsort/utils.py index c21d3b4..28f1487 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -54,6 +54,7 @@ from itertools import chain as ichain from collections import deque from functools import partial, reduce from operator import methodcaller +from unicodedata import normalize # Local imports. from natsort.ns_enum import ns @@ -267,11 +268,23 @@ def _input_string_transform_factory(alg): # Shortcuts. lowfirst = alg & ns.LOWERCASEFIRST dumb = alg & ns._DUMB + normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD' + + if NEWPY: + careful_normalize = partial(normalize, normalization_form) + else: + def careful_normalize(x): + """Normalize unicode input.""" + if isinstance(x, py23_str): # unicode + return normalize(normalization_form, x) + else: + return x # Build the chain of functions to execute in order. - function_chain = [] + function_chain = [careful_normalize] if (dumb and not lowfirst) or (lowfirst and not dumb): function_chain.append(methodcaller('swapcase')) + if alg & ns.IGNORECASE: if NEWPY: function_chain.append(methodcaller('casefold')) diff --git a/test_natsort/test_input_string_transform_factory.py b/test_natsort/test_input_string_transform_factory.py index 3dbd843..97acf21 100644 --- a/test_natsort/test_input_string_transform_factory.py +++ b/test_natsort/test_input_string_transform_factory.py @@ -5,6 +5,7 @@ from __future__ import unicode_literals import pytest import locale from operator import methodcaller +from unicodedata import normalize from natsort.ns_enum import ns from natsort.utils import _input_string_transform_factory from natsort.compat.py23 import NEWPY @@ -28,12 +29,22 @@ from hypothesis.strategies import ( def test_input_string_transform_factory_is_no_op_for_no_alg_options_examples(): x = 'feijGGAd' - assert _input_string_transform_factory(0)(x) is x + assert _input_string_transform_factory(0)(x) == x @given(text()) -def test_input_string_transform_factory_is_no_op_for_no_alg_options(x): - assert _input_string_transform_factory(0)(x) is x +def test_input_string_transform_factory_is_no_op_for_no_alg_options_except_normalization(x): + assert _input_string_transform_factory(0)(x) == normalize('NFD', x) + + +def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE_examples(): + x = '⑦' + assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == '7' + + +@given(text()) +def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE(x): + assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == normalize('NFKD', x) def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_examples(): @@ -47,9 +58,9 @@ def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_exampl @given(text()) def test_input_string_transform_factory_performs_casefold_with_IGNORECASE(x): if NEWPY: - assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.casefold() + assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).casefold() else: - assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.lower() + assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).lower() def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples(): @@ -59,7 +70,7 @@ def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples(): @given(text()) def test_input_string_transform_factory_performs_swapcase_with_DUMB(x): - assert _input_string_transform_factory(ns._DUMB)(x) == x.swapcase() + assert _input_string_transform_factory(ns._DUMB)(x) == normalize('NFD', x).swapcase() def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_example(): @@ -69,18 +80,17 @@ def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_ex @given(text()) def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST(x): - x = 'feijGGAd' - assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == x.swapcase() + assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase() def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB_example(): x = 'feijGGAd' - assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x + assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == x @given(text()) def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB(x): - assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x + assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == normalize('NFD', x) def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE_example(): @@ -94,9 +104,9 @@ def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWE @given(text()) def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE(x): if NEWPY: - assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold() + assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().casefold() else: - assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower() + assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().lower() def test_input_string_transform_factory_removes_thousands_separator_with_LOCALE_example(): diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py index 146997a..388e209 100644 --- a/test_natsort/test_natsorted.py +++ b/test_natsort/test_natsorted.py @@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_ def test_natsorted_with_mixed_input_returns_sorted_results_without_error(): + a = ['0', 'Á', '2', 'Z'] + assert natsorted(a) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] - assert natsorted(a) == [1.5, '2', 3, 'b', 'ä'] + assert natsorted(a) == [1.5, '2', 3, 'ä', 'b'] def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST(): @@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error(): load_locale('en_US') a = ['0', 'Á', '2', 'Z'] - assert natsorted(a) == ['0', '2', 'Z', 'Á'] + assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b'] locale.setlocale(locale.LC_ALL, str('')) diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py index 934757a..f1cffa2 100644 --- a/test_natsort/test_utils.py +++ b/test_natsort/test_utils.py @@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected(): assert ns.CAPITALFIRST == ns.C assert ns.UNGROUPLETTERS == ns.CAPITALFIRST assert ns.NANLAST == ns.NL + assert ns.COMPATIBILITYNORMALIZE == ns.CN # Convenience assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM |