Add unicode normalization to all input.

All unicode input now gets 'NFD' normalization, which ensures that all characters that look the same are represented by the same code points. 'NFD' was chosen because it is the expanded for which will cause (for example) 'é' to be placed immediately after 'e' rather than after 'z'. Users can choose 'NFKD' with ns.COMPATIBILITYNORMALIZE (or ns.CN) which will change certain characters to their compatible (and often ASCII) representation. This may be useful to cause force numbers in odd representations to be transformed to ASCII which will potentially give better sorting orders. This will close issue #44.
author: Seth M Morton <seth.m.morton@gmail.com> 2017-08-18 21:08:01 -0700
committer: Seth M Morton <seth.m.morton@gmail.com> 2017-08-18 23:24:27 -0700
commit: 3a75ddb0dda38e59bd1e034390933ec39a1ab0ff (patch)
tree: 71dfc73c6591701a57190fbffef6ab72c716530d
parent: c2f4b5d8adc2c1fddc2968c75fae04175f96061f (diff)
download: natsort-3a75ddb0dda38e59bd1e034390933ec39a1ab0ff.tar.gz
5 files changed, 64 insertions, 30 deletions
diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py
index e5ffbf5..37a00de 100644
--- a/natsort/ns_enum.py
+++ b/natsort/ns_enum.py
@@ -39,7 +39,7 @@ class ns(object):
         This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful
         when attempting to sort real numbers.
     NOEXP, N
-        Tell `natsort` to not search for exponents as part of the number.
+        Tell `natsort` to not search for exponents as part of a float number.
         For example, with `NOEXP` the number "5.6E5" would be interpreted
         as `5.6`, `"E"`, and `5` instead of `560000`.
     PATH, P
@@ -51,6 +51,13 @@ class ns(object):
         sorted properly; 'Folder/' will be placed at the end, not at the
         front. It is the same as setting the old `as_path` option to
         `True`.
+    COMPATIBILITYNORMALIZE, CN
+        Use the "NFKD" unicode normalization form on input rather than the
+        default "NFD". This will transform characters such as '⑦' into
+        '7'. Please see https://stackoverflow.com/a/7934397/1399279,
+        https://stackoverflow.com/a/7931547/1399279,
+        and http://unicode.org/reports/tr15/ full details into unicode
+        normalization.
     LOCALE, L
         Tell `natsort` to be locale-aware when sorting. This includes both
         proper sorting of alphabetical characters as well as proper
@@ -129,20 +136,21 @@ class ns(object):
 
     # The below are options. The values are stored as powers of two
     # so bitmasks can be used to extract the user's requested options.
-    FLOAT            = F  = 1 << 0
-    SIGNED           = S  = 1 << 1
-    REAL             = R  = FLOAT | SIGNED
-    NOEXP            = N  = 1 << 2
-    PATH             = P  = 1 << 3
-    LOCALEALPHA      = LA = 1 << 4
-    LOCALENUM        = LN = 1 << 5
-    LOCALE           = L  = LOCALEALPHA | LOCALENUM
-    IGNORECASE       = IC = 1 << 6
-    LOWERCASEFIRST   = LF = 1 << 7
-    GROUPLETTERS     = G  = 1 << 8
-    UNGROUPLETTERS   = UG = 1 << 9
-    CAPITALFIRST     = C  = UNGROUPLETTERS
-    NANLAST          = NL = 1 << 10
+    FLOAT                  = F  = 1 << 0
+    SIGNED                 = S  = 1 << 1
+    REAL                   = R  = FLOAT | SIGNED
+    NOEXP                  = N  = 1 << 2
+    PATH                   = P  = 1 << 3
+    LOCALEALPHA            = LA = 1 << 4
+    LOCALENUM              = LN = 1 << 5
+    LOCALE                 = L  = LOCALEALPHA | LOCALENUM
+    IGNORECASE             = IC = 1 << 6
+    LOWERCASEFIRST         = LF = 1 << 7
+    GROUPLETTERS           = G  = 1 << 8
+    UNGROUPLETTERS         = UG = 1 << 9
+    CAPITALFIRST           = C  = UNGROUPLETTERS
+    NANLAST                = NL = 1 << 10
+    COMPATIBILITYNORMALIZE = CN = 1 << 11
 
     # The below are private options for internal use only.
     _NUMERIC_ONLY    = REAL | NOEXP
diff --git a/natsort/utils.py b/natsort/utils.py
index c21d3b4..28f1487 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -54,6 +54,7 @@ from itertools import chain as ichain
 from collections import deque
 from functools import partial, reduce
 from operator import methodcaller
+from unicodedata import normalize
 
 # Local imports.
 from natsort.ns_enum import ns
@@ -267,11 +268,23 @@ def _input_string_transform_factory(alg):
     # Shortcuts.
     lowfirst = alg & ns.LOWERCASEFIRST
     dumb = alg & ns._DUMB
+    normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD'
+
+    if NEWPY:
+        careful_normalize = partial(normalize, normalization_form)
+    else:
+        def careful_normalize(x):
+            """Normalize unicode input."""
+            if isinstance(x, py23_str):  # unicode
+                return normalize(normalization_form, x)
+            else:
+                return x
 
     # Build the chain of functions to execute in order.
-    function_chain = []
+    function_chain = [careful_normalize]
     if (dumb and not lowfirst) or (lowfirst and not dumb):
         function_chain.append(methodcaller('swapcase'))
+
     if alg & ns.IGNORECASE:
         if NEWPY:
             function_chain.append(methodcaller('casefold'))
diff --git a/test_natsort/test_input_string_transform_factory.py b/test_natsort/test_input_string_transform_factory.py
index 3dbd843..97acf21 100644
--- a/test_natsort/test_input_string_transform_factory.py
+++ b/test_natsort/test_input_string_transform_factory.py
@@ -5,6 +5,7 @@ from __future__ import unicode_literals
 import pytest
 import locale
 from operator import methodcaller
+from unicodedata import normalize
 from natsort.ns_enum import ns
 from natsort.utils import _input_string_transform_factory
 from natsort.compat.py23 import NEWPY
@@ -28,12 +29,22 @@ from hypothesis.strategies import (
 
 def test_input_string_transform_factory_is_no_op_for_no_alg_options_examples():
     x = 'feijGGAd'
-    assert _input_string_transform_factory(0)(x) is x
+    assert _input_string_transform_factory(0)(x) == x
 
 
 @given(text())
-def test_input_string_transform_factory_is_no_op_for_no_alg_options(x):
-    assert _input_string_transform_factory(0)(x) is x
+def test_input_string_transform_factory_is_no_op_for_no_alg_options_except_normalization(x):
+    assert _input_string_transform_factory(0)(x) == normalize('NFD', x)
+
+
+def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE_examples():
+    x = '⑦'
+    assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == '7'
+
+
+@given(text())
+def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE(x):
+    assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == normalize('NFKD', x)
 
 
 def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_examples():
@@ -47,9 +58,9 @@ def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_exampl
 @given(text())
 def test_input_string_transform_factory_performs_casefold_with_IGNORECASE(x):
     if NEWPY:
-        assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.casefold()
+        assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).casefold()
     else:
-        assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.lower()
+        assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).lower()
 
 
 def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples():
@@ -59,7 +70,7 @@ def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples():
 
 @given(text())
 def test_input_string_transform_factory_performs_swapcase_with_DUMB(x):
-    assert _input_string_transform_factory(ns._DUMB)(x) == x.swapcase()
+    assert _input_string_transform_factory(ns._DUMB)(x) == normalize('NFD', x).swapcase()
 
 
 def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_example():
@@ -69,18 +80,17 @@ def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_ex
 
 @given(text())
 def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST(x):
-    x = 'feijGGAd'
-    assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == x.swapcase()
+    assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase()
 
 
 def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB_example():
     x = 'feijGGAd'
-    assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x
+    assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == x
 
 
 @given(text())
 def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB(x):
-    assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x
+    assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == normalize('NFD', x)
 
 
 def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE_example():
@@ -94,9 +104,9 @@ def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWE
 @given(text())
 def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE(x):
     if NEWPY:
-        assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold()
+        assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().casefold()
     else:
-        assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower()
+        assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().lower()
 
 
 def test_input_string_transform_factory_removes_thousands_separator_with_LOCALE_example():
diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
index 146997a..388e209 100644
--- a/test_natsort/test_natsorted.py
+++ b/test_natsort/test_natsorted.py
@@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_
 
 
 def test_natsorted_with_mixed_input_returns_sorted_results_without_error():
+    a = ['0', 'Á', '2', 'Z']
+    assert natsorted(a) == ['0', '2', 'Á', 'Z']
     a = ['2', 'ä', 'b', 1.5, 3]
-    assert natsorted(a) == [1.5, '2', 3, 'b', 'ä']
+    assert natsorted(a) == [1.5, '2', 3, 'ä', 'b']
 
 
 def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST():
@@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu
 def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error():
     load_locale('en_US')
     a = ['0', 'Á', '2', 'Z']
-    assert natsorted(a) == ['0', '2', 'Z', 'Á']
+    assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z']
     a = ['2', 'ä', 'b', 1.5, 3]
     assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b']
     locale.setlocale(locale.LC_ALL, str(''))
diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py
index 934757a..f1cffa2 100644
--- a/test_natsort/test_utils.py
+++ b/test_natsort/test_utils.py
@@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected():
     assert ns.CAPITALFIRST == ns.C
     assert ns.UNGROUPLETTERS == ns.CAPITALFIRST
     assert ns.NANLAST == ns.NL
+    assert ns.COMPATIBILITYNORMALIZE == ns.CN
 
     # Convenience
     assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM
author	Seth M Morton <seth.m.morton@gmail.com>	2017-08-18 21:08:01 -0700
committer	Seth M Morton <seth.m.morton@gmail.com>	2017-08-18 23:24:27 -0700
commit	3a75ddb0dda38e59bd1e034390933ec39a1ab0ff (patch)
tree	71dfc73c6591701a57190fbffef6ab72c716530d
parent	c2f4b5d8adc2c1fddc2968c75fae04175f96061f (diff)
download	natsort-3a75ddb0dda38e59bd1e034390933ec39a1ab0ff.tar.gz