summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth Morton <seth.m.morton@gmail.com>2017-08-19 00:38:28 -0700
committerGitHub <noreply@github.com>2017-08-19 00:38:28 -0700
commit11b7f8d624d43036c9471498e0a5b870485fef51 (patch)
tree71c0385502dc0522c23099d177e59316cc40b142
parentc2f4b5d8adc2c1fddc2968c75fae04175f96061f (diff)
parent059de483de650d91853de8103e31fae849e40493 (diff)
downloadnatsort-11b7f8d624d43036c9471498e0a5b870485fef51.tar.gz
Merge pull request #45 from SethMMorton/unicode-normalization
Add normalization to incoming Unicode data
-rw-r--r--README.rst1
-rw-r--r--docs/source/howitworks.rst89
-rw-r--r--natsort/ns_enum.py38
-rw-r--r--natsort/utils.py22
-rw-r--r--test_natsort/slow_splitters.py2
-rw-r--r--test_natsort/test_natsorted.py14
-rw-r--r--test_natsort/test_utils.py1
7 files changed, 141 insertions, 26 deletions
diff --git a/README.rst b/README.rst
index 3a283f2..61ddb96 100644
--- a/README.rst
+++ b/README.rst
@@ -234,6 +234,7 @@ Other Useful Things
+++++++++++++++++++
- recursively descend into lists of lists
+ - automatic unicode normalization of input data
- `controlling the case-sensitivity <http://natsort.readthedocs.io/en/stable/examples.html#case-sort>`_
- `sorting file paths correctly <http://natsort.readthedocs.io/en/stable/examples.html#path-sort>`_
- `allow custom sorting keys <http://natsort.readthedocs.io/en/stable/examples.html#custom-sort>`_
diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst
index f15db8f..22de2a5 100644
--- a/docs/source/howitworks.rst
+++ b/docs/source/howitworks.rst
@@ -655,12 +655,14 @@ StdLib there can't be too many dragons, right?
- https://github.com/SethMMorton/natsort/issues/22
- https://github.com/SethMMorton/natsort/issues/23
- https://github.com/SethMMorton/natsort/issues/36
+ - https://github.com/SethMMorton/natsort/issues/44
- https://bugs.python.org/issue2481
- https://bugs.python.org/issue23195
- - http://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help
- - http://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation
- - http://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm
- - http://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators
+ - https://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help
+ - https://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation
+ - https://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm
+ - https://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators
+ - https://stackoverflow.com/questions/45734562/how-can-i-get-a-reasonable-string-sorting-with-python
These can be summed up as follows:
@@ -787,6 +789,84 @@ the ``else:`` block of :func:`coerce_to_int`/:func:`coerce_to_float`.
Of course, applying both *LOWERCASEFIRST* and *GROUPLETTERS* is just
a matter of turning on both functions.
+Basic Unicode Support
++++++++++++++++++++++
+
+Unicode is hard and complicated. Here's an example.
+
+.. code-block:: python
+
+ >>> b = [b'\x66', b'\x65', b'\xc3\xa9', b'\x65\xcc\x81', b'\x61', b'\x7a']
+ >>> a = [x.decode('utf8') for x in b]
+ >>> a # doctest: +SKIP
+ ['f', 'e', 'é', 'é', 'a', 'z']
+ >>> sorted(a) # doctest: +SKIP
+ ['a', 'e', 'é', 'f', 'z', 'é']
+
+
+There are more than one way to represent the character 'é' in Unicode.
+In fact, many characters have multiple representations. This is a challenge
+because comparing the two representations would return ``False`` even though
+they *look* the same.
+
+.. code-block:: python
+
+ >>> a[2] == a[3]
+ False
+
+Alas, since characters are compared based on the numerical value of their
+representation, sorting Unicode often gives unexpected results (like seeing
+'é' come both *before* and *after* 'z').
+
+The original approach that :mod:`natsort` took with respect to non-ASCII
+Unicode characters was to say "just use
+the :mod:`locale` or :mod:`PyICU` library" and then cross it's fingers
+and hope those libraries take care of it. As you will find in the following
+sections, that comes with its own baggage, and turned out to not always work anyway
+(see https://stackoverflow.com/q/45734562/1399279). A more robust approach is to
+handle the Unicode out-of-the-box without invoking a heavy-handed library
+like :mod:`locale` or :mod:`PyICU`. To do this, we must use *normalization*.
+
+To fully understand Unicode normalization, `check out some official Unicode documentation`_.
+Just kidding... that's too much text. The following StackOverflow answers do
+a good job at explaining Unicode normalization in simple terms:
+https://stackoverflow.com/a/7934397/1399279 and
+https://stackoverflow.com/a/7931547/1399279. Put simply, normalization
+ensures that Unicode characters with multiple representations are in
+some canonical and consistent representation so that (for example) comparisons
+of the characters can be performed in a sane way. The following discussion
+assumes you at least read the StackOverflow answers.
+
+Looking back at our 'é' example, we can see that the two versions were
+constructed with the byte strings ``b'\xc3\xa9'`` and ``b'\x65\xcc\x81'``.
+The former representation is actually
+`LATIN SMALL LETTER E WITH ACUTE <http://www.fileformat.info/info/unicode/char/e9/index.htm>`_
+and is a single character in the Unicode standard. This is known as the
+*compressed form* and corresponds to the 'NFC' normalization scheme.
+The latter representation is actually the letter 'e' followed by
+`COMBINING ACUTE ACCENT <http://www.fileformat.info/info/unicode/char/0301/index.htm>`_
+and so is two characters in the Unicode standard. This is known as the
+*decompressed form* and corresponds to the 'NFD' normalization scheme.
+Since the first character in the decompressed form is actually the letter 'e',
+when compared to other ASCII characters it fits where you might expect.
+Unfortunately, all Unicode compressed form characters come after the
+ASCII characters and so they always will be placed after 'z' when sorting.
+
+It seems that most Unicode data is stored and shared in the compressed form
+which makes it challenging to sort. This can be solved by normalizing all
+incoming Unicode data to the decompressed form ('NFD') and *then* sorting.
+
+.. code-block:: python
+
+ >>> import unicodedata
+ >>> c = [unicodedata.normalize('NFD', x) for x in a]
+ >>> c # doctest: +SKIP
+ ['f', 'e', 'é', 'é', 'a', 'z']
+ >>> sorted(c) # doctest: +SKIP
+ ['a', 'e', 'é', 'é', 'f', 'z']
+
+Huzzah! Sane sorting without having to resort to :mod:`locale`!
+
Using Locale to Compare Strings
+++++++++++++++++++++++++++++++
@@ -1052,3 +1132,4 @@ what the rest of the world assumes.
.. _Thousands separator support: https://github.com/SethMMorton/natsort/issues/36
.. _really good: https://hypothesis.readthedocs.io/en/latest/
.. _testing strategy: http://doc.pytest.org/en/latest/
+.. _check out some official Unicode documentation: http://unicode.org/reports/tr15/
diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py
index e5ffbf5..37a00de 100644
--- a/natsort/ns_enum.py
+++ b/natsort/ns_enum.py
@@ -39,7 +39,7 @@ class ns(object):
This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful
when attempting to sort real numbers.
NOEXP, N
- Tell `natsort` to not search for exponents as part of the number.
+ Tell `natsort` to not search for exponents as part of a float number.
For example, with `NOEXP` the number "5.6E5" would be interpreted
as `5.6`, `"E"`, and `5` instead of `560000`.
PATH, P
@@ -51,6 +51,13 @@ class ns(object):
sorted properly; 'Folder/' will be placed at the end, not at the
front. It is the same as setting the old `as_path` option to
`True`.
+ COMPATIBILITYNORMALIZE, CN
+ Use the "NFKD" unicode normalization form on input rather than the
+ default "NFD". This will transform characters such as '⑦' into
+ '7'. Please see https://stackoverflow.com/a/7934397/1399279,
+ https://stackoverflow.com/a/7931547/1399279,
+ and http://unicode.org/reports/tr15/ full details into unicode
+ normalization.
LOCALE, L
Tell `natsort` to be locale-aware when sorting. This includes both
proper sorting of alphabetical characters as well as proper
@@ -129,20 +136,21 @@ class ns(object):
# The below are options. The values are stored as powers of two
# so bitmasks can be used to extract the user's requested options.
- FLOAT = F = 1 << 0
- SIGNED = S = 1 << 1
- REAL = R = FLOAT | SIGNED
- NOEXP = N = 1 << 2
- PATH = P = 1 << 3
- LOCALEALPHA = LA = 1 << 4
- LOCALENUM = LN = 1 << 5
- LOCALE = L = LOCALEALPHA | LOCALENUM
- IGNORECASE = IC = 1 << 6
- LOWERCASEFIRST = LF = 1 << 7
- GROUPLETTERS = G = 1 << 8
- UNGROUPLETTERS = UG = 1 << 9
- CAPITALFIRST = C = UNGROUPLETTERS
- NANLAST = NL = 1 << 10
+ FLOAT = F = 1 << 0
+ SIGNED = S = 1 << 1
+ REAL = R = FLOAT | SIGNED
+ NOEXP = N = 1 << 2
+ PATH = P = 1 << 3
+ LOCALEALPHA = LA = 1 << 4
+ LOCALENUM = LN = 1 << 5
+ LOCALE = L = LOCALEALPHA | LOCALENUM
+ IGNORECASE = IC = 1 << 6
+ LOWERCASEFIRST = LF = 1 << 7
+ GROUPLETTERS = G = 1 << 8
+ UNGROUPLETTERS = UG = 1 << 9
+ CAPITALFIRST = C = UNGROUPLETTERS
+ NANLAST = NL = 1 << 10
+ COMPATIBILITYNORMALIZE = CN = 1 << 11
# The below are private options for internal use only.
_NUMERIC_ONLY = REAL | NOEXP
diff --git a/natsort/utils.py b/natsort/utils.py
index c21d3b4..c33de1d 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -54,6 +54,7 @@ from itertools import chain as ichain
from collections import deque
from functools import partial, reduce
from operator import methodcaller
+from unicodedata import normalize
# Local imports.
from natsort.ns_enum import ns
@@ -118,6 +119,22 @@ def _no_op(x):
return x
+def _normalize_input_factory(alg):
+ """Create a function that will normalize unicode input data."""
+ normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD'
+
+ if NEWPY:
+ return partial(normalize, normalization_form)
+ else:
+ def func(x):
+ """Normalize unicode input."""
+ if isinstance(x, py23_str): # unicode
+ return normalize(normalization_form, x)
+ else:
+ return x
+ return func
+
+
def _natsort_key(val, key, string_func, bytes_func, num_func):
"""\
Key to sort strings and numbers naturally.
@@ -208,11 +225,13 @@ def _parse_string_factory(alg, sep, splitter,
# sometimes after.
orig_after_xfrm = not (alg & ns._DUMB and alg & ns.LOCALEALPHA)
original_func = input_transform if orig_after_xfrm else _no_op
+ normalize_input = _normalize_input_factory(alg)
- def func(x, original_func=original_func):
+ def func(x):
# Apply string input transformation function and return to x.
# Original function is usually a no-op, but some algorithms require it
# to also be the transformation function.
+ x = normalize_input(x)
x, original = input_transform(x), original_func(x)
x = splitter(x) # Split string into components.
x = py23_filter(None, x) # Remove empty strings.
@@ -272,6 +291,7 @@ def _input_string_transform_factory(alg):
function_chain = []
if (dumb and not lowfirst) or (lowfirst and not dumb):
function_chain.append(methodcaller('swapcase'))
+
if alg & ns.IGNORECASE:
if NEWPY:
function_chain.append(methodcaller('casefold'))
diff --git a/test_natsort/slow_splitters.py b/test_natsort/slow_splitters.py
index 8a98948..aef329e 100644
--- a/test_natsort/slow_splitters.py
+++ b/test_natsort/slow_splitters.py
@@ -19,6 +19,7 @@ SplitElement = collections.namedtuple('SplitElement',
def int_splitter(iterable, signed, sep):
"""Alternate (slow) method to split a string into numbers."""
+ iterable = unicodedata.normalize('NFD', iterable)
split_by_digits = itertools.groupby(iterable, lambda a: a.isdigit())
split_by_digits = refine_split_grouping(split_by_digits)
split = int_splitter_iter(split_by_digits, signed)
@@ -32,6 +33,7 @@ def float_splitter(iterable, signed, exp, sep):
def number_tester(x):
return x.isdigit() or unicodedata.numeric(x, None) is not None
+ iterable = unicodedata.normalize('NFD', iterable)
split_by_digits = itertools.groupby(iterable, number_tester)
split_by_digits = peekable(refine_split_grouping(split_by_digits))
split = float_splitter_iter(split_by_digits, signed, exp)
diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
index 146997a..fcbf75b 100644
--- a/test_natsort/test_natsorted.py
+++ b/test_natsort/test_natsorted.py
@@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_
def test_natsorted_with_mixed_input_returns_sorted_results_without_error():
+ a = ['0', 'Á', '2', 'Z']
+ assert natsorted(a) == ['0', '2', 'Á', 'Z']
a = ['2', 'ä', 'b', 1.5, 3]
- assert natsorted(a) == [1.5, '2', 3, 'b', 'ä']
+ assert natsorted(a) == [1.5, '2', 3, 'ä', 'b']
def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST():
@@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu
def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error():
load_locale('en_US')
a = ['0', 'Á', '2', 'Z']
- assert natsorted(a) == ['0', '2', 'Z', 'Á']
+ assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z']
a = ['2', 'ä', 'b', 1.5, 3]
assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b']
locale.setlocale(locale.LC_ALL, str(''))
@@ -249,16 +251,16 @@ def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_er
def test_natsorted_with_LOCALE_and_UNGROUPLETTERS_and_mixed_input_returns_sorted_results_without_error():
load_locale('en_US')
a = ['0', 'Á', '2', 'Z']
- assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Z', 'Á']
+ assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Á', 'Z']
a = ['2', 'ä', 'b', 1.5, 3]
- assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'b', 'ä']
+ assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'ä', 'b']
locale.setlocale(locale.LC_ALL, str(''))
def test_natsorted_with_PATH_and_LOCALE_and_UNGROUPLETTERS_and_mixed_input_returns_sorted_results_without_error():
load_locale('en_US')
a = ['0', 'Á', '2', 'Z']
- assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Z', 'Á']
+ assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Á', 'Z']
a = ['2', 'ä', 'b', 1.5, 3]
- assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'b', 'ä']
+ assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'ä', 'b']
locale.setlocale(locale.LC_ALL, str(''))
diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py
index 934757a..f1cffa2 100644
--- a/test_natsort/test_utils.py
+++ b/test_natsort/test_utils.py
@@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected():
assert ns.CAPITALFIRST == ns.C
assert ns.UNGROUPLETTERS == ns.CAPITALFIRST
assert ns.NANLAST == ns.NL
+ assert ns.COMPATIBILITYNORMALIZE == ns.CN
# Convenience
assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM