diff options
Diffstat (limited to 'natsort/natsort.py')
-rw-r--r-- | natsort/natsort.py | 396 |
1 files changed, 9 insertions, 387 deletions
diff --git a/natsort/natsort.py b/natsort/natsort.py index 82b84df..d3d6f8a 100644 --- a/natsort/natsort.py +++ b/natsort/natsort.py @@ -15,400 +15,20 @@ See the README or the natsort homepage for more details. from __future__ import (print_function, division, unicode_literals, absolute_import) -import re -from os import curdir, pardir -from os.path import split, splitext +# Std lib. imports. from operator import itemgetter from functools import partial -from itertools import islice from warnings import warn -from locale import localeconv -# If the user has fastnumbers installed, they will get great speed -# benefits. If not, we simulate the functions here. -try: - from fastnumbers import fast_float, fast_int, isreal -except ImportError: - from .fake_fastnumbers import fast_float, fast_int, isreal -from .locale_help import locale_convert, grouper, lowercase, swapcase -from .py23compat import u_format, py23_str, py23_zip +# Local imports. +from natsort.utils import _natsort_key, _args_to_enum +from natsort.ns_enum import ns +from natsort.py23compat import u_format # Make sure the doctest works for either python2 or python3 __doc__ = u_format(__doc__) -class ns(object): - """ - Enum to control the `natsort` algorithm. - - This class acts like an enum to control the `natsort` algorithm. The - user may select several options simultaneously by or'ing the options - together. For example, to choose ``ns.INT``, `ns.PATH``, and - ``ns.LOCALE``, you could do ``ns.INT | ns.LOCALE | ns.PATH``. - - Each option has a shortened 1- or 2-letter form. - - .. warning:: On some systems, the underlying C library that - Python's locale module uses is broken. On these - systems it is recommended that you install - `PyICU <https://pypi.python.org/pypi/PyICU>`_ - if you wish to use `LOCALE`. - Please validate that `LOCALE` works as - expected on your target system, and if not you - should add - `PyICU <https://pypi.python.org/pypi/PyICU>`_ - as a dependency. - - Attributes - ---------- - FLOAT, F - The default - parse numbers as floats. - INT, I - Tell `natsort` to parse numbers as ints. - UNSIGNED, U - Tell `natsort` to ignore any sign (i.e. "-" or "+") to the - immediate left of a number. It is the same as setting the old - `signed` option to `False`. - VERSION, V - This is a shortcut for ``ns.INT | ns.UNSIGNED``, which is useful - when attempting to sort version numbers. It is the same as - setting the old `number_type` option to `None`. - DIGIT, D - Same as `VERSION` above. - NOEXP, N - Tell `natsort` to not search for exponents as part of the number. - For example, with `NOEXP` the number "5.6E5" would be interpreted - as `5.6`, `"E"`, and `5`. It is the same as setting the old `exp` - option to `False`. - PATH, P - Tell `natsort` to interpret strings as filesystem paths, so they - will be split according to the filesystem separator - (i.e. ‘/’ on UNIX, ‘\’ on Windows), as well as splitting on the - file extension, if any. Without this, lists of file paths like - ``['Folder/', 'Folder (1)/', 'Folder (10)/']`` will not be sorted - properly; 'Folder/' will be placed at the end, not at the front. - It is the same as setting the old `as_path` option to `True`. - LOCALE, L - Tell `natsort` to be locale-aware when sorting strings (everything - that was not converted to a number). Your sorting results will vary - depending on your current locale. Generally, the `GROUPLETTERS` - option is needed with `LOCALE` because the `locale` library - groups the letters in the same manner (although you may still - need `GROUPLETTERS` if there are numbers in your strings). - IGNORECASE, IC - Tell `natsort` to ignore case when sorting. For example, - ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as - ``['apple', 'Apple', 'Banana', 'banana']``. - LOWERCASEFIRST, LF - Tell `natsort` to put lowercase letters before uppercase letters - when sorting. For example, - ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as - ``['apple', 'banana', 'Apple', 'Banana']`` (the default order - would be ``['Apple', 'Banana', 'apple', 'banana']`` which is - the order from a purely ordinal sort). - Useless when used with `IGNORECASE`. - GROUPLETTERS, G - Tell `natsort` to group lowercase and uppercase letters together - when sorting. For example, - ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as - ``['Apple', 'apple', 'Banana', 'banana']``. - Useless when used with `IGNORECASE`; use with `LOWERCASEFIRST` - to reverse the order of upper and lower case. - TYPESAFE, T - Try hard to avoid "unorderable types" error on Python 3. It - is the same as setting the old `py3_safe` option to `True`. - - Notes - ----- - If using `LOCALE`, you may find that if you do not explicitly set - the locale your results may not be as you expect... I have found that - it depends on the system you are on. To do this is straightforward - (in the below example I use 'en_US.UTF-8', but you should use your - locale):: - - >>> import locale - >>> # The 'str' call is only to get around a bug on Python 2.x - >>> # where 'setlocale' does not expect unicode strings (ironic, - >>> # right?) - >>> locale.setlocale(locale.LC_ALL, str('en_US.UTF-8')) - 'en_US.UTF-8' - - It is preferred that you do this before importing `natsort`. - If you use `PyICU <https://pypi.python.org/pypi/PyICU>`_ (see warning - above) then you should not need to do this. - - """ - pass - - -# Sort algorithm "enum" values. -_nsdict = {'FLOAT': 0, 'F': 0, - 'INT': 1, 'I': 1, - 'UNSIGNED': 2, 'U': 2, - 'VERSION': 3, 'V': 3, # Shortcut for INT | UNSIGNED - 'DIGIT': 3, 'D': 3, # Shortcut for INT | UNSIGNED - 'NOEXP': 4, 'N': 4, - 'PATH': 8, 'P': 8, - 'LOCALE': 16, 'L': 16, - 'IGNORECASE': 32, 'IC': 32, - 'LOWERCASEFIRST': 64, 'LF': 64, - 'GROUPLETTERS': 128, 'G': 128, - 'TYPESAFE': 1024, 'T': 1024, - } -# Populate the ns class with the _nsdict values. -for x, y in _nsdict.items(): - setattr(ns, x, y) - -# Group algorithm types for easy extraction -_NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP -_CASE_ALGORITHMS = ns.IGNORECASE | ns.LOWERCASEFIRST | ns.GROUPLETTERS -_ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L | - ns.IC | ns.LF | ns.G | ns.TYPESAFE) - -# The regex that locates floats -_float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U) -_float_nosign_exp_re = re.compile(r'(\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U) -_float_sign_noexp_re = re.compile(r'([-+]?\d*\.?\d+)', re.U) -_float_nosign_noexp_re = re.compile(r'(\d*\.?\d+)', re.U) -_float_sign_exp_re_c = re.compile(r'([-+]?\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U) -_float_nosign_exp_re_c = re.compile(r'(\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U) -_float_sign_noexp_re_c = re.compile(r'([-+]?\d*[.,]?\d+)', re.U) -_float_nosign_noexp_re_c = re.compile(r'(\d*[.,]?\d+)', re.U) - -# Integer regexes -_int_nosign_re = re.compile(r'(\d+)', re.U) -_int_sign_re = re.compile(r'([-+]?\d+)', re.U) - -# This dict will help select the correct regex and number conversion function. -_regex_and_num_function_chooser = { - (ns.F, '.'): (_float_sign_exp_re, fast_float), - (ns.F | ns.N, '.'): (_float_sign_noexp_re, fast_float), - (ns.F | ns.U, '.'): (_float_nosign_exp_re, fast_float), - (ns.F | ns.U | ns.N, '.'): (_float_nosign_noexp_re, fast_float), - (ns.I, '.'): (_int_sign_re, fast_int), - (ns.I | ns.N, '.'): (_int_sign_re, fast_int), - (ns.I | ns.U, '.'): (_int_nosign_re, fast_int), - (ns.I | ns.U | ns.N, '.'): (_int_nosign_re, fast_int), - (ns.F, ','): (_float_sign_exp_re_c, fast_float), - (ns.F | ns.N, ','): (_float_sign_noexp_re_c, fast_float), - (ns.F | ns.U, ','): (_float_nosign_exp_re_c, fast_float), - (ns.F | ns.U | ns.N, ','): (_float_nosign_noexp_re_c, fast_float), - (ns.I, ','): (_int_sign_re, fast_int), - (ns.I | ns.N, ','): (_int_sign_re, fast_int), - (ns.I | ns.U, ','): (_int_nosign_re, fast_int), - (ns.I | ns.U | ns.N, ','): (_int_nosign_re, fast_int), -} - - -def _args_to_enum(number_type, signed, exp, as_path, py3_safe): - """A function to convert input booleans to an enum-type argument.""" - alg = 0 - if number_type is not float: - msg = "The 'number_type' argument is depreciated as of 3.5.0, " - msg += "please use 'alg=ns.FLOAT', 'alg=ns.INT', or 'alg=ns.VERSION'" - warn(msg, DeprecationWarning) - alg |= (_nsdict['INT'] * bool(number_type in (int, None))) - alg |= (_nsdict['UNSIGNED'] * (number_type is None)) - if signed is not None: - msg = "The 'signed' argument is depreciated as of 3.5.0, " - msg += "please use 'alg=ns.UNSIGNED'." - warn(msg, DeprecationWarning) - alg |= (_nsdict['UNSIGNED'] * (not signed)) - if exp is not None: - msg = "The 'exp' argument is depreciated as of 3.5.0, " - msg += "please use 'alg=ns.NOEXP'." - warn(msg, DeprecationWarning) - alg |= (_nsdict['NOEXP'] * (not exp)) - if as_path is not None: - msg = "The 'as_path' argument is depreciated as of 3.5.0, " - msg += "please use 'alg=ns.PATH'." - warn(msg, DeprecationWarning) - alg |= (_nsdict['PATH'] * as_path) - if py3_safe is not None: - msg = "The 'py3_safe' argument is depreciated as of 3.5.0, " - msg += "please use 'alg=ns.TYPESAFE'." - warn(msg, DeprecationWarning) - alg |= (_nsdict['TYPESAFE'] * py3_safe) - return alg - - -def _input_parser(s, regex, numconv, py3_safe, use_locale, group_letters): - """Helper to parse the string input into numbers and strings.""" - - # Split the input string by numbers. - # If the input is not a string, TypeError is raised. - s = regex.split(s) - - # Now convert the numbers to numbers, and leave strings as strings. - # Take into account locale if needed, and group letters if needed. - # Remove empty strings from the list. - if use_locale: - s = [locale_convert(x, numconv, group_letters) for x in s if x] - elif group_letters: - s = [grouper(x, numconv) for x in s if x] - else: - s = [numconv(x) for x in s if x] - - # If the list begins with a number, lead with an empty string. - # This is used to get around the "unorderable types" issue. - if not s: # Return empty tuple for empty results. - return () - elif isreal(s[0]): - s = [''] + s - - # The _py3_safe function inserts "" between numbers in the list, - # and is used to get around "unorderable types" in complex cases. - # It is a separate function that needs to be requested specifically - # because it is expensive to call. - return _py3_safe(s) if py3_safe else s - - -def _path_splitter(s, _d_match=re.compile(r'\.\d').match): - """Split a string into its path components. Assumes a string is a path.""" - path_parts = [] - p_append = path_parts.append - path_location = s - - # Continue splitting the path from the back until we have reached - # '..' or '.', or until there is nothing left to split. - while path_location != curdir and path_location != pardir: - parent_path = path_location - path_location, child_path = split(parent_path) - if path_location == parent_path: - break - p_append(child_path) - - # This last append is the base path. - # Only append if the string is non-empty. - if path_location: - p_append(path_location) - - # We created this list in reversed order, so we now correct the order. - path_parts.reverse() - - # Now, split off the file extensions using a similar method to above. - # Continue splitting off file extensions until we reach a decimal number - # or there are no more extensions. - base = path_parts.pop() - base_parts = [] - b_append = base_parts.append - while True: - front = base - base, ext = splitext(front) - if _d_match(ext) or not ext: - # Reset base to before the split if the split is invalid. - base = front - break - b_append(ext) - b_append(base) - base_parts.reverse() - - # Return the split parent paths and then the split basename. - return path_parts + base_parts - - -def _py3_safe(parsed_list): - """Insert '' between two numbers.""" - length = len(parsed_list) - if length < 2: - return parsed_list - else: - new_list = [parsed_list[0]] - nl_append = new_list.append - for before, after in py23_zip(islice(parsed_list, 0, length-1), - islice(parsed_list, 1, None)): - if isreal(before) and isreal(after): - nl_append("") - nl_append(after) - return new_list - - -def _natsort_key(val, key, alg): - """\ - Key to sort strings and numbers naturally. - - It works by separating out the numbers from the strings. This function for - internal use only. See the natsort_keygen documentation for details of each - parameter. - - Parameters - ---------- - val : {str, unicode} - key : callable - alg : ns enum - - Returns - ------- - out : tuple - The modified value with numbers extracted. - - """ - - # Convert the arguments to the proper input tuple - try: - use_locale = alg & _nsdict['LOCALE'] - inp_options = (alg & _NUMBER_ALGORITHMS, - localeconv()['decimal_point'] if use_locale else '.') - except TypeError: - msg = "_natsort_key: 'alg' argument must be from the enum 'ns'" - raise ValueError(msg+', got {0}'.format(py23_str(alg))) - - # Get the proper regex and conversion function. - try: - regex, num_function = _regex_and_num_function_chooser[inp_options] - except KeyError: # pragma: no cover - if inp_options[1] not in ('.', ','): # pragma: no cover - raise ValueError("_natsort_key: currently natsort only supports " - "the decimal separators '.' and ','. " - "Please file a bug report.") - else: - raise - else: - # Apply key if needed. - if key is not None: - val = key(val) - - # If this is a path, convert it. - # An AttrubuteError is raised if not a string. - split_as_path = False - if alg & _nsdict['PATH']: - try: - val = _path_splitter(val) - except AttributeError: - pass - else: - # Record that this string was split as a path so that - # we don't set PATH in the recursive call. - split_as_path = True - - # Assume the input are strings, which is the most common case. - # Apply the string modification if needed. - try: - if alg & _nsdict['LOWERCASEFIRST']: - val = swapcase(val) - if alg & _nsdict['IGNORECASE']: - val = lowercase(val) - return tuple(_input_parser(val, - regex, - num_function, - alg & _nsdict['TYPESAFE'], - use_locale, - alg & _nsdict['GROUPLETTERS'])) - except TypeError: - # If not strings, assume it is an iterable that must - # be parsed recursively. Do not apply the key recursively. - # If this string was split as a path, turn off 'PATH'. - try: - was_path = alg & _nsdict['PATH'] - newalg = alg & _ALL_BUT_PATH - newalg |= (was_path * (not split_as_path)) - return tuple([_natsort_key(x, None, newalg) for x in val]) - # If there is still an error, it must be a number. - # Return as-is, with a leading empty string. - except TypeError: - return (('', val,),) if alg & _nsdict['PATH'] else ('', val,) - - @u_format def natsort_key(val, key=None, number_type=float, signed=None, exp=None, as_path=None, py3_safe=None, alg=0): @@ -689,7 +309,8 @@ def natsorted(seq, key=None, number_type=float, signed=None, exp=None, See Also -------- natsort_keygen : Generates the key that makes natural sorting possible. - versorted : A wrapper for ``natsorted(seq, number_type=None)``. + versorted : A wrapper for ``natsorted(seq, alg=ns.VERSION)``. + humansorted : A wrapper for ``natsorted(seq, alg=ns.LOCALE)``. index_natsorted : Returns the sorted indexes from `natsorted`. Examples @@ -711,7 +332,7 @@ def natsorted(seq, key=None, number_type=float, signed=None, exp=None, if 'unorderable types' in str(e): return sorted(seq, reverse=reverse, key=natsort_keygen(key, - alg=alg | _nsdict['TYPESAFE'])) + alg=alg | ns.TYPESAFE)) else: # Re-raise if the problem was not "unorderable types" raise @@ -1128,6 +749,7 @@ def order_by_index(seq, index, iter=False): -------- index_natsorted index_versorted + index_humansorted Examples -------- |