summaryrefslogtreecommitdiff
path: root/natsort/natsort.py
diff options
context:
space:
mode:
Diffstat (limited to 'natsort/natsort.py')
-rw-r--r--natsort/natsort.py396
1 files changed, 9 insertions, 387 deletions
diff --git a/natsort/natsort.py b/natsort/natsort.py
index 82b84df..d3d6f8a 100644
--- a/natsort/natsort.py
+++ b/natsort/natsort.py
@@ -15,400 +15,20 @@ See the README or the natsort homepage for more details.
from __future__ import (print_function, division,
unicode_literals, absolute_import)
-import re
-from os import curdir, pardir
-from os.path import split, splitext
+# Std lib. imports.
from operator import itemgetter
from functools import partial
-from itertools import islice
from warnings import warn
-from locale import localeconv
-# If the user has fastnumbers installed, they will get great speed
-# benefits. If not, we simulate the functions here.
-try:
- from fastnumbers import fast_float, fast_int, isreal
-except ImportError:
- from .fake_fastnumbers import fast_float, fast_int, isreal
-from .locale_help import locale_convert, grouper, lowercase, swapcase
-from .py23compat import u_format, py23_str, py23_zip
+# Local imports.
+from natsort.utils import _natsort_key, _args_to_enum
+from natsort.ns_enum import ns
+from natsort.py23compat import u_format
# Make sure the doctest works for either python2 or python3
__doc__ = u_format(__doc__)
-class ns(object):
- """
- Enum to control the `natsort` algorithm.
-
- This class acts like an enum to control the `natsort` algorithm. The
- user may select several options simultaneously by or'ing the options
- together. For example, to choose ``ns.INT``, `ns.PATH``, and
- ``ns.LOCALE``, you could do ``ns.INT | ns.LOCALE | ns.PATH``.
-
- Each option has a shortened 1- or 2-letter form.
-
- .. warning:: On some systems, the underlying C library that
- Python's locale module uses is broken. On these
- systems it is recommended that you install
- `PyICU <https://pypi.python.org/pypi/PyICU>`_
- if you wish to use `LOCALE`.
- Please validate that `LOCALE` works as
- expected on your target system, and if not you
- should add
- `PyICU <https://pypi.python.org/pypi/PyICU>`_
- as a dependency.
-
- Attributes
- ----------
- FLOAT, F
- The default - parse numbers as floats.
- INT, I
- Tell `natsort` to parse numbers as ints.
- UNSIGNED, U
- Tell `natsort` to ignore any sign (i.e. "-" or "+") to the
- immediate left of a number. It is the same as setting the old
- `signed` option to `False`.
- VERSION, V
- This is a shortcut for ``ns.INT | ns.UNSIGNED``, which is useful
- when attempting to sort version numbers. It is the same as
- setting the old `number_type` option to `None`.
- DIGIT, D
- Same as `VERSION` above.
- NOEXP, N
- Tell `natsort` to not search for exponents as part of the number.
- For example, with `NOEXP` the number "5.6E5" would be interpreted
- as `5.6`, `"E"`, and `5`. It is the same as setting the old `exp`
- option to `False`.
- PATH, P
- Tell `natsort` to interpret strings as filesystem paths, so they
- will be split according to the filesystem separator
- (i.e. ‘/’ on UNIX, ‘\’ on Windows), as well as splitting on the
- file extension, if any. Without this, lists of file paths like
- ``['Folder/', 'Folder (1)/', 'Folder (10)/']`` will not be sorted
- properly; 'Folder/' will be placed at the end, not at the front.
- It is the same as setting the old `as_path` option to `True`.
- LOCALE, L
- Tell `natsort` to be locale-aware when sorting strings (everything
- that was not converted to a number). Your sorting results will vary
- depending on your current locale. Generally, the `GROUPLETTERS`
- option is needed with `LOCALE` because the `locale` library
- groups the letters in the same manner (although you may still
- need `GROUPLETTERS` if there are numbers in your strings).
- IGNORECASE, IC
- Tell `natsort` to ignore case when sorting. For example,
- ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
- ``['apple', 'Apple', 'Banana', 'banana']``.
- LOWERCASEFIRST, LF
- Tell `natsort` to put lowercase letters before uppercase letters
- when sorting. For example,
- ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
- ``['apple', 'banana', 'Apple', 'Banana']`` (the default order
- would be ``['Apple', 'Banana', 'apple', 'banana']`` which is
- the order from a purely ordinal sort).
- Useless when used with `IGNORECASE`.
- GROUPLETTERS, G
- Tell `natsort` to group lowercase and uppercase letters together
- when sorting. For example,
- ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
- ``['Apple', 'apple', 'Banana', 'banana']``.
- Useless when used with `IGNORECASE`; use with `LOWERCASEFIRST`
- to reverse the order of upper and lower case.
- TYPESAFE, T
- Try hard to avoid "unorderable types" error on Python 3. It
- is the same as setting the old `py3_safe` option to `True`.
-
- Notes
- -----
- If using `LOCALE`, you may find that if you do not explicitly set
- the locale your results may not be as you expect... I have found that
- it depends on the system you are on. To do this is straightforward
- (in the below example I use 'en_US.UTF-8', but you should use your
- locale)::
-
- >>> import locale
- >>> # The 'str' call is only to get around a bug on Python 2.x
- >>> # where 'setlocale' does not expect unicode strings (ironic,
- >>> # right?)
- >>> locale.setlocale(locale.LC_ALL, str('en_US.UTF-8'))
- 'en_US.UTF-8'
-
- It is preferred that you do this before importing `natsort`.
- If you use `PyICU <https://pypi.python.org/pypi/PyICU>`_ (see warning
- above) then you should not need to do this.
-
- """
- pass
-
-
-# Sort algorithm "enum" values.
-_nsdict = {'FLOAT': 0, 'F': 0,
- 'INT': 1, 'I': 1,
- 'UNSIGNED': 2, 'U': 2,
- 'VERSION': 3, 'V': 3, # Shortcut for INT | UNSIGNED
- 'DIGIT': 3, 'D': 3, # Shortcut for INT | UNSIGNED
- 'NOEXP': 4, 'N': 4,
- 'PATH': 8, 'P': 8,
- 'LOCALE': 16, 'L': 16,
- 'IGNORECASE': 32, 'IC': 32,
- 'LOWERCASEFIRST': 64, 'LF': 64,
- 'GROUPLETTERS': 128, 'G': 128,
- 'TYPESAFE': 1024, 'T': 1024,
- }
-# Populate the ns class with the _nsdict values.
-for x, y in _nsdict.items():
- setattr(ns, x, y)
-
-# Group algorithm types for easy extraction
-_NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP
-_CASE_ALGORITHMS = ns.IGNORECASE | ns.LOWERCASEFIRST | ns.GROUPLETTERS
-_ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L |
- ns.IC | ns.LF | ns.G | ns.TYPESAFE)
-
-# The regex that locates floats
-_float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
-_float_nosign_exp_re = re.compile(r'(\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
-_float_sign_noexp_re = re.compile(r'([-+]?\d*\.?\d+)', re.U)
-_float_nosign_noexp_re = re.compile(r'(\d*\.?\d+)', re.U)
-_float_sign_exp_re_c = re.compile(r'([-+]?\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
-_float_nosign_exp_re_c = re.compile(r'(\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
-_float_sign_noexp_re_c = re.compile(r'([-+]?\d*[.,]?\d+)', re.U)
-_float_nosign_noexp_re_c = re.compile(r'(\d*[.,]?\d+)', re.U)
-
-# Integer regexes
-_int_nosign_re = re.compile(r'(\d+)', re.U)
-_int_sign_re = re.compile(r'([-+]?\d+)', re.U)
-
-# This dict will help select the correct regex and number conversion function.
-_regex_and_num_function_chooser = {
- (ns.F, '.'): (_float_sign_exp_re, fast_float),
- (ns.F | ns.N, '.'): (_float_sign_noexp_re, fast_float),
- (ns.F | ns.U, '.'): (_float_nosign_exp_re, fast_float),
- (ns.F | ns.U | ns.N, '.'): (_float_nosign_noexp_re, fast_float),
- (ns.I, '.'): (_int_sign_re, fast_int),
- (ns.I | ns.N, '.'): (_int_sign_re, fast_int),
- (ns.I | ns.U, '.'): (_int_nosign_re, fast_int),
- (ns.I | ns.U | ns.N, '.'): (_int_nosign_re, fast_int),
- (ns.F, ','): (_float_sign_exp_re_c, fast_float),
- (ns.F | ns.N, ','): (_float_sign_noexp_re_c, fast_float),
- (ns.F | ns.U, ','): (_float_nosign_exp_re_c, fast_float),
- (ns.F | ns.U | ns.N, ','): (_float_nosign_noexp_re_c, fast_float),
- (ns.I, ','): (_int_sign_re, fast_int),
- (ns.I | ns.N, ','): (_int_sign_re, fast_int),
- (ns.I | ns.U, ','): (_int_nosign_re, fast_int),
- (ns.I | ns.U | ns.N, ','): (_int_nosign_re, fast_int),
-}
-
-
-def _args_to_enum(number_type, signed, exp, as_path, py3_safe):
- """A function to convert input booleans to an enum-type argument."""
- alg = 0
- if number_type is not float:
- msg = "The 'number_type' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.FLOAT', 'alg=ns.INT', or 'alg=ns.VERSION'"
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['INT'] * bool(number_type in (int, None)))
- alg |= (_nsdict['UNSIGNED'] * (number_type is None))
- if signed is not None:
- msg = "The 'signed' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.UNSIGNED'."
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['UNSIGNED'] * (not signed))
- if exp is not None:
- msg = "The 'exp' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.NOEXP'."
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['NOEXP'] * (not exp))
- if as_path is not None:
- msg = "The 'as_path' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.PATH'."
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['PATH'] * as_path)
- if py3_safe is not None:
- msg = "The 'py3_safe' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.TYPESAFE'."
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['TYPESAFE'] * py3_safe)
- return alg
-
-
-def _input_parser(s, regex, numconv, py3_safe, use_locale, group_letters):
- """Helper to parse the string input into numbers and strings."""
-
- # Split the input string by numbers.
- # If the input is not a string, TypeError is raised.
- s = regex.split(s)
-
- # Now convert the numbers to numbers, and leave strings as strings.
- # Take into account locale if needed, and group letters if needed.
- # Remove empty strings from the list.
- if use_locale:
- s = [locale_convert(x, numconv, group_letters) for x in s if x]
- elif group_letters:
- s = [grouper(x, numconv) for x in s if x]
- else:
- s = [numconv(x) for x in s if x]
-
- # If the list begins with a number, lead with an empty string.
- # This is used to get around the "unorderable types" issue.
- if not s: # Return empty tuple for empty results.
- return ()
- elif isreal(s[0]):
- s = [''] + s
-
- # The _py3_safe function inserts "" between numbers in the list,
- # and is used to get around "unorderable types" in complex cases.
- # It is a separate function that needs to be requested specifically
- # because it is expensive to call.
- return _py3_safe(s) if py3_safe else s
-
-
-def _path_splitter(s, _d_match=re.compile(r'\.\d').match):
- """Split a string into its path components. Assumes a string is a path."""
- path_parts = []
- p_append = path_parts.append
- path_location = s
-
- # Continue splitting the path from the back until we have reached
- # '..' or '.', or until there is nothing left to split.
- while path_location != curdir and path_location != pardir:
- parent_path = path_location
- path_location, child_path = split(parent_path)
- if path_location == parent_path:
- break
- p_append(child_path)
-
- # This last append is the base path.
- # Only append if the string is non-empty.
- if path_location:
- p_append(path_location)
-
- # We created this list in reversed order, so we now correct the order.
- path_parts.reverse()
-
- # Now, split off the file extensions using a similar method to above.
- # Continue splitting off file extensions until we reach a decimal number
- # or there are no more extensions.
- base = path_parts.pop()
- base_parts = []
- b_append = base_parts.append
- while True:
- front = base
- base, ext = splitext(front)
- if _d_match(ext) or not ext:
- # Reset base to before the split if the split is invalid.
- base = front
- break
- b_append(ext)
- b_append(base)
- base_parts.reverse()
-
- # Return the split parent paths and then the split basename.
- return path_parts + base_parts
-
-
-def _py3_safe(parsed_list):
- """Insert '' between two numbers."""
- length = len(parsed_list)
- if length < 2:
- return parsed_list
- else:
- new_list = [parsed_list[0]]
- nl_append = new_list.append
- for before, after in py23_zip(islice(parsed_list, 0, length-1),
- islice(parsed_list, 1, None)):
- if isreal(before) and isreal(after):
- nl_append("")
- nl_append(after)
- return new_list
-
-
-def _natsort_key(val, key, alg):
- """\
- Key to sort strings and numbers naturally.
-
- It works by separating out the numbers from the strings. This function for
- internal use only. See the natsort_keygen documentation for details of each
- parameter.
-
- Parameters
- ----------
- val : {str, unicode}
- key : callable
- alg : ns enum
-
- Returns
- -------
- out : tuple
- The modified value with numbers extracted.
-
- """
-
- # Convert the arguments to the proper input tuple
- try:
- use_locale = alg & _nsdict['LOCALE']
- inp_options = (alg & _NUMBER_ALGORITHMS,
- localeconv()['decimal_point'] if use_locale else '.')
- except TypeError:
- msg = "_natsort_key: 'alg' argument must be from the enum 'ns'"
- raise ValueError(msg+', got {0}'.format(py23_str(alg)))
-
- # Get the proper regex and conversion function.
- try:
- regex, num_function = _regex_and_num_function_chooser[inp_options]
- except KeyError: # pragma: no cover
- if inp_options[1] not in ('.', ','): # pragma: no cover
- raise ValueError("_natsort_key: currently natsort only supports "
- "the decimal separators '.' and ','. "
- "Please file a bug report.")
- else:
- raise
- else:
- # Apply key if needed.
- if key is not None:
- val = key(val)
-
- # If this is a path, convert it.
- # An AttrubuteError is raised if not a string.
- split_as_path = False
- if alg & _nsdict['PATH']:
- try:
- val = _path_splitter(val)
- except AttributeError:
- pass
- else:
- # Record that this string was split as a path so that
- # we don't set PATH in the recursive call.
- split_as_path = True
-
- # Assume the input are strings, which is the most common case.
- # Apply the string modification if needed.
- try:
- if alg & _nsdict['LOWERCASEFIRST']:
- val = swapcase(val)
- if alg & _nsdict['IGNORECASE']:
- val = lowercase(val)
- return tuple(_input_parser(val,
- regex,
- num_function,
- alg & _nsdict['TYPESAFE'],
- use_locale,
- alg & _nsdict['GROUPLETTERS']))
- except TypeError:
- # If not strings, assume it is an iterable that must
- # be parsed recursively. Do not apply the key recursively.
- # If this string was split as a path, turn off 'PATH'.
- try:
- was_path = alg & _nsdict['PATH']
- newalg = alg & _ALL_BUT_PATH
- newalg |= (was_path * (not split_as_path))
- return tuple([_natsort_key(x, None, newalg) for x in val])
- # If there is still an error, it must be a number.
- # Return as-is, with a leading empty string.
- except TypeError:
- return (('', val,),) if alg & _nsdict['PATH'] else ('', val,)
-
-
@u_format
def natsort_key(val, key=None, number_type=float, signed=None, exp=None,
as_path=None, py3_safe=None, alg=0):
@@ -689,7 +309,8 @@ def natsorted(seq, key=None, number_type=float, signed=None, exp=None,
See Also
--------
natsort_keygen : Generates the key that makes natural sorting possible.
- versorted : A wrapper for ``natsorted(seq, number_type=None)``.
+ versorted : A wrapper for ``natsorted(seq, alg=ns.VERSION)``.
+ humansorted : A wrapper for ``natsorted(seq, alg=ns.LOCALE)``.
index_natsorted : Returns the sorted indexes from `natsorted`.
Examples
@@ -711,7 +332,7 @@ def natsorted(seq, key=None, number_type=float, signed=None, exp=None,
if 'unorderable types' in str(e):
return sorted(seq, reverse=reverse,
key=natsort_keygen(key,
- alg=alg | _nsdict['TYPESAFE']))
+ alg=alg | ns.TYPESAFE))
else:
# Re-raise if the problem was not "unorderable types"
raise
@@ -1128,6 +749,7 @@ def order_by_index(seq, index, iter=False):
--------
index_natsorted
index_versorted
+ index_humansorted
Examples
--------