summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth M Morton <seth.m.morton@gmail.com>2014-09-25 19:52:58 -0700
committerSeth M Morton <seth.m.morton@gmail.com>2014-09-25 19:52:58 -0700
commit9ae05d5847e5487d833562e5c8adea774d931749 (patch)
tree8d26659b88e79cb40ec39a9bd491e19c774e5b96
parent9fdd503da47f0207a67528d26cb44482f0456047 (diff)
downloadnatsort-9ae05d5847e5487d833562e5c8adea774d931749.tar.gz
Refactored utility functions out of natsort.py.
The utility functions and variables have been refactored out of the natsort.py module and into the utils.py module... this has the benefit of placing only user-facing code in natsort.py, and also making it more obvious that most of the utility functions went into building the _natsort_key, and were not used by the user-facing functions. The hope is that this makes natsort more easy to maintain.
-rw-r--r--natsort/__main__.py3
-rw-r--r--natsort/natsort.py390
-rw-r--r--natsort/ns_enum.py128
-rw-r--r--natsort/utils.py277
-rw-r--r--test_natsort/test_natsort.py9
5 files changed, 415 insertions, 392 deletions
diff --git a/natsort/__main__.py b/natsort/__main__.py
index f55c3fe..03c4366 100644
--- a/natsort/__main__.py
+++ b/natsort/__main__.py
@@ -4,7 +4,8 @@ from __future__ import (print_function, division,
import sys
-from .natsort import natsorted, _regex_and_num_function_chooser, ns
+from .natsort import natsorted, ns
+from .utils import _regex_and_num_function_chooser
from ._version import __version__
from .py23compat import py23_str
diff --git a/natsort/natsort.py b/natsort/natsort.py
index 76d75f6..e448356 100644
--- a/natsort/natsort.py
+++ b/natsort/natsort.py
@@ -15,400 +15,18 @@ See the README or the natsort homepage for more details.
from __future__ import (print_function, division,
unicode_literals, absolute_import)
-import re
-from os import curdir, pardir
-from os.path import split, splitext
from operator import itemgetter
from functools import partial
-from itertools import islice
from warnings import warn
-from locale import localeconv
-# If the user has fastnumbers installed, they will get great speed
-# benefits. If not, we simulate the functions here.
-try:
- from fastnumbers import fast_float, fast_int, isreal
-except ImportError:
- from .fake_fastnumbers import fast_float, fast_int, isreal
-from .locale_help import locale_convert, grouper
-from .py23compat import u_format, py23_str, py23_zip
+from .utils import _natsort_key, _args_to_enum
+from .ns_enum import ns
+from .py23compat import u_format
# Make sure the doctest works for either python2 or python3
__doc__ = u_format(__doc__)
-class ns(object):
- """
- Enum to control the `natsort` algorithm.
-
- This class acts like an enum to control the `natsort` algorithm. The
- user may select several options simultaneously by or'ing the options
- together. For example, to choose ``ns.INT``, `ns.PATH``, and
- ``ns.LOCALE``, you could do ``ns.INT | ns.LOCALE | ns.PATH``.
-
- Each option has a shortened 1- or 2-letter form.
-
- .. warning:: On some systems, the underlying C library that
- Python's locale module uses is broken. On these
- systems it is recommended that you install
- `PyICU <https://pypi.python.org/pypi/PyICU>`_
- if you wish to use `LOCALE`.
- Please validate that `LOCALE` works as
- expected on your target system, and if not you
- should add
- `PyICU <https://pypi.python.org/pypi/PyICU>`_
- as a dependency.
-
- Attributes
- ----------
- FLOAT, F
- The default - parse numbers as floats.
- INT, I
- Tell `natsort` to parse numbers as ints.
- UNSIGNED, U
- Tell `natsort` to ignore any sign (i.e. "-" or "+") to the
- immediate left of a number. It is the same as setting the old
- `signed` option to `False`.
- VERSION, V
- This is a shortcut for ``ns.INT | ns.UNSIGNED``, which is useful
- when attempting to sort version numbers. It is the same as
- setting the old `number_type` option to `None`.
- DIGIT, D
- Same as `VERSION` above.
- NOEXP, N
- Tell `natsort` to not search for exponents as part of the number.
- For example, with `NOEXP` the number "5.6E5" would be interpreted
- as `5.6`, `"E"`, and `5`. It is the same as setting the old `exp`
- option to `False`.
- PATH, P
- Tell `natsort` to interpret strings as filesystem paths, so they
- will be split according to the filesystem separator
- (i.e. ‘/’ on UNIX, ‘\’ on Windows), as well as splitting on the
- file extension, if any. Without this, lists of file paths like
- ``['Folder/', 'Folder (1)/', 'Folder (10)/']`` will not be sorted
- properly; 'Folder/' will be placed at the end, not at the front.
- It is the same as setting the old `as_path` option to `True`.
- LOCALE, L
- Tell `natsort` to be locale-aware when sorting strings (everything
- that was not converted to a number). Your sorting results will vary
- depending on your current locale. Generally, the `GROUPLETTERS`
- option is needed with `LOCALE` because the `locale` library
- groups the letters in the same manner (although you may still
- need `GROUPLETTERS` if there are numbers in your strings).
- IGNORECASE, IC
- Tell `natsort` to ignore case when sorting. For example,
- ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
- ``['apple', 'Apple', 'Banana', 'banana']``.
- LOWERCASEFIRST, LF
- Tell `natsort` to put lowercase letters before uppercase letters
- when sorting. For example,
- ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
- ``['apple', 'banana', 'Apple', 'Banana']`` (the default order
- would be ``['Apple', 'Banana', 'apple', 'banana']`` which is
- the order from a purely ordinal sort).
- Useless when used with `IGNORECASE`.
- GROUPLETTERS, G
- Tell `natsort` to group lowercase and uppercase letters together
- when sorting. For example,
- ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
- ``['Apple', 'apple', 'Banana', 'banana']``.
- Useless when used with `IGNORECASE`; use with `LOWERCASEFIRST`
- to reverse the order of upper and lower case.
- TYPESAFE, T
- Try hard to avoid "unorderable types" error on Python 3. It
- is the same as setting the old `py3_safe` option to `True`.
-
- Notes
- -----
- If using `LOCALE`, you may find that if you do not explicitly set
- the locale your results may not be as you expect... I have found that
- it depends on the system you are on. To do this is straightforward
- (in the below example I use 'en_US.UTF-8', but you should use your
- locale)::
-
- >>> import locale
- >>> # The 'str' call is only to get around a bug on Python 2.x
- >>> # where 'setlocale' does not expect unicode strings (ironic,
- >>> # right?)
- >>> locale.setlocale(locale.LC_ALL, str('en_US.UTF-8'))
- 'en_US.UTF-8'
-
- It is preferred that you do this before importing `natsort`.
- If you use `PyICU <https://pypi.python.org/pypi/PyICU>`_ (see warning
- above) then you should not need to do this.
-
- """
- pass
-
-
-# Sort algorithm "enum" values.
-_nsdict = {'FLOAT': 0, 'F': 0,
- 'INT': 1, 'I': 1,
- 'UNSIGNED': 2, 'U': 2,
- 'VERSION': 3, 'V': 3, # Shortcut for INT | UNSIGNED
- 'DIGIT': 3, 'D': 3, # Shortcut for INT | UNSIGNED
- 'NOEXP': 4, 'N': 4,
- 'PATH': 8, 'P': 8,
- 'LOCALE': 16, 'L': 16,
- 'IGNORECASE': 32, 'IC': 32,
- 'LOWERCASEFIRST': 64, 'LF': 64,
- 'GROUPLETTERS': 128, 'G': 128,
- 'TYPESAFE': 1024, 'T': 1024,
- }
-# Populate the ns class with the _nsdict values.
-for x, y in _nsdict.items():
- setattr(ns, x, y)
-
-# Group algorithm types for easy extraction
-_NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP
-_CASE_ALGORITHMS = ns.IGNORECASE | ns.LOWERCASEFIRST | ns.GROUPLETTERS
-_ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L |
- ns.IC | ns.LF | ns.G | ns.TYPESAFE)
-
-# The regex that locates floats
-_float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
-_float_nosign_exp_re = re.compile(r'(\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
-_float_sign_noexp_re = re.compile(r'([-+]?\d*\.?\d+)', re.U)
-_float_nosign_noexp_re = re.compile(r'(\d*\.?\d+)', re.U)
-_float_sign_exp_re_c = re.compile(r'([-+]?\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
-_float_nosign_exp_re_c = re.compile(r'(\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
-_float_sign_noexp_re_c = re.compile(r'([-+]?\d*[.,]?\d+)', re.U)
-_float_nosign_noexp_re_c = re.compile(r'(\d*[.,]?\d+)', re.U)
-
-# Integer regexes
-_int_nosign_re = re.compile(r'(\d+)', re.U)
-_int_sign_re = re.compile(r'([-+]?\d+)', re.U)
-
-# This dict will help select the correct regex and number conversion function.
-_regex_and_num_function_chooser = {
- (ns.F, '.'): (_float_sign_exp_re, fast_float),
- (ns.F | ns.N, '.'): (_float_sign_noexp_re, fast_float),
- (ns.F | ns.U, '.'): (_float_nosign_exp_re, fast_float),
- (ns.F | ns.U | ns.N, '.'): (_float_nosign_noexp_re, fast_float),
- (ns.I, '.'): (_int_sign_re, fast_int),
- (ns.I | ns.N, '.'): (_int_sign_re, fast_int),
- (ns.I | ns.U, '.'): (_int_nosign_re, fast_int),
- (ns.I | ns.U | ns.N, '.'): (_int_nosign_re, fast_int),
- (ns.F, ','): (_float_sign_exp_re_c, fast_float),
- (ns.F | ns.N, ','): (_float_sign_noexp_re_c, fast_float),
- (ns.F | ns.U, ','): (_float_nosign_exp_re_c, fast_float),
- (ns.F | ns.U | ns.N, ','): (_float_nosign_noexp_re_c, fast_float),
- (ns.I, ','): (_int_sign_re, fast_int),
- (ns.I | ns.N, ','): (_int_sign_re, fast_int),
- (ns.I | ns.U, ','): (_int_nosign_re, fast_int),
- (ns.I | ns.U | ns.N, ','): (_int_nosign_re, fast_int),
-}
-
-
-def _args_to_enum(number_type, signed, exp, as_path, py3_safe):
- """A function to convert input booleans to an enum-type argument."""
- alg = 0
- if number_type is not float:
- msg = "The 'number_type' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.FLOAT', 'alg=ns.INT', or 'alg=ns.VERSION'"
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['INT'] * bool(number_type in (int, None)))
- alg |= (_nsdict['UNSIGNED'] * (number_type is None))
- if signed is not None:
- msg = "The 'signed' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.UNSIGNED'."
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['UNSIGNED'] * (not signed))
- if exp is not None:
- msg = "The 'exp' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.NOEXP'."
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['NOEXP'] * (not exp))
- if as_path is not None:
- msg = "The 'as_path' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.PATH'."
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['PATH'] * as_path)
- if py3_safe is not None:
- msg = "The 'py3_safe' argument is depreciated as of 3.5.0, "
- msg += "please use 'alg=ns.TYPESAFE'."
- warn(msg, DeprecationWarning)
- alg |= (_nsdict['TYPESAFE'] * py3_safe)
- return alg
-
-
-def _input_parser(s, regex, numconv, py3_safe, use_locale, group_letters):
- """Helper to parse the string input into numbers and strings."""
-
- # Split the input string by numbers.
- # If the input is not a string, TypeError is raised.
- s = regex.split(s)
-
- # Now convert the numbers to numbers, and leave strings as strings.
- # Take into account locale if needed, and group letters if needed.
- # Remove empty strings from the list.
- if use_locale:
- s = [locale_convert(x, numconv, group_letters) for x in s if x]
- elif group_letters:
- s = [grouper(x, numconv) for x in s if x]
- else:
- s = [numconv(x) for x in s if x]
-
- # If the list begins with a number, lead with an empty string.
- # This is used to get around the "unorderable types" issue.
- if not s: # Return empty tuple for empty results.
- return ()
- elif isreal(s[0]):
- s = [''] + s
-
- # The _py3_safe function inserts "" between numbers in the list,
- # and is used to get around "unorderable types" in complex cases.
- # It is a separate function that needs to be requested specifically
- # because it is expensive to call.
- return _py3_safe(s) if py3_safe else s
-
-
-def _path_splitter(s, _d_match=re.compile(r'\.\d').match):
- """Split a string into its path components. Assumes a string is a path."""
- path_parts = []
- p_append = path_parts.append
- path_location = s
-
- # Continue splitting the path from the back until we have reached
- # '..' or '.', or until there is nothing left to split.
- while path_location != curdir and path_location != pardir:
- parent_path = path_location
- path_location, child_path = split(parent_path)
- if path_location == parent_path:
- break
- p_append(child_path)
-
- # This last append is the base path.
- # Only append if the string is non-empty.
- if path_location:
- p_append(path_location)
-
- # We created this list in reversed order, so we now correct the order.
- path_parts.reverse()
-
- # Now, split off the file extensions using a similar method to above.
- # Continue splitting off file extensions until we reach a decimal number
- # or there are no more extensions.
- base = path_parts.pop()
- base_parts = []
- b_append = base_parts.append
- while True:
- front = base
- base, ext = splitext(front)
- if _d_match(ext) or not ext:
- # Reset base to before the split if the split is invalid.
- base = front
- break
- b_append(ext)
- b_append(base)
- base_parts.reverse()
-
- # Return the split parent paths and then the split basename.
- return path_parts + base_parts
-
-
-def _py3_safe(parsed_list):
- """Insert '' between two numbers."""
- length = len(parsed_list)
- if length < 2:
- return parsed_list
- else:
- new_list = [parsed_list[0]]
- nl_append = new_list.append
- for before, after in py23_zip(islice(parsed_list, 0, length-1),
- islice(parsed_list, 1, None)):
- if isreal(before) and isreal(after):
- nl_append("")
- nl_append(after)
- return new_list
-
-
-def _natsort_key(val, key, alg):
- """\
- Key to sort strings and numbers naturally.
-
- It works by separating out the numbers from the strings. This function for
- internal use only. See the natsort_keygen documentation for details of each
- parameter.
-
- Parameters
- ----------
- val : {str, unicode}
- key : callable
- alg : ns enum
-
- Returns
- -------
- out : tuple
- The modified value with numbers extracted.
-
- """
-
- # Convert the arguments to the proper input tuple
- try:
- use_locale = alg & _nsdict['LOCALE']
- inp_options = (alg & _NUMBER_ALGORITHMS,
- localeconv()['decimal_point'] if use_locale else '.')
- except TypeError:
- msg = "_natsort_key: 'alg' argument must be from the enum 'ns'"
- raise ValueError(msg+', got {0}'.format(py23_str(alg)))
-
- # Get the proper regex and conversion function.
- try:
- regex, num_function = _regex_and_num_function_chooser[inp_options]
- except KeyError: # pragma: no cover
- if inp_options[1] not in ('.', ','): # pragma: no cover
- raise ValueError("_natsort_key: currently natsort only supports "
- "the decimal separators '.' and ','. "
- "Please file a bug report.")
- else:
- raise
- else:
- # Apply key if needed.
- if key is not None:
- val = key(val)
-
- # If this is a path, convert it.
- # An AttrubuteError is raised if not a string.
- split_as_path = False
- if alg & _nsdict['PATH']:
- try:
- val = _path_splitter(val)
- except AttributeError:
- pass
- else:
- # Record that this string was split as a path so that
- # we don't set PATH in the recursive call.
- split_as_path = True
-
- # Assume the input are strings, which is the most common case.
- # Apply the string modification if needed.
- try:
- if alg & _nsdict['LOWERCASEFIRST']:
- val = val.swapcase()
- if alg & _nsdict['IGNORECASE']:
- val = val.lower()
- return tuple(_input_parser(val,
- regex,
- num_function,
- alg & _nsdict['TYPESAFE'],
- use_locale,
- alg & _nsdict['GROUPLETTERS']))
- except (TypeError, AttributeError):
- # If not strings, assume it is an iterable that must
- # be parsed recursively. Do not apply the key recursively.
- # If this string was split as a path, turn off 'PATH'.
- try:
- was_path = alg & _nsdict['PATH']
- newalg = alg & _ALL_BUT_PATH
- newalg |= (was_path * (not split_as_path))
- return tuple([_natsort_key(x, None, newalg) for x in val])
- # If there is still an error, it must be a number.
- # Return as-is, with a leading empty string.
- except TypeError:
- return (('', val,),) if alg & _nsdict['PATH'] else ('', val,)
-
-
@u_format
def natsort_key(val, key=None, number_type=float, signed=None, exp=None,
as_path=None, py3_safe=None, alg=0):
@@ -712,7 +330,7 @@ def natsorted(seq, key=None, number_type=float, signed=None, exp=None,
if 'unorderable types' in str(e):
return sorted(seq, reverse=reverse,
key=natsort_keygen(key,
- alg=alg | _nsdict['TYPESAFE']))
+ alg=alg | ns.TYPESAFE))
else:
# Re-raise if the problem was not "unorderable types"
raise
diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py
new file mode 100644
index 0000000..d83ef16
--- /dev/null
+++ b/natsort/ns_enum.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+"""This module defines the "ns" enum for natsort."""
+
+from __future__ import (print_function, division,
+ unicode_literals, absolute_import)
+
+
+class ns(object):
+ """
+ Enum to control the `natsort` algorithm.
+
+ This class acts like an enum to control the `natsort` algorithm. The
+ user may select several options simultaneously by or'ing the options
+ together. For example, to choose ``ns.INT``, `ns.PATH``, and
+ ``ns.LOCALE``, you could do ``ns.INT | ns.LOCALE | ns.PATH``.
+
+ Each option has a shortened 1- or 2-letter form.
+
+ .. warning:: On some systems, the underlying C library that
+ Python's locale module uses is broken. On these
+ systems it is recommended that you install
+ `PyICU <https://pypi.python.org/pypi/PyICU>`_
+ if you wish to use `LOCALE`.
+ Please validate that `LOCALE` works as
+ expected on your target system, and if not you
+ should add
+ `PyICU <https://pypi.python.org/pypi/PyICU>`_
+ as a dependency.
+
+ Attributes
+ ----------
+ FLOAT, F
+ The default - parse numbers as floats.
+ INT, I
+ Tell `natsort` to parse numbers as ints.
+ UNSIGNED, U
+ Tell `natsort` to ignore any sign (i.e. "-" or "+") to the
+ immediate left of a number. It is the same as setting the old
+ `signed` option to `False`.
+ VERSION, V
+ This is a shortcut for ``ns.INT | ns.UNSIGNED``, which is useful
+ when attempting to sort version numbers. It is the same as
+ setting the old `number_type` option to `None`.
+ DIGIT, D
+ Same as `VERSION` above.
+ NOEXP, N
+ Tell `natsort` to not search for exponents as part of the number.
+ For example, with `NOEXP` the number "5.6E5" would be interpreted
+ as `5.6`, `"E"`, and `5`. It is the same as setting the old `exp`
+ option to `False`.
+ PATH, P
+ Tell `natsort` to interpret strings as filesystem paths, so they
+ will be split according to the filesystem separator
+ (i.e. ‘/’ on UNIX, ‘\’ on Windows), as well as splitting on the
+ file extension, if any. Without this, lists of file paths like
+ ``['Folder/', 'Folder (1)/', 'Folder (10)/']`` will not be sorted
+ properly; 'Folder/' will be placed at the end, not at the front.
+ It is the same as setting the old `as_path` option to `True`.
+ LOCALE, L
+ Tell `natsort` to be locale-aware when sorting strings (everything
+ that was not converted to a number). Your sorting results will vary
+ depending on your current locale. Generally, the `GROUPLETTERS`
+ option is needed with `LOCALE` because the `locale` library
+ groups the letters in the same manner (although you may still
+ need `GROUPLETTERS` if there are numbers in your strings).
+ IGNORECASE, IC
+ Tell `natsort` to ignore case when sorting. For example,
+ ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
+ ``['apple', 'Apple', 'Banana', 'banana']``.
+ LOWERCASEFIRST, LF
+ Tell `natsort` to put lowercase letters before uppercase letters
+ when sorting. For example,
+ ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
+ ``['apple', 'banana', 'Apple', 'Banana']`` (the default order
+ would be ``['Apple', 'Banana', 'apple', 'banana']`` which is
+ the order from a purely ordinal sort).
+ Useless when used with `IGNORECASE`.
+ GROUPLETTERS, G
+ Tell `natsort` to group lowercase and uppercase letters together
+ when sorting. For example,
+ ``['Banana', 'apple', 'banana', 'Apple']`` would be sorted as
+ ``['Apple', 'apple', 'Banana', 'banana']``.
+ Useless when used with `IGNORECASE`; use with `LOWERCASEFIRST`
+ to reverse the order of upper and lower case.
+ TYPESAFE, T
+ Try hard to avoid "unorderable types" error on Python 3. It
+ is the same as setting the old `py3_safe` option to `True`.
+
+ Notes
+ -----
+ If using `LOCALE`, you may find that if you do not explicitly set
+ the locale your results may not be as you expect... I have found that
+ it depends on the system you are on. To do this is straightforward
+ (in the below example I use 'en_US.UTF-8', but you should use your
+ locale)::
+
+ >>> import locale
+ >>> # The 'str' call is only to get around a bug on Python 2.x
+ >>> # where 'setlocale' does not expect unicode strings (ironic,
+ >>> # right?)
+ >>> locale.setlocale(locale.LC_ALL, str('en_US.UTF-8'))
+ 'en_US.UTF-8'
+
+ It is preferred that you do this before importing `natsort`.
+ If you use `PyICU <https://pypi.python.org/pypi/PyICU>`_ (see warning
+ above) then you should not need to do this.
+
+ """
+ pass
+
+
+# Sort algorithm "enum" values.
+_nsdict = {'FLOAT': 0, 'F': 0,
+ 'INT': 1, 'I': 1,
+ 'UNSIGNED': 2, 'U': 2,
+ 'VERSION': 3, 'V': 3, # Shortcut for INT | UNSIGNED
+ 'DIGIT': 3, 'D': 3, # Shortcut for INT | UNSIGNED
+ 'NOEXP': 4, 'N': 4,
+ 'PATH': 8, 'P': 8,
+ 'LOCALE': 16, 'L': 16,
+ 'IGNORECASE': 32, 'IC': 32,
+ 'LOWERCASEFIRST': 64, 'LF': 64,
+ 'GROUPLETTERS': 128, 'G': 128,
+ 'TYPESAFE': 1024, 'T': 1024,
+ }
+# Populate the ns class with the _nsdict values.
+for x, y in _nsdict.items():
+ setattr(ns, x, y)
diff --git a/natsort/utils.py b/natsort/utils.py
new file mode 100644
index 0000000..4308611
--- /dev/null
+++ b/natsort/utils.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+"""
+Utilities and definitions for natsort, mostly all used to define
+the _natsort_key function.
+
+"""
+
+from __future__ import (print_function, division,
+ unicode_literals, absolute_import)
+
+# Std. lib imports.
+import re
+from warnings import warn
+from os import curdir, pardir
+from os.path import split, splitext
+from itertools import islice
+from locale import localeconv
+
+# Local imports.
+from .locale_help import locale_convert, grouper
+from .py23compat import py23_str, py23_zip
+from .ns_enum import ns, _nsdict
+
+# If the user has fastnumbers installed, they will get great speed
+# benefits. If not, we simulate the functions here.
+try:
+ from fastnumbers import fast_float, fast_int, isreal
+except ImportError:
+ from .fake_fastnumbers import fast_float, fast_int, isreal
+
+# Group algorithm types for easy extraction
+_NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP
+_ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L |
+ ns.IC | ns.LF | ns.G | ns.TYPESAFE)
+
+# The regex that locates floats
+_float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
+_float_nosign_exp_re = re.compile(r'(\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
+_float_sign_noexp_re = re.compile(r'([-+]?\d*\.?\d+)', re.U)
+_float_nosign_noexp_re = re.compile(r'(\d*\.?\d+)', re.U)
+_float_sign_exp_re_c = re.compile(r'([-+]?\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
+_float_nosign_exp_re_c = re.compile(r'(\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
+_float_sign_noexp_re_c = re.compile(r'([-+]?\d*[.,]?\d+)', re.U)
+_float_nosign_noexp_re_c = re.compile(r'(\d*[.,]?\d+)', re.U)
+
+# Integer regexes
+_int_nosign_re = re.compile(r'(\d+)', re.U)
+_int_sign_re = re.compile(r'([-+]?\d+)', re.U)
+
+# This dict will help select the correct regex and number conversion function.
+_regex_and_num_function_chooser = {
+ (ns.F, '.'): (_float_sign_exp_re, fast_float),
+ (ns.F | ns.N, '.'): (_float_sign_noexp_re, fast_float),
+ (ns.F | ns.U, '.'): (_float_nosign_exp_re, fast_float),
+ (ns.F | ns.U | ns.N, '.'): (_float_nosign_noexp_re, fast_float),
+ (ns.I, '.'): (_int_sign_re, fast_int),
+ (ns.I | ns.N, '.'): (_int_sign_re, fast_int),
+ (ns.I | ns.U, '.'): (_int_nosign_re, fast_int),
+ (ns.I | ns.U | ns.N, '.'): (_int_nosign_re, fast_int),
+ (ns.F, ','): (_float_sign_exp_re_c, fast_float),
+ (ns.F | ns.N, ','): (_float_sign_noexp_re_c, fast_float),
+ (ns.F | ns.U, ','): (_float_nosign_exp_re_c, fast_float),
+ (ns.F | ns.U | ns.N, ','): (_float_nosign_noexp_re_c, fast_float),
+ (ns.I, ','): (_int_sign_re, fast_int),
+ (ns.I | ns.N, ','): (_int_sign_re, fast_int),
+ (ns.I | ns.U, ','): (_int_nosign_re, fast_int),
+ (ns.I | ns.U | ns.N, ','): (_int_nosign_re, fast_int),
+}
+
+
+def _args_to_enum(number_type, signed, exp, as_path, py3_safe):
+ """A function to convert input booleans to an enum-type argument."""
+ alg = 0
+ if number_type is not float:
+ msg = "The 'number_type' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.FLOAT', 'alg=ns.INT', or 'alg=ns.VERSION'"
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['INT'] * bool(number_type in (int, None)))
+ alg |= (_nsdict['UNSIGNED'] * (number_type is None))
+ if signed is not None:
+ msg = "The 'signed' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.UNSIGNED'."
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['UNSIGNED'] * (not signed))
+ if exp is not None:
+ msg = "The 'exp' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.NOEXP'."
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['NOEXP'] * (not exp))
+ if as_path is not None:
+ msg = "The 'as_path' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.PATH'."
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['PATH'] * as_path)
+ if py3_safe is not None:
+ msg = "The 'py3_safe' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.TYPESAFE'."
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['TYPESAFE'] * py3_safe)
+ return alg
+
+
+def _input_parser(s, regex, numconv, py3_safe, use_locale, group_letters):
+ """Helper to parse the string input into numbers and strings."""
+
+ # Split the input string by numbers.
+ # If the input is not a string, TypeError is raised.
+ s = regex.split(s)
+
+ # Now convert the numbers to numbers, and leave strings as strings.
+ # Take into account locale if needed, and group letters if needed.
+ # Remove empty strings from the list.
+ if use_locale:
+ s = [locale_convert(x, numconv, group_letters) for x in s if x]
+ elif group_letters:
+ s = [grouper(x, numconv) for x in s if x]
+ else:
+ s = [numconv(x) for x in s if x]
+
+ # If the list begins with a number, lead with an empty string.
+ # This is used to get around the "unorderable types" issue.
+ if not s: # Return empty tuple for empty results.
+ return ()
+ elif isreal(s[0]):
+ s = [''] + s
+
+ # The _py3_safe function inserts "" between numbers in the list,
+ # and is used to get around "unorderable types" in complex cases.
+ # It is a separate function that needs to be requested specifically
+ # because it is expensive to call.
+ return _py3_safe(s) if py3_safe else s
+
+
+def _path_splitter(s, _d_match=re.compile(r'\.\d').match):
+ """Split a string into its path components. Assumes a string is a path."""
+ path_parts = []
+ p_append = path_parts.append
+ path_location = s
+
+ # Continue splitting the path from the back until we have reached
+ # '..' or '.', or until there is nothing left to split.
+ while path_location != curdir and path_location != pardir:
+ parent_path = path_location
+ path_location, child_path = split(parent_path)
+ if path_location == parent_path:
+ break
+ p_append(child_path)
+
+ # This last append is the base path.
+ # Only append if the string is non-empty.
+ if path_location:
+ p_append(path_location)
+
+ # We created this list in reversed order, so we now correct the order.
+ path_parts.reverse()
+
+ # Now, split off the file extensions using a similar method to above.
+ # Continue splitting off file extensions until we reach a decimal number
+ # or there are no more extensions.
+ base = path_parts.pop()
+ base_parts = []
+ b_append = base_parts.append
+ while True:
+ front = base
+ base, ext = splitext(front)
+ if _d_match(ext) or not ext:
+ # Reset base to before the split if the split is invalid.
+ base = front
+ break
+ b_append(ext)
+ b_append(base)
+ base_parts.reverse()
+
+ # Return the split parent paths and then the split basename.
+ return path_parts + base_parts
+
+
+def _py3_safe(parsed_list):
+ """Insert '' between two numbers."""
+ length = len(parsed_list)
+ if length < 2:
+ return parsed_list
+ else:
+ new_list = [parsed_list[0]]
+ nl_append = new_list.append
+ for before, after in py23_zip(islice(parsed_list, 0, length-1),
+ islice(parsed_list, 1, None)):
+ if isreal(before) and isreal(after):
+ nl_append("")
+ nl_append(after)
+ return new_list
+
+
+def _natsort_key(val, key, alg):
+ """\
+ Key to sort strings and numbers naturally.
+
+ It works by separating out the numbers from the strings. This function for
+ internal use only. See the natsort_keygen documentation for details of each
+ parameter.
+
+ Parameters
+ ----------
+ val : {str, unicode}
+ key : callable
+ alg : ns enum
+
+ Returns
+ -------
+ out : tuple
+ The modified value with numbers extracted.
+
+ """
+
+ # Convert the arguments to the proper input tuple
+ try:
+ use_locale = alg & _nsdict['LOCALE']
+ inp_options = (alg & _NUMBER_ALGORITHMS,
+ localeconv()['decimal_point'] if use_locale else '.')
+ except TypeError:
+ msg = "_natsort_key: 'alg' argument must be from the enum 'ns'"
+ raise ValueError(msg+', got {0}'.format(py23_str(alg)))
+
+ # Get the proper regex and conversion function.
+ try:
+ regex, num_function = _regex_and_num_function_chooser[inp_options]
+ except KeyError: # pragma: no cover
+ if inp_options[1] not in ('.', ','): # pragma: no cover
+ raise ValueError("_natsort_key: currently natsort only supports "
+ "the decimal separators '.' and ','. "
+ "Please file a bug report.")
+ else:
+ raise
+ else:
+ # Apply key if needed.
+ if key is not None:
+ val = key(val)
+
+ # If this is a path, convert it.
+ # An AttrubuteError is raised if not a string.
+ split_as_path = False
+ if alg & _nsdict['PATH']:
+ try:
+ val = _path_splitter(val)
+ except AttributeError:
+ pass
+ else:
+ # Record that this string was split as a path so that
+ # we don't set PATH in the recursive call.
+ split_as_path = True
+
+ # Assume the input are strings, which is the most common case.
+ # Apply the string modification if needed.
+ try:
+ if alg & _nsdict['LOWERCASEFIRST']:
+ val = val.swapcase()
+ if alg & _nsdict['IGNORECASE']:
+ val = val.lower()
+ return tuple(_input_parser(val,
+ regex,
+ num_function,
+ alg & _nsdict['TYPESAFE'],
+ use_locale,
+ alg & _nsdict['GROUPLETTERS']))
+ except (TypeError, AttributeError):
+ # If not strings, assume it is an iterable that must
+ # be parsed recursively. Do not apply the key recursively.
+ # If this string was split as a path, turn off 'PATH'.
+ try:
+ was_path = alg & _nsdict['PATH']
+ newalg = alg & _ALL_BUT_PATH
+ newalg |= (was_path * (not split_as_path))
+ return tuple([_natsort_key(x, None, newalg) for x in val])
+ # If there is still an error, it must be a number.
+ # Return as-is, with a leading empty string.
+ except TypeError:
+ return (('', val,),) if alg & _nsdict['PATH'] else ('', val,)
diff --git a/test_natsort/test_natsort.py b/test_natsort/test_natsort.py
index 1f41fb6..d1e1f52 100644
--- a/test_natsort/test_natsort.py
+++ b/test_natsort/test_natsort.py
@@ -9,11 +9,10 @@ import locale
from operator import itemgetter
from pytest import raises
from natsort import natsorted, index_natsorted, natsort_key, versorted, index_versorted
-from natsort import humansorted, index_humansorted, natsort_keygen, order_by_index
-from natsort.natsort import _input_parser, _py3_safe, _natsort_key, _args_to_enum
-from natsort.natsort import _float_sign_exp_re, _float_nosign_exp_re, _float_sign_noexp_re
-from natsort.natsort import _float_nosign_noexp_re, _int_nosign_re, _int_sign_re
-from natsort.natsort import ns
+from natsort import humansorted, index_humansorted, natsort_keygen, order_by_index, ns
+from natsort.utils import _input_parser, _py3_safe, _natsort_key, _args_to_enum
+from natsort.utils import _float_sign_exp_re, _float_nosign_exp_re, _float_sign_noexp_re
+from natsort.utils import _float_nosign_noexp_re, _int_nosign_re, _int_sign_re
from natsort.locale_help import use_pyicu
try: