summaryrefslogtreecommitdiff
path: root/natsort/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'natsort/utils.py')
-rw-r--r--natsort/utils.py277
1 files changed, 277 insertions, 0 deletions
diff --git a/natsort/utils.py b/natsort/utils.py
new file mode 100644
index 0000000..add749f
--- /dev/null
+++ b/natsort/utils.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+"""
+Utilities and definitions for natsort, mostly all used to define
+the _natsort_key function.
+
+"""
+
+from __future__ import (print_function, division,
+ unicode_literals, absolute_import)
+
+# Std. lib imports.
+import re
+from warnings import warn
+from os import curdir, pardir
+from os.path import split, splitext
+from itertools import islice
+from locale import localeconv
+
+# Local imports.
+from natsort.locale_help import locale_convert, grouper
+from natsort.py23compat import py23_str, py23_zip
+from natsort.ns_enum import ns, _nsdict
+
+# If the user has fastnumbers installed, they will get great speed
+# benefits. If not, we simulate the functions here.
+try:
+ from fastnumbers import fast_float, fast_int, isreal
+except ImportError:
+ from natsort.fake_fastnumbers import fast_float, fast_int, isreal
+
+# Group algorithm types for easy extraction
+_NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP
+_ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L |
+ ns.IC | ns.LF | ns.G | ns.TYPESAFE)
+
+# The regex that locates floats
+_float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
+_float_nosign_exp_re = re.compile(r'(\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
+_float_sign_noexp_re = re.compile(r'([-+]?\d*\.?\d+)', re.U)
+_float_nosign_noexp_re = re.compile(r'(\d*\.?\d+)', re.U)
+_float_sign_exp_re_c = re.compile(r'([-+]?\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
+_float_nosign_exp_re_c = re.compile(r'(\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
+_float_sign_noexp_re_c = re.compile(r'([-+]?\d*[.,]?\d+)', re.U)
+_float_nosign_noexp_re_c = re.compile(r'(\d*[.,]?\d+)', re.U)
+
+# Integer regexes
+_int_nosign_re = re.compile(r'(\d+)', re.U)
+_int_sign_re = re.compile(r'([-+]?\d+)', re.U)
+
+# This dict will help select the correct regex and number conversion function.
+_regex_and_num_function_chooser = {
+ (ns.F, '.'): (_float_sign_exp_re, fast_float),
+ (ns.F | ns.N, '.'): (_float_sign_noexp_re, fast_float),
+ (ns.F | ns.U, '.'): (_float_nosign_exp_re, fast_float),
+ (ns.F | ns.U | ns.N, '.'): (_float_nosign_noexp_re, fast_float),
+ (ns.I, '.'): (_int_sign_re, fast_int),
+ (ns.I | ns.N, '.'): (_int_sign_re, fast_int),
+ (ns.I | ns.U, '.'): (_int_nosign_re, fast_int),
+ (ns.I | ns.U | ns.N, '.'): (_int_nosign_re, fast_int),
+ (ns.F, ','): (_float_sign_exp_re_c, fast_float),
+ (ns.F | ns.N, ','): (_float_sign_noexp_re_c, fast_float),
+ (ns.F | ns.U, ','): (_float_nosign_exp_re_c, fast_float),
+ (ns.F | ns.U | ns.N, ','): (_float_nosign_noexp_re_c, fast_float),
+ (ns.I, ','): (_int_sign_re, fast_int),
+ (ns.I | ns.N, ','): (_int_sign_re, fast_int),
+ (ns.I | ns.U, ','): (_int_nosign_re, fast_int),
+ (ns.I | ns.U | ns.N, ','): (_int_nosign_re, fast_int),
+}
+
+
+def _args_to_enum(number_type, signed, exp, as_path, py3_safe):
+ """A function to convert input booleans to an enum-type argument."""
+ alg = 0
+ if number_type is not float:
+ msg = "The 'number_type' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.FLOAT', 'alg=ns.INT', or 'alg=ns.VERSION'"
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['INT'] * bool(number_type in (int, None)))
+ alg |= (_nsdict['UNSIGNED'] * (number_type is None))
+ if signed is not None:
+ msg = "The 'signed' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.UNSIGNED'."
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['UNSIGNED'] * (not signed))
+ if exp is not None:
+ msg = "The 'exp' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.NOEXP'."
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['NOEXP'] * (not exp))
+ if as_path is not None:
+ msg = "The 'as_path' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.PATH'."
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['PATH'] * as_path)
+ if py3_safe is not None:
+ msg = "The 'py3_safe' argument is depreciated as of 3.5.0, "
+ msg += "please use 'alg=ns.TYPESAFE'."
+ warn(msg, DeprecationWarning)
+ alg |= (_nsdict['TYPESAFE'] * py3_safe)
+ return alg
+
+
+def _input_parser(s, regex, numconv, py3_safe, use_locale, group_letters):
+ """Helper to parse the string input into numbers and strings."""
+
+ # Split the input string by numbers.
+ # If the input is not a string, TypeError is raised.
+ s = regex.split(s)
+
+ # Now convert the numbers to numbers, and leave strings as strings.
+ # Take into account locale if needed, and group letters if needed.
+ # Remove empty strings from the list.
+ if use_locale:
+ s = [locale_convert(x, numconv, group_letters) for x in s if x]
+ elif group_letters:
+ s = [grouper(x, numconv) for x in s if x]
+ else:
+ s = [numconv(x) for x in s if x]
+
+ # If the list begins with a number, lead with an empty string.
+ # This is used to get around the "unorderable types" issue.
+ if not s: # Return empty tuple for empty results.
+ return ()
+ elif isreal(s[0]):
+ s = [''] + s
+
+ # The _py3_safe function inserts "" between numbers in the list,
+ # and is used to get around "unorderable types" in complex cases.
+ # It is a separate function that needs to be requested specifically
+ # because it is expensive to call.
+ return _py3_safe(s) if py3_safe else s
+
+
+def _path_splitter(s, _d_match=re.compile(r'\.\d').match):
+ """Split a string into its path components. Assumes a string is a path."""
+ path_parts = []
+ p_append = path_parts.append
+ path_location = s
+
+ # Continue splitting the path from the back until we have reached
+ # '..' or '.', or until there is nothing left to split.
+ while path_location != curdir and path_location != pardir:
+ parent_path = path_location
+ path_location, child_path = split(parent_path)
+ if path_location == parent_path:
+ break
+ p_append(child_path)
+
+ # This last append is the base path.
+ # Only append if the string is non-empty.
+ if path_location:
+ p_append(path_location)
+
+ # We created this list in reversed order, so we now correct the order.
+ path_parts.reverse()
+
+ # Now, split off the file extensions using a similar method to above.
+ # Continue splitting off file extensions until we reach a decimal number
+ # or there are no more extensions.
+ base = path_parts.pop()
+ base_parts = []
+ b_append = base_parts.append
+ while True:
+ front = base
+ base, ext = splitext(front)
+ if _d_match(ext) or not ext:
+ # Reset base to before the split if the split is invalid.
+ base = front
+ break
+ b_append(ext)
+ b_append(base)
+ base_parts.reverse()
+
+ # Return the split parent paths and then the split basename.
+ return path_parts + base_parts
+
+
+def _py3_safe(parsed_list):
+ """Insert '' between two numbers."""
+ length = len(parsed_list)
+ if length < 2:
+ return parsed_list
+ else:
+ new_list = [parsed_list[0]]
+ nl_append = new_list.append
+ for before, after in py23_zip(islice(parsed_list, 0, length-1),
+ islice(parsed_list, 1, None)):
+ if isreal(before) and isreal(after):
+ nl_append("")
+ nl_append(after)
+ return new_list
+
+
+def _natsort_key(val, key, alg):
+ """\
+ Key to sort strings and numbers naturally.
+
+ It works by separating out the numbers from the strings. This function for
+ internal use only. See the natsort_keygen documentation for details of each
+ parameter.
+
+ Parameters
+ ----------
+ val : {str, unicode}
+ key : callable
+ alg : ns enum
+
+ Returns
+ -------
+ out : tuple
+ The modified value with numbers extracted.
+
+ """
+
+ # Convert the arguments to the proper input tuple
+ try:
+ use_locale = alg & _nsdict['LOCALE']
+ inp_options = (alg & _NUMBER_ALGORITHMS,
+ localeconv()['decimal_point'] if use_locale else '.')
+ except TypeError:
+ msg = "_natsort_key: 'alg' argument must be from the enum 'ns'"
+ raise ValueError(msg+', got {0}'.format(py23_str(alg)))
+
+ # Get the proper regex and conversion function.
+ try:
+ regex, num_function = _regex_and_num_function_chooser[inp_options]
+ except KeyError: # pragma: no cover
+ if inp_options[1] not in ('.', ','): # pragma: no cover
+ raise ValueError("_natsort_key: currently natsort only supports "
+ "the decimal separators '.' and ','. "
+ "Please file a bug report.")
+ else:
+ raise
+ else:
+ # Apply key if needed.
+ if key is not None:
+ val = key(val)
+
+ # If this is a path, convert it.
+ # An AttrubuteError is raised if not a string.
+ split_as_path = False
+ if alg & _nsdict['PATH']:
+ try:
+ val = _path_splitter(val)
+ except AttributeError:
+ pass
+ else:
+ # Record that this string was split as a path so that
+ # we don't set PATH in the recursive call.
+ split_as_path = True
+
+ # Assume the input are strings, which is the most common case.
+ # Apply the string modification if needed.
+ try:
+ if alg & _nsdict['LOWERCASEFIRST']:
+ val = val.swapcase()
+ if alg & _nsdict['IGNORECASE']:
+ val = val.lower()
+ return tuple(_input_parser(val,
+ regex,
+ num_function,
+ alg & _nsdict['TYPESAFE'],
+ use_locale,
+ alg & _nsdict['GROUPLETTERS']))
+ except (TypeError, AttributeError):
+ # If not strings, assume it is an iterable that must
+ # be parsed recursively. Do not apply the key recursively.
+ # If this string was split as a path, turn off 'PATH'.
+ try:
+ was_path = alg & _nsdict['PATH']
+ newalg = alg & _ALL_BUT_PATH
+ newalg |= (was_path * (not split_as_path))
+ return tuple([_natsort_key(x, None, newalg) for x in val])
+ # If there is still an error, it must be a number.
+ # Return as-is, with a leading empty string.
+ except TypeError:
+ return (('', val,),) if alg & _nsdict['PATH'] else ('', val,)