diff options
Diffstat (limited to 'natsort/utils.py')
-rw-r--r-- | natsort/utils.py | 277 |
1 files changed, 277 insertions, 0 deletions
diff --git a/natsort/utils.py b/natsort/utils.py new file mode 100644 index 0000000..add749f --- /dev/null +++ b/natsort/utils.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- +""" +Utilities and definitions for natsort, mostly all used to define +the _natsort_key function. + +""" + +from __future__ import (print_function, division, + unicode_literals, absolute_import) + +# Std. lib imports. +import re +from warnings import warn +from os import curdir, pardir +from os.path import split, splitext +from itertools import islice +from locale import localeconv + +# Local imports. +from natsort.locale_help import locale_convert, grouper +from natsort.py23compat import py23_str, py23_zip +from natsort.ns_enum import ns, _nsdict + +# If the user has fastnumbers installed, they will get great speed +# benefits. If not, we simulate the functions here. +try: + from fastnumbers import fast_float, fast_int, isreal +except ImportError: + from natsort.fake_fastnumbers import fast_float, fast_int, isreal + +# Group algorithm types for easy extraction +_NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP +_ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L | + ns.IC | ns.LF | ns.G | ns.TYPESAFE) + +# The regex that locates floats +_float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U) +_float_nosign_exp_re = re.compile(r'(\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U) +_float_sign_noexp_re = re.compile(r'([-+]?\d*\.?\d+)', re.U) +_float_nosign_noexp_re = re.compile(r'(\d*\.?\d+)', re.U) +_float_sign_exp_re_c = re.compile(r'([-+]?\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U) +_float_nosign_exp_re_c = re.compile(r'(\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U) +_float_sign_noexp_re_c = re.compile(r'([-+]?\d*[.,]?\d+)', re.U) +_float_nosign_noexp_re_c = re.compile(r'(\d*[.,]?\d+)', re.U) + +# Integer regexes +_int_nosign_re = re.compile(r'(\d+)', re.U) +_int_sign_re = re.compile(r'([-+]?\d+)', re.U) + +# This dict will help select the correct regex and number conversion function. +_regex_and_num_function_chooser = { + (ns.F, '.'): (_float_sign_exp_re, fast_float), + (ns.F | ns.N, '.'): (_float_sign_noexp_re, fast_float), + (ns.F | ns.U, '.'): (_float_nosign_exp_re, fast_float), + (ns.F | ns.U | ns.N, '.'): (_float_nosign_noexp_re, fast_float), + (ns.I, '.'): (_int_sign_re, fast_int), + (ns.I | ns.N, '.'): (_int_sign_re, fast_int), + (ns.I | ns.U, '.'): (_int_nosign_re, fast_int), + (ns.I | ns.U | ns.N, '.'): (_int_nosign_re, fast_int), + (ns.F, ','): (_float_sign_exp_re_c, fast_float), + (ns.F | ns.N, ','): (_float_sign_noexp_re_c, fast_float), + (ns.F | ns.U, ','): (_float_nosign_exp_re_c, fast_float), + (ns.F | ns.U | ns.N, ','): (_float_nosign_noexp_re_c, fast_float), + (ns.I, ','): (_int_sign_re, fast_int), + (ns.I | ns.N, ','): (_int_sign_re, fast_int), + (ns.I | ns.U, ','): (_int_nosign_re, fast_int), + (ns.I | ns.U | ns.N, ','): (_int_nosign_re, fast_int), +} + + +def _args_to_enum(number_type, signed, exp, as_path, py3_safe): + """A function to convert input booleans to an enum-type argument.""" + alg = 0 + if number_type is not float: + msg = "The 'number_type' argument is depreciated as of 3.5.0, " + msg += "please use 'alg=ns.FLOAT', 'alg=ns.INT', or 'alg=ns.VERSION'" + warn(msg, DeprecationWarning) + alg |= (_nsdict['INT'] * bool(number_type in (int, None))) + alg |= (_nsdict['UNSIGNED'] * (number_type is None)) + if signed is not None: + msg = "The 'signed' argument is depreciated as of 3.5.0, " + msg += "please use 'alg=ns.UNSIGNED'." + warn(msg, DeprecationWarning) + alg |= (_nsdict['UNSIGNED'] * (not signed)) + if exp is not None: + msg = "The 'exp' argument is depreciated as of 3.5.0, " + msg += "please use 'alg=ns.NOEXP'." + warn(msg, DeprecationWarning) + alg |= (_nsdict['NOEXP'] * (not exp)) + if as_path is not None: + msg = "The 'as_path' argument is depreciated as of 3.5.0, " + msg += "please use 'alg=ns.PATH'." + warn(msg, DeprecationWarning) + alg |= (_nsdict['PATH'] * as_path) + if py3_safe is not None: + msg = "The 'py3_safe' argument is depreciated as of 3.5.0, " + msg += "please use 'alg=ns.TYPESAFE'." + warn(msg, DeprecationWarning) + alg |= (_nsdict['TYPESAFE'] * py3_safe) + return alg + + +def _input_parser(s, regex, numconv, py3_safe, use_locale, group_letters): + """Helper to parse the string input into numbers and strings.""" + + # Split the input string by numbers. + # If the input is not a string, TypeError is raised. + s = regex.split(s) + + # Now convert the numbers to numbers, and leave strings as strings. + # Take into account locale if needed, and group letters if needed. + # Remove empty strings from the list. + if use_locale: + s = [locale_convert(x, numconv, group_letters) for x in s if x] + elif group_letters: + s = [grouper(x, numconv) for x in s if x] + else: + s = [numconv(x) for x in s if x] + + # If the list begins with a number, lead with an empty string. + # This is used to get around the "unorderable types" issue. + if not s: # Return empty tuple for empty results. + return () + elif isreal(s[0]): + s = [''] + s + + # The _py3_safe function inserts "" between numbers in the list, + # and is used to get around "unorderable types" in complex cases. + # It is a separate function that needs to be requested specifically + # because it is expensive to call. + return _py3_safe(s) if py3_safe else s + + +def _path_splitter(s, _d_match=re.compile(r'\.\d').match): + """Split a string into its path components. Assumes a string is a path.""" + path_parts = [] + p_append = path_parts.append + path_location = s + + # Continue splitting the path from the back until we have reached + # '..' or '.', or until there is nothing left to split. + while path_location != curdir and path_location != pardir: + parent_path = path_location + path_location, child_path = split(parent_path) + if path_location == parent_path: + break + p_append(child_path) + + # This last append is the base path. + # Only append if the string is non-empty. + if path_location: + p_append(path_location) + + # We created this list in reversed order, so we now correct the order. + path_parts.reverse() + + # Now, split off the file extensions using a similar method to above. + # Continue splitting off file extensions until we reach a decimal number + # or there are no more extensions. + base = path_parts.pop() + base_parts = [] + b_append = base_parts.append + while True: + front = base + base, ext = splitext(front) + if _d_match(ext) or not ext: + # Reset base to before the split if the split is invalid. + base = front + break + b_append(ext) + b_append(base) + base_parts.reverse() + + # Return the split parent paths and then the split basename. + return path_parts + base_parts + + +def _py3_safe(parsed_list): + """Insert '' between two numbers.""" + length = len(parsed_list) + if length < 2: + return parsed_list + else: + new_list = [parsed_list[0]] + nl_append = new_list.append + for before, after in py23_zip(islice(parsed_list, 0, length-1), + islice(parsed_list, 1, None)): + if isreal(before) and isreal(after): + nl_append("") + nl_append(after) + return new_list + + +def _natsort_key(val, key, alg): + """\ + Key to sort strings and numbers naturally. + + It works by separating out the numbers from the strings. This function for + internal use only. See the natsort_keygen documentation for details of each + parameter. + + Parameters + ---------- + val : {str, unicode} + key : callable + alg : ns enum + + Returns + ------- + out : tuple + The modified value with numbers extracted. + + """ + + # Convert the arguments to the proper input tuple + try: + use_locale = alg & _nsdict['LOCALE'] + inp_options = (alg & _NUMBER_ALGORITHMS, + localeconv()['decimal_point'] if use_locale else '.') + except TypeError: + msg = "_natsort_key: 'alg' argument must be from the enum 'ns'" + raise ValueError(msg+', got {0}'.format(py23_str(alg))) + + # Get the proper regex and conversion function. + try: + regex, num_function = _regex_and_num_function_chooser[inp_options] + except KeyError: # pragma: no cover + if inp_options[1] not in ('.', ','): # pragma: no cover + raise ValueError("_natsort_key: currently natsort only supports " + "the decimal separators '.' and ','. " + "Please file a bug report.") + else: + raise + else: + # Apply key if needed. + if key is not None: + val = key(val) + + # If this is a path, convert it. + # An AttrubuteError is raised if not a string. + split_as_path = False + if alg & _nsdict['PATH']: + try: + val = _path_splitter(val) + except AttributeError: + pass + else: + # Record that this string was split as a path so that + # we don't set PATH in the recursive call. + split_as_path = True + + # Assume the input are strings, which is the most common case. + # Apply the string modification if needed. + try: + if alg & _nsdict['LOWERCASEFIRST']: + val = val.swapcase() + if alg & _nsdict['IGNORECASE']: + val = val.lower() + return tuple(_input_parser(val, + regex, + num_function, + alg & _nsdict['TYPESAFE'], + use_locale, + alg & _nsdict['GROUPLETTERS'])) + except (TypeError, AttributeError): + # If not strings, assume it is an iterable that must + # be parsed recursively. Do not apply the key recursively. + # If this string was split as a path, turn off 'PATH'. + try: + was_path = alg & _nsdict['PATH'] + newalg = alg & _ALL_BUT_PATH + newalg |= (was_path * (not split_as_path)) + return tuple([_natsort_key(x, None, newalg) for x in val]) + # If there is still an error, it must be a number. + # Return as-is, with a leading empty string. + except TypeError: + return (('', val,),) if alg & _nsdict['PATH'] else ('', val,) |