natsort/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

# -*- coding: utf-8 -*-
"""
Utilities and definitions for natsort, mostly all used to define
the _natsort_key function.

"""

from __future__ import (print_function, division,
                        unicode_literals, absolute_import)

# Std. lib imports.
import re
from warnings import warn
from os import curdir, pardir
from os.path import split, splitext
from itertools import islice
from locale import localeconv

# Local imports.
from natsort.locale_help import locale_convert, grouper
from natsort.py23compat import py23_str, py23_zip
from natsort.ns_enum import ns, _nsdict

# If the user has fastnumbers installed, they will get great speed
# benefits. If not, we simulate the functions here.
try:
    from fastnumbers import fast_float, fast_int, isreal
except ImportError:
    from natsort.fake_fastnumbers import fast_float, fast_int, isreal

# Group algorithm types for easy extraction
_NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP
_ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L |
                 ns.IC | ns.LF | ns.G | ns.TYPESAFE)

# The regex that locates floats
_float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
_float_nosign_exp_re = re.compile(r'(\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
_float_sign_noexp_re = re.compile(r'([-+]?\d*\.?\d+)', re.U)
_float_nosign_noexp_re = re.compile(r'(\d*\.?\d+)', re.U)
_float_sign_exp_re_c = re.compile(r'([-+]?\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
_float_nosign_exp_re_c = re.compile(r'(\d*[.,]?\d+(?:[eE][-+]?\d+)?)', re.U)
_float_sign_noexp_re_c = re.compile(r'([-+]?\d*[.,]?\d+)', re.U)
_float_nosign_noexp_re_c = re.compile(r'(\d*[.,]?\d+)', re.U)

# Integer regexes
_int_nosign_re = re.compile(r'(\d+)', re.U)
_int_sign_re = re.compile(r'([-+]?\d+)', re.U)

# This dict will help select the correct regex and number conversion function.
_regex_and_num_function_chooser = {
    (ns.F, '.'):               (_float_sign_exp_re,     fast_float),
    (ns.F | ns.N, '.'):        (_float_sign_noexp_re,   fast_float),
    (ns.F | ns.U, '.'):        (_float_nosign_exp_re,   fast_float),
    (ns.F | ns.U | ns.N, '.'): (_float_nosign_noexp_re, fast_float),
    (ns.I, '.'):               (_int_sign_re,   fast_int),
    (ns.I | ns.N, '.'):        (_int_sign_re,   fast_int),
    (ns.I | ns.U, '.'):        (_int_nosign_re, fast_int),
    (ns.I | ns.U | ns.N, '.'): (_int_nosign_re, fast_int),
    (ns.F, ','):               (_float_sign_exp_re_c,     fast_float),
    (ns.F | ns.N, ','):        (_float_sign_noexp_re_c,   fast_float),
    (ns.F | ns.U, ','):        (_float_nosign_exp_re_c,   fast_float),
    (ns.F | ns.U | ns.N, ','): (_float_nosign_noexp_re_c, fast_float),
    (ns.I, ','):               (_int_sign_re,   fast_int),
    (ns.I | ns.N, ','):        (_int_sign_re,   fast_int),
    (ns.I | ns.U, ','):        (_int_nosign_re, fast_int),
    (ns.I | ns.U | ns.N, ','): (_int_nosign_re, fast_int),
}


def _args_to_enum(number_type, signed, exp, as_path, py3_safe):
    """A function to convert input booleans to an enum-type argument."""
    alg = 0
    if number_type is not float:
        msg = "The 'number_type' argument is depreciated as of 3.5.0, "
        msg += "please use 'alg=ns.FLOAT', 'alg=ns.INT', or 'alg=ns.VERSION'"
        warn(msg, DeprecationWarning)
        alg |= (_nsdict['INT'] * bool(number_type in (int, None)))
        alg |= (_nsdict['UNSIGNED'] * (number_type is None))
    if signed is not None:
        msg = "The 'signed' argument is depreciated as of 3.5.0, "
        msg += "please use 'alg=ns.UNSIGNED'."
        warn(msg, DeprecationWarning)
        alg |= (_nsdict['UNSIGNED'] * (not signed))
    if exp is not None:
        msg = "The 'exp' argument is depreciated as of 3.5.0, "
        msg += "please use 'alg=ns.NOEXP'."
        warn(msg, DeprecationWarning)
        alg |= (_nsdict['NOEXP'] * (not exp))
    if as_path is not None:
        msg = "The 'as_path' argument is depreciated as of 3.5.0, "
        msg += "please use 'alg=ns.PATH'."
        warn(msg, DeprecationWarning)
        alg |= (_nsdict['PATH'] * as_path)
    if py3_safe is not None:
        msg = "The 'py3_safe' argument is depreciated as of 3.5.0, "
        msg += "please use 'alg=ns.TYPESAFE'."
        warn(msg, DeprecationWarning)
        alg |= (_nsdict['TYPESAFE'] * py3_safe)
    return alg


def _input_parser(s, regex, numconv, py3_safe, use_locale, group_letters):
    """Helper to parse the string input into numbers and strings."""

    # Split the input string by numbers.
    # If the input is not a string, TypeError is raised.
    s = regex.split(s)

    # Now convert the numbers to numbers, and leave strings as strings.
    # Take into account locale if needed, and group letters if needed.
    # Remove empty strings from the list.
    if use_locale:
        s = [locale_convert(x, numconv, group_letters) for x in s if x]
    elif group_letters:
        s = [grouper(x, numconv) for x in s if x]
    else:
        s = [numconv(x) for x in s if x]

    # If the list begins with a number, lead with an empty string.
    # This is used to get around the "unorderable types" issue.
    if not s:  # Return empty tuple for empty results.
        return ()
    elif isreal(s[0]):
        s = [''] + s

    # The _py3_safe function inserts "" between numbers in the list,
    # and is used to get around "unorderable types" in complex cases.
    # It is a separate function that needs to be requested specifically
    # because it is expensive to call.
    return _py3_safe(s) if py3_safe else s


def _path_splitter(s, _d_match=re.compile(r'\.\d').match):
    """Split a string into its path components. Assumes a string is a path."""
    path_parts = []
    p_append = path_parts.append
    path_location = s

    # Continue splitting the path from the back until we have reached
    # '..' or '.', or until there is nothing left to split.
    while path_location != curdir and path_location != pardir:
        parent_path = path_location
        path_location, child_path = split(parent_path)
        if path_location == parent_path:
            break
        p_append(child_path)

    # This last append is the base path.
    # Only append if the string is non-empty.
    if path_location:
        p_append(path_location)

    # We created this list in reversed order, so we now correct the order.
    path_parts.reverse()

    # Now, split off the file extensions using a similar method to above.
    # Continue splitting off file extensions until we reach a decimal number
    # or there are no more extensions.
    base = path_parts.pop()
    base_parts = []
    b_append = base_parts.append
    while True:
        front = base
        base, ext = splitext(front)
        if _d_match(ext) or not ext:
            # Reset base to before the split if the split is invalid.
            base = front
            break
        b_append(ext)
    b_append(base)
    base_parts.reverse()

    # Return the split parent paths and then the split basename.
    return path_parts + base_parts


def _py3_safe(parsed_list):
    """Insert '' between two numbers."""
    length = len(parsed_list)
    if length < 2:
        return parsed_list
    else:
        new_list = [parsed_list[0]]
        nl_append = new_list.append
        for before, after in py23_zip(islice(parsed_list, 0, length-1),
                                      islice(parsed_list, 1, None)):
            if isreal(before) and isreal(after):
                nl_append("")
            nl_append(after)
        return new_list


def _natsort_key(val, key, alg):
    """\
    Key to sort strings and numbers naturally.

    It works by separating out the numbers from the strings. This function for
    internal use only. See the natsort_keygen documentation for details of each
    parameter.

    Parameters
    ----------
    val : {str, unicode}
    key : callable
    alg : ns enum

    Returns
    -------
    out : tuple
        The modified value with numbers extracted.

    """

    # Convert the arguments to the proper input tuple
    try:
        use_locale = alg & _nsdict['LOCALE']
        inp_options = (alg & _NUMBER_ALGORITHMS,
                       localeconv()['decimal_point'] if use_locale else '.')
    except TypeError:
        msg = "_natsort_key: 'alg' argument must be from the enum 'ns'"
        raise ValueError(msg+', got {0}'.format(py23_str(alg)))

    # Get the proper regex and conversion function.
    try:
        regex, num_function = _regex_and_num_function_chooser[inp_options]
    except KeyError:  # pragma: no cover
        if inp_options[1] not in ('.', ','):  # pragma: no cover
            raise ValueError("_natsort_key: currently natsort only supports "
                             "the decimal separators '.' and ','. "
                             "Please file a bug report.")
        else:
            raise
    else:
        # Apply key if needed.
        if key is not None:
            val = key(val)

        # If this is a path, convert it.
        # An AttrubuteError is raised if not a string.
        split_as_path = False
        if alg & _nsdict['PATH']:
            try:
                val = _path_splitter(val)
            except AttributeError:
                pass
            else:
                # Record that this string was split as a path so that
                # we don't set PATH in the recursive call.
                split_as_path = True

        # Assume the input are strings, which is the most common case.
        # Apply the string modification if needed.
        try:
            if alg & _nsdict['LOWERCASEFIRST']:
                val = val.swapcase()
            if alg & _nsdict['IGNORECASE']:
                val = val.lower()
            return tuple(_input_parser(val,
                                       regex,
                                       num_function,
                                       alg & _nsdict['TYPESAFE'],
                                       use_locale,
                                       alg & _nsdict['GROUPLETTERS']))
        except (TypeError, AttributeError):
            # If not strings, assume it is an iterable that must
            # be parsed recursively. Do not apply the key recursively.
            # If this string was split as a path, turn off 'PATH'.
            try:
                was_path = alg & _nsdict['PATH']
                newalg = alg & _ALL_BUT_PATH
                newalg |= (was_path * (not split_as_path))
                return tuple([_natsort_key(x, None, newalg) for x in val])
            # If there is still an error, it must be a number.
            # Return as-is, with a leading empty string.
            except TypeError:
                return (('', val,),) if alg & _nsdict['PATH'] else ('', val,)