diff options
author | Seth M Morton <seth.m.morton@gmail.com> | 2014-08-12 18:18:58 -0700 |
---|---|---|
committer | Seth M Morton <seth.m.morton@gmail.com> | 2014-08-12 23:04:26 -0700 |
commit | d94de806ce6ee78cd15a78b8a746adb287b06cd9 (patch) | |
tree | 8b333be964eb7024f6daf9ed9dd0bba1e97b3e53 | |
parent | 5840e801f2175798a91f197c4cfd2c03fa683f8d (diff) | |
download | natsort-d94de806ce6ee78cd15a78b8a746adb287b06cd9.tar.gz |
Adopted new fastnumbers module.
This module quickly confers strings to numbers, and will speed up
the natsort algorithm significantly (if it is installed). In case it is
not installed, a "fake_fastnumbers" module is added to natsort that
simulates the needed functions.
It is found that up to 30% gain in efficiency can be had by using
the fastnumbers module.
-rw-r--r-- | .coveragerc | 3 | ||||
-rw-r--r-- | natsort/fake_fastnumbers.py | 30 | ||||
-rw-r--r-- | natsort/natsort.py | 71 | ||||
-rw-r--r-- | test_natsort/test_fake_fastnumbers.py | 26 | ||||
-rw-r--r-- | test_natsort/test_natsort.py | 61 |
5 files changed, 123 insertions, 68 deletions
diff --git a/.coveragerc b/.coveragerc index 1bbfe9d..8622bd1 100644 --- a/.coveragerc +++ b/.coveragerc @@ -9,6 +9,9 @@ exclude_lines = raise NotImplementedError raise$ + # Don't complain about alternate imports + except ImportError + # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: diff --git a/natsort/fake_fastnumbers.py b/natsort/fake_fastnumbers.py new file mode 100644 index 0000000..15d7e88 --- /dev/null +++ b/natsort/fake_fastnumbers.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +"""\ +This module is intended to replicate some of the functionality +from the fastnumbers module in the event that module is not +installed. +""" +from __future__ import (print_function, division, + unicode_literals, absolute_import) + +import re + +float_re = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?$') +int_re = re.compile(r'[-+]?\d+$') + + +def fast_float(x, regex_matcher=float_re.match): + """Convert a string to a float quickly""" + return float(x) if regex_matcher(x) else x + + +def fast_int(x, regex_matcher=int_re.match): + """\ + Convert a string to a int quickly, return input as-is if not possible. + """ + return int(x) if regex_matcher(x) else x + + +def isreal(x, ntypes=set([int, float])): + """Returns true if the input is a real number, false otherwise.""" + return type(x) in ntypes diff --git a/natsort/natsort.py b/natsort/natsort.py index c67ec3f..5974199 100644 --- a/natsort/natsort.py +++ b/natsort/natsort.py @@ -23,6 +23,13 @@ from functools import partial from itertools import islice from warnings import warn +# If the user has fastnumbers installed, they will get great speed +# benefits. If not, we simulate the functions here. +try: + from fastnumbers import fast_float, fast_int, isreal +except ImportError: + from .fake_fastnumbers import fast_float, fast_int, isreal + from .py23compat import u_format, py23_str, py23_zip # Make sure the doctest works for either python2 or python3 @@ -38,54 +45,37 @@ int_nosign_re = re.compile(r'(\d+)') int_sign_re = re.compile(r'([-+]?\d+)') # This dict will help select the correct regex and number conversion function. regex_and_num_function_chooser = { - (float, True, True): (float_sign_exp_re, float), - (float, True, False): (float_sign_noexp_re, float), - (float, False, True): (float_nosign_exp_re, float), - (float, False, False): (float_nosign_noexp_re, float), - (int, True, True): (int_sign_re, int), - (int, True, False): (int_sign_re, int), - (int, False, True): (int_nosign_re, int), - (int, False, False): (int_nosign_re, int), - (None, True, True): (int_nosign_re, int), - (None, True, False): (int_nosign_re, int), - (None, False, True): (int_nosign_re, int), - (None, False, False): (int_nosign_re, int), + (float, True, True): (float_sign_exp_re, fast_float), + (float, True, False): (float_sign_noexp_re, fast_float), + (float, False, True): (float_nosign_exp_re, fast_float), + (float, False, False): (float_nosign_noexp_re, fast_float), + (int, True, True): (int_sign_re, fast_int), + (int, True, False): (int_sign_re, fast_int), + (int, False, True): (int_nosign_re, fast_int), + (int, False, False): (int_nosign_re, fast_int), + (None, True, True): (int_nosign_re, fast_int), + (None, True, False): (int_nosign_re, fast_int), + (None, False, True): (int_nosign_re, fast_int), + (None, False, False): (int_nosign_re, fast_int), } -# Number types. I have to use set([...]) and not {...} -# because I am supporting Python 2.6. -number_types = set([float, int]) - -# This regex is to make sure we don't mistake a number for a file extension -decimal = re.compile(r'\.\d') - def _number_finder(s, regex, numconv, py3_safe): """Helper to split numbers""" - # Split the input string by numbers. - # If there are no splits, return now. - # If the input is not a string, ValueError is raised. + # Split the input string by numbers. If there are no splits, return now. + # If the input is not a string, TypeError is raised. s = regex.split(s) if len(s) == 1: return tuple(s) # Now convert the numbers to numbers, and leave strings as strings. # Remove empty strings from the list. - # Profiling showed that using regex here is much faster than - # try/except with the numconv function. - r = regex.match - s = [numconv(x) if r(x) else x for x in s if x] + s = [numconv(x) for x in s if x] # If the list begins with a number, lead with an empty string. # This is used to get around the "unorderable types" issue. - # The most common case will be a string at the front of the - # list, and in that case the try/except method is faster than - # using isinstance. This was chosen at the expense of the less - # common case of a number being at the front of the list. - try: - s[0][0] # str supports indexing, but not numbers - except TypeError: + if isreal(s[0]): s = [''] + s # The _py3_safe function inserts "" between numbers in the list, @@ -95,11 +85,12 @@ def _number_finder(s, regex, numconv, py3_safe): return _py3_safe(s) if py3_safe else s -def _path_splitter(s): +def _path_splitter(s, _d_match=re.compile(r'\.\d').match): """Split a string into its path components. Assumes a string is a path.""" path_parts = [] p_append = path_parts.append path_location = s + # Continue splitting the path from the back until we have reached # '..' or '.', or until there is nothing left to split. while path_location != curdir and path_location != pardir: @@ -108,29 +99,32 @@ def _path_splitter(s): if path_location == parent_path: break p_append(child_path) + # This last append is the base path. # Only append if the string is non-empty. if path_location: p_append(path_location) + # We created this list in reversed order, so we now correct the order. path_parts.reverse() + # Now, split off the file extensions using a similar method to above. # Continue splitting off file extensions until we reach a decimal number # or there are no more extensions. base = path_parts.pop() base_parts = [] b_append = base_parts.append - d_match = decimal.match while True: front = base base, ext = splitext(front) - if d_match(ext) or not ext: + if _d_match(ext) or not ext: # Reset base to before the split if the split is invalid. base = front break b_append(ext) b_append(base) base_parts.reverse() + # Return the split parent paths and then the split basename. return path_parts + base_parts @@ -143,12 +137,9 @@ def _py3_safe(parsed_list): else: new_list = [parsed_list[0]] nl_append = new_list.append - ntypes = number_types for before, after in py23_zip(islice(parsed_list, 0, length-1), islice(parsed_list, 1, None)): - # I realize that isinstance is favored over type, but - # in this case type is SO MUCH FASTER than isinstance!! - if type(before) in ntypes and type(after) in ntypes: + if isreal(before) and isreal(after): nl_append("") nl_append(after) return new_list diff --git a/test_natsort/test_fake_fastnumbers.py b/test_natsort/test_fake_fastnumbers.py new file mode 100644 index 0000000..29ba9af --- /dev/null +++ b/test_natsort/test_fake_fastnumbers.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +"""\ +Test the fake fastnumbers module. +""" +from natsort.fake_fastnumbers import fast_float, fast_int, isreal + + +def test_fast_float(): + assert fast_float('45.8') == 45.8 + assert fast_float('-45') == -45.0 + assert fast_float('45.8e-2') == 45.8e-2 + assert fast_float('invalid') == 'invalid' + + +def test_fast_int(): + assert fast_int('45.8') == '45.8' + assert fast_int('-45') == -45 + assert fast_int('+45') == 45 + assert fast_int('invalid') == 'invalid' + + +def test_isreal(): + assert not isreal('45.8') + assert isreal(-45) + assert isreal(45.8e-2) + assert not isreal('invalid') diff --git a/test_natsort/test_natsort.py b/test_natsort/test_natsort.py index 0eeed12..264b508 100644 --- a/test_natsort/test_natsort.py +++ b/test_natsort/test_natsort.py @@ -11,25 +11,30 @@ from natsort.natsort import _number_finder, _py3_safe, _natsort_key from natsort.natsort import float_sign_exp_re, float_nosign_exp_re, float_sign_noexp_re from natsort.natsort import float_nosign_noexp_re, int_nosign_re, int_sign_re +try: + from fastnumbers import fast_float, fast_int +except ImportError: + from natsort.fake_fastnumbers import fast_float, fast_int + def test_number_finder(): - assert _number_finder('a5+5.034e-1', float_sign_exp_re, float, False) == ['a', 5.0, 0.5034] - assert _number_finder('a5+5.034e-1', float_nosign_exp_re, float, False) == ['a', 5.0, '+', 0.5034] - assert _number_finder('a5+5.034e-1', float_sign_noexp_re, float, False) == ['a', 5.0, 5.034, 'e', -1.0] - assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, float, False) == ['a', 5.0, '+', 5.034, 'e-', 1.0] - assert _number_finder('a5+5.034e-1', int_nosign_re, int, False) == ['a', 5, '+', 5, '.', 34, 'e-', 1] - assert _number_finder('a5+5.034e-1', int_sign_re, int, False) == ['a', 5, 5, '.', 34, 'e', -1] + assert _number_finder('a5+5.034e-1', float_sign_exp_re, fast_float, False) == ['a', 5.0, 0.5034] + assert _number_finder('a5+5.034e-1', float_nosign_exp_re, fast_float, False) == ['a', 5.0, '+', 0.5034] + assert _number_finder('a5+5.034e-1', float_sign_noexp_re, fast_float, False) == ['a', 5.0, 5.034, 'e', -1.0] + assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, fast_float, False) == ['a', 5.0, '+', 5.034, 'e-', 1.0] + assert _number_finder('a5+5.034e-1', int_nosign_re, fast_int, False) == ['a', 5, '+', 5, '.', 34, 'e-', 1] + assert _number_finder('a5+5.034e-1', int_sign_re, fast_int, False) == ['a', 5, 5, '.', 34, 'e', -1] - assert _number_finder('a5+5.034e-1', float_sign_exp_re, float, True) == ['a', 5.0, '', 0.5034] - assert _number_finder('a5+5.034e-1', float_nosign_exp_re, float, True) == ['a', 5.0, '+', 0.5034] - assert _number_finder('a5+5.034e-1', float_sign_noexp_re, float, True) == ['a', 5.0, '', 5.034, 'e', -1.0] - assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, float, True) == ['a', 5.0, '+', 5.034, 'e-', 1.0] - assert _number_finder('a5+5.034e-1', int_nosign_re, int, True) == ['a', 5, '+', 5, '.', 34, 'e-', 1] - assert _number_finder('a5+5.034e-1', int_sign_re, int, True) == ['a', 5, '', 5, '.', 34, 'e', -1] + assert _number_finder('a5+5.034e-1', float_sign_exp_re, fast_float, True) == ['a', 5.0, '', 0.5034] + assert _number_finder('a5+5.034e-1', float_nosign_exp_re, fast_float, True) == ['a', 5.0, '+', 0.5034] + assert _number_finder('a5+5.034e-1', float_sign_noexp_re, fast_float, True) == ['a', 5.0, '', 5.034, 'e', -1.0] + assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, fast_float, True) == ['a', 5.0, '+', 5.034, 'e-', 1.0] + assert _number_finder('a5+5.034e-1', int_nosign_re, fast_int, True) == ['a', 5, '+', 5, '.', 34, 'e-', 1] + assert _number_finder('a5+5.034e-1', int_sign_re, fast_int, True) == ['a', 5, '', 5, '.', 34, 'e', -1] - assert _number_finder('6a5+5.034e-1', float_sign_exp_re, float, False) == ['', 6.0, 'a', 5.0, 0.5034] - assert _number_finder('6a5+5.034e-1', float_sign_exp_re, float, True) == ['', 6.0, 'a', 5.0, '', 0.5034] + assert _number_finder('6a5+5.034e-1', float_sign_exp_re, fast_float, False) == ['', 6.0, 'a', 5.0, 0.5034] + assert _number_finder('6a5+5.034e-1', float_sign_exp_re, fast_float, True) == ['', 6.0, 'a', 5.0, '', 0.5034] def test_py3_safe(): @@ -47,21 +52,21 @@ def test_natsort_key_private(): assert a == ['num2', 'num3', 'num5'] # The below illustrates how the key works, and how the different options affect sorting. - assert _natsort_key('a-5.034e1') == ('a', -50.34) - assert _natsort_key('a-5.034e1', number_type=float, signed=True, exp=True) == ('a', -50.34) - assert _natsort_key('a-5.034e1', number_type=float, signed=True, exp=False) == ('a', -5.034, 'e', 1.0) - assert _natsort_key('a-5.034e1', number_type=float, signed=False, exp=True) == ('a-', 50.34) - assert _natsort_key('a-5.034e1', number_type=float, signed=False, exp=False) == ('a-', 5.034, 'e', 1.0) - assert _natsort_key('a-5.034e1', number_type=int) == ('a', -5, '.', 34, 'e', 1) - assert _natsort_key('a-5.034e1', number_type=int, signed=False) == ('a-', 5, '.', 34, 'e', 1) - assert _natsort_key('a-5.034e1', number_type=None) == _natsort_key('a-5.034e1', number_type=int, signed=False) - assert _natsort_key('a-5.034e1', key=lambda x: x.upper()) == ('A', -50.34) + assert _natsort_key('a-5.034e2') == ('a', -503.4) + assert _natsort_key('a-5.034e2', number_type=float, signed=True, exp=True) == ('a', -503.4) + assert _natsort_key('a-5.034e2', number_type=float, signed=True, exp=False) == ('a', -5.034, 'e', 2.0) + assert _natsort_key('a-5.034e2', number_type=float, signed=False, exp=True) == ('a-', 503.4) + assert _natsort_key('a-5.034e2', number_type=float, signed=False, exp=False) == ('a-', 5.034, 'e', 2.0) + assert _natsort_key('a-5.034e2', number_type=int) == ('a', -5, '.', 34, 'e', 2) + assert _natsort_key('a-5.034e2', number_type=int, signed=False) == ('a-', 5, '.', 34, 'e', 2) + assert _natsort_key('a-5.034e2', number_type=None) == _natsort_key('a-5.034e2', number_type=int, signed=False) + assert _natsort_key('a-5.034e2', key=lambda x: x.upper()) == ('A', -503.4) # Iterables are parsed recursively so you can sort lists of lists. - assert _natsort_key(('a1', 'a-5.034e1')) == (('a', 1.0), ('a', -50.34)) - assert _natsort_key(('a1', 'a-5.034e1'), number_type=None) == (('a', 1), ('a-', 5, '.', 34, 'e', 1)) + assert _natsort_key(('a1', 'a-5.034e2')) == (('a', 1.0), ('a', -503.4)) + assert _natsort_key(('a1', 'a-5.034e2'), number_type=None) == (('a', 1), ('a-', 5, '.', 34, 'e', 2)) # A key is applied before recursion, but not in the recursive calls. - assert _natsort_key(('a1', 'a-5.034e1'), key=itemgetter(1)) == ('a', -50.34) + assert _natsort_key(('a1', 'a-5.034e2'), key=itemgetter(1)) == ('a', -503.4) # Strings that lead with a number get an empty string at the front of the tuple. # This is designed to get around the "unorderable types" issue. @@ -100,10 +105,10 @@ def test_natsort_key_public(): # But it raises a depreciation warning with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - assert natsort_key('a-5.034e1') == _natsort_key('a-5.034e1') + assert natsort_key('a-5.034e2') == _natsort_key('a-5.034e2') assert len(w) == 1 assert "natsort_key is depreciated as of 3.4.0, please use natsort_keygen" in str(w[-1].message) - assert natsort_key('a-5.034e1', number_type=float, signed=False, exp=False) == _natsort_key('a-5.034e1', number_type=float, signed=False, exp=False) + assert natsort_key('a-5.034e2', number_type=float, signed=False, exp=False) == _natsort_key('a-5.034e2', number_type=float, signed=False, exp=False) # It is called for each element in a list when sorting with warnings.catch_warnings(record=True) as w: |