Adopted new fastnumbers module.

This module quickly confers strings to numbers, and will speed up the natsort algorithm significantly (if it is installed). In case it is not installed, a "fake_fastnumbers" module is added to natsort that simulates the needed functions. It is found that up to 30% gain in efficiency can be had by using the fastnumbers module.
author: Seth M Morton <seth.m.morton@gmail.com> 2014-08-12 18:18:58 -0700
committer: Seth M Morton <seth.m.morton@gmail.com> 2014-08-12 23:04:26 -0700
commit: d94de806ce6ee78cd15a78b8a746adb287b06cd9 (patch)
tree: 8b333be964eb7024f6daf9ed9dd0bba1e97b3e53
parent: 5840e801f2175798a91f197c4cfd2c03fa683f8d (diff)
download: natsort-d94de806ce6ee78cd15a78b8a746adb287b06cd9.tar.gz
5 files changed, 123 insertions, 68 deletions
diff --git a/.coveragerc b/.coveragerc
index 1bbfe9d..8622bd1 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -9,6 +9,9 @@ exclude_lines =
     raise NotImplementedError
     raise$
 
+    # Don't complain about alternate imports
+    except ImportError
+
     # Don't complain if non-runnable code isn't run:
     if 0:
     if __name__ == .__main__.:
diff --git a/natsort/fake_fastnumbers.py b/natsort/fake_fastnumbers.py
new file mode 100644
index 0000000..15d7e88
--- /dev/null
+++ b/natsort/fake_fastnumbers.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+"""\
+This module is intended to replicate some of the functionality
+from the fastnumbers module in the event that module is not
+installed.
+"""
+from __future__ import (print_function, division,
+                        unicode_literals, absolute_import)
+
+import re
+
+float_re = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?$')
+int_re = re.compile(r'[-+]?\d+$')
+
+
+def fast_float(x, regex_matcher=float_re.match):
+    """Convert a string to a float quickly"""
+    return float(x) if regex_matcher(x) else x
+
+
+def fast_int(x, regex_matcher=int_re.match):
+    """\
+    Convert a string to a int quickly, return input as-is if not possible.
+    """
+    return int(x) if regex_matcher(x) else x
+
+
+def isreal(x, ntypes=set([int, float])):
+    """Returns true if the input is a real number, false otherwise."""
+    return type(x) in ntypes
diff --git a/natsort/natsort.py b/natsort/natsort.py
index c67ec3f..5974199 100644
--- a/natsort/natsort.py
+++ b/natsort/natsort.py
@@ -23,6 +23,13 @@ from functools import partial
 from itertools import islice
 from warnings import warn
 
+# If the user has fastnumbers installed, they will get great speed
+# benefits.  If not, we simulate the functions here.
+try:
+    from fastnumbers import fast_float, fast_int, isreal
+except ImportError:
+    from .fake_fastnumbers import fast_float, fast_int, isreal
+
 from .py23compat import u_format, py23_str, py23_zip
 
 # Make sure the doctest works for either python2 or python3
@@ -38,54 +45,37 @@ int_nosign_re = re.compile(r'(\d+)')
 int_sign_re = re.compile(r'([-+]?\d+)')
 # This dict will help select the correct regex and number conversion function.
 regex_and_num_function_chooser = {
-    (float, True,  True):  (float_sign_exp_re,     float),
-    (float, True,  False): (float_sign_noexp_re,   float),
-    (float, False, True):  (float_nosign_exp_re,   float),
-    (float, False, False): (float_nosign_noexp_re, float),
-    (int,   True,  True):  (int_sign_re,   int),
-    (int,   True,  False): (int_sign_re,   int),
-    (int,   False, True):  (int_nosign_re, int),
-    (int,   False, False): (int_nosign_re, int),
-    (None,  True,  True):  (int_nosign_re, int),
-    (None,  True,  False): (int_nosign_re, int),
-    (None,  False, True):  (int_nosign_re, int),
-    (None,  False, False): (int_nosign_re, int),
+    (float, True,  True):  (float_sign_exp_re,     fast_float),
+    (float, True,  False): (float_sign_noexp_re,   fast_float),
+    (float, False, True):  (float_nosign_exp_re,   fast_float),
+    (float, False, False): (float_nosign_noexp_re, fast_float),
+    (int,   True,  True):  (int_sign_re,   fast_int),
+    (int,   True,  False): (int_sign_re,   fast_int),
+    (int,   False, True):  (int_nosign_re, fast_int),
+    (int,   False, False): (int_nosign_re, fast_int),
+    (None,  True,  True):  (int_nosign_re, fast_int),
+    (None,  True,  False): (int_nosign_re, fast_int),
+    (None,  False, True):  (int_nosign_re, fast_int),
+    (None,  False, False): (int_nosign_re, fast_int),
 }
 
-# Number types.  I have to use set([...]) and not {...}
-# because I am supporting Python 2.6.
-number_types = set([float, int])
-
-# This regex is to make sure we don't mistake a number for a file extension
-decimal = re.compile(r'\.\d')
-
 
 def _number_finder(s, regex, numconv, py3_safe):
     """Helper to split numbers"""
 
-    # Split the input string by numbers.
-    # If there are no splits, return now.
-    # If the input is not a string, ValueError is raised.
+    # Split the input string by numbers. If there are no splits, return now.
+    # If the input is not a string, TypeError is raised.
     s = regex.split(s)
     if len(s) == 1:
         return tuple(s)
 
     # Now convert the numbers to numbers, and leave strings as strings.
     # Remove empty strings from the list.
-    # Profiling showed that using regex here is much faster than
-    # try/except with the numconv function.
-    r = regex.match
-    s = [numconv(x) if r(x) else x for x in s if x]
+    s = [numconv(x) for x in s if x]
 
     # If the list begins with a number, lead with an empty string.
     # This is used to get around the "unorderable types" issue.
-    # The most common case will be a string at the front of the
-    # list, and in that case the try/except method is faster than
-    # using isinstance. This was chosen at the expense of the less
-    # common case of a number being at the front of the list.
-    try:
-        s[0][0]  # str supports indexing, but not numbers
-    except TypeError:
+    if isreal(s[0]):
         s = [''] + s
 
     # The _py3_safe function inserts "" between numbers in the list,
@@ -95,11 +85,12 @@ def _number_finder(s, regex, numconv, py3_safe):
     return _py3_safe(s) if py3_safe else s
 
 
-def _path_splitter(s):
+def _path_splitter(s, _d_match=re.compile(r'\.\d').match):
     """Split a string into its path components. Assumes a string is a path."""
     path_parts = []
     p_append = path_parts.append
     path_location = s
+
     # Continue splitting the path from the back until we have reached
     # '..' or '.', or until there is nothing left to split.
     while path_location != curdir and path_location != pardir:
@@ -108,29 +99,32 @@ def _path_splitter(s):
         if path_location == parent_path:
             break
         p_append(child_path)
+
     # This last append is the base path.
     # Only append if the string is non-empty.
     if path_location:
         p_append(path_location)
+
     # We created this list in reversed order, so we now correct the order.
     path_parts.reverse()
+
     # Now, split off the file extensions using a similar method to above.
     # Continue splitting off file extensions until we reach a decimal number
     # or there are no more extensions.
     base = path_parts.pop()
     base_parts = []
     b_append = base_parts.append
-    d_match = decimal.match
     while True:
         front = base
         base, ext = splitext(front)
-        if d_match(ext) or not ext:
+        if _d_match(ext) or not ext:
             # Reset base to before the split if the split is invalid.
             base = front
             break
         b_append(ext)
     b_append(base)
     base_parts.reverse()
+
     # Return the split parent paths and then the split basename.
     return path_parts + base_parts
 
@@ -143,12 +137,9 @@ def _py3_safe(parsed_list):
     else:
         new_list = [parsed_list[0]]
         nl_append = new_list.append
-        ntypes = number_types
         for before, after in py23_zip(islice(parsed_list, 0, length-1),
                                       islice(parsed_list, 1, None)):
-            # I realize that isinstance is favored over type, but
-            # in this case type is SO MUCH FASTER than isinstance!!
-            if type(before) in ntypes and type(after) in ntypes:
+            if isreal(before) and isreal(after):
                 nl_append("")
             nl_append(after)
         return new_list
diff --git a/test_natsort/test_fake_fastnumbers.py b/test_natsort/test_fake_fastnumbers.py
new file mode 100644
index 0000000..29ba9af
--- /dev/null
+++ b/test_natsort/test_fake_fastnumbers.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+"""\
+Test the fake fastnumbers module.
+"""
+from natsort.fake_fastnumbers import fast_float, fast_int, isreal
+
+
+def test_fast_float():
+    assert fast_float('45.8') == 45.8
+    assert fast_float('-45') == -45.0
+    assert fast_float('45.8e-2') == 45.8e-2
+    assert fast_float('invalid') == 'invalid'
+
+
+def test_fast_int():
+    assert fast_int('45.8') == '45.8'
+    assert fast_int('-45') == -45
+    assert fast_int('+45') == 45
+    assert fast_int('invalid') == 'invalid'
+
+
+def test_isreal():
+    assert not isreal('45.8')
+    assert isreal(-45)
+    assert isreal(45.8e-2)
+    assert not isreal('invalid')
diff --git a/test_natsort/test_natsort.py b/test_natsort/test_natsort.py
index 0eeed12..264b508 100644
--- a/test_natsort/test_natsort.py
+++ b/test_natsort/test_natsort.py
@@ -11,25 +11,30 @@ from natsort.natsort import _number_finder, _py3_safe, _natsort_key
 from natsort.natsort import float_sign_exp_re, float_nosign_exp_re, float_sign_noexp_re
 from natsort.natsort import float_nosign_noexp_re, int_nosign_re, int_sign_re
 
+try:
+    from fastnumbers import fast_float, fast_int
+except ImportError:
+    from natsort.fake_fastnumbers import fast_float, fast_int
+
 
 def test_number_finder():
 
-    assert _number_finder('a5+5.034e-1', float_sign_exp_re,     float, False) == ['a', 5.0, 0.5034]
-    assert _number_finder('a5+5.034e-1', float_nosign_exp_re,   float, False) == ['a', 5.0, '+', 0.5034]
-    assert _number_finder('a5+5.034e-1', float_sign_noexp_re,   float, False) == ['a', 5.0, 5.034, 'e', -1.0]
-    assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, float, False) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
-    assert _number_finder('a5+5.034e-1', int_nosign_re,         int,   False) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
-    assert _number_finder('a5+5.034e-1', int_sign_re,           int,   False) == ['a', 5, 5, '.', 34, 'e', -1]
+    assert _number_finder('a5+5.034e-1', float_sign_exp_re,     fast_float, False) == ['a', 5.0, 0.5034]
+    assert _number_finder('a5+5.034e-1', float_nosign_exp_re,   fast_float, False) == ['a', 5.0, '+', 0.5034]
+    assert _number_finder('a5+5.034e-1', float_sign_noexp_re,   fast_float, False) == ['a', 5.0, 5.034, 'e', -1.0]
+    assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, fast_float, False) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
+    assert _number_finder('a5+5.034e-1', int_nosign_re,         fast_int,   False) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
+    assert _number_finder('a5+5.034e-1', int_sign_re,           fast_int,   False) == ['a', 5, 5, '.', 34, 'e', -1]
 
-    assert _number_finder('a5+5.034e-1', float_sign_exp_re,     float, True) == ['a', 5.0, '', 0.5034]
-    assert _number_finder('a5+5.034e-1', float_nosign_exp_re,   float, True) == ['a', 5.0, '+', 0.5034]
-    assert _number_finder('a5+5.034e-1', float_sign_noexp_re,   float, True) == ['a', 5.0, '', 5.034, 'e', -1.0]
-    assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, float, True) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
-    assert _number_finder('a5+5.034e-1', int_nosign_re,         int,   True) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
-    assert _number_finder('a5+5.034e-1', int_sign_re,           int,   True) == ['a', 5, '', 5, '.', 34, 'e', -1]
+    assert _number_finder('a5+5.034e-1', float_sign_exp_re,     fast_float, True) == ['a', 5.0, '', 0.5034]
+    assert _number_finder('a5+5.034e-1', float_nosign_exp_re,   fast_float, True) == ['a', 5.0, '+', 0.5034]
+    assert _number_finder('a5+5.034e-1', float_sign_noexp_re,   fast_float, True) == ['a', 5.0, '', 5.034, 'e', -1.0]
+    assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, fast_float, True) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
+    assert _number_finder('a5+5.034e-1', int_nosign_re,         fast_int,   True) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
+    assert _number_finder('a5+5.034e-1', int_sign_re,           fast_int,   True) == ['a', 5, '', 5, '.', 34, 'e', -1]
 
-    assert _number_finder('6a5+5.034e-1', float_sign_exp_re,    float, False) == ['', 6.0, 'a', 5.0, 0.5034]
-    assert _number_finder('6a5+5.034e-1', float_sign_exp_re,    float, True)  == ['', 6.0, 'a', 5.0, '', 0.5034]
+    assert _number_finder('6a5+5.034e-1', float_sign_exp_re,    fast_float, False) == ['', 6.0, 'a', 5.0, 0.5034]
+    assert _number_finder('6a5+5.034e-1', float_sign_exp_re,    fast_float, True)  == ['', 6.0, 'a', 5.0, '', 0.5034]
 
 
 def test_py3_safe():
@@ -47,21 +52,21 @@ def test_natsort_key_private():
     assert a == ['num2', 'num3', 'num5']
 
     # The below illustrates how the key works, and how the different options affect sorting.
-    assert _natsort_key('a-5.034e1')                                             == ('a', -50.34)
-    assert _natsort_key('a-5.034e1', number_type=float, signed=True,  exp=True)  == ('a', -50.34)
-    assert _natsort_key('a-5.034e1', number_type=float, signed=True,  exp=False) == ('a', -5.034, 'e', 1.0)
-    assert _natsort_key('a-5.034e1', number_type=float, signed=False, exp=True)  == ('a-', 50.34)
-    assert _natsort_key('a-5.034e1', number_type=float, signed=False, exp=False) == ('a-', 5.034, 'e', 1.0)
-    assert _natsort_key('a-5.034e1', number_type=int)                            == ('a', -5, '.', 34, 'e', 1)
-    assert _natsort_key('a-5.034e1', number_type=int, signed=False)              == ('a-', 5, '.', 34, 'e', 1)
-    assert _natsort_key('a-5.034e1', number_type=None) == _natsort_key('a-5.034e1', number_type=int, signed=False)
-    assert _natsort_key('a-5.034e1', key=lambda x: x.upper()) == ('A', -50.34)
+    assert _natsort_key('a-5.034e2')                                             == ('a', -503.4)
+    assert _natsort_key('a-5.034e2', number_type=float, signed=True,  exp=True)  == ('a', -503.4)
+    assert _natsort_key('a-5.034e2', number_type=float, signed=True,  exp=False) == ('a', -5.034, 'e', 2.0)
+    assert _natsort_key('a-5.034e2', number_type=float, signed=False, exp=True)  == ('a-', 503.4)
+    assert _natsort_key('a-5.034e2', number_type=float, signed=False, exp=False) == ('a-', 5.034, 'e', 2.0)
+    assert _natsort_key('a-5.034e2', number_type=int)                            == ('a', -5, '.', 34, 'e', 2)
+    assert _natsort_key('a-5.034e2', number_type=int, signed=False)              == ('a-', 5, '.', 34, 'e', 2)
+    assert _natsort_key('a-5.034e2', number_type=None) == _natsort_key('a-5.034e2', number_type=int, signed=False)
+    assert _natsort_key('a-5.034e2', key=lambda x: x.upper()) == ('A', -503.4)
 
     # Iterables are parsed recursively so you can sort lists of lists.
-    assert _natsort_key(('a1', 'a-5.034e1')) == (('a', 1.0), ('a', -50.34))
-    assert _natsort_key(('a1', 'a-5.034e1'), number_type=None) == (('a', 1), ('a-', 5, '.', 34, 'e', 1))
+    assert _natsort_key(('a1', 'a-5.034e2')) == (('a', 1.0), ('a', -503.4))
+    assert _natsort_key(('a1', 'a-5.034e2'), number_type=None) == (('a', 1), ('a-', 5, '.', 34, 'e', 2))
     # A key is applied before recursion, but not in the recursive calls.
-    assert _natsort_key(('a1', 'a-5.034e1'), key=itemgetter(1)) == ('a', -50.34)
+    assert _natsort_key(('a1', 'a-5.034e2'), key=itemgetter(1)) == ('a', -503.4)
 
     # Strings that lead with a number get an empty string at the front of the tuple.
     # This is designed to get around the "unorderable types" issue.
@@ -100,10 +105,10 @@ def test_natsort_key_public():
     # But it raises a depreciation warning
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always")
-        assert natsort_key('a-5.034e1') == _natsort_key('a-5.034e1')
+        assert natsort_key('a-5.034e2') == _natsort_key('a-5.034e2')
         assert len(w) == 1
         assert "natsort_key is depreciated as of 3.4.0, please use natsort_keygen" in str(w[-1].message)
-        assert natsort_key('a-5.034e1', number_type=float, signed=False, exp=False) == _natsort_key('a-5.034e1', number_type=float, signed=False, exp=False)
+        assert natsort_key('a-5.034e2', number_type=float, signed=False, exp=False) == _natsort_key('a-5.034e2', number_type=float, signed=False, exp=False)
 
     # It is called for each element in a list when sorting
     with warnings.catch_warnings(record=True) as w:
author	Seth M Morton <seth.m.morton@gmail.com>	2014-08-12 18:18:58 -0700
committer	Seth M Morton <seth.m.morton@gmail.com>	2014-08-12 23:04:26 -0700
commit	d94de806ce6ee78cd15a78b8a746adb287b06cd9 (patch)
tree	8b333be964eb7024f6daf9ed9dd0bba1e97b3e53
parent	5840e801f2175798a91f197c4cfd2c03fa683f8d (diff)
download	natsort-d94de806ce6ee78cd15a78b8a746adb287b06cd9.tar.gz