summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth M Morton <seth.m.morton@gmail.com>2014-08-12 18:18:58 -0700
committerSeth M Morton <seth.m.morton@gmail.com>2014-08-12 23:04:26 -0700
commitd94de806ce6ee78cd15a78b8a746adb287b06cd9 (patch)
tree8b333be964eb7024f6daf9ed9dd0bba1e97b3e53
parent5840e801f2175798a91f197c4cfd2c03fa683f8d (diff)
downloadnatsort-d94de806ce6ee78cd15a78b8a746adb287b06cd9.tar.gz
Adopted new fastnumbers module.
This module quickly confers strings to numbers, and will speed up the natsort algorithm significantly (if it is installed). In case it is not installed, a "fake_fastnumbers" module is added to natsort that simulates the needed functions. It is found that up to 30% gain in efficiency can be had by using the fastnumbers module.
-rw-r--r--.coveragerc3
-rw-r--r--natsort/fake_fastnumbers.py30
-rw-r--r--natsort/natsort.py71
-rw-r--r--test_natsort/test_fake_fastnumbers.py26
-rw-r--r--test_natsort/test_natsort.py61
5 files changed, 123 insertions, 68 deletions
diff --git a/.coveragerc b/.coveragerc
index 1bbfe9d..8622bd1 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -9,6 +9,9 @@ exclude_lines =
raise NotImplementedError
raise$
+ # Don't complain about alternate imports
+ except ImportError
+
# Don't complain if non-runnable code isn't run:
if 0:
if __name__ == .__main__.:
diff --git a/natsort/fake_fastnumbers.py b/natsort/fake_fastnumbers.py
new file mode 100644
index 0000000..15d7e88
--- /dev/null
+++ b/natsort/fake_fastnumbers.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+"""\
+This module is intended to replicate some of the functionality
+from the fastnumbers module in the event that module is not
+installed.
+"""
+from __future__ import (print_function, division,
+ unicode_literals, absolute_import)
+
+import re
+
+float_re = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?$')
+int_re = re.compile(r'[-+]?\d+$')
+
+
+def fast_float(x, regex_matcher=float_re.match):
+ """Convert a string to a float quickly"""
+ return float(x) if regex_matcher(x) else x
+
+
+def fast_int(x, regex_matcher=int_re.match):
+ """\
+ Convert a string to a int quickly, return input as-is if not possible.
+ """
+ return int(x) if regex_matcher(x) else x
+
+
+def isreal(x, ntypes=set([int, float])):
+ """Returns true if the input is a real number, false otherwise."""
+ return type(x) in ntypes
diff --git a/natsort/natsort.py b/natsort/natsort.py
index c67ec3f..5974199 100644
--- a/natsort/natsort.py
+++ b/natsort/natsort.py
@@ -23,6 +23,13 @@ from functools import partial
from itertools import islice
from warnings import warn
+# If the user has fastnumbers installed, they will get great speed
+# benefits. If not, we simulate the functions here.
+try:
+ from fastnumbers import fast_float, fast_int, isreal
+except ImportError:
+ from .fake_fastnumbers import fast_float, fast_int, isreal
+
from .py23compat import u_format, py23_str, py23_zip
# Make sure the doctest works for either python2 or python3
@@ -38,54 +45,37 @@ int_nosign_re = re.compile(r'(\d+)')
int_sign_re = re.compile(r'([-+]?\d+)')
# This dict will help select the correct regex and number conversion function.
regex_and_num_function_chooser = {
- (float, True, True): (float_sign_exp_re, float),
- (float, True, False): (float_sign_noexp_re, float),
- (float, False, True): (float_nosign_exp_re, float),
- (float, False, False): (float_nosign_noexp_re, float),
- (int, True, True): (int_sign_re, int),
- (int, True, False): (int_sign_re, int),
- (int, False, True): (int_nosign_re, int),
- (int, False, False): (int_nosign_re, int),
- (None, True, True): (int_nosign_re, int),
- (None, True, False): (int_nosign_re, int),
- (None, False, True): (int_nosign_re, int),
- (None, False, False): (int_nosign_re, int),
+ (float, True, True): (float_sign_exp_re, fast_float),
+ (float, True, False): (float_sign_noexp_re, fast_float),
+ (float, False, True): (float_nosign_exp_re, fast_float),
+ (float, False, False): (float_nosign_noexp_re, fast_float),
+ (int, True, True): (int_sign_re, fast_int),
+ (int, True, False): (int_sign_re, fast_int),
+ (int, False, True): (int_nosign_re, fast_int),
+ (int, False, False): (int_nosign_re, fast_int),
+ (None, True, True): (int_nosign_re, fast_int),
+ (None, True, False): (int_nosign_re, fast_int),
+ (None, False, True): (int_nosign_re, fast_int),
+ (None, False, False): (int_nosign_re, fast_int),
}
-# Number types. I have to use set([...]) and not {...}
-# because I am supporting Python 2.6.
-number_types = set([float, int])
-
-# This regex is to make sure we don't mistake a number for a file extension
-decimal = re.compile(r'\.\d')
-
def _number_finder(s, regex, numconv, py3_safe):
"""Helper to split numbers"""
- # Split the input string by numbers.
- # If there are no splits, return now.
- # If the input is not a string, ValueError is raised.
+ # Split the input string by numbers. If there are no splits, return now.
+ # If the input is not a string, TypeError is raised.
s = regex.split(s)
if len(s) == 1:
return tuple(s)
# Now convert the numbers to numbers, and leave strings as strings.
# Remove empty strings from the list.
- # Profiling showed that using regex here is much faster than
- # try/except with the numconv function.
- r = regex.match
- s = [numconv(x) if r(x) else x for x in s if x]
+ s = [numconv(x) for x in s if x]
# If the list begins with a number, lead with an empty string.
# This is used to get around the "unorderable types" issue.
- # The most common case will be a string at the front of the
- # list, and in that case the try/except method is faster than
- # using isinstance. This was chosen at the expense of the less
- # common case of a number being at the front of the list.
- try:
- s[0][0] # str supports indexing, but not numbers
- except TypeError:
+ if isreal(s[0]):
s = [''] + s
# The _py3_safe function inserts "" between numbers in the list,
@@ -95,11 +85,12 @@ def _number_finder(s, regex, numconv, py3_safe):
return _py3_safe(s) if py3_safe else s
-def _path_splitter(s):
+def _path_splitter(s, _d_match=re.compile(r'\.\d').match):
"""Split a string into its path components. Assumes a string is a path."""
path_parts = []
p_append = path_parts.append
path_location = s
+
# Continue splitting the path from the back until we have reached
# '..' or '.', or until there is nothing left to split.
while path_location != curdir and path_location != pardir:
@@ -108,29 +99,32 @@ def _path_splitter(s):
if path_location == parent_path:
break
p_append(child_path)
+
# This last append is the base path.
# Only append if the string is non-empty.
if path_location:
p_append(path_location)
+
# We created this list in reversed order, so we now correct the order.
path_parts.reverse()
+
# Now, split off the file extensions using a similar method to above.
# Continue splitting off file extensions until we reach a decimal number
# or there are no more extensions.
base = path_parts.pop()
base_parts = []
b_append = base_parts.append
- d_match = decimal.match
while True:
front = base
base, ext = splitext(front)
- if d_match(ext) or not ext:
+ if _d_match(ext) or not ext:
# Reset base to before the split if the split is invalid.
base = front
break
b_append(ext)
b_append(base)
base_parts.reverse()
+
# Return the split parent paths and then the split basename.
return path_parts + base_parts
@@ -143,12 +137,9 @@ def _py3_safe(parsed_list):
else:
new_list = [parsed_list[0]]
nl_append = new_list.append
- ntypes = number_types
for before, after in py23_zip(islice(parsed_list, 0, length-1),
islice(parsed_list, 1, None)):
- # I realize that isinstance is favored over type, but
- # in this case type is SO MUCH FASTER than isinstance!!
- if type(before) in ntypes and type(after) in ntypes:
+ if isreal(before) and isreal(after):
nl_append("")
nl_append(after)
return new_list
diff --git a/test_natsort/test_fake_fastnumbers.py b/test_natsort/test_fake_fastnumbers.py
new file mode 100644
index 0000000..29ba9af
--- /dev/null
+++ b/test_natsort/test_fake_fastnumbers.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+"""\
+Test the fake fastnumbers module.
+"""
+from natsort.fake_fastnumbers import fast_float, fast_int, isreal
+
+
+def test_fast_float():
+ assert fast_float('45.8') == 45.8
+ assert fast_float('-45') == -45.0
+ assert fast_float('45.8e-2') == 45.8e-2
+ assert fast_float('invalid') == 'invalid'
+
+
+def test_fast_int():
+ assert fast_int('45.8') == '45.8'
+ assert fast_int('-45') == -45
+ assert fast_int('+45') == 45
+ assert fast_int('invalid') == 'invalid'
+
+
+def test_isreal():
+ assert not isreal('45.8')
+ assert isreal(-45)
+ assert isreal(45.8e-2)
+ assert not isreal('invalid')
diff --git a/test_natsort/test_natsort.py b/test_natsort/test_natsort.py
index 0eeed12..264b508 100644
--- a/test_natsort/test_natsort.py
+++ b/test_natsort/test_natsort.py
@@ -11,25 +11,30 @@ from natsort.natsort import _number_finder, _py3_safe, _natsort_key
from natsort.natsort import float_sign_exp_re, float_nosign_exp_re, float_sign_noexp_re
from natsort.natsort import float_nosign_noexp_re, int_nosign_re, int_sign_re
+try:
+ from fastnumbers import fast_float, fast_int
+except ImportError:
+ from natsort.fake_fastnumbers import fast_float, fast_int
+
def test_number_finder():
- assert _number_finder('a5+5.034e-1', float_sign_exp_re, float, False) == ['a', 5.0, 0.5034]
- assert _number_finder('a5+5.034e-1', float_nosign_exp_re, float, False) == ['a', 5.0, '+', 0.5034]
- assert _number_finder('a5+5.034e-1', float_sign_noexp_re, float, False) == ['a', 5.0, 5.034, 'e', -1.0]
- assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, float, False) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
- assert _number_finder('a5+5.034e-1', int_nosign_re, int, False) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
- assert _number_finder('a5+5.034e-1', int_sign_re, int, False) == ['a', 5, 5, '.', 34, 'e', -1]
+ assert _number_finder('a5+5.034e-1', float_sign_exp_re, fast_float, False) == ['a', 5.0, 0.5034]
+ assert _number_finder('a5+5.034e-1', float_nosign_exp_re, fast_float, False) == ['a', 5.0, '+', 0.5034]
+ assert _number_finder('a5+5.034e-1', float_sign_noexp_re, fast_float, False) == ['a', 5.0, 5.034, 'e', -1.0]
+ assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, fast_float, False) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
+ assert _number_finder('a5+5.034e-1', int_nosign_re, fast_int, False) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
+ assert _number_finder('a5+5.034e-1', int_sign_re, fast_int, False) == ['a', 5, 5, '.', 34, 'e', -1]
- assert _number_finder('a5+5.034e-1', float_sign_exp_re, float, True) == ['a', 5.0, '', 0.5034]
- assert _number_finder('a5+5.034e-1', float_nosign_exp_re, float, True) == ['a', 5.0, '+', 0.5034]
- assert _number_finder('a5+5.034e-1', float_sign_noexp_re, float, True) == ['a', 5.0, '', 5.034, 'e', -1.0]
- assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, float, True) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
- assert _number_finder('a5+5.034e-1', int_nosign_re, int, True) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
- assert _number_finder('a5+5.034e-1', int_sign_re, int, True) == ['a', 5, '', 5, '.', 34, 'e', -1]
+ assert _number_finder('a5+5.034e-1', float_sign_exp_re, fast_float, True) == ['a', 5.0, '', 0.5034]
+ assert _number_finder('a5+5.034e-1', float_nosign_exp_re, fast_float, True) == ['a', 5.0, '+', 0.5034]
+ assert _number_finder('a5+5.034e-1', float_sign_noexp_re, fast_float, True) == ['a', 5.0, '', 5.034, 'e', -1.0]
+ assert _number_finder('a5+5.034e-1', float_nosign_noexp_re, fast_float, True) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
+ assert _number_finder('a5+5.034e-1', int_nosign_re, fast_int, True) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
+ assert _number_finder('a5+5.034e-1', int_sign_re, fast_int, True) == ['a', 5, '', 5, '.', 34, 'e', -1]
- assert _number_finder('6a5+5.034e-1', float_sign_exp_re, float, False) == ['', 6.0, 'a', 5.0, 0.5034]
- assert _number_finder('6a5+5.034e-1', float_sign_exp_re, float, True) == ['', 6.0, 'a', 5.0, '', 0.5034]
+ assert _number_finder('6a5+5.034e-1', float_sign_exp_re, fast_float, False) == ['', 6.0, 'a', 5.0, 0.5034]
+ assert _number_finder('6a5+5.034e-1', float_sign_exp_re, fast_float, True) == ['', 6.0, 'a', 5.0, '', 0.5034]
def test_py3_safe():
@@ -47,21 +52,21 @@ def test_natsort_key_private():
assert a == ['num2', 'num3', 'num5']
# The below illustrates how the key works, and how the different options affect sorting.
- assert _natsort_key('a-5.034e1') == ('a', -50.34)
- assert _natsort_key('a-5.034e1', number_type=float, signed=True, exp=True) == ('a', -50.34)
- assert _natsort_key('a-5.034e1', number_type=float, signed=True, exp=False) == ('a', -5.034, 'e', 1.0)
- assert _natsort_key('a-5.034e1', number_type=float, signed=False, exp=True) == ('a-', 50.34)
- assert _natsort_key('a-5.034e1', number_type=float, signed=False, exp=False) == ('a-', 5.034, 'e', 1.0)
- assert _natsort_key('a-5.034e1', number_type=int) == ('a', -5, '.', 34, 'e', 1)
- assert _natsort_key('a-5.034e1', number_type=int, signed=False) == ('a-', 5, '.', 34, 'e', 1)
- assert _natsort_key('a-5.034e1', number_type=None) == _natsort_key('a-5.034e1', number_type=int, signed=False)
- assert _natsort_key('a-5.034e1', key=lambda x: x.upper()) == ('A', -50.34)
+ assert _natsort_key('a-5.034e2') == ('a', -503.4)
+ assert _natsort_key('a-5.034e2', number_type=float, signed=True, exp=True) == ('a', -503.4)
+ assert _natsort_key('a-5.034e2', number_type=float, signed=True, exp=False) == ('a', -5.034, 'e', 2.0)
+ assert _natsort_key('a-5.034e2', number_type=float, signed=False, exp=True) == ('a-', 503.4)
+ assert _natsort_key('a-5.034e2', number_type=float, signed=False, exp=False) == ('a-', 5.034, 'e', 2.0)
+ assert _natsort_key('a-5.034e2', number_type=int) == ('a', -5, '.', 34, 'e', 2)
+ assert _natsort_key('a-5.034e2', number_type=int, signed=False) == ('a-', 5, '.', 34, 'e', 2)
+ assert _natsort_key('a-5.034e2', number_type=None) == _natsort_key('a-5.034e2', number_type=int, signed=False)
+ assert _natsort_key('a-5.034e2', key=lambda x: x.upper()) == ('A', -503.4)
# Iterables are parsed recursively so you can sort lists of lists.
- assert _natsort_key(('a1', 'a-5.034e1')) == (('a', 1.0), ('a', -50.34))
- assert _natsort_key(('a1', 'a-5.034e1'), number_type=None) == (('a', 1), ('a-', 5, '.', 34, 'e', 1))
+ assert _natsort_key(('a1', 'a-5.034e2')) == (('a', 1.0), ('a', -503.4))
+ assert _natsort_key(('a1', 'a-5.034e2'), number_type=None) == (('a', 1), ('a-', 5, '.', 34, 'e', 2))
# A key is applied before recursion, but not in the recursive calls.
- assert _natsort_key(('a1', 'a-5.034e1'), key=itemgetter(1)) == ('a', -50.34)
+ assert _natsort_key(('a1', 'a-5.034e2'), key=itemgetter(1)) == ('a', -503.4)
# Strings that lead with a number get an empty string at the front of the tuple.
# This is designed to get around the "unorderable types" issue.
@@ -100,10 +105,10 @@ def test_natsort_key_public():
# But it raises a depreciation warning
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
- assert natsort_key('a-5.034e1') == _natsort_key('a-5.034e1')
+ assert natsort_key('a-5.034e2') == _natsort_key('a-5.034e2')
assert len(w) == 1
assert "natsort_key is depreciated as of 3.4.0, please use natsort_keygen" in str(w[-1].message)
- assert natsort_key('a-5.034e1', number_type=float, signed=False, exp=False) == _natsort_key('a-5.034e1', number_type=float, signed=False, exp=False)
+ assert natsort_key('a-5.034e2', number_type=float, signed=False, exp=False) == _natsort_key('a-5.034e2', number_type=float, signed=False, exp=False)
# It is called for each element in a list when sorting
with warnings.catch_warnings(record=True) as w: