summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>2017-12-19 16:28:56 +0000
committerLeo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>2017-12-20 13:18:30 +0000
commit4b5ab3344cc88c69c1045e2c488173dfa867bb7f (patch)
tree4b9a4ea7065c876b7d81774d7661debab8b68621
parent5fc7b506b3e6c36b44534f73c18ffe168d73dede (diff)
downloadsmartypants-git-4b5ab3344cc88c69c1045e2c488173dfa867bb7f.tar.gz
use re.match instead of re.search to improve performance
if there isn't a match to the second half of the tag regex, the regex engine backtracks through the entire string character by character. one way to solve this is to use match instead of search. search, if it doesn't find a match starting with char 0, tries again starting at char 1, and so on and so forth - so if there's no match it'll take O(n^2) polynomial time to complete. match, on the other hand, only matches if the entire string matches. Since we start off with a greedy capturing group to consume everything before a tag, this is safe to use.
-rw-r--r--CHANGES.rst1
-rwxr-xr-xsmartypants.py6
-rw-r--r--tests/test.py11
3 files changed, 15 insertions, 3 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 133dc8c..2627e49 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -46,6 +46,7 @@ Releases 2.0 and greater
Development
-----------
+* use `re.match` instead of `re.search` to improve performance on large strings
Release 2.0.0 (2016-12-28T06:18:42Z)
------------------------------------
diff --git a/smartypants.py b/smartypants.py
index 55506ac..c39f409 100755
--- a/smartypants.py
+++ b/smartypants.py
@@ -15,7 +15,7 @@ smartypants module
__author__ = 'Leo Hemsted'
__author_email__ = 'leohemsted@gmail.com'
-__version__ = '2.0.0'
+__version__ = '2.0.1'
__license__ = 'BSD License'
__url__ = 'https://github.com/leohemsted/smartypants.py'
__description__ = 'Python with the SmartyPants'
@@ -570,7 +570,7 @@ def _tokenize(text):
tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S)
- token_match = tag_soup.search(text)
+ token_match = tag_soup.match(text)
previous_end = 0
while token_match:
@@ -600,7 +600,7 @@ def _tokenize(text):
tokens.append([type_, tag])
previous_end = token_match.end()
- token_match = tag_soup.search(text, token_match.end())
+ token_match = tag_soup.match(text, token_match.end())
if previous_end < len(text):
tokens.append(['text', text[previous_end:]])
diff --git a/tests/test.py b/tests/test.py
index 840bfa0..2c1a0ea 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -5,6 +5,7 @@
import doctest
import unittest
+from time import time
import smartypants
from smartypants import smartypants as sp
@@ -147,6 +148,16 @@ document.write('<a href="' + href + '">' + linktext + "</a>");
self.assertEqual(sp('"quote here"', Attr.set1 | Attr.s),
'"quote here"')
+ def test_incredibly_long_string(self):
+ # make sure it doesn't take too long with a long string
+ start = time()
+
+ sp("a" * 10000)
+
+ end = time()
+
+ self.assertLess(end - start, 0.2)
+
def load_tests(loader, tests, pattern):