use re.match instead of re.search to improve performance

if there isn't a match to the second half of the tag regex, the regex engine backtracks through the entire string character by character. one way to solve this is to use match instead of search. search, if it doesn't find a match starting with char 0, tries again starting at char 1, and so on and so forth - so if there's no match it'll take O(n^2) polynomial time to complete. match, on the other hand, only matches if the entire string matches. Since we start off with a greedy capturing group to consume everything before a tag, this is safe to use.
author: Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk> 2017-12-19 16:28:56 +0000
committer: Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk> 2017-12-20 13:18:30 +0000
commit: 4b5ab3344cc88c69c1045e2c488173dfa867bb7f (patch)
tree: 4b9a4ea7065c876b7d81774d7661debab8b68621
parent: 5fc7b506b3e6c36b44534f73c18ffe168d73dede (diff)
download: smartypants-git-4b5ab3344cc88c69c1045e2c488173dfa867bb7f.tar.gz
3 files changed, 15 insertions, 3 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 133dc8c..2627e49 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -46,6 +46,7 @@ Releases 2.0 and greater
 Development
 -----------
 
+* use `re.match` instead of `re.search` to improve performance on large strings
 
 Release 2.0.0 (2016-12-28T06:18:42Z)
 ------------------------------------
diff --git a/smartypants.py b/smartypants.py
index 55506ac..c39f409 100755
--- a/smartypants.py
+++ b/smartypants.py
@@ -15,7 +15,7 @@ smartypants module
 
 __author__ = 'Leo Hemsted'
 __author_email__ = 'leohemsted@gmail.com'
-__version__ = '2.0.0'
+__version__ = '2.0.1'
 __license__ = 'BSD License'
 __url__ = 'https://github.com/leohemsted/smartypants.py'
 __description__ = 'Python with the SmartyPants'
@@ -570,7 +570,7 @@ def _tokenize(text):
 
     tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S)
 
-    token_match = tag_soup.search(text)
+    token_match = tag_soup.match(text)
 
     previous_end = 0
     while token_match:
@@ -600,7 +600,7 @@ def _tokenize(text):
         tokens.append([type_, tag])
 
         previous_end = token_match.end()
-        token_match = tag_soup.search(text, token_match.end())
+        token_match = tag_soup.match(text, token_match.end())
 
     if previous_end < len(text):
         tokens.append(['text', text[previous_end:]])
diff --git a/tests/test.py b/tests/test.py
index 840bfa0..2c1a0ea 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -5,6 +5,7 @@
 
 import doctest
 import unittest
+from time import time
 
 import smartypants
 from smartypants import smartypants as sp
@@ -147,6 +148,16 @@ document.write('<a href="' + href + '">' + linktext + "</a>");
         self.assertEqual(sp('"quote here"', Attr.set1 | Attr.s),
                          '"quote here"')
 
+    def test_incredibly_long_string(self):
+        # make sure it doesn't take too long with a long string
+        start = time()
+
+        sp("a" * 10000)
+
+        end = time()
+
+        self.assertLess(end - start, 0.2)
+
 
 def load_tests(loader, tests, pattern):
author	Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>	2017-12-19 16:28:56 +0000
committer	Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk>	2017-12-20 13:18:30 +0000
commit	4b5ab3344cc88c69c1045e2c488173dfa867bb7f (patch)
tree	4b9a4ea7065c876b7d81774d7661debab8b68621
parent	5fc7b506b3e6c36b44534f73c18ffe168d73dede (diff)
download	smartypants-git-4b5ab3344cc88c69c1045e2c488173dfa867bb7f.tar.gz