diff options
author | Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk> | 2017-12-19 16:28:56 +0000 |
---|---|---|
committer | Leo Hemsted <leo.hemsted@digital.cabinet-office.gov.uk> | 2017-12-20 13:18:30 +0000 |
commit | 4b5ab3344cc88c69c1045e2c488173dfa867bb7f (patch) | |
tree | 4b9a4ea7065c876b7d81774d7661debab8b68621 | |
parent | 5fc7b506b3e6c36b44534f73c18ffe168d73dede (diff) | |
download | smartypants-git-4b5ab3344cc88c69c1045e2c488173dfa867bb7f.tar.gz |
use re.match instead of re.search to improve performance
if there isn't a match to the second half of the tag regex, the regex
engine backtracks through the entire string character by character.
one way to solve this is to use match instead of search. search, if it
doesn't find a match starting with char 0, tries again starting at char
1, and so on and so forth - so if there's no match it'll take O(n^2)
polynomial time to complete. match, on the other hand, only matches if
the entire string matches. Since we start off with a greedy capturing
group to consume everything before a tag, this is safe to use.
-rw-r--r-- | CHANGES.rst | 1 | ||||
-rwxr-xr-x | smartypants.py | 6 | ||||
-rw-r--r-- | tests/test.py | 11 |
3 files changed, 15 insertions, 3 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index 133dc8c..2627e49 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -46,6 +46,7 @@ Releases 2.0 and greater Development ----------- +* use `re.match` instead of `re.search` to improve performance on large strings Release 2.0.0 (2016-12-28T06:18:42Z) ------------------------------------ diff --git a/smartypants.py b/smartypants.py index 55506ac..c39f409 100755 --- a/smartypants.py +++ b/smartypants.py @@ -15,7 +15,7 @@ smartypants module __author__ = 'Leo Hemsted' __author_email__ = 'leohemsted@gmail.com' -__version__ = '2.0.0' +__version__ = '2.0.1' __license__ = 'BSD License' __url__ = 'https://github.com/leohemsted/smartypants.py' __description__ = 'Python with the SmartyPants' @@ -570,7 +570,7 @@ def _tokenize(text): tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S) - token_match = tag_soup.search(text) + token_match = tag_soup.match(text) previous_end = 0 while token_match: @@ -600,7 +600,7 @@ def _tokenize(text): tokens.append([type_, tag]) previous_end = token_match.end() - token_match = tag_soup.search(text, token_match.end()) + token_match = tag_soup.match(text, token_match.end()) if previous_end < len(text): tokens.append(['text', text[previous_end:]]) diff --git a/tests/test.py b/tests/test.py index 840bfa0..2c1a0ea 100644 --- a/tests/test.py +++ b/tests/test.py @@ -5,6 +5,7 @@ import doctest import unittest +from time import time import smartypants from smartypants import smartypants as sp @@ -147,6 +148,16 @@ document.write('<a href="' + href + '">' + linktext + "</a>"); self.assertEqual(sp('"quote here"', Attr.set1 | Attr.s), '"quote here"') + def test_incredibly_long_string(self): + # make sure it doesn't take too long with a long string + start = time() + + sp("a" * 10000) + + end = time() + + self.assertLess(end - start, 0.2) + def load_tests(loader, tests, pattern): |