diff options
author | David Lord <davidism@gmail.com> | 2021-01-31 07:54:40 -0800 |
---|---|---|
committer | David Lord <davidism@gmail.com> | 2021-01-31 07:54:40 -0800 |
commit | 49d4d539d3b61c9bbccd98d3293107fc6d188b65 (patch) | |
tree | 38514dfe5ebd11401609be2bde066efa8c5474e2 | |
parent | eeca0fecc3318d43f61bc340ad61db641b861ade (diff) | |
download | jinja2-urlize-speedup.tar.gz |
speed up urlize matchingurlize-speedup
-rw-r--r-- | src/jinja2/utils.py | 107 |
1 files changed, 56 insertions, 51 deletions
diff --git a/src/jinja2/utils.py b/src/jinja2/utils.py index b422ba9..6afca81 100644 --- a/src/jinja2/utils.py +++ b/src/jinja2/utils.py @@ -6,6 +6,8 @@ import warnings from collections import deque from random import choice from random import randrange +from string import ascii_letters as _letters +from string import digits as _digits from threading import Lock from markupsafe import escape @@ -16,20 +18,6 @@ from ._compat import string_types from ._compat import text_type from ._compat import url_quote -_word_split_re = re.compile(r"(\s+)") -_punctuation_re = re.compile( - "^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$" - % ( - "|".join(map(re.escape, ("(", "<", "<"))), - "|".join(map(re.escape, (".", ",", ")", ">", "\n", ">"))), - ) -) -_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$") -_striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)") -_entity_re = re.compile(r"&([^;]+);") -_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" -_digits = "0123456789" - # special singleton representing missing values for the runtime missing = type("MissingType", (), {"__repr__": lambda x: "missing"})() @@ -210,48 +198,65 @@ def urlize(text, trim_url_limit=None, rel=None, target=None): and (x[:limit] + (len(x) >= limit and "..." or "")) or x ) - words = _word_split_re.split(text_type(escape(text))) + words = re.split(r"(\s+)", text_type(escape(text))) rel_attr = rel and ' rel="%s"' % text_type(escape(rel)) or "" target_attr = target and ' target="%s"' % escape(target) or "" for i, word in enumerate(words): - match = _punctuation_re.match(word) + head, middle, tail = "", word, "" + match = re.match(r"^([(<]|<)+", middle) + if match: - lead, middle, trail = match.groups() - if middle.startswith("www.") or ( - "@" not in middle - and not middle.startswith("http://") - and not middle.startswith("https://") - and len(middle) > 0 - and middle[0] in _letters + _digits - and ( - middle.endswith(".org") - or middle.endswith(".net") - or middle.endswith(".com") - ) - ): - middle = '<a href="http://%s"%s%s>%s</a>' % ( - middle, - rel_attr, - target_attr, - trim_url(middle), - ) - if middle.startswith("http://") or middle.startswith("https://"): - middle = '<a href="%s"%s%s>%s</a>' % ( - middle, - rel_attr, - target_attr, - trim_url(middle), - ) - if ( - "@" in middle - and not middle.startswith("www.") - and ":" not in middle - and _simple_email_re.match(middle) - ): - middle = '<a href="mailto:%s">%s</a>' % (middle, middle) - if lead + middle + trail != word: - words[i] = lead + middle + trail + head = match.group() + middle = middle[match.end() :] + + # Unlike lead, which is anchored to the start of the string, + # need to check that the string ends with any of the characters + # before trying to match all of them, to avoid backtracking. + if middle.endswith((")", ">", ".", ",", "\n", ">")): + match = re.search(r"([)>.,\n]|>)+$", middle) + + if match: + tail = match.group() + middle = middle[: match.start()] + + if middle.startswith("www.") or ( + "@" not in middle + and not middle.startswith("http://") + and not middle.startswith("https://") + and len(middle) > 0 + and middle[0] in _letters + _digits + and ( + middle.endswith(".org") + or middle.endswith(".net") + or middle.endswith(".com") + ) + ): + middle = '<a href="http://%s"%s%s>%s</a>' % ( + middle, + rel_attr, + target_attr, + trim_url(middle), + ) + + if middle.startswith("http://") or middle.startswith("https://"): + middle = '<a href="%s"%s%s>%s</a>' % ( + middle, + rel_attr, + target_attr, + trim_url(middle), + ) + + if ( + "@" in middle + and not middle.startswith("www.") + and ":" not in middle + and re.match(r"^\S+@\w[\w.-]*\.\w+$", middle) + ): + middle = '<a href="mailto:%s">%s</a>' % (middle, middle) + + words[i] = head + middle + tail + return u"".join(words) |