Improve and extend urlize

author: Bebleo <james.warne@outlook.com> 2020-04-19 05:42:12 -0400
committer: David Lord <davidism@gmail.com> 2021-01-30 06:25:03 -0800
commit: d504e1d1e2798d7b4661462b9ef4cd77dd270ff9 (patch)
tree: 7b97d883edb56768cc5d72351dfe9f8946dff61d
parent: c3b34a06f340234939df5ad77bbe6327ca7fc3f0 (diff)
download: jinja2-d504e1d1e2798d7b4661462b9ef4cd77dd270ff9.tar.gz
6 files changed, 129 insertions, 27 deletions
diff --git a/docs/api.rst b/docs/api.rst
index ec083a8..9189642 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -550,6 +550,10 @@ Example::
     The default target that is issued for links from the `urlize` filter
     if no other target is defined by the call explicitly.
 
+``urlize.additional_uri_schemes``:
+    Additional uri scheme prefixes that will generate links from the
+    `urlize` filter in addition to http://, https://, and mailto:.
+
 ``json.dumps_function``:
     If this is set to a value other than `None` then the `tojson` filter
     will dump with this function instead of the default one.  Note that
diff --git a/src/jinja2/defaults.py b/src/jinja2/defaults.py
index 1f0b0ab..d582836 100644
--- a/src/jinja2/defaults.py
+++ b/src/jinja2/defaults.py
@@ -35,6 +35,7 @@ DEFAULT_POLICIES = {
     "compiler.ascii_str": True,
     "urlize.rel": "noopener",
     "urlize.target": None,
+    "urlize.extra_uri_schemes": None,
     "truncate.leeway": 5,
     "json.dumps_function": None,
     "json.dumps_kwargs": {"sort_keys": True},
diff --git a/src/jinja2/filters.py b/src/jinja2/filters.py
index 7a554a0..0d1639f 100644
--- a/src/jinja2/filters.py
+++ b/src/jinja2/filters.py
@@ -20,6 +20,7 @@ from .utils import urlize
 
 _word_re = re.compile(r"\w+")
 _word_beginning_split_re = re.compile(r"([-\s({\[<]+)")
+_uri_scheme_re = re.compile(r"^([\w\.\+-]{2,}:(/){0,2})$")
 
 
 def contextfilter(f):
@@ -569,7 +570,13 @@ def do_pprint(value):
 
 @evalcontextfilter
 def do_urlize(
-    eval_ctx, value, trim_url_limit=None, nofollow=False, target=None, rel=None
+    eval_ctx,
+    value,
+    trim_url_limit=None,
+    nofollow=False,
+    target=None,
+    rel=None,
+    extra_uri_schemes=None,
 ):
     """Converts URLs in plain text into clickable links.
 
@@ -589,18 +596,44 @@ def do_urlize(
 
        {{ mytext|urlize(40, target='_blank') }}
 
+    If *extra_uri_schemes* are added then links will be generated for those
+    in addition to http(s): and mailto: schemes.
+
+    .. sourcecode:: jinja
+
+        {{ mytext|urlize(extra_uri_schemes=['tel:', 'ftp://']) }}
+            links are generated for tel and ftp.
+
     .. versionchanged:: 2.8
        The ``target`` parameter was added.
+
+    .. versionchanged:: 3.0
+       The ``extra_uri_schemes`` parameter was added.
     """
     policies = eval_ctx.environment.policies
+
     rel = set((rel or "").split() or [])
     if nofollow:
         rel.add("nofollow")
     rel.update((policies["urlize.rel"] or "").split())
+    rel = " ".join(sorted(rel)) or None
+
     if target is None:
         target = policies["urlize.target"]
-    rel = " ".join(sorted(rel)) or None
-    rv = urlize(value, trim_url_limit, rel=rel, target=target)
+
+    if extra_uri_schemes is None:
+        extra_uri_schemes = policies["urlize.extra_uri_schemes"] or []
+    for uri_scheme in extra_uri_schemes:
+        if _uri_scheme_re.fullmatch(uri_scheme) is None:
+            raise FilterArgumentError(f"{uri_scheme} is not a valid URI scheme prefix.")
+
+    rv = urlize(
+        value,
+        trim_url_limit,
+        rel=rel,
+        target=target,
+        extra_uri_schemes=extra_uri_schemes,
+    )
     if eval_ctx.autoescape:
         rv = Markup(rv)
     return rv
diff --git a/src/jinja2/utils.py b/src/jinja2/utils.py
index 8ee0295..9ab5eb0 100644
--- a/src/jinja2/utils.py
+++ b/src/jinja2/utils.py
@@ -17,6 +17,14 @@ _trail_pattern = "|".join(map(re.escape, (".", ",", ")", ">", "\n", "&gt;")))
 _punctuation_re = re.compile(
     fr"^(?P<lead>(?:{_lead_pattern})*)(?P<middle>.*?)(?P<trail>(?:{_trail_pattern})*)$"
 )
+_simple_http_https_re = re.compile(
+    r"^((https?://|www\.)(([\w%-]+\.)+)?([a-z]{2,63}|xn--[\w%]{2,59})|"
+    r"([\w%-]{2,63}\.)+(com|net|int|edu|gov|org|info|mil)|"
+    r"(https?://)((([\d]{1,3})(\.[\d]{1,3}){3})|"
+    r"(\[([\da-f]{0,4}:){2}([\da-f]{0,4}:?){1,6}\])))"
+    r"(?::[\d]{1,5})?(?:[/?#]\S*)?$",
+    re.IGNORECASE,
+)
 _simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
 _striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
 _entity_re = re.compile(r"&([^;]+);")
@@ -175,11 +183,11 @@ def pformat(obj):
     return pformat(obj)
 
 
-def urlize(text, trim_url_limit=None, rel=None, target=None):
+def urlize(text, trim_url_limit=None, rel=None, target=None, extra_uri_schemes=None):
     """Converts any URLs in text into clickable links. Works on http://,
-    https:// and www. links. Links can have trailing punctuation (periods,
-    commas, close-parens) and leading punctuation (opening parens) and
-    it'll still do the right thing.
+    https://, www., mailto:, and email links. Links can have trailing
+    punctuation (periods, commas, close-parens) and leading punctuation
+    (opening parens) and it'll still do the right thing.
 
     If trim_url_limit is not None, the URLs in link text will be limited
     to trim_url_limit characters.
@@ -188,6 +196,13 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
     attribute.
 
     If target is not None, a target attribute will be added to the link.
+
+    Known Limitations:
+    -   Will not urlize emails or mailto: links if they include header fields
+        (for example, mailto:address@example.com?cc=copy@example.com).
+
+    .. versionchanged:: 3.0
+        Adds limited support for mailto: links
     """
 
     def trim_url(x, limit=trim_url_limit):
@@ -204,26 +219,30 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
         match = _punctuation_re.match(word)
         if match:
             lead, middle, trail = match.groups()
-            if middle.startswith("www.") or (
-                "@" not in middle
-                and not middle.startswith("http://")
-                and not middle.startswith("https://")
-                and len(middle) > 0
-                and middle[0] in _letters + _digits
-                and (
-                    middle.endswith(".org")
-                    or middle.endswith(".net")
-                    or middle.endswith(".com")
-                )
-            ):
-                middle = (
-                    f'<a href="http://{middle}"{rel_attr}{target_attr}>'
-                    f"{trim_url(middle)}</a>"
-                )
-            if middle.startswith("http://") or middle.startswith("https://"):
-                middle = (
-                    f'<a href="{middle}"{rel_attr}{target_attr}>{trim_url(middle)}</a>'
-                )
+            # fix for mismatched opening and closing parentheses
+            pairs = [("(", ")"), ("<", ">"), ("&lt;", "&gt;")]
+            for start_char in re.findall(_lead_pattern, middle):
+                end_char = next(c for o, c in pairs if o == start_char)
+                while (
+                    middle.count(start_char) > middle.count(end_char)
+                    and end_char in trail
+                ):
+                    end_char_index = trail.index(end_char)
+                    middle = middle + trail[: end_char_index + len(end_char)]
+                    trail = trail[end_char_index + len(end_char) :]
+
+            if _simple_http_https_re.match(middle):
+                if middle.startswith("https://") or middle.startswith("http://"):
+                    middle = (
+                        f'<a href="{middle}"{rel_attr}{target_attr}>'
+                        f"{trim_url(middle)}</a>"
+                    )
+                else:
+                    middle = (
+                        f'<a href="https://{middle}"{rel_attr}{target_attr}>'
+                        f"{trim_url(middle)}</a>"
+                    )
+
             if (
                 "@" in middle
                 and not middle.startswith("www.")
@@ -231,8 +250,21 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
                 and _simple_email_re.match(middle)
             ):
                 middle = f'<a href="mailto:{middle}">{middle}</a>'
+            if middle.startswith("mailto:") and _simple_email_re.match(middle[7:]):
+                middle = f'<a href="{middle}">{middle[7:]}</a>'
+
+            if extra_uri_schemes is not None:
+                schemes = {x for x in extra_uri_schemes if middle.startswith(x)}
+                for uri_scheme in schemes:
+                    if len(middle) > len(uri_scheme):
+                        middle = (
+                            f'<a href="{middle}"{rel_attr}{target_attr}>'
+                            f"{middle}</a>"
+                        )
+
             if lead + middle + trail != word:
                 words[i] = lead + middle + trail
+
     return "".join(words)
 
 
diff --git a/tests/test_filters.py b/tests/test_filters.py
index 8087a24..bf00f06 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -337,11 +337,23 @@ class TestFilter:
         assert tmpl.render() == "FOO"
 
     def test_urlize(self, env):
+        tmpl = env.from_string('{{ "foo example.org bar"|urlize }}')
+        assert tmpl.render() == (
+            'foo <a href="https://example.org" rel="noopener">' "example.org</a> bar"
+        )
         tmpl = env.from_string('{{ "foo http://www.example.com/ bar"|urlize }}')
         assert tmpl.render() == (
             'foo <a href="http://www.example.com/" rel="noopener">'
             "http://www.example.com/</a> bar"
         )
+        tmpl = env.from_string('{{ "foo mailto:email@example.com bar"|urlize }}')
+        assert tmpl.render() == (
+            'foo <a href="mailto:email@example.com">email@example.com</a> bar'
+        )
+        tmpl = env.from_string('{{ "foo email@example.com bar"|urlize }}')
+        assert tmpl.render() == (
+            'foo <a href="mailto:email@example.com">email@example.com</a> bar'
+        )
 
     def test_urlize_rel_policy(self):
         env = Environment()
@@ -361,6 +373,17 @@ class TestFilter:
             "http://www.example.com/</a> bar"
         )
 
+    def test_urlize_extra_uri_schemes_parameter(self, env):
+        tmpl = env.from_string(
+            '{{ "foo tel:+1-514-555-1234 ftp://localhost bar"|'
+            'urlize(extra_uri_schemes=["tel:", "ftp:"]) }}'
+        )
+        assert tmpl.render() == (
+            'foo <a href="tel:+1-514-555-1234" rel="noopener">'
+            'tel:+1-514-555-1234</a> <a href="ftp://localhost" rel="noopener">'
+            "ftp://localhost</a> bar"
+        )
+
     def test_wordcount(self, env):
         tmpl = env.from_string('{{ "foo bar baz"|wordcount }}')
         assert tmpl.render() == "3"
diff --git a/tests/test_regression.py b/tests/test_regression.py
index d052f43..21a6d92 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -109,6 +109,15 @@ class TestBug:
             "http://www.example.org/&lt;foo</a>"
         )
 
+    def test_urlize_filter_closing_punctuation(self, env):
+        tmpl = env.from_string(
+            '{{ "(see http://www.example.org/?page=subj_<desc.h>)"|urlize }}'
+        )
+        assert tmpl.render() == (
+            '(see <a href="http://www.example.org/?page=subj_&lt;desc.h&gt;" '
+            'rel="noopener">http://www.example.org/?page=subj_&lt;desc.h&gt;</a>)'
+        )
+
     def test_loop_call_loop(self, env):
         tmpl = env.from_string(
             """
author	Bebleo <james.warne@outlook.com>	2020-04-19 05:42:12 -0400
committer	David Lord <davidism@gmail.com>	2021-01-30 06:25:03 -0800
commit	d504e1d1e2798d7b4661462b9ef4cd77dd270ff9 (patch)
tree	7b97d883edb56768cc5d72351dfe9f8946dff61d
parent	c3b34a06f340234939df5ad77bbe6327ca7fc3f0 (diff)
download	jinja2-d504e1d1e2798d7b4661462b9ef4cd77dd270ff9.tar.gz