summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAron Griffis <aron@arongriffis.com>2013-09-19 21:07:03 -0400
committerAron Griffis <aron@arongriffis.com>2013-09-19 21:07:03 -0400
commitdd8aa7cac51dc8f0f2c273a0c2082fa8ed465193 (patch)
tree605861cf4cd995749ab97b3739226b6fbde68d37
parent68e926a0dba69b96f908226bda12fa9d78f5aa1e (diff)
downloadsmartypants-dd8aa7cac51dc8f0f2c273a0c2082fa8ed465193.tar.gz
Fix the primitive tokenizer to handle HTML comments that contain tags, otherwise this generates completely broken HTML by turning the end-comment into an en-dash.
-rwxr-xr-xsmartypants.py2
-rw-r--r--tests/test.py7
2 files changed, 8 insertions, 1 deletions
diff --git a/smartypants.py b/smartypants.py
index aec885b..f8d56f3 100755
--- a/smartypants.py
+++ b/smartypants.py
@@ -709,7 +709,7 @@ def _tokenize(text):
tokens = []
- tag_soup = re.compile('([^<]*)(<[^>]*>)')
+ tag_soup = re.compile(r"""(?s)([^<]*)(<!--.*?--\s*>|<[^>]*>)""")
token_match = tag_soup.search(text)
diff --git a/tests/test.py b/tests/test.py
index ffd20ac..d0622de 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -84,6 +84,13 @@ document.write('<a href="' + href + '">' + linktext + "</a>");
"is python code.</p>")
self.assertEqual(T, E)
+ def test_comments(self):
+
+ self.assertEqual(sp("--"), "&#8212;")
+ self.assertEqual(sp("-->"), "&#8212;>")
+ self.assertEqual(sp("<!-- comment -->"), "<!-- comment -->")
+ self.assertEqual(sp("<!-- <li>Fee-fi-of-fum</li> -->"), "<!-- <li>Fee-fi-of-fum</li> -->")
+
def test_ordinal_numbers(self):
self.assertEqual(sp("21st century"), "21st century") # no effect.