summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYu-Jie Lin <livibetter@gmail.com>2013-09-21 05:24:32 +0800
committerYu-Jie Lin <livibetter@gmail.com>2013-09-21 05:24:32 +0800
commitb00f2b51b83234df327cfc6e1117a9da2caef668 (patch)
tree99188b7d824e2eff51c3239e17ed9fe045b1409a
parent0134cea881640d9583339d997d517652de32450e (diff)
downloadsmartypants-git-b00f2b51b83234df327cfc6e1117a9da2caef668.tar.gz
fix hyphens do not cause a comment not a comment, add more tests
In both HTML4 and HTML5, two hyphens should not (HTML4) or must not (HTML5) be contained in comments. This would make sure if they appear in a comment, then the entire comment will be treated as text, which means it will be converted.
-rw-r--r--CHANGES.rst6
-rwxr-xr-xsmartypants.py24
-rw-r--r--tests/test.py36
3 files changed, 59 insertions, 7 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 8c7ba50..d62459e 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -14,8 +14,10 @@ Versions without timestamps mean they are future releases.
and ``processEscapes``
development:
- - fix ``_tokenize`` turning ``--`` of HTML comment end tag ``-->``
- (pull request #1)
+ - fix ``_tokenize`` can not handle HTML comment properly
+
+ This fix includes pull request #1 with modification for handling
+ ``--`` appears in a comment, which makes the comment not a comment.
1.8.2: 2013-08-28T11:38:42Z
- add documentation generation
diff --git a/smartypants.py b/smartypants.py
index f8d56f3..1580d26 100755
--- a/smartypants.py
+++ b/smartypants.py
@@ -709,7 +709,7 @@ def _tokenize(text):
tokens = []
- tag_soup = re.compile(r"""(?s)([^<]*)(<!--.*?--\s*>|<[^>]*>)""")
+ tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S)
token_match = tag_soup.search(text)
@@ -718,7 +718,27 @@ def _tokenize(text):
if token_match.group(1):
tokens.append(['text', token_match.group(1)])
- tokens.append(['tag', token_match.group(2)])
+ # if -- in text part of comment, then it's not a comment, therefore it
+ # should be converted.
+ #
+ # In HTML4 [1]:
+ # [...] Authors should avoid putting two or more adjacent hyphens
+ # inside comments.
+ #
+ # In HTML5 [2]:
+ # [...] the comment may have text, with the additional restriction
+ # that the text must not [...], nor contain two consecutive U+002D
+ # HYPHEN-MINUS characters (--)
+ #
+ # [1]: http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.4
+ # [2]: http://www.w3.org/TR/html5/syntax.html#comments
+ tag = token_match.group(2)
+ type_ = 'tag'
+ if tag.startswith('<!--'):
+ # remove --[white space]> from the end of tag
+ if '--' in tag[4:].rstrip('>').rstrip().rstrip('-'):
+ type_ = 'text'
+ tokens.append([type_, tag])
previous_end = token_match.end()
token_match = tag_soup.search(text, token_match.end())
diff --git a/tests/test.py b/tests/test.py
index 85e8af2..6b2c5da 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -88,9 +88,39 @@ document.write('<a href="' + href + '">' + linktext + "</a>");
self.assertEqual(sp("--"), "&#8212;")
self.assertEqual(sp("-->"), "&#8212;>")
- self.assertEqual(sp("<!-- comment -->"), "<!-- comment -->")
- self.assertEqual(sp("<!-- <li>Fee-fi-of-fum</li> -->"),
- "<!-- <li>Fee-fi-of-fum</li> -->")
+ self.assertEqual(sp("-- \t >"), "&#8212; \t >")
+
+ TEXT = '<!-- "foo" --> blah--blah <!-- "bar" -->'
+ T = sp(TEXT)
+ E = '<!-- "foo" --> blah&#8212;blah <!-- "bar" -->'
+ self.assertEqual(T, E)
+
+ TEXT = (
+ '<p>foo -- "bar"<!-- foo-bar\n'
+ '<p>blah "this"</p>\n'
+ '-->\n'
+ '</p>'
+ )
+
+ T = sp(TEXT)
+ E = (
+ '<p>foo &#8212; &#8220;bar&#8221;<!-- foo-bar\n'
+ '<p>blah "this"</p>\n'
+ '-->\n'
+ '</p>'
+ )
+ self.assertEqual(T, E)
+
+ # nothing should be converted
+ for TEXT in ('<!-- comment -->',
+ '<!-- <li>Fee-fi-of-fum</li> -->',
+ '<!-- "foo" --> <!-- "bar" -->'):
+ self.assertEqual(sp(TEXT), TEXT)
+
+ # not comments
+ self.assertEqual(sp('<!-- -- -->'), '<!&#8212; &#8212; &#8212;>')
+ self.assertEqual(sp('<!-- -- -- \t >'),
+ '<!&#8212; &#8212; &#8212; \t >')
def test_ordinal_numbers(self):