fix hyphens do not cause a comment not a comment, add more tests

In both HTML4 and HTML5, two hyphens should not (HTML4) or must not (HTML5) be contained in comments. This would make sure if they appear in a comment, then the entire comment will be treated as text, which means it will be converted.
author: Yu-Jie Lin <livibetter@gmail.com> 2013-09-21 05:24:32 +0800
committer: Yu-Jie Lin <livibetter@gmail.com> 2013-09-21 05:24:32 +0800
commit: b00f2b51b83234df327cfc6e1117a9da2caef668 (patch)
tree: 99188b7d824e2eff51c3239e17ed9fe045b1409a
parent: 0134cea881640d9583339d997d517652de32450e (diff)
download: smartypants-git-b00f2b51b83234df327cfc6e1117a9da2caef668.tar.gz
3 files changed, 59 insertions, 7 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 8c7ba50..d62459e 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -14,8 +14,10 @@ Versions without timestamps mean they are future releases.
       and ``processEscapes``
 
 development:
-    - fix ``_tokenize`` turning ``--`` of HTML comment end tag ``-->``
-      (pull request #1)
+    - fix ``_tokenize`` can not handle HTML comment properly
+
+        This fix includes pull request #1 with modification for handling
+        ``--`` appears in a comment, which makes the comment not a comment.
 
 1.8.2: 2013-08-28T11:38:42Z
     - add documentation generation
diff --git a/smartypants.py b/smartypants.py
index f8d56f3..1580d26 100755
--- a/smartypants.py
+++ b/smartypants.py
@@ -709,7 +709,7 @@ def _tokenize(text):
 
     tokens = []
 
-    tag_soup = re.compile(r"""(?s)([^<]*)(<!--.*?--\s*>|<[^>]*>)""")
+    tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S)
 
     token_match = tag_soup.search(text)
 
@@ -718,7 +718,27 @@ def _tokenize(text):
         if token_match.group(1):
             tokens.append(['text', token_match.group(1)])
 
-        tokens.append(['tag', token_match.group(2)])
+        # if -- in text part of comment, then it's not a comment, therefore it
+        # should be converted.
+        #
+        # In HTML4 [1]:
+        #   [...] Authors should avoid putting two or more adjacent hyphens
+        #   inside comments.
+        #
+        # In HTML5 [2]:
+        #   [...] the comment may have text, with the additional restriction
+        #   that the text must not [...], nor contain two consecutive U+002D
+        #   HYPHEN-MINUS characters (--)
+        #
+        # [1]: http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.4
+        # [2]: http://www.w3.org/TR/html5/syntax.html#comments
+        tag = token_match.group(2)
+        type_ = 'tag'
+        if tag.startswith('<!--'):
+            # remove --[white space]> from the end of tag
+            if '--' in tag[4:].rstrip('>').rstrip().rstrip('-'):
+                type_ = 'text'
+        tokens.append([type_, tag])
 
         previous_end = token_match.end()
         token_match = tag_soup.search(text, token_match.end())
diff --git a/tests/test.py b/tests/test.py
index 85e8af2..6b2c5da 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -88,9 +88,39 @@ document.write('<a href="' + href + '">' + linktext + "</a>");
 
         self.assertEqual(sp("--"), "&#8212;")
         self.assertEqual(sp("-->"), "&#8212;>")
-        self.assertEqual(sp("<!-- comment -->"), "<!-- comment -->")
-        self.assertEqual(sp("<!-- <li>Fee-fi-of-fum</li> -->"),
-                         "<!-- <li>Fee-fi-of-fum</li> -->")
+        self.assertEqual(sp("-- \t  >"), "&#8212; \t  >")
+
+        TEXT = '<!-- "foo" --> blah--blah <!-- "bar" -->'
+        T = sp(TEXT)
+        E = '<!-- "foo" --> blah&#8212;blah <!-- "bar" -->'
+        self.assertEqual(T, E)
+
+        TEXT = (
+            '<p>foo -- "bar"<!-- foo-bar\n'
+            '<p>blah "this"</p>\n'
+            '-->\n'
+            '</p>'
+        )
+
+        T = sp(TEXT)
+        E = (
+            '<p>foo &#8212; &#8220;bar&#8221;<!-- foo-bar\n'
+            '<p>blah "this"</p>\n'
+            '-->\n'
+            '</p>'
+        )
+        self.assertEqual(T, E)
+
+        # nothing should be converted
+        for TEXT in ('<!-- comment -->',
+                     '<!-- <li>Fee-fi-of-fum</li> -->',
+                     '<!-- "foo" --> <!-- "bar" -->'):
+            self.assertEqual(sp(TEXT), TEXT)
+
+        # not comments
+        self.assertEqual(sp('<!-- -- -->'), '<!&#8212; &#8212; &#8212;>')
+        self.assertEqual(sp('<!-- -- -- \t >'),
+                         '<!&#8212; &#8212; &#8212; \t >')
 
     def test_ordinal_numbers(self):
author	Yu-Jie Lin <livibetter@gmail.com>	2013-09-21 05:24:32 +0800
committer	Yu-Jie Lin <livibetter@gmail.com>	2013-09-21 05:24:32 +0800
commit	b00f2b51b83234df327cfc6e1117a9da2caef668 (patch)
tree	99188b7d824e2eff51c3239e17ed9fe045b1409a
parent	0134cea881640d9583339d997d517652de32450e (diff)
download	smartypants-git-b00f2b51b83234df327cfc6e1117a9da2caef668.tar.gz