diff options
author | Yu-Jie Lin <livibetter@gmail.com> | 2013-09-21 05:24:32 +0800 |
---|---|---|
committer | Yu-Jie Lin <livibetter@gmail.com> | 2013-09-21 05:24:32 +0800 |
commit | b00f2b51b83234df327cfc6e1117a9da2caef668 (patch) | |
tree | 99188b7d824e2eff51c3239e17ed9fe045b1409a | |
parent | 0134cea881640d9583339d997d517652de32450e (diff) | |
download | smartypants-git-b00f2b51b83234df327cfc6e1117a9da2caef668.tar.gz |
fix hyphens do not cause a comment not a comment, add more tests
In both HTML4 and HTML5, two hyphens should not (HTML4) or must not (HTML5) be contained in comments. This would make sure if they appear in a comment, then the entire comment will be treated as text, which means it will be converted.
-rw-r--r-- | CHANGES.rst | 6 | ||||
-rwxr-xr-x | smartypants.py | 24 | ||||
-rw-r--r-- | tests/test.py | 36 |
3 files changed, 59 insertions, 7 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index 8c7ba50..d62459e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,8 +14,10 @@ Versions without timestamps mean they are future releases. and ``processEscapes`` development: - - fix ``_tokenize`` turning ``--`` of HTML comment end tag ``-->`` - (pull request #1) + - fix ``_tokenize`` can not handle HTML comment properly + + This fix includes pull request #1 with modification for handling + ``--`` appears in a comment, which makes the comment not a comment. 1.8.2: 2013-08-28T11:38:42Z - add documentation generation diff --git a/smartypants.py b/smartypants.py index f8d56f3..1580d26 100755 --- a/smartypants.py +++ b/smartypants.py @@ -709,7 +709,7 @@ def _tokenize(text): tokens = [] - tag_soup = re.compile(r"""(?s)([^<]*)(<!--.*?--\s*>|<[^>]*>)""") + tag_soup = re.compile(r'([^<]*)(<!--.*?--\s*>|<[^>]*>)', re.S) token_match = tag_soup.search(text) @@ -718,7 +718,27 @@ def _tokenize(text): if token_match.group(1): tokens.append(['text', token_match.group(1)]) - tokens.append(['tag', token_match.group(2)]) + # if -- in text part of comment, then it's not a comment, therefore it + # should be converted. + # + # In HTML4 [1]: + # [...] Authors should avoid putting two or more adjacent hyphens + # inside comments. + # + # In HTML5 [2]: + # [...] the comment may have text, with the additional restriction + # that the text must not [...], nor contain two consecutive U+002D + # HYPHEN-MINUS characters (--) + # + # [1]: http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.4 + # [2]: http://www.w3.org/TR/html5/syntax.html#comments + tag = token_match.group(2) + type_ = 'tag' + if tag.startswith('<!--'): + # remove --[white space]> from the end of tag + if '--' in tag[4:].rstrip('>').rstrip().rstrip('-'): + type_ = 'text' + tokens.append([type_, tag]) previous_end = token_match.end() token_match = tag_soup.search(text, token_match.end()) diff --git a/tests/test.py b/tests/test.py index 85e8af2..6b2c5da 100644 --- a/tests/test.py +++ b/tests/test.py @@ -88,9 +88,39 @@ document.write('<a href="' + href + '">' + linktext + "</a>"); self.assertEqual(sp("--"), "—") self.assertEqual(sp("-->"), "—>") - self.assertEqual(sp("<!-- comment -->"), "<!-- comment -->") - self.assertEqual(sp("<!-- <li>Fee-fi-of-fum</li> -->"), - "<!-- <li>Fee-fi-of-fum</li> -->") + self.assertEqual(sp("-- \t >"), "— \t >") + + TEXT = '<!-- "foo" --> blah--blah <!-- "bar" -->' + T = sp(TEXT) + E = '<!-- "foo" --> blah—blah <!-- "bar" -->' + self.assertEqual(T, E) + + TEXT = ( + '<p>foo -- "bar"<!-- foo-bar\n' + '<p>blah "this"</p>\n' + '-->\n' + '</p>' + ) + + T = sp(TEXT) + E = ( + '<p>foo — “bar”<!-- foo-bar\n' + '<p>blah "this"</p>\n' + '-->\n' + '</p>' + ) + self.assertEqual(T, E) + + # nothing should be converted + for TEXT in ('<!-- comment -->', + '<!-- <li>Fee-fi-of-fum</li> -->', + '<!-- "foo" --> <!-- "bar" -->'): + self.assertEqual(sp(TEXT), TEXT) + + # not comments + self.assertEqual(sp('<!-- -- -->'), '<!— — —>') + self.assertEqual(sp('<!-- -- -- \t >'), + '<!— — — \t >') def test_ordinal_numbers(self): |