summaryrefslogtreecommitdiff
path: root/docutils
diff options
context:
space:
mode:
authormilde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2012-11-18 22:11:49 +0000
committermilde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2012-11-18 22:11:49 +0000
commit0304e2ae239c44db70bcb85234a188749eba1b87 (patch)
tree1791ec6739e61a0d198e39ce594cd1bc1c60c0fb /docutils
parentf9d2bd91e4eba01350158ffd5b8b79753124d03f (diff)
downloaddocutils-0304e2ae239c44db70bcb85234a188749eba1b87.tar.gz
Smartquotes: correct "educating" of quotes around inline markup.
git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@7537 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
Diffstat (limited to 'docutils')
-rw-r--r--docutils/transforms/universal.py47
-rw-r--r--docutils/utils/smartquotes.py384
2 files changed, 223 insertions, 208 deletions
diff --git a/docutils/transforms/universal.py b/docutils/transforms/universal.py
index 443b186fb..4f5626c1d 100644
--- a/docutils/transforms/universal.py
+++ b/docutils/transforms/universal.py
@@ -207,23 +207,46 @@ class SmartQuotes(Transform):
"""
Replace ASCII quotation marks with typographic form.
- Also replace multiple dashes with em-dashes and en-dashes.
+ Also replace multiple dashes with em-dash/en-dash characters.
"""
default_priority = 850
+ texttype = {True: 'literal',
+ False: 'plain'}
+
def apply(self):
if self.document.settings.smart_quotes is False:
return
- for node in self.document.traverse(nodes.Text):
- if isinstance(node.parent,
- (nodes.FixedTextElement,
- nodes.literal,
- nodes.Special,
- nodes.option_string
- )):
- # print "literal", node
+
+ # "Educate" quotes in normal text. Handle each block of text
+ # (TextElement node) as a unit to keep context around inline nodes:
+ for node in self.document.traverse(nodes.TextElement):
+ # skip preformatted text blocks and special elements:
+ if isinstance(node, (nodes.FixedTextElement, nodes.Special)):
+ continue
+ # nested TextElements are not "block-level" elements:
+ if isinstance(node.parent, nodes.TextElement):
continue
- newtext = smartquotes.smartyPants(node.astext(), attr='2')
- node.parent.replace(node, nodes.Text(newtext))
- # print "smartquote",
+
+ # list of text nodes in the "text block":
+ txtnodes = [txtnode for txtnode in node.traverse(nodes.Text)
+ if not isinstance(txtnode.parent,
+ nodes.option_string)]
+ # smartquotes.educate_tokens() iterates over
+ # ``(texttype, nodetext)`` tuples. `texttype` is "literal"
+ # or "plain" where "literal" text is not changed:
+ tokens = [(self.texttype[isinstance(txtnode.parent,
+ (nodes.literal,
+ nodes.math,
+ nodes.image,
+ nodes.raw,
+ nodes.problematic))],
+ txtnode.astext()) for txtnode in txtnodes]
+
+ # Iterator educating quotes in plain text
+ # 2 : set all, using old school en- and em- dash shortcuts
+ teacher = smartquotes.educate_tokens(tokens, attr='2')
+
+ for txtnode, newtext in zip(txtnodes, teacher):
+ txtnode.parent.replace(txtnode, nodes.Text(newtext))
diff --git a/docutils/utils/smartquotes.py b/docutils/utils/smartquotes.py
index b79993fa8..0eaa531cf 100644
--- a/docutils/utils/smartquotes.py
+++ b/docutils/utils/smartquotes.py
@@ -2,7 +2,7 @@
# -*- coding: utf8 -*-
# :Id: $Id$
-# :Copyright: © 2011 Günter Milde,
+# :Copyright: © 2010 Günter Milde,
# original `SmartyPants`_: © 2003 John Gruber
# smartypants.py: © 2004, 2007 Chad Miller
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
@@ -313,6 +313,10 @@ proper character for closing single-quotes (``’``) by hand.
Version History
===============
+1.6.1: 2012-11-06
+ - Refactor code, code cleanup,
+ - `educate_tokens()` generator as interface for Docutils.
+
1.6: 2010-08-26
- Adaption to Docutils:
- Use Unicode instead of HTML entities,
@@ -352,6 +356,7 @@ Version History
default_smartypants_attr = "1"
+
import re
class smart(object):
@@ -371,7 +376,14 @@ class smart(object):
ellipsis = u'…' # "&#8230;" HORIZONTAL ELLIPSIS
def smartyPants(text, attr=default_smartypants_attr):
- convert_quot = False # translate &quot; entities into normal quotes?
+ """Main function for "traditional" use."""
+
+ return "".join([t for t in educate_tokens(tokenize(text), attr)])
+
+
+def educate_tokens(text_tokens, attr=default_smartypants_attr):
+ """Return iterator that "educates" `text_tokens`.
+ """
# Parse attributes:
# 0 : do nothing
@@ -388,142 +400,123 @@ def smartyPants(text, attr=default_smartypants_attr):
# e : ellipses
# w : convert &quot; entities to " for Dreamweaver users
- skipped_tag_stack = []
- do_dashes = "0"
- do_backticks = "0"
- do_quotes = "0"
- do_ellipses = "0"
- do_stupefy = "0"
-
- if attr == "0":
- # Do nothing.
- return text
- elif attr == "1":
- do_quotes = "1"
- do_backticks = "1"
- do_dashes = "1"
- do_ellipses = "1"
+ convert_quot = False # translate &quot; entities into normal quotes?
+ do_dashes = False
+ do_backticks = False
+ do_quotes = False
+ do_ellipses = False
+ do_stupefy = False
+
+ if attr == "0": # Do nothing.
+ yield text
+ elif attr == "1": # Do everything, turn all options on.
+ do_quotes = True
+ do_backticks = True
+ do_dashes = 1
+ do_ellipses = True
elif attr == "2":
# Do everything, turn all options on, use old school dash shorthand.
- do_quotes = "1"
- do_backticks = "1"
- do_dashes = "2"
- do_ellipses = "1"
+ do_quotes = True
+ do_backticks = True
+ do_dashes = 2
+ do_ellipses = True
elif attr == "3":
- # Do everything, turn all options on, use inverted old school dash shorthand.
- do_quotes = "1"
- do_backticks = "1"
- do_dashes = "3"
- do_ellipses = "1"
- elif attr == "-1":
- # Special "stupefy" mode.
- do_stupefy = "1"
+ # Do everything, use inverted old school dash shorthand.
+ do_quotes = True
+ do_backticks = True
+ do_dashes = 3
+ do_ellipses = True
+ elif attr == "-1": # Special "stupefy" mode.
+ do_stupefy = True
else:
- for c in attr:
- if c == "q": do_quotes = "1"
- elif c == "b": do_backticks = "1"
- elif c == "B": do_backticks = "2"
- elif c == "d": do_dashes = "1"
- elif c == "D": do_dashes = "2"
- elif c == "i": do_dashes = "3"
- elif c == "e": do_ellipses = "1"
- elif c == "w": convert_quot = "1"
- else:
- pass
- # ignore unknown option
-
- tokens = _tokenize(text)
- result = []
- in_pre = False
-
- prev_token_last_char = ""
- # This is a cheat, used to get some context
- # for one-character tokens that consist of
- # just a quote char. What we do is remember
- # the last character of the previous text
- # token, to use as context to curl single-
- # character quote tokens correctly.
-
- for cur_token in tokens:
+ if "q" in attr: do_quotes = True
+ if "b" in attr: do_backticks = True
+ if "B" in attr: do_backticks = 2
+ if "d" in attr: do_dashes = 1
+ if "D" in attr: do_dashes = 2
+ if "i" in attr: do_dashes = 3
+ if "e" in attr: do_ellipses = True
+ if "w" in attr: convert_quot = True
+
+ prev_token_last_char = " "
+ # Get context around inline mark-up. (Remember the last character of the
+ # previous text token, to use as context to curl single-character quote
+ # tokens correctly.)
+
+ for cur_token in text_tokens:
t = cur_token[1]
+
+ # skip HTML and/or XML tags (do not update last character)
+ if cur_token[0] == 'tag':
+ yield t
+ continue
+
last_char = t[-1:] # Remember last char of this token before processing.
- if not in_pre:
- oldstr = t
- t = processEscapes(t)
-
- if convert_quot != "0":
- t = re.sub('&quot;', '"', t)
-
- if do_dashes != "0":
- if do_dashes == "1":
- t = educateDashes(t)
- if do_dashes == "2":
- t = educateDashesOldSchool(t)
- if do_dashes == "3":
- t = educateDashesOldSchoolInverted(t)
-
- if do_ellipses != "0":
- t = educateEllipses(t)
-
- # Note: backticks need to be processed before quotes.
- if do_backticks != "0":
- t = educateBackticks(t)
-
- if do_backticks == "2":
- t = educateSingleBackticks(t)
-
- if do_quotes != "0":
- if t == "'":
- # Special case: single-character ' token
- if re.match("\S", prev_token_last_char):
- t = smart.rquote
- else:
- t = smart.lquote
- elif t == '"':
- # Special case: single-character " token
- if re.match("\S", prev_token_last_char):
- t = smart.rdquote
- else:
- t = smart.ldquote
-
- else:
- # Normal case:
- t = educateQuotes(t)
-
- if do_stupefy == "1":
- t = stupefyEntities(t)
+ # skip literal text (math, literal, raw, ...)
+ if cur_token[0] == 'literal':
+ yield t
+ continue
+
+ t = processEscapes(t)
+
+ if convert_quot:
+ t = re.sub('&quot;', '"', t)
+
+ if do_dashes == 1:
+ t = educateDashes(t)
+ elif do_dashes == 2:
+ t = educateDashesOldSchool(t)
+ elif do_dashes == 3:
+ t = educateDashesOldSchoolInverted(t)
+
+ if do_ellipses:
+ t = educateEllipses(t)
+
+ # Note: backticks need to be processed before quotes.
+ if do_backticks:
+ t = educateBackticks(t)
+
+ if do_backticks == 2:
+ t = educateSingleBackticks(t)
+
+ if do_quotes:
+ t = educateQuotes(prev_token_last_char+t)[1:]
+
+ if do_stupefy:
+ t = stupefyEntities(t)
+
+ # print prev_token_last_char, t.encode('utf8')
prev_token_last_char = last_char
- result.append(t)
- return "".join(result)
+ yield t
-def educateQuotes(str):
- """
- Parameter: String.
- Returns: The string, with "educated" curly quote characters.
+def educateQuotes(text):
+ """
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with "educated" curly quote characters.
Example input: "Isn't this fun?"
Example output: “Isn’t this fun?“;
"""
- oldstr = str
+ # oldtext = text
punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
# Special case if the very first character is a quote
# followed by punctuation at a non-word-break. Close the quotes by brute force:
- str = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.rquote, str)
- str = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.rdquote, str)
+ text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.rquote, text)
+ text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.rdquote, text)
# Special case for double sets of quotes, e.g.:
# <p>He said, "'Quoted' words in a larger quote."</p>
- str = re.sub(r""""'(?=\w)""", smart.ldquote+smart.lquote, str)
- str = re.sub(r"""'"(?=\w)""", smart.lquote+smart.ldquote, str)
+ text = re.sub(r""""'(?=\w)""", smart.ldquote+smart.lquote, text)
+ text = re.sub(r"""'"(?=\w)""", smart.lquote+smart.ldquote, text)
# Special case for decade abbreviations (the '80s):
- str = re.sub(r"""\b'(?=\d{2}s)""", smart.rquote, str)
+ text = re.sub(r"""\b'(?=\d{2}s)""", smart.rquote, text)
close_class = r"""[^\ \t\r\n\[\{\(\-]"""
dec_dashes = r"""&#8211;|&#8212;"""
@@ -541,24 +534,24 @@ def educateQuotes(str):
' # the quote
(?=\w) # followed by a word character
""" % (dec_dashes,), re.VERBOSE)
- str = opening_single_quotes_regex.sub(r'\1'+smart.lquote, str)
+ text = opening_single_quotes_regex.sub(r'\1'+smart.lquote, text)
closing_single_quotes_regex = re.compile(r"""
(%s)
'
(?!\s | s\b | \d)
""" % (close_class,), re.VERBOSE)
- str = closing_single_quotes_regex.sub(r'\1'+smart.rquote, str)
+ text = closing_single_quotes_regex.sub(r'\1'+smart.rquote, text)
closing_single_quotes_regex = re.compile(r"""
(%s)
'
(\s | s\b)
""" % (close_class,), re.VERBOSE)
- str = closing_single_quotes_regex.sub(r'\1%s\2' % smart.rquote, str)
+ text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.rquote, text)
# Any remaining single quotes should be opening ones:
- str = re.sub(r"""'""", smart.lquote, str)
+ text = re.sub(r"""'""", smart.lquote, text)
# Get most opening double quotes:
opening_double_quotes_regex = re.compile(r"""
@@ -573,7 +566,7 @@ def educateQuotes(str):
" # the quote
(?=\w) # followed by a word character
""" % (dec_dashes,), re.VERBOSE)
- str = opening_double_quotes_regex.sub(r'\1'+smart.ldquote, str)
+ text = opening_double_quotes_regex.sub(r'\1'+smart.ldquote, text)
# Double closing quotes:
closing_double_quotes_regex = re.compile(r"""
@@ -581,81 +574,78 @@ def educateQuotes(str):
"
(?=\s)
""" % (close_class,), re.VERBOSE)
- str = closing_double_quotes_regex.sub(smart.rdquote, str)
+ text = closing_double_quotes_regex.sub(smart.rdquote, text)
closing_double_quotes_regex = re.compile(r"""
(%s) # character that indicates the quote should be closing
"
""" % (close_class,), re.VERBOSE)
- str = closing_double_quotes_regex.sub(r'\1'+smart.rdquote, str)
+ text = closing_double_quotes_regex.sub(r'\1'+smart.rdquote, text)
# Any remaining quotes should be opening ones.
- str = re.sub(r'"', smart.ldquote, str)
+ text = re.sub(r'"', smart.ldquote, text)
- return str
+ return text
-def educateBackticks(str):
+def educateBackticks(text):
"""
- Parameter: String.
- Returns: The string, with ``backticks'' -style double quotes
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with ``backticks'' -style double quotes
translated into HTML curly quote entities.
Example input: ``Isn't this fun?''
Example output: “Isn't this fun?“;
"""
- str = re.sub(r"""``""", smart.ldquote, str)
- str = re.sub(r"""''""", smart.rdquote, str)
- return str
+ text = re.sub(r"""``""", smart.ldquote, text)
+ text = re.sub(r"""''""", smart.rdquote, text)
+ return text
-def educateSingleBackticks(str):
+def educateSingleBackticks(text):
"""
- Parameter: String.
- Returns: The string, with `backticks' -style single quotes
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with `backticks' -style single quotes
translated into HTML curly quote entities.
Example input: `Isn't this fun?'
Example output: ‘Isn’t this fun?’
"""
- str = re.sub(r"""`""", smart.lquote, str)
- str = re.sub(r"""'""", smart.rquote, str)
- return str
+ text = re.sub(r"""`""", smart.lquote, text)
+ text = re.sub(r"""'""", smart.rquote, text)
+ return text
-def educateDashes(str):
+def educateDashes(text):
"""
- Parameter: String.
-
- Returns: The string, with each instance of "--" translated to
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with each instance of "--" translated to
an em-dash character.
"""
- str = re.sub(r"""---""", smart.endash, str) # en (yes, backwards)
- str = re.sub(r"""--""", smart.emdash, str) # em (yes, backwards)
- return str
+ text = re.sub(r"""---""", smart.endash, text) # en (yes, backwards)
+ text = re.sub(r"""--""", smart.emdash, text) # em (yes, backwards)
+ return text
-def educateDashesOldSchool(str):
+def educateDashesOldSchool(text):
"""
- Parameter: String.
-
- Returns: The string, with each instance of "--" translated to
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with each instance of "--" translated to
an en-dash character, and each "---" translated to
an em-dash character.
"""
- str = re.sub(r"""---""", smart.emdash, str) # em (yes, backwards)
- str = re.sub(r"""--""", smart.endash, str) # en (yes, backwards)
- return str
+ text = re.sub(r"""---""", smart.emdash, text) # em (yes, backwards)
+ text = re.sub(r"""--""", smart.endash, text) # en (yes, backwards)
+ return text
-def educateDashesOldSchoolInverted(str):
+def educateDashesOldSchoolInverted(text):
"""
- Parameter: String.
-
- Returns: The string, with each instance of "--" translated to
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with each instance of "--" translated to
an em-dash character, and each "---" translated to
an en-dash character. Two reasons why: First, unlike the
en- and em-dash syntax supported by
@@ -666,55 +656,55 @@ def educateDashesOldSchoolInverted(str):
the shortcut should be shorter to type. (Thanks to Aaron
Swartz for the idea.)
"""
- str = re.sub(r"""---""", smart.endash, str) # em
- str = re.sub(r"""--""", smart.emdash, str) # en
- return str
+ text = re.sub(r"""---""", smart.endash, text) # em
+ text = re.sub(r"""--""", smart.emdash, text) # en
+ return text
-def educateEllipses(str):
+def educateEllipses(text):
"""
- Parameter: String.
- Returns: The string, with each instance of "..." translated to
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with each instance of "..." translated to
an ellipsis character.
Example input: Huh...?
Example output: Huh&#8230;?
"""
- str = re.sub(r"""\.\.\.""", smart.ellipsis, str)
- str = re.sub(r"""\. \. \.""", smart.ellipsis, str)
- return str
+ text = re.sub(r"""\.\.\.""", smart.ellipsis, text)
+ text = re.sub(r"""\. \. \.""", smart.ellipsis, text)
+ return text
-def stupefyEntities(str):
+def stupefyEntities(text):
"""
- Parameter: String.
- Returns: The string, with each SmartyPants character translated to
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with each SmartyPants character translated to
its ASCII counterpart.
Example input: “Hello — world.”
Example output: "Hello -- world."
"""
- str = re.sub(smart.endash, "-", str) # en-dash
- str = re.sub(smart.emdash, "--", str) # em-dash
+ text = re.sub(smart.endash, "-", text) # en-dash
+ text = re.sub(smart.emdash, "--", text) # em-dash
- str = re.sub(smart.lquote, "'", str) # open single quote
- str = re.sub(smart.rquote, "'", str) # close single quote
+ text = re.sub(smart.lquote, "'", text) # open single quote
+ text = re.sub(smart.rquote, "'", text) # close single quote
- str = re.sub(smart.ldquote, '"', str) # open double quote
- str = re.sub(smart.rdquote, '"', str) # close double quote
+ text = re.sub(smart.ldquote, '"', text) # open double quote
+ text = re.sub(smart.rdquote, '"', text) # close double quote
- str = re.sub(smart.ellipsis, '...', str)# ellipsis
+ text = re.sub(smart.ellipsis, '...', text)# ellipsis
- return str
+ return text
-def processEscapes(str):
+def processEscapes(text):
r"""
- Parameter: String.
- Returns: The string, with after processing the following backslash
+ Parameter: String (unicode or bytes).
+ Returns: The `text`, with after processing the following backslash
escape sequences. This is useful if you want to force a "dumb"
quote or other character to appear.
@@ -727,24 +717,24 @@ def processEscapes(str):
\- &#45;
\` &#96;
"""
- str = re.sub(r"""\\\\""", r"""&#92;""", str)
- str = re.sub(r'''\\"''', r"""&#34;""", str)
- str = re.sub(r"""\\'""", r"""&#39;""", str)
- str = re.sub(r"""\\\.""", r"""&#46;""", str)
- str = re.sub(r"""\\-""", r"""&#45;""", str)
- str = re.sub(r"""\\`""", r"""&#96;""", str)
+ text = re.sub(r"""\\\\""", r"""&#92;""", text)
+ text = re.sub(r'''\\"''', r"""&#34;""", text)
+ text = re.sub(r"""\\'""", r"""&#39;""", text)
+ text = re.sub(r"""\\\.""", r"""&#46;""", text)
+ text = re.sub(r"""\\-""", r"""&#45;""", text)
+ text = re.sub(r"""\\`""", r"""&#96;""", text)
- return str
+ return text
-def _tokenize(str):
+def tokenize(text):
"""
Parameter: String containing HTML markup.
- Returns: Reference to an array of the tokens comprising the input
+ Returns: An iterator that yields the tokens comprising the input
string. Each token is either a tag (possibly with nested,
tags contained therein, such as <a href="<MTFoo>">, or a
- run of text between tags. Each element of the array is a
- two-element array; the first is either 'tag' or 'text';
+ run of text between tags. Each yielded element is a
+ two-element tuple; the first is either 'tag' or 'text';
the second is the actual value.
Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin.
@@ -752,8 +742,8 @@ def _tokenize(str):
"""
pos = 0
- length = len(str)
- tokens = []
+ length = len(text)
+ # tokens = []
depth = 6
nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth)
@@ -762,22 +752,20 @@ def _tokenize(str):
# %s # nested tags """ % (nested_tags,)
tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")
- token_match = tag_soup.search(str)
+ token_match = tag_soup.search(text)
previous_end = 0
while token_match is not None:
if token_match.group(1):
- tokens.append(['text', token_match.group(1)])
+ yield ('text', token_match.group(1))
- tokens.append(['tag', token_match.group(2)])
+ yield ('tag', token_match.group(2))
previous_end = token_match.end()
- token_match = tag_soup.search(str, token_match.end())
-
- if previous_end < len(str):
- tokens.append(['text', str[previous_end:]])
+ token_match = tag_soup.search(text, token_match.end())
- return tokens
+ if previous_end < len(text):
+ yield ('text', text[previous_end:])
@@ -796,7 +784,7 @@ if __name__ == "__main__":
print docstring_html
- # Unit test output goes out stderr. No worries.
+ # Unit test output goes out stderr.
import unittest
sp = smartyPants
@@ -819,6 +807,10 @@ if __name__ == "__main__":
def test_educated_quotes(self):
self.assertEqual(sp('''"Isn't this fun?"'''), u'“Isn’t this fun?”')
+ def test_html_tags(self):
+ text = '<a src="foo">more</a>'
+ self.assertEqual(sp(text), text)
+
unittest.main()