diff options
| author | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2020-01-26 22:08:25 +0000 |
|---|---|---|
| committer | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2020-01-26 22:08:25 +0000 |
| commit | c71966c4f8c707bbfc2ec50840cc070e98d118a9 (patch) | |
| tree | b61c250b5d89e33569e305a744938a78444cc46a | |
| parent | a5b16480cf8ac90f5f528740eac20760974180ad (diff) | |
| download | docutils-c71966c4f8c707bbfc2ec50840cc070e98d118a9.tar.gz | |
Fix [ 383 ] Smart quotes around opening and separator characters.
git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@8469 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
| -rw-r--r-- | docutils/HISTORY.txt | 4 | ||||
| -rw-r--r-- | docutils/docutils/utils/smartquotes.py | 120 | ||||
| -rw-r--r-- | docutils/test/test_transforms/test_smartquotes.py | 45 |
3 files changed, 94 insertions, 75 deletions
diff --git a/docutils/HISTORY.txt b/docutils/HISTORY.txt index 5b6e9179c..bf4a1a4b3 100644 --- a/docutils/HISTORY.txt +++ b/docutils/HISTORY.txt @@ -27,6 +27,10 @@ Changes Since 0.16 * docutils/__init__.py - VersionInfo: ValueError for invalid values, fix comparison to tuples. + +* docutils/utils/smartquotes.py + + - Fix [ 383 ]: Smart quotes around opening and separator characters. Release 0.16 diff --git a/docutils/docutils/utils/smartquotes.py b/docutils/docutils/utils/smartquotes.py index 6b6809cac..72f093b98 100644 --- a/docutils/docutils/utils/smartquotes.py +++ b/docutils/docutils/utils/smartquotes.py @@ -228,22 +228,22 @@ apostrophes are used at the start of leading contractions. For example:: 'Twas the night before Christmas. In the case above, SmartyPants will turn the apostrophe into an opening -single-quote, when in fact it should be the `right single quotation mark` +secondary quote, when in fact it should be the `RIGHT SINGLE QUOTATION MARK` character which is also "the preferred character to use for apostrophe" (Unicode). I don't think this problem can be solved in the general case -- every word processor I've tried gets this wrong as well. In such cases, it's -best to use the proper character for closing single-quotes (’) by hand. +best to inset the `RIGHT SINGLE QUOTATION MARK` (’) by hand. -In English, the same character is used for apostrophe and closing single +In English, the same character is used for apostrophe and closing secondary quote (both plain and "smart" ones). For other locales (French, Italean, -Swiss, ...) "smart" single closing quotes differ from the curly apostrophe. +Swiss, ...) "smart" secondary closing quotes differ from the curly apostrophe. .. class:: language-fr Il dit : "C'est 'super' !" If the apostrophe is used at the end of a word, it cannot be distinguished -from a single quote by the algorithm. Therefore, a text like:: +from a secondary quote by the algorithm. Therefore, a text like:: .. class:: language-de-CH @@ -251,7 +251,7 @@ from a single quote by the algorithm. Therefore, a text like:: will get a single closing guillemet instead of an apostrophe. -This can be prevented by use use of the curly apostrophe character (’) in +This can be prevented by use use of the `RIGHT SINGLE QUOTATION MARK` in the source:: - "Er sagt: 'Ich fass' es nicht.'" @@ -636,87 +636,83 @@ def educateQuotes(text, language='en'): """ smart = smartchars(language) - - punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" - close_class = r"""[^\ \t\r\n\[\{\(\-]""" - open_class = u'[\u200B\u200C]' # ZWSP, ZWNJ - dec_dashes = r"""–|—""" + ch_classes = {'open': u'[(\[{]', # opening braces + 'close': r'[^\s]', # everything except whitespace + 'punct': r"""[-!"#\$\%'()*+,.\/:;<=>?\@\[\\\]\^_`{|}~]""", + 'dash': u'[-–—]' # hyphen and em/en dashes + + r'|&[mn]dash;|&\#8211;|&\#8212;|&\#x201[34];', + 'sep': u'[\\s\u200B\u200C]| ', # Whitespace, ZWSP, ZWNJ + } # Special case if the very first character is a quote - # followed by punctuation at a non-word-break. - # Close the quotes by brute force: - text = re.sub(r"^'(?=%s\\B)" % (punct_class,), smart.csquote, text) - text = re.sub(r'^"(?=%s\\B)' % (punct_class,), smart.cpquote, text) + # followed by punctuation at a non-word-break. Use closing quotes. + # TODO: example (when does this match?) + text = re.sub(r"^'(?=%s\\B)" % ch_classes['punct'], smart.csquote, text) + text = re.sub(r'^"(?=%s\\B)' % ch_classes['punct'], smart.cpquote, text) - # Special case for double sets of quotes, e.g.: - # <p>He said, "'Quoted' words in a larger quote."</p> + # Special case for adjacent quotes + # like "'Quoted' words in a larger quote." text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text) text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text) + # Special case: "opening character" followed by quote, + # optional punctuation and space like "[", '(', or '-'. + text = re.sub(r"(%(open)s|%(dash)s)'(?=%(punct)s? )" % ch_classes, + r'\1%s'%smart.csquote, text) + text = re.sub(r'(%(open)s|%(dash)s)"(?=%(punct)s? )' % ch_classes, + r'\1%s'%smart.cpquote, text) + # Special case for decade abbreviations (the '80s): if language.startswith('en'): # TODO similar cases in other languages? text = re.sub(r"'(?=\d{2}s)", smart.apostrophe, text) - # Get most opening single quotes: - opening_single_quotes_regex = re.compile(u""" + # Get most opening secondary quotes: + opening_secondary_quotes_regex = re.compile(u""" (# ?<= # look behind fails: requires fixed-width pattern - \\s | # a whitespace char, or - %s | # another separating char, or - | # a non-breaking space entity, or - [\u2013 \u2014 ] | # literal dashes, or - -- | # dumb dashes, or - &[mn]dash; | # dash entities (named or - %s | # decimal or - &\\#x201[34]; # hex) + %(sep)s | # a whitespace char, or + %(open)s | # opening brace, or + %(dash)s # em/en-dash ) ' # the quote - (?=\\w) # followed by a word character - """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE) + (?=\\w|%(punct)s) # followed by a word character or punctuation + """ % ch_classes, re.VERBOSE | re.UNICODE) - text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text) + text = opening_secondary_quotes_regex.sub(r'\1'+smart.osquote, text) - # In many locales, single closing quotes are different from apostrophe: + # In many locales, secondary closing quotes are different from apostrophe: if smart.csquote != smart.apostrophe: apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE) text = apostrophe_regex.sub(smart.apostrophe, text) # TODO: keep track of quoting level to recognize apostrophe in, e.g., # "Ich fass' es nicht." - closing_single_quotes_regex = re.compile(r""" - (?<=%s) - ' - """ % close_class, re.VERBOSE) - text = closing_single_quotes_regex.sub(smart.csquote, text) + closing_secondary_quotes_regex = re.compile(r"(?<!\s)'", re.UNICODE) + text = closing_secondary_quotes_regex.sub(smart.csquote, text) - # Any remaining single quotes should be opening ones: + # Any remaining secondary quotes should be opening ones: text = re.sub(r"""'""", smart.osquote, text) - # Get most opening double quotes: - opening_double_quotes_regex = re.compile(u""" + # Get most opening primary quotes: + opening_primary_quotes_regex = re.compile(u""" ( - \\s | # a whitespace char, or - %s | # another separating char, or - | # a non-breaking space entity, or - [\u2013 \u2014 ] | # literal dashes, or - -- | # dumb dashes, or - &[mn]dash; | # dash entities (named or - %s | # decimal or - &\\#x201[34]; # hex) + %(sep)s | # a whitespace char, or + %(open)s | # zero width separating char, or + %(dash)s # em/en-dash ) " # the quote - (?=\\w) # followed by a word character - """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE) + (?=\\w|%(punct)s) # followed by a word character or punctuation + """ % ch_classes, re.VERBOSE | re.UNICODE) - text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text) + text = opening_primary_quotes_regex.sub(r'\1'+smart.opquote, text) - # Double closing quotes: - closing_double_quotes_regex = re.compile(r""" + # primary closing quotes: + closing_primary_quotes_regex = re.compile(r""" ( - (?<=%s)" | # char indicating the quote should be closing + (?<!\s)" | # no whitespace before "(?=\s) # whitespace behind ) - """ % (close_class,), re.VERBOSE | re.UNICODE) - text = closing_double_quotes_regex.sub(smart.cpquote, text) + """, re.VERBOSE | re.UNICODE) + text = closing_primary_quotes_regex.sub(smart.cpquote, text) # Any remaining quotes should be opening ones. text = re.sub(r'"', smart.opquote, text) @@ -826,16 +822,16 @@ def stupefyEntities(text, language='en'): """ smart = smartchars(language) - text = re.sub(smart.endash, "-", text) # en-dash - text = re.sub(smart.emdash, "--", text) # em-dash + text = re.sub(smart.endash, "-", text) # en-dash + text = re.sub(smart.emdash, "--", text) # em-dash - text = re.sub(smart.osquote, "'", text) # open single quote - text = re.sub(smart.csquote, "'", text) # close single quote + text = re.sub(smart.osquote, "'", text) # open secondary quote + text = re.sub(smart.csquote, "'", text) # close secondary quote - text = re.sub(smart.opquote, '"', text) # open double quote - text = re.sub(smart.cpquote, '"', text) # close double quote + text = re.sub(smart.opquote, '"', text) # open primary quote + text = re.sub(smart.cpquote, '"', text) # close primary quote - text = re.sub(smart.ellipsis, '...', text)# ellipsis + text = re.sub(smart.ellipsis, '...', text) # ellipsis return text diff --git a/docutils/test/test_transforms/test_smartquotes.py b/docutils/test/test_transforms/test_smartquotes.py index fe8a24c32..4e17c8b89 100644 --- a/docutils/test/test_transforms/test_smartquotes.py +++ b/docutils/test/test_transforms/test_smartquotes.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # $Id$ # @@ -18,7 +18,9 @@ Test module for universal.SmartQuotes transform. """ from __future__ import absolute_import -from . import DocutilsTestSupport # must be imported before docutils +if __name__ == '__main__': + import __init__ +from test_transforms import DocutilsTestSupport # must be imported before docutils from docutils.transforms.universal import SmartQuotes from docutils.parsers.rst import Parser @@ -26,7 +28,8 @@ from docutils.parsers.rst import Parser def suite(): parser = Parser() settings = {'smart_quotes': True, - 'trim_footnote_ref_space': True} + 'trim_footnote_ref_space': True, + 'report': 2} # TODO: why is this ignored when running as main? s = DocutilsTestSupport.TransformTestSuite( parser, suite_settings=settings) s.generateTests(totest) @@ -111,10 +114,11 @@ u"""\ f'(x) = df(x)/dx """], [u"""\ -Quotes preceded by -a word"a" and'a', -punctuation:"a",'a', +Closing quotes, if preceded by +wor"d char's +or punctuation:"a",'a';'a' (TODO: opening quotes if followed by word-char?). +Opening quotes after normal space "a" 'a', thin space "a" 'a', em space "a" 'a', @@ -123,17 +127,25 @@ ZWSP\u200B"a" and\u200B'a', ZWNJ\u200C"a" and\u200C'a', escaped space\\ "a" and\\ 'a', +hyphen -"a", -'a' —"a",—'a' -en dash–"a"–'a', -em dash—"a"—'a'. +en dash –"a"–'a', +em dash —"a"—'a'. + +opening brackets ("a") ('a') ["a"] ['a'] {"a"} {'a'} + +But not if followed by (optional punctuation and) whitespace: +"-", "–", "—", "(", "a[", "{" +'-', '–', '—', '((', '[', '{' """, u"""\ <document source="test data"> <paragraph> - Quotes preceded by - a word”a” and’a’, - punctuation:”a”,’a’, + Closing quotes, if preceded by + wor”d char’s + or punctuation:”a”,’a’;’a’ (TODO: opening quotes if followed by word-char?). <paragraph> + Opening quotes after normal space “a” ‘a’, thin space “a” ‘a’, em space “a” ‘a’, @@ -142,9 +154,16 @@ u"""\ ZWNJ\u200C“a” and\u200C‘a’, escaped space“a” and‘a’, <paragraph> + hyphen -“a”, -‘a’ —“a”,—‘a’ - en dash–“a”–‘a’, - em dash—“a”—‘a’. + en dash –“a”–‘a’, + em dash —“a”—‘a’. + <paragraph> + opening brackets (“a”) (‘a’) [“a”] [‘a’] {“a”} {‘a’} + <paragraph> + But not if followed by (optional punctuation and) whitespace: + “-”, “–”, “—”, “(”, “a[”, “{” + ‘-’, ‘–’, ‘—’, ‘((’, ‘[’, ‘{’ """], ["""\ Quotes and inline-elements: |
