diff options
| author | Serhiy Storchaka <storchaka@gmail.com> | 2016-10-25 14:44:54 +0300 | 
|---|---|---|
| committer | Serhiy Storchaka <storchaka@gmail.com> | 2016-10-25 14:44:54 +0300 | 
| commit | f3ebc9fe3fae3d44da4d0da9764ed7c033115f12 (patch) | |
| tree | 29214aa4fce9cf570c160441a43de526f7457eb5 /Lib/textwrap.py | |
| parent | 42bababba62023383291c7413a5d453374ecd933 (diff) | |
| download | cpython-git-f3ebc9fe3fae3d44da4d0da9764ed7c033115f12.tar.gz | |
Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.
Based on patch by Kaarle Ritvanen.
Diffstat (limited to 'Lib/textwrap.py')
| -rw-r--r-- | Lib/textwrap.py | 27 | 
1 files changed, 13 insertions, 14 deletions
| diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 05e030673a..0c18dc582e 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -10,13 +10,8 @@ import re  __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']  # Hardcode the recognized whitespace characters to the US-ASCII -# whitespace characters.  The main reason for doing this is that in -# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales -# that character winds up in string.whitespace.  Respecting -# string.whitespace in those cases would 1) make textwrap treat 0xa0 the -# same as any other whitespace char, which is clearly wrong (it's a -# *non-breaking* space), 2) possibly cause problems with Unicode, -# since 0xa0 is not in range(128). +# whitespace characters.  The main reason for doing this is that +# some Unicode spaces (like \u00a0) are non-breaking whitespaces.  _whitespace = '\t\n\x0b\x0c\r '  class TextWrapper: @@ -81,29 +76,34 @@ class TextWrapper:      # (after stripping out empty strings).      word_punct = r'[\w!"\'&.,?]'      letter = r'[^\d\W]' +    whitespace = r'[%s]' % re.escape(_whitespace) +    nowhitespace = '[^' + whitespace[1:]      wordsep_re = re.compile(r'''          ( # any whitespace -          \s+ +          %(ws)s+          | # em-dash between words            (?<=%(wp)s) -{2,} (?=\w)          | # word, possibly hyphenated -          \S+? (?: +          %(nws)s+? (?:              # hyphenated word                -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))                (?= %(lt)s -? %(lt)s)              | # end of word -              (?=\s|\Z) +              (?=%(ws)s|\Z)              | # em-dash                (?<=%(wp)s) (?=-{2,}\w)              ) -        )''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE) -    del word_punct, letter +        )''' % {'wp': word_punct, 'lt': letter, +                'ws': whitespace, 'nws': nowhitespace}, +        re.VERBOSE) +    del word_punct, letter, nowhitespace      # This less funky little regex just split on recognized spaces. E.g.      #   "Hello there -- you goof-ball, use the -b option!"      # splits into      #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ -    wordsep_simple_re = re.compile(r'(\s+)') +    wordsep_simple_re = re.compile(r'(%s+)' % whitespace) +    del whitespace      # XXX this is not locale- or charset-aware -- string.lowercase      # is US-ASCII only (and therefore English-only) @@ -112,7 +112,6 @@ class TextWrapper:                                   r'[\"\']?'           # optional end-of-quote                                   r'\Z')               # end of chunk -      def __init__(self,                   width=70,                   initial_indent="", | 
