summaryrefslogtreecommitdiff
path: root/Lib/textwrap.py
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2016-10-25 14:44:54 +0300
committerSerhiy Storchaka <storchaka@gmail.com>2016-10-25 14:44:54 +0300
commitf3ebc9fe3fae3d44da4d0da9764ed7c033115f12 (patch)
tree29214aa4fce9cf570c160441a43de526f7457eb5 /Lib/textwrap.py
parent42bababba62023383291c7413a5d453374ecd933 (diff)
downloadcpython-git-f3ebc9fe3fae3d44da4d0da9764ed7c033115f12.tar.gz
Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.
Based on patch by Kaarle Ritvanen.
Diffstat (limited to 'Lib/textwrap.py')
-rw-r--r--Lib/textwrap.py27
1 files changed, 13 insertions, 14 deletions
diff --git a/Lib/textwrap.py b/Lib/textwrap.py
index 05e030673a..0c18dc582e 100644
--- a/Lib/textwrap.py
+++ b/Lib/textwrap.py
@@ -10,13 +10,8 @@ import re
__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
# Hardcode the recognized whitespace characters to the US-ASCII
-# whitespace characters. The main reason for doing this is that in
-# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
-# that character winds up in string.whitespace. Respecting
-# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
-# same as any other whitespace char, which is clearly wrong (it's a
-# *non-breaking* space), 2) possibly cause problems with Unicode,
-# since 0xa0 is not in range(128).
+# whitespace characters. The main reason for doing this is that
+# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
_whitespace = '\t\n\x0b\x0c\r '
class TextWrapper:
@@ -81,29 +76,34 @@ class TextWrapper:
# (after stripping out empty strings).
word_punct = r'[\w!"\'&.,?]'
letter = r'[^\d\W]'
+ whitespace = r'[%s]' % re.escape(_whitespace)
+ nowhitespace = '[^' + whitespace[1:]
wordsep_re = re.compile(r'''
( # any whitespace
- \s+
+ %(ws)s+
| # em-dash between words
(?<=%(wp)s) -{2,} (?=\w)
| # word, possibly hyphenated
- \S+? (?:
+ %(nws)s+? (?:
# hyphenated word
-(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
(?= %(lt)s -? %(lt)s)
| # end of word
- (?=\s|\Z)
+ (?=%(ws)s|\Z)
| # em-dash
(?<=%(wp)s) (?=-{2,}\w)
)
- )''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE)
- del word_punct, letter
+ )''' % {'wp': word_punct, 'lt': letter,
+ 'ws': whitespace, 'nws': nowhitespace},
+ re.VERBOSE)
+ del word_punct, letter, nowhitespace
# This less funky little regex just split on recognized spaces. E.g.
# "Hello there -- you goof-ball, use the -b option!"
# splits into
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
- wordsep_simple_re = re.compile(r'(\s+)')
+ wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
+ del whitespace
# XXX this is not locale- or charset-aware -- string.lowercase
# is US-ASCII only (and therefore English-only)
@@ -112,7 +112,6 @@ class TextWrapper:
r'[\"\']?' # optional end-of-quote
r'\Z') # end of chunk
-
def __init__(self,
width=70,
initial_indent="",