summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Lib/email/_header_value_parser.py21
-rw-r--r--Lib/test/test_email/test__header_value_parser.py24
-rw-r--r--Lib/test/test_email/test_headerregistry.py3
-rw-r--r--Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst4
4 files changed, 49 insertions, 3 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 958ef5018c..18aecbffa7 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -97,6 +97,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
def quote_string(value):
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+ =\? # literal =?
+ [^?]* # charset
+ \? # literal ?
+ [qQbB] # literal 'q' or 'b', case insensitive
+ \? # literal ?
+ .*? # encoded word
+ \?= # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
#
# TokenList and its subclasses
#
@@ -1050,6 +1062,10 @@ def get_encoded_word(value):
_validate_xtext(vtext)
ew.append(vtext)
text = ''.join(remainder)
+ # Encoded words should be followed by a WS
+ if value and value[0] not in WSP:
+ ew.defects.append(errors.InvalidHeaderDefect(
+ "missing trailing whitespace after encoded-word"))
return ew, value
def get_unstructured(value):
@@ -1102,6 +1118,11 @@ def get_unstructured(value):
unstructured.append(token)
continue
tok, *remainder = _wsp_splitter(value, 1)
+ # Split in the middle of an atom if there is a rfc2047 encoded word
+ # which does not have WSP on both sides. The defect will be registered
+ # the next time through the loop.
+ if rfc2047_matcher.search(tok):
+ tok, *remainder = value.partition('=?')
vtext = ValueTerminal(tok, 'vtext')
_validate_xtext(vtext)
unstructured.append(vtext)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 676732bb3d..693487bc96 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -118,7 +118,7 @@ class TestParser(TestParserMixin, TestEmailBase):
'=?us-ascii?q?first?==?utf-8?q?second?=',
'first',
'first',
- [],
+ [errors.InvalidHeaderDefect],
'=?utf-8?q?second?=')
def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ class TestParser(TestParserMixin, TestEmailBase):
'=?utf-8?q?foo?==?utf-8?q?bar?=',
'foobar',
'foobar',
+ [errors.InvalidHeaderDefect,
+ errors.InvalidHeaderDefect],
+ '')
+
+ def test_get_unstructured_ew_without_leading_whitespace(self):
+ self._test_get_x(
+ self._get_unst,
+ 'nowhitespace=?utf-8?q?somevalue?=',
+ 'nowhitespacesomevalue',
+ 'nowhitespacesomevalue',
+ [errors.InvalidHeaderDefect],
+ '')
+
+ def test_get_unstructured_ew_without_trailing_whitespace(self):
+ self._test_get_x(
+ self._get_unst,
+ '=?utf-8?q?somevalue?=nowhitespace',
+ 'somevaluenowhitespace',
+ 'somevaluenowhitespace',
[errors.InvalidHeaderDefect],
'')
@@ -546,7 +565,8 @@ class TestParser(TestParserMixin, TestEmailBase):
'"=?utf-8?Q?not_really_valid?="',
'"not really valid"',
'not really valid',
- [errors.InvalidHeaderDefect],
+ [errors.InvalidHeaderDefect,
+ errors.InvalidHeaderDefect],
'')
# get_comment
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index d1007099f6..e6db3acedc 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
'rfc2047_atom_in_quoted_string_is_decoded':
('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
- [errors.InvalidHeaderDefect],
+ [errors.InvalidHeaderDefect,
+ errors.InvalidHeaderDefect],
'Éric <foo@example.com>',
'Éric',
'foo@example.com',
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
new file mode 100644
index 0000000000..dd0dd7f72c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
@@ -0,0 +1,4 @@
+Email headers containing RFC2047 encoded words are parsed despite the missing
+whitespace, and a defect registered. Also missing trailing whitespace after
+encoded words is now registered as a defect.
+