summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Doc/library/codecs.rst12
-rw-r--r--Lib/test/test_bytes.py4
-rw-r--r--Lib/test/test_codecs.py15
-rw-r--r--Lib/test/test_unicode.py6
-rw-r--r--Lib/test/test_unicodedata.py3
-rw-r--r--Misc/NEWS2
-rw-r--r--Objects/unicodeobject.c83
-rw-r--r--Python/codecs.c92
-rw-r--r--Python/marshal.c6
9 files changed, 202 insertions, 21 deletions
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index 4b6c7e5024..ab578ea281 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -323,6 +323,18 @@ and implemented by all standard Python codecs:
| | (only for encoding). |
+-------------------------+-----------------------------------------------+
+In addition, the following error handlers are specific to a single codec:
+
++------------------+---------+--------------------------------------------+
+| Value | Codec | Meaning |
++==================+=========+============================================+
+| ``'surrogates'`` | utf-8 | Allow encoding and decoding of surrogate |
+| | | codes in UTF-8. |
++------------------+---------+--------------------------------------------+
+
+.. versionadded:: 3.1
+ The ``'surrogates'`` error handler.
+
The set of allowed values can be extended via :meth:`register_error`.
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index a3ea40aa50..992f3d2eec 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -169,13 +169,13 @@ class BaseBytesTest(unittest.TestCase):
self.assertEqual(b[start:stop:step], self.type2test(L[start:stop:step]))
def test_encoding(self):
- sample = "Hello world\n\u1234\u5678\u9abc\udef0"
+ sample = "Hello world\n\u1234\u5678\u9abc"
for enc in ("utf8", "utf16"):
b = self.type2test(sample, enc)
self.assertEqual(b, self.type2test(sample.encode(enc)))
self.assertRaises(UnicodeEncodeError, self.type2test, sample, "latin1")
b = self.type2test(sample, "latin1", "ignore")
- self.assertEqual(b, self.type2test(sample[:-4], "utf-8"))
+ self.assertEqual(b, self.type2test(sample[:-3], "utf-8"))
def test_decode(self):
sample = "Hello world\n\u1234\u5678\u9abc\def0\def0"
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 1730dbe593..6706507335 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -541,6 +541,17 @@ class UTF8Test(ReadTest):
self.check_state_handling_decode(self.encoding,
u, u.encode(self.encoding))
+ def test_lone_surrogates(self):
+ self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
+ self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
+
+ def test_surrogates_handler(self):
+ self.assertEquals("abc\ud800def".encode("utf-8", "surrogates"),
+ b"abc\xed\xa0\x80def")
+ self.assertEquals(b"abc\xed\xa0\x80def".decode("utf-8", "surrogates"),
+ "abc\ud800def")
+ self.assertTrue(codecs.lookup_error("surrogates"))
+
class UTF7Test(ReadTest):
encoding = "utf-7"
@@ -1023,12 +1034,12 @@ class NameprepTest(unittest.TestCase):
# Skipped
continue
# The Unicode strings are given in UTF-8
- orig = str(orig, "utf-8")
+ orig = str(orig, "utf-8", "surrogates")
if prepped is None:
# Input contains prohibited characters
self.assertRaises(UnicodeError, nameprep, orig)
else:
- prepped = str(prepped, "utf-8")
+ prepped = str(prepped, "utf-8", "surrogates")
try:
self.assertEquals(nameprep(orig), prepped)
except Exception as e:
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 1fddc06c62..220a8eb26e 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -886,10 +886,10 @@ class UnicodeTest(
self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82')
self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96')
- self.assertEqual('\ud800'.encode('utf-8'), b'\xed\xa0\x80')
- self.assertEqual('\udc00'.encode('utf-8'), b'\xed\xb0\x80')
+ self.assertEqual('\ud800'.encode('utf-8', 'surrogates'), b'\xed\xa0\x80')
+ self.assertEqual('\udc00'.encode('utf-8', 'surrogates'), b'\xed\xb0\x80')
self.assertEqual(
- ('\ud800\udc02'*1000).encode('utf-8'),
+ ('\ud800\udc02'*1000).encode('utf-8', 'surrogates'),
b'\xf0\x90\x80\x82'*1000
)
self.assertEqual(
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index aed8eaa0fe..b84aaaf5ec 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -13,6 +13,7 @@ import subprocess
import test.support
encoding = 'utf-8'
+errors = 'surrogates'
### Run tests
@@ -61,7 +62,7 @@ class UnicodeMethodsTest(unittest.TestCase):
(char + 'ABC').title(),
]
- h.update(''.join(data).encode(encoding))
+ h.update(''.join(data).encode(encoding, errors))
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)
diff --git a/Misc/NEWS b/Misc/NEWS
index 7f22b0d5b5..f4116ad8e7 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1?
Core and Builtins
-----------------
+- Issue #3672: Reject surrogates in utf-8 codec; add surrogates error handler.
+
- Issue #5883: In the io module, the BufferedIOBase and TextIOBase ABCs have
received a new method, detach(). detach() disconnects the underlying stream
from the buffer or text IO and returns it.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 68d4fc41ae..cc70bad825 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -154,6 +154,11 @@ const unsigned char _Py_ascii_whitespace[] = {
0, 0, 0, 0, 0, 0, 0, 0
};
+static PyObject *unicode_encode_call_errorhandler(const char *errors,
+ PyObject **errorHandler,const char *encoding, const char *reason,
+ const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
+ Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
+
/* Same for linebreaks */
static unsigned char ascii_linebreak[] = {
0, 0, 0, 0, 0, 0, 0, 0,
@@ -2214,14 +2219,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
- if (ch < 0x0800) {
- /* Note: UTF-8 encodings of surrogates are considered
- legal UTF-8 sequences;
-
- XXX For wide builds (UCS-4) we should probably try
- to recombine the surrogates into a single code
- unit.
- */
+ if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
errmsg = "illegal encoding";
startinpos = s-starts;
endinpos = startinpos+3;
@@ -2328,6 +2326,8 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Py_ssize_t nallocated; /* number of result bytes allocated */
Py_ssize_t nneeded; /* number of result bytes needed */
char stackbuf[MAX_SHORT_UNICHARS * 4];
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
assert(s != NULL);
assert(size >= 0);
@@ -2367,6 +2367,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
else {
/* Encode UCS2 Unicode ordinals */
if (ch < 0x10000) {
+#ifndef Py_UNICODE_WIDE
/* Special case: check for high surrogate */
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
Py_UCS4 ch2 = s[i];
@@ -2379,6 +2380,36 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
}
/* Fall through: handles isolated high surrogates */
}
+#endif
+ if (ch >= 0xd800 && ch <= 0xdfff) {
+ Py_ssize_t newpos;
+ PyObject *rep;
+ char *prep;
+ int k;
+ rep = unicode_encode_call_errorhandler
+ (errors, &errorHandler, "utf-8", "surrogates not allowed",
+ s, size, &exc, i-1, i, &newpos);
+ if (!rep)
+ goto error;
+ /* Implementation limitations: only support error handler that return
+ bytes, and only support up to four replacement bytes. */
+ if (!PyBytes_Check(rep)) {
+ PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
+ Py_DECREF(rep);
+ goto error;
+ }
+ if (PyBytes_Size(rep) > 4) {
+ PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
+ Py_DECREF(rep);
+ goto error;
+ }
+ prep = PyBytes_AsString(rep);
+ for(k = PyBytes_Size(rep); k > 0; k--)
+ *p++ = *prep++;
+ Py_DECREF(rep);
+ continue;
+
+ }
*p++ = (char)(0xe0 | (ch >> 12));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
@@ -2405,7 +2436,14 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
assert(nneeded <= nallocated);
_PyBytes_Resize(&result, nneeded);
}
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return result;
+ error:
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ Py_XDECREF(result);
+ return NULL;
#undef MAX_SHORT_UNICHARS
}
@@ -3897,7 +3935,7 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
Py_ssize_t startpos, Py_ssize_t endpos,
Py_ssize_t *newpos)
{
- static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
+ static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
PyObject *restuple;
PyObject *resunicode;
@@ -3918,15 +3956,20 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
if (restuple == NULL)
return NULL;
if (!PyTuple_Check(restuple)) {
- PyErr_SetString(PyExc_TypeError, &argparse[4]);
+ PyErr_SetString(PyExc_TypeError, &argparse[3]);
Py_DECREF(restuple);
return NULL;
}
- if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+ if (!PyArg_ParseTuple(restuple, argparse,
&resunicode, newpos)) {
Py_DECREF(restuple);
return NULL;
}
+ if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
+ PyErr_SetString(PyExc_TypeError, &argparse[3]);
+ Py_DECREF(restuple);
+ return NULL;
+ }
if (*newpos<0)
*newpos = size+*newpos;
if (*newpos<0 || *newpos>size) {
@@ -4064,6 +4107,12 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
collstart-startp, collend-startp, &newpos);
if (repunicode == NULL)
goto onError;
+ if (!PyUnicode_Check(repunicode)) {
+ /* Implementation limitation: byte results not supported yet. */
+ PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+ Py_DECREF(repunicode);
+ goto onError;
+ }
/* need more space? (at least enough for what we
have+the replacement+the rest of the string, so
we won't have to check space for encodable characters) */
@@ -5027,6 +5076,12 @@ int charmap_encoding_error(
collstartpos, collendpos, &newpos);
if (repunicode == NULL)
return -1;
+ if (!PyUnicode_Check(repunicode)) {
+ /* Implementation limitation: byte results not supported yet. */
+ PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+ Py_DECREF(repunicode);
+ return -1;
+ }
/* generate replacement */
repsize = PyUnicode_GET_SIZE(repunicode);
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
@@ -5588,6 +5643,12 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
collstart-s, collend-s, &newpos);
if (repunicode == NULL)
goto onError;
+ if (!PyUnicode_Check(repunicode)) {
+ /* Implementation limitation: byte results not supported yet. */
+ PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
+ Py_DECREF(repunicode);
+ goto onError;
+ }
/* generate replacement */
repsize = PyUnicode_GET_SIZE(repunicode);
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
diff --git a/Python/codecs.c b/Python/codecs.c
index ebddc09d7b..3f1412d00c 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -748,6 +748,85 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
}
}
+PyObject *PyCodec_SurrogateErrors(PyObject *exc)
+{
+ PyObject *restuple;
+ PyObject *object;
+ Py_ssize_t start;
+ Py_ssize_t end;
+ PyObject *res;
+ if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+ Py_UNICODE *p;
+ Py_UNICODE *startp;
+ char *outp;
+ if (PyUnicodeEncodeError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeEncodeError_GetEnd(exc, &end))
+ return NULL;
+ if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+ return NULL;
+ startp = PyUnicode_AS_UNICODE(object);
+ res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
+ if (!res) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ outp = PyBytes_AsString(res);
+ for (p = startp+start; p < startp+end; p++) {
+ Py_UNICODE ch = *p;
+ if (ch < 0xd800 || ch > 0xdfff) {
+ /* Not a surrogate, fail with original exception */
+ PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+ Py_DECREF(res);
+ Py_DECREF(object);
+ return NULL;
+ }
+ *outp++ = (char)(0xe0 | (ch >> 12));
+ *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *outp++ = (char)(0x80 | (ch & 0x3f));
+ }
+ restuple = Py_BuildValue("(On)", res, end);
+ Py_DECREF(res);
+ Py_DECREF(object);
+ return restuple;
+ }
+ else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
+ unsigned char *p;
+ Py_UNICODE ch = 0;
+ if (PyUnicodeDecodeError_GetStart(exc, &start))
+ return NULL;
+ if (!(object = PyUnicodeDecodeError_GetObject(exc)))
+ return NULL;
+ if (!(p = (unsigned char*)PyBytes_AsString(object))) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ /* Try decoding a single surrogate character. If
+ there are more, let the codec call us again. */
+ p += start;
+ if ((p[0] & 0xf0) == 0xe0 ||
+ (p[1] & 0xc0) == 0x80 ||
+ (p[2] & 0xc0) == 0x80) {
+ /* it's a three-byte code */
+ ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+ if (ch < 0xd800 || ch > 0xdfff)
+ /* it's not a surrogate - fail */
+ ch = 0;
+ }
+ Py_DECREF(object);
+ if (ch == 0) {
+ PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+ return NULL;
+ }
+ return Py_BuildValue("(u#n)", &ch, 1, start+3);
+ }
+ else {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+}
+
+
static PyObject *strict_errors(PyObject *self, PyObject *exc)
{
return PyCodec_StrictErrors(exc);
@@ -777,6 +856,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
return PyCodec_BackslashReplaceErrors(exc);
}
+static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_SurrogateErrors(exc);
+}
+
static int _PyCodecRegistry_Init(void)
{
static struct {
@@ -823,6 +907,14 @@ static int _PyCodecRegistry_Init(void)
backslashreplace_errors,
METH_O
}
+ },
+ {
+ "surrogates",
+ {
+ "surrogates",
+ surrogates_errors,
+ METH_O
+ }
}
};
diff --git a/Python/marshal.c b/Python/marshal.c
index bf7a26b5b2..4ad873eb77 100644
--- a/Python/marshal.c
+++ b/Python/marshal.c
@@ -312,7 +312,9 @@ w_object(PyObject *v, WFILE *p)
}
else if (PyUnicode_CheckExact(v)) {
PyObject *utf8;
- utf8 = PyUnicode_AsUTF8String(v);
+ utf8 = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(v),
+ PyUnicode_GET_SIZE(v),
+ "surrogates");
if (utf8 == NULL) {
p->depth--;
p->error = WFERR_UNMARSHALLABLE;
@@ -810,7 +812,7 @@ r_object(RFILE *p)
retval = NULL;
break;
}
- v = PyUnicode_DecodeUTF8(buffer, n, NULL);
+ v = PyUnicode_DecodeUTF8(buffer, n, "surrogates");
PyMem_DEL(buffer);
retval = v;
break;