diff options
Diffstat (limited to 'Objects/unicodeobject.c')
| -rw-r--r-- | Objects/unicodeobject.c | 259 |
1 files changed, 178 insertions, 81 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 168f9f9923..4fd0430631 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "Python.h" #include "ucnhash.h" #include "bytes_methods.h" +#include "stringlib/eq.h" #ifdef MS_WINDOWS #include <windows.h> @@ -292,6 +293,37 @@ static unsigned char ascii_linebreak[] = { #include "clinic/unicodeobject.c.h" +typedef enum { + _Py_ERROR_UNKNOWN=0, + _Py_ERROR_STRICT, + _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_SURROGATEPASS, + _Py_ERROR_REPLACE, + _Py_ERROR_IGNORE, + _Py_ERROR_XMLCHARREFREPLACE, + _Py_ERROR_OTHER +} _Py_error_handler; + +static _Py_error_handler +get_error_handler(const char *errors) +{ + if (errors == NULL) + return _Py_ERROR_STRICT; + if (strcmp(errors, "strict") == 0) + return _Py_ERROR_STRICT; + if (strcmp(errors, "surrogateescape") == 0) + return _Py_ERROR_SURROGATEESCAPE; + if (strcmp(errors, "surrogatepass") == 0) + return _Py_ERROR_SURROGATEPASS; + if (strcmp(errors, "ignore") == 0) + return _Py_ERROR_IGNORE; + if (strcmp(errors, "replace") == 0) + return _Py_ERROR_REPLACE; + if (strcmp(errors, "xmlcharrefreplace") == 0) + return _Py_ERROR_XMLCHARREFREPLACE; + return _Py_ERROR_OTHER; +} + /* The max unicode value is always 0x10FFFF while using the PEP-393 API. This function is kept for backward compatibility with the old API. */ Py_UNICODE @@ -3162,24 +3194,22 @@ wcstombs_errorpos(const wchar_t *wstr) static int locale_error_handler(const char *errors, int *surrogateescape) { - if (errors == NULL) { - *surrogateescape = 0; - return 0; - } - - if (strcmp(errors, "strict") == 0) { + _Py_error_handler error_handler = get_error_handler(errors); + switch (error_handler) + { + case _Py_ERROR_STRICT: *surrogateescape = 0; return 0; - } - if (strcmp(errors, "surrogateescape") == 0) { + case _Py_ERROR_SURROGATEESCAPE: *surrogateescape = 1; return 0; + default: + PyErr_Format(PyExc_ValueError, + "only 'strict' and 'surrogateescape' error handlers " + "are supported, not '%s'", + errors); + return -1; } - PyErr_Format(PyExc_ValueError, - "only 'strict' and 'surrogateescape' error handlers " - "are supported, not '%s'", - errors); - return -1; } PyObject * @@ -6390,7 +6420,7 @@ unicode_encode_call_errorhandler(const char *errors, static PyObject * unicode_encode_ucs1(PyObject *unicode, const char *errors, - unsigned int limit) + const Py_UCS4 limit) { /* input state */ Py_ssize_t pos=0, size; @@ -6404,11 +6434,9 @@ unicode_encode_ucs1(PyObject *unicode, Py_ssize_t ressize; const char *encoding = (limit == 256) ? "latin-1" : "ascii"; const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (PyUnicode_READY(unicode) == -1) return NULL; @@ -6426,12 +6454,12 @@ unicode_encode_ucs1(PyObject *unicode, ressize = size; while (pos < size) { - Py_UCS4 c = PyUnicode_READ(kind, data, pos); + Py_UCS4 ch = PyUnicode_READ(kind, data, pos); /* can we encode this? */ - if (c<limit) { + if (ch < limit) { /* no overflow check, because we know that the space is enough */ - *str++ = (char)c; + *str++ = (char)ch; ++pos; } else { @@ -6442,38 +6470,35 @@ unicode_encode_ucs1(PyObject *unicode, Py_ssize_t collstart = pos; Py_ssize_t collend = pos; /* find all unecodable characters */ + while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) ++collend; + /* cache callback name lookup (if not done yet, i.e. it's the first error) */ - if (known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - known_errorHandler = 4; - else - known_errorHandler = 0; - } - switch (known_errorHandler) { - case 1: /* strict */ + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_STRICT: raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); goto onError; - case 2: /* replace */ - while (collstart++ < collend) - *str++ = '?'; /* fall through */ - case 3: /* ignore */ + + case _Py_ERROR_REPLACE: + memset(str, '?', collend - collstart); + str += (collend - collstart); + /* fall through ignore error handler */ + case _Py_ERROR_IGNORE: pos = collend; break; - case 4: /* xmlcharrefreplace */ + + case _Py_ERROR_XMLCHARREFREPLACE: respos = str - PyBytes_AS_STRING(res); requiredsize = respos; /* determine replacement size */ for (i = collstart; i < collend; ++i) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); Py_ssize_t incr; + + ch = PyUnicode_READ(kind, data, i); if (ch < 10) incr = 2+1+1; else if (ch < 100) @@ -6511,13 +6536,31 @@ unicode_encode_ucs1(PyObject *unicode, } pos = collend; break; + + case _Py_ERROR_SURROGATEESCAPE: + for (i = collstart; i < collend; ++i) { + ch = PyUnicode_READ(kind, data, i); + if (ch < 0xdc80 || 0xdcff < ch) { + /* Not a UTF-8b surrogate */ + break; + } + *str++ = (char)(ch - 0xdc00); + ++pos; + } + if (i >= collend) + break; + collstart = pos; + assert(collstart != collend); + /* fallback to general error handling */ + default: - repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, + repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj, encoding, reason, unicode, &exc, collstart, collend, &newpos); if (repunicode == NULL || (PyUnicode_Check(repunicode) && PyUnicode_READY(repunicode) == -1)) goto onError; + if (PyBytes_Check(repunicode)) { /* Directly copy bytes result to output. */ repsize = PyBytes_Size(repunicode); @@ -6541,6 +6584,7 @@ unicode_encode_ucs1(PyObject *unicode, Py_DECREF(repunicode); break; } + /* need more space? (at least enough for what we have+the replacement+the rest of the string, so we won't have to check space for encodable characters) */ @@ -6563,17 +6607,18 @@ unicode_encode_ucs1(PyObject *unicode, str = PyBytes_AS_STRING(res) + respos; ressize = requiredsize; } + /* check if there is anything unencodable in the replacement and copy it to the output */ for (i = 0; repsize-->0; ++i, ++str) { - c = PyUnicode_READ_CHAR(repunicode, i); - if (c >= limit) { + ch = PyUnicode_READ_CHAR(repunicode, i); + if (ch >= limit) { raise_encode_exception(&exc, encoding, unicode, pos, pos+1, reason); Py_DECREF(repunicode); goto onError; } - *str = (char)c; + *str = (char)ch; } pos = newpos; Py_DECREF(repunicode); @@ -6588,7 +6633,7 @@ unicode_encode_ucs1(PyObject *unicode, goto onError; } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return res; @@ -6598,7 +6643,7 @@ unicode_encode_ucs1(PyObject *unicode, onError: Py_XDECREF(res); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; } @@ -6658,8 +6703,9 @@ PyUnicode_DecodeASCII(const char *s, Py_ssize_t endinpos; Py_ssize_t outpos; const char *e; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) _Py_RETURN_UNICODE_EMPTY(); @@ -6688,12 +6734,42 @@ PyUnicode_DecodeASCII(const char *s, PyUnicode_WRITE(kind, data, writer.pos, c); writer.pos++; ++s; + continue; } - else { + + /* byte outsize range 0x00..0x7f: call the error handler */ + + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) + { + case _Py_ERROR_REPLACE: + case _Py_ERROR_SURROGATEESCAPE: + /* Fast-path: the error handler only writes one character, + but we may switch to UCS2 at the first write */ + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + kind = writer.kind; + data = writer.data; + + if (error_handler == _Py_ERROR_REPLACE) + PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); + else + PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); + writer.pos++; + ++s; + break; + + case _Py_ERROR_IGNORE: + ++s; + break; + + default: startinpos = s-starts; endinpos = startinpos + 1; if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, + errors, &error_handler_obj, "ascii", "ordinal not in range(128)", &starts, &e, &startinpos, &endinpos, &exc, &s, &writer)) @@ -6702,13 +6778,13 @@ PyUnicode_DecodeASCII(const char *s, data = writer.data; } } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: _PyUnicodeWriter_Dealloc(&writer); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; } @@ -8074,7 +8150,7 @@ static int charmap_encoding_error( PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, PyObject **exceptionObject, - int *known_errorHandler, PyObject **errorHandler, const char *errors, + _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, PyObject **res, Py_ssize_t *respos) { PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ @@ -8121,23 +8197,15 @@ charmap_encoding_error( } /* cache callback name lookup * (if not done yet, i.e. it's the first error) */ - if (*known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - *known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - *known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - *known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - *known_errorHandler = 4; - else - *known_errorHandler = 0; - } - switch (*known_errorHandler) { - case 1: /* strict */ + if (*error_handler == _Py_ERROR_UNKNOWN) + *error_handler = get_error_handler(errors); + + switch (*error_handler) { + case _Py_ERROR_STRICT: raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); return -1; - case 2: /* replace */ + + case _Py_ERROR_REPLACE: for (collpos = collstartpos; collpos<collendpos; ++collpos) { x = charmapencode_output('?', mapping, res, respos); if (x==enc_EXCEPTION) { @@ -8149,10 +8217,11 @@ charmap_encoding_error( } } /* fall through */ - case 3: /* ignore */ + case _Py_ERROR_IGNORE: *inpos = collendpos; break; - case 4: /* xmlcharrefreplace */ + + case _Py_ERROR_XMLCHARREFREPLACE: /* generate replacement (temporarily (mis)uses p) */ for (collpos = collstartpos; collpos < collendpos; ++collpos) { char buffer[2+29+1+1]; @@ -8170,8 +8239,9 @@ charmap_encoding_error( } *inpos = collendpos; break; + default: - repunicode = unicode_encode_call_errorhandler(errors, errorHandler, + repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, encoding, reason, unicode, exceptionObject, collstartpos, collendpos, &newpos); if (repunicode == NULL) @@ -8234,12 +8304,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, Py_ssize_t size; /* current output position */ Py_ssize_t respos = 0; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, - * 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; void *data; int kind; @@ -8270,7 +8337,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, if (x==enc_FAILED) { /* unencodable character */ if (charmap_encoding_error(unicode, &inpos, mapping, &exc, - &known_errorHandler, &errorHandler, errors, + &error_handler, &error_handler_obj, errors, &res, &respos)) { goto onError; } @@ -8286,13 +8353,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, goto onError; Py_XDECREF(exc); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); return res; onError: Py_XDECREF(res); Py_XDECREF(exc); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); return NULL; } @@ -8618,7 +8685,7 @@ exit: return res; } -PyObject * +static PyObject * _PyUnicode_TranslateCharmap(PyObject *input, PyObject *mapping, const char *errors) @@ -10889,6 +10956,12 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) } int +_PyUnicode_EQ(PyObject *aa, PyObject *bb) +{ + return unicode_eq(aa, bb); +} + +int PyUnicode_Contains(PyObject *container, PyObject *element) { PyObject *str, *sub; @@ -13258,7 +13331,9 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t newlen; PyObject *newbuffer; - assert(length > 0); + /* ensure that the _PyUnicodeWriter_Prepare macro was used */ + assert((maxchar > writer->maxchar && length >= 0) + || length > 0); if (length > PY_SSIZE_T_MAX - writer->pos) { PyErr_NoMemory(); @@ -13325,6 +13400,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, #undef OVERALLOCATE_FACTOR } +int +_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, + enum PyUnicode_Kind kind) +{ + Py_UCS4 maxchar; + + /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ + assert(writer->kind < kind); + + switch (kind) + { + case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; + case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; + case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; + default: + assert(0 && "invalid kind"); + return -1; + } + + return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); +} + Py_LOCAL_INLINE(int) _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) { |
