summaryrefslogtreecommitdiff
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c750
1 files changed, 609 insertions, 141 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5f56cf7db0..9d11546cb3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -429,6 +429,10 @@ _PyUnicode_CheckConsistency(void *op, int check_content)
}
#endif
+#ifdef HAVE_MBCS
+static OSVERSIONINFOEX winver;
+#endif
+
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -6896,130 +6900,307 @@ PyUnicode_AsASCIIString(PyObject *unicode)
#define NEED_RETRY
#endif
-/* XXX This code is limited to "true" double-byte encodings, as
- a) it assumes an incomplete character consists of a single byte, and
- b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
- encodings, see IsDBCSLeadByteEx documentation. */
+#ifndef WC_ERR_INVALID_CHARS
+# define WC_ERR_INVALID_CHARS 0x0080
+#endif
+
+static char*
+code_page_name(UINT code_page, PyObject **obj)
+{
+ *obj = NULL;
+ if (code_page == CP_ACP)
+ return "mbcs";
+ if (code_page == CP_UTF7)
+ return "CP_UTF7";
+ if (code_page == CP_UTF8)
+ return "CP_UTF8";
+
+ *obj = PyBytes_FromFormat("cp%u", code_page);
+ if (*obj == NULL)
+ return NULL;
+ return PyBytes_AS_STRING(*obj);
+}
static int
-is_dbcs_lead_byte(const char *s, int offset)
+is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
{
const char *curr = s + offset;
+ const char *prev;
- if (IsDBCSLeadByte(*curr)) {
- const char *prev = CharPrev(s, curr);
- return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
- }
+ if (!IsDBCSLeadByteEx(code_page, *curr))
+ return 0;
+
+ prev = CharPrevExA(code_page, s, curr, 0);
+ if (prev == curr)
+ return 1;
+ /* FIXME: This code is limited to "true" double-byte encodings,
+ as it assumes an incomplete character consists of a single
+ byte. */
+ if (curr - prev == 2)
+ return 1;
+ if (!IsDBCSLeadByteEx(code_page, *prev))
+ return 1;
return 0;
}
+static DWORD
+decode_code_page_flags(UINT code_page)
+{
+ if (code_page == CP_UTF7) {
+ /* The CP_UTF7 decoder only supports flags=0 */
+ return 0;
+ }
+ else
+ return MB_ERR_INVALID_CHARS;
+}
+
/*
- * Decode MBCS string into unicode object. If 'final' is set, converts
- * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ * Decode a byte string from a Windows code page into unicode object in strict
+ * mode.
+ *
+ * Returns consumed size if succeed, returns -2 on decode error, or raise a
+ * WindowsError and returns -1 on other error.
*/
static int
-decode_mbcs(PyUnicodeObject **v,
- const char *s, /* MBCS string */
- int size, /* sizeof MBCS string */
- int final,
- const char *errors)
+decode_code_page_strict(UINT code_page,
+ PyUnicodeObject **v,
+ const char *in,
+ int insize)
{
- Py_UNICODE *p;
- Py_ssize_t n;
- DWORD usize;
- DWORD flags;
+ const DWORD flags = decode_code_page_flags(code_page);
+ Py_UNICODE *out;
+ DWORD outsize;
- assert(size >= 0);
+ /* First get the size of the result */
+ assert(insize > 0);
+ outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
+ if (outsize <= 0)
+ goto error;
- /* check and handle 'errors' arg */
- if (errors==NULL || strcmp(errors, "strict")==0)
- flags = MB_ERR_INVALID_CHARS;
- else if (strcmp(errors, "ignore")==0)
- flags = 0;
+ if (*v == NULL) {
+ /* Create unicode object */
+ *v = _PyUnicode_New(outsize);
+ if (*v == NULL)
+ return -1;
+ out = PyUnicode_AS_UNICODE(*v);
+ }
else {
- PyErr_Format(PyExc_ValueError,
- "mbcs encoding does not support errors='%s'",
- errors);
- return -1;
+ /* Extend unicode object */
+ Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+ if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
+ return -1;
+ out = PyUnicode_AS_UNICODE(*v) + n;
}
- /* Skip trailing lead-byte unless 'final' is set */
- if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
- --size;
+ /* Do the conversion */
+ outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
+ if (outsize <= 0)
+ goto error;
+ return insize;
- /* First get the size of the result */
- if (size > 0) {
- usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
- if (usize==0)
- goto mbcs_decode_error;
- } else
- usize = 0;
+error:
+ if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+ return -2;
+ PyErr_SetFromWindowsErr(0);
+ return -1;
+}
+
+/*
+ * Decode a byte string from a code page into unicode object with an error
+ * handler.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page_errors(UINT code_page,
+ PyUnicodeObject **v,
+ const char *in,
+ int size,
+ const char *errors)
+{
+ const char *startin = in;
+ const char *endin = in + size;
+ const DWORD flags = decode_code_page_flags(code_page);
+ /* Ideally, we should get reason from FormatMessage. This is the Windows
+ 2000 English version of the message. */
+ const char *reason = "No mapping for the Unicode character exists "
+ "in the target code page.";
+ /* each step cannot decode more than 1 character, but a character can be
+ represented as a surrogate pair */
+ wchar_t buffer[2], *startout, *out;
+ int insize, outsize;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ PyObject *encoding_obj = NULL;
+ char *encoding;
+ DWORD err;
+ int ret = -1;
+
+ assert(size > 0);
+
+ encoding = code_page_name(code_page, &encoding_obj);
+ if (encoding == NULL)
+ return -1;
+
+ if (errors == NULL || strcmp(errors, "strict") == 0) {
+ /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
+ UnicodeDecodeError. */
+ make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_CLEAR(exc);
+ }
+ goto error;
+ }
if (*v == NULL) {
/* Create unicode object */
- *v = _PyUnicode_New(usize);
+ if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
if (*v == NULL)
- return -1;
- n = 0;
+ goto error;
+ startout = PyUnicode_AS_UNICODE(*v);
}
else {
/* Extend unicode object */
- n = PyUnicode_GET_SIZE(*v);
- if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
- return -1;
+ Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+ if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
+ goto error;
+ startout = PyUnicode_AS_UNICODE(*v) + n;
}
- /* Do the conversion */
- if (usize > 0) {
- p = PyUnicode_AS_UNICODE(*v) + n;
- if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
- goto mbcs_decode_error;
+ /* Decode the byte string character per character */
+ out = startout;
+ while (in < endin)
+ {
+ /* Decode a character */
+ insize = 1;
+ do
+ {
+ outsize = MultiByteToWideChar(code_page, flags,
+ in, insize,
+ buffer, Py_ARRAY_LENGTH(buffer));
+ if (outsize > 0)
+ break;
+ err = GetLastError();
+ if (err != ERROR_NO_UNICODE_TRANSLATION
+ && err != ERROR_INSUFFICIENT_BUFFER)
+ {
+ PyErr_SetFromWindowsErr(0);
+ goto error;
+ }
+ insize++;
+ }
+ /* 4=maximum length of a UTF-8 sequence */
+ while (insize <= 4 && (in + insize) <= endin);
+
+ if (outsize <= 0) {
+ Py_ssize_t startinpos, endinpos, outpos;
+
+ startinpos = in - startin;
+ endinpos = startinpos + 1;
+ outpos = out - PyUnicode_AS_UNICODE(*v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ encoding, reason,
+ &startin, &endin, &startinpos, &endinpos, &exc, &in,
+ v, &outpos, &out))
+ {
+ goto error;
+ }
+ }
+ else {
+ in += insize;
+ memcpy(out, buffer, outsize * sizeof(wchar_t));
+ out += outsize;
}
}
- return size;
-mbcs_decode_error:
- /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
- we raise a UnicodeDecodeError - else it is a 'generic'
- windows error
- */
- if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
- /* Ideally, we should get reason from FormatMessage - this
- is the Windows 2000 English version of the message
- */
- PyObject *exc = NULL;
- const char *reason = "No mapping for the Unicode character exists "
- "in the target multi-byte code page.";
- make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
- if (exc != NULL) {
- PyCodec_StrictErrors(exc);
- Py_DECREF(exc);
+ /* write a NUL character at the end */
+ *out = 0;
+
+ /* Extend unicode object */
+ outsize = out - startout;
+ assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
+ if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
+ goto error;
+ ret = 0;
+
+error:
+ Py_XDECREF(encoding_obj);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return ret;
+}
+
+/*
+ * Decode a byte string from a Windows code page into unicode object. If
+ * 'final' is set, converts trailing lead-byte too.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page(UINT code_page,
+ PyUnicodeObject **v,
+ const char *s, int size,
+ int final, const char *errors)
+{
+ int done;
+
+ /* Skip trailing lead-byte unless 'final' is set */
+ if (size == 0) {
+ if (*v == NULL) {
+ Py_INCREF(unicode_empty);
+ *v = (PyUnicodeObject*)unicode_empty;
+ if (*v == NULL)
+ return -1;
}
- } else {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return 0;
}
- return -1;
+
+ if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
+ --size;
+
+ done = decode_code_page_strict(code_page, v, s, size);
+ if (done == -2)
+ done = decode_code_page_errors(code_page, v, s, size, errors);
+ return done;
}
-PyObject *
-PyUnicode_DecodeMBCSStateful(const char *s,
- Py_ssize_t size,
- const char *errors,
- Py_ssize_t *consumed)
+static PyObject *
+decode_code_page_stateful(int code_page,
+ const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
{
PyUnicodeObject *v = NULL;
int done;
+ if (code_page < 0) {
+ PyErr_SetString(PyExc_ValueError, "invalid code page number");
+ return NULL;
+ }
+
if (consumed)
*consumed = 0;
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
- done = decode_mbcs(&v, s, INT_MAX, 0, errors);
+ done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
else
#endif
- done = decode_mbcs(&v, s, (int)size, !consumed, errors);
+ done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
if (done < 0) {
Py_XDECREF(v);
@@ -7036,6 +7217,7 @@ PyUnicode_DecodeMBCSStateful(const char *s,
goto retry;
}
#endif
+
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&v)) {
Py_DECREF(v);
@@ -7047,6 +7229,25 @@ PyUnicode_DecodeMBCSStateful(const char *s,
}
PyObject *
+PyUnicode_DecodeCodePageStateful(int code_page,
+ const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ return decode_code_page_stateful(code_page, s, size, errors, consumed);
+}
+
+PyObject *
+PyUnicode_DecodeMBCSStateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
+}
+
+PyObject *
PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
const char *errors)
@@ -7054,105 +7255,342 @@ PyUnicode_DecodeMBCS(const char *s,
return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
}
+static DWORD
+encode_code_page_flags(UINT code_page, const char *errors)
+{
+ if (code_page == CP_UTF8) {
+ if (winver.dwMajorVersion >= 6)
+ /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
+ and later */
+ return WC_ERR_INVALID_CHARS;
+ else
+ /* CP_UTF8 only supports flags=0 on Windows older than Vista */
+ return 0;
+ }
+ else if (code_page == CP_UTF7) {
+ /* CP_UTF7 only supports flags=0 */
+ return 0;
+ }
+ else {
+ if (errors != NULL && strcmp(errors, "replace") == 0)
+ return 0;
+ else
+ return WC_NO_BEST_FIT_CHARS;
+ }
+}
+
/*
- * Convert unicode into string object (MBCS).
- * Returns 0 if succeed, -1 otherwise.
+ * Encode a Unicode string to a Windows code page into a byte string in strict
+ * mode.
+ *
+ * Returns consumed characters if succeed, returns -2 on encode error, or raise
+ * a WindowsError and returns -1 on other error.
*/
static int
-encode_mbcs(PyObject **repr,
- const Py_UNICODE *p, /* unicode */
- int size, /* size of unicode */
- const char* errors)
+encode_code_page_strict(UINT code_page, PyObject **outbytes,
+ const Py_UNICODE *p, const int size,
+ const char* errors)
{
BOOL usedDefaultChar = FALSE;
- BOOL *pusedDefaultChar;
- int mbcssize;
- Py_ssize_t n;
+ BOOL *pusedDefaultChar = &usedDefaultChar;
+ int outsize;
PyObject *exc = NULL;
- DWORD flags;
+ const DWORD flags = encode_code_page_flags(code_page, NULL);
+ char *out;
- assert(size >= 0);
+ assert(size > 0);
- /* check and handle 'errors' arg */
- if (errors==NULL || strcmp(errors, "strict")==0) {
- flags = WC_NO_BEST_FIT_CHARS;
+ if (code_page != CP_UTF8 && code_page != CP_UTF7)
pusedDefaultChar = &usedDefaultChar;
- } else if (strcmp(errors, "replace")==0) {
- flags = 0;
+ else
pusedDefaultChar = NULL;
- } else {
- PyErr_Format(PyExc_ValueError,
- "mbcs encoding does not support errors='%s'",
- errors);
- return -1;
- }
/* First get the size of the result */
- if (size > 0) {
- mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
- NULL, pusedDefaultChar);
- if (mbcssize == 0) {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
- return -1;
- }
- /* If we used a default char, then we failed! */
- if (pusedDefaultChar && *pusedDefaultChar)
- goto mbcs_encode_error;
- } else {
- mbcssize = 0;
- }
+ outsize = WideCharToMultiByte(code_page, flags,
+ p, size,
+ NULL, 0,
+ NULL, pusedDefaultChar);
+ if (outsize <= 0)
+ goto error;
+ /* If we used a default char, then we failed! */
+ if (pusedDefaultChar && *pusedDefaultChar)
+ return -2;
- if (*repr == NULL) {
+ if (*outbytes == NULL) {
/* Create string object */
- *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
- if (*repr == NULL)
+ *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+ if (*outbytes == NULL)
return -1;
- n = 0;
+ out = PyBytes_AS_STRING(*outbytes);
}
else {
/* Extend string object */
- n = PyBytes_Size(*repr);
- if (_PyBytes_Resize(repr, n + mbcssize) < 0)
+ const Py_ssize_t n = PyBytes_Size(*outbytes);
+ if (outsize > PY_SSIZE_T_MAX - n) {
+ PyErr_NoMemory();
return -1;
+ }
+ if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+ return -1;
+ out = PyBytes_AS_STRING(*outbytes) + n;
}
/* Do the conversion */
- if (size > 0) {
- char *s = PyBytes_AS_STRING(*repr) + n;
- if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
- NULL, pusedDefaultChar)) {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
- return -1;
+ outsize = WideCharToMultiByte(code_page, flags,
+ p, size,
+ out, outsize,
+ NULL, pusedDefaultChar);
+ if (outsize <= 0)
+ goto error;
+ if (pusedDefaultChar && *pusedDefaultChar)
+ return -2;
+ return 0;
+
+error:
+ if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+ return -2;
+ PyErr_SetFromWindowsErr(0);
+ return -1;
+}
+
+/*
+ * Encode a Unicode string to a Windows code page into a byte string using a
+ * error handler.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_errors(UINT code_page, PyObject **outbytes,
+ const Py_UNICODE *in, const int insize,
+ const char* errors)
+{
+ const DWORD flags = encode_code_page_flags(code_page, errors);
+ const Py_UNICODE *startin = in;
+ const Py_UNICODE *endin = in + insize;
+ /* Ideally, we should get reason from FormatMessage. This is the Windows
+ 2000 English version of the message. */
+ const char *reason = "invalid character";
+ /* 4=maximum length of a UTF-8 sequence */
+ char buffer[4];
+ BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
+ Py_ssize_t outsize;
+ char *out;
+ int charsize;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ PyObject *encoding_obj = NULL;
+ char *encoding;
+ int err;
+ Py_ssize_t startpos, newpos, newoutsize;
+ PyObject *rep;
+ int ret = -1;
+
+ assert(insize > 0);
+
+ encoding = code_page_name(code_page, &encoding_obj);
+ if (encoding == NULL)
+ return -1;
+
+ if (errors == NULL || strcmp(errors, "strict") == 0) {
+ /* The last error was ERROR_NO_UNICODE_TRANSLATION,
+ then we raise a UnicodeEncodeError. */
+ make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_DECREF(exc);
}
- if (pusedDefaultChar && *pusedDefaultChar)
- goto mbcs_encode_error;
+ Py_XDECREF(encoding_obj);
+ return -1;
}
- return 0;
-mbcs_encode_error:
- raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
+ if (code_page != CP_UTF8 && code_page != CP_UTF7)
+ pusedDefaultChar = &usedDefaultChar;
+ else
+ pusedDefaultChar = NULL;
+
+ if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ outsize = insize * Py_ARRAY_LENGTH(buffer);
+
+ if (*outbytes == NULL) {
+ /* Create string object */
+ *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+ if (*outbytes == NULL)
+ goto error;
+ out = PyBytes_AS_STRING(*outbytes);
+ }
+ else {
+ /* Extend string object */
+ Py_ssize_t n = PyBytes_Size(*outbytes);
+ if (n > PY_SSIZE_T_MAX - outsize) {
+ PyErr_NoMemory();
+ goto error;
+ }
+ if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+ goto error;
+ out = PyBytes_AS_STRING(*outbytes) + n;
+ }
+
+ /* Encode the string character per character */
+ while (in < endin)
+ {
+ if ((in + 2) <= endin
+ && 0xD800 <= in[0] && in[0] <= 0xDBFF
+ && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
+ charsize = 2;
+ else
+ charsize = 1;
+
+ outsize = WideCharToMultiByte(code_page, flags,
+ in, charsize,
+ buffer, Py_ARRAY_LENGTH(buffer),
+ NULL, pusedDefaultChar);
+ if (outsize > 0) {
+ if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
+ {
+ in += charsize;
+ memcpy(out, buffer, outsize);
+ out += outsize;
+ continue;
+ }
+ }
+ else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
+ PyErr_SetFromWindowsErr(0);
+ goto error;
+ }
+
+ charsize = Py_MAX(charsize - 1, 1);
+ startpos = in - startin;
+ rep = unicode_encode_call_errorhandler(
+ errors, &errorHandler, encoding, reason,
+ startin, insize, &exc,
+ startpos, startpos + charsize, &newpos);
+ if (rep == NULL)
+ goto error;
+ in = startin + newpos;
+
+ if (PyBytes_Check(rep)) {
+ outsize = PyBytes_GET_SIZE(rep);
+ if (outsize != 1) {
+ Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+ newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+ if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+ Py_DECREF(rep);
+ goto error;
+ }
+ out = PyBytes_AS_STRING(*outbytes) + offset;
+ }
+ memcpy(out, PyBytes_AS_STRING(rep), outsize);
+ out += outsize;
+ }
+ else {
+ Py_ssize_t i;
+ enum PyUnicode_Kind kind;
+ void *data;
+
+ if (PyUnicode_READY(rep) < 0) {
+ Py_DECREF(rep);
+ goto error;
+ }
+
+ outsize = PyUnicode_GET_LENGTH(rep);
+ if (outsize != 1) {
+ Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+ newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+ if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+ Py_DECREF(rep);
+ goto error;
+ }
+ out = PyBytes_AS_STRING(*outbytes) + offset;
+ }
+ kind = PyUnicode_KIND(rep);
+ data = PyUnicode_DATA(rep);
+ for (i=0; i < outsize; i++) {
+ Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+ if (ch > 127) {
+ raise_encode_exception(&exc,
+ encoding,
+ startin, insize,
+ startpos, startpos + charsize,
+ "unable to encode error handler result to ASCII");
+ Py_DECREF(rep);
+ goto error;
+ }
+ *out = (unsigned char)ch;
+ out++;
+ }
+ }
+ Py_DECREF(rep);
+ }
+ /* write a NUL byte */
+ *out = 0;
+ outsize = out - PyBytes_AS_STRING(*outbytes);
+ assert(outsize <= PyBytes_GET_SIZE(*outbytes));
+ if (_PyBytes_Resize(outbytes, outsize) < 0)
+ goto error;
+ ret = 0;
+
+error:
+ Py_XDECREF(encoding_obj);
+ Py_XDECREF(errorHandler);
Py_XDECREF(exc);
- return -1;
+ return ret;
}
-PyObject *
-PyUnicode_EncodeMBCS(const Py_UNICODE *p,
- Py_ssize_t size,
- const char *errors)
+/*
+ * Encode a Unicode string to a Windows code page into a byte string.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_chunk(UINT code_page, PyObject **outbytes,
+ const Py_UNICODE *p, int size,
+ const char* errors)
+{
+ int done;
+
+ if (size == 0) {
+ if (*outbytes == NULL) {
+ *outbytes = PyBytes_FromStringAndSize(NULL, 0);
+ if (*outbytes == NULL)
+ return -1;
+ }
+ return 0;
+ }
+
+ done = encode_code_page_strict(code_page, outbytes, p, size, errors);
+ if (done == -2)
+ done = encode_code_page_errors(code_page, outbytes, p, size, errors);
+ return done;
+}
+
+static PyObject *
+encode_code_page(int code_page,
+ const Py_UNICODE *p, Py_ssize_t size,
+ const char *errors)
{
- PyObject *repr = NULL;
+ PyObject *outbytes = NULL;
int ret;
+ if (code_page < 0) {
+ PyErr_SetString(PyExc_ValueError, "invalid code page number");
+ return NULL;
+ }
+
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
- ret = encode_mbcs(&repr, p, INT_MAX, errors);
+ ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
else
#endif
- ret = encode_mbcs(&repr, p, (int)size, errors);
+ ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
if (ret < 0) {
- Py_XDECREF(repr);
+ Py_XDECREF(outbytes);
return NULL;
}
@@ -7164,7 +7602,28 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p,
}
#endif
- return repr;
+ return outbytes;
+}
+
+PyObject *
+PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+ Py_ssize_t size,
+ const char *errors)
+{
+ return encode_code_page(CP_ACP, p, size, errors);
+}
+
+PyObject *
+PyUnicode_EncodeCodePage(int code_page,
+ PyObject *unicode,
+ const char *errors)
+{
+ const Py_UNICODE *p;
+ Py_ssize_t size;
+ p = PyUnicode_AsUnicodeAndSize(unicode, &size);
+ if (p == NULL)
+ return NULL;
+ return encode_code_page(code_page, p, size, errors);
}
PyObject *
@@ -13434,7 +13893,7 @@ PyTypeObject PyUnicode_Type = {
/* Initialize the Unicode implementation */
-void _PyUnicode_Init(void)
+int _PyUnicode_Init(void)
{
int i;
@@ -13467,6 +13926,15 @@ void _PyUnicode_Init(void)
Py_ARRAY_LENGTH(linebreak));
PyType_Ready(&EncodingMapType);
+
+#ifdef HAVE_MBCS
+ winver.dwOSVersionInfoSize = sizeof(winver);
+ if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
+ PyErr_SetFromWindowsErr(0);
+ return -1;
+ }
+#endif
+ return 0;
}
/* Finalize the Unicode implementation */