diff options
| author | Antoine Pitrou <solipsis@pitrou.net> | 2010-09-09 20:33:43 +0000 | 
|---|---|---|
| committer | Antoine Pitrou <solipsis@pitrou.net> | 2010-09-09 20:33:43 +0000 | 
| commit | c9a8df24cc8c95efb63b9820d9381ad2f54e45c5 (patch) | |
| tree | 590f0f94fd1907e7849a30f071ee6d27af1a3fbb /Python | |
| parent | 8e0bb6a1e2907797cd6e4b7cc90539904e54db7e (diff) | |
| download | cpython-git-c9a8df24cc8c95efb63b9820d9381ad2f54e45c5.tar.gz | |
Merged revisions 84655 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/branches/py3k
........
  r84655 | antoine.pitrou | 2010-09-09 22:30:23 +0200 (jeu., 09 sept. 2010) | 6 lines
  Issue #9804: ascii() now always represents unicode surrogate pairs as
  a single `\UXXXXXXXX`, regardless of whether the character is printable
  or not.  Also, the "backslashreplace" error handler now joins surrogate
  pairs into a single character on UCS-2 builds.
........
Diffstat (limited to 'Python')
| -rw-r--r-- | Python/codecs.c | 26 | 
1 files changed, 20 insertions, 6 deletions
| diff --git a/Python/codecs.c b/Python/codecs.c index 04487a216c..45d99291f1 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {  PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)  { +#ifndef Py_UNICODE_WIDE +#define IS_SURROGATE_PAIR(p, end) \ +    (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \ +     *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF) +#else +#define IS_SURROGATE_PAIR(p, end) 0 +#endif      if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {          PyObject *restuple;          PyObject *object; @@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)              else  #endif              if (*p >= 0x100) { -                ressize += 1+1+4; +                if (IS_SURROGATE_PAIR(p, startp+end)) { +                    ressize += 1+1+8; +                    ++p; +                } +                else +                    ressize += 1+1+4;              }              else                  ressize += 1+1+2; @@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)              return NULL;          for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);              p < startp+end; ++p) { -            Py_UNICODE c = *p; +            Py_UCS4 c = (Py_UCS4) *p;              *outp++ = '\\'; -#ifdef Py_UNICODE_WIDE +            if (IS_SURROGATE_PAIR(p, startp+end)) { +                c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000; +                ++p; +            }              if (c >= 0x00010000) {                  *outp++ = 'U';                  *outp++ = hexdigits[(c>>28)&0xf]; @@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)                  *outp++ = hexdigits[(c>>12)&0xf];                  *outp++ = hexdigits[(c>>8)&0xf];              } -            else -#endif -            if (c >= 0x100) { +            else if (c >= 0x100) {                  *outp++ = 'u';                  *outp++ = hexdigits[(c>>12)&0xf];                  *outp++ = hexdigits[(c>>8)&0xf]; @@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)          wrong_exception_type(exc);          return NULL;      } +#undef IS_SURROGATE_PAIR  }  /* This handler is declared static until someone demonstrates | 
