diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2014-10-10 11:24:02 +0200 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2014-10-10 11:24:02 +0200 |
commit | 147f614dde703c8ea0859ad1052fdc3a39502dc7 (patch) | |
tree | d3ec7dcf08ef619faca9197dc45131953634927c | |
parent | f5188e8ef91fd71e30ad421781630ba3abe49b86 (diff) | |
download | cython-147f614dde703c8ea0859ad1052fdc3a39502dc7.tar.gz |
extend PyUnicode C-API declarations
--HG--
extra : transplant_source : r5f%D6%7E%D3%9C%0A%A0%1C%9FX%E0%F6ji%13T%00%A4
-rw-r--r-- | CHANGES.rst | 2 | ||||
-rw-r--r-- | Cython/Includes/cpython/unicode.pxd | 180 |
2 files changed, 160 insertions, 22 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index a692d9fe9..da286ec64 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -8,6 +8,8 @@ Latest Features added -------------- +* Missing C-API declarations in ``cpython.unicode`` were added. + * Passing ``language='c++'`` into cythonize() globally enables C++ mode for all modules that were not passed as Extension objects (i.e. only source files and file patterns). diff --git a/Cython/Includes/cpython/unicode.pxd b/Cython/Includes/cpython/unicode.pxd index 32088d6ff..a8370e311 100644 --- a/Cython/Includes/cpython/unicode.pxd +++ b/Cython/Includes/cpython/unicode.pxd @@ -131,6 +131,131 @@ cdef extern from *: #Py_ssize_t PyUnicode_AsWideChar(object o, wchar_t *w, Py_ssize_t size) + +# Unicode Methods + + # Concat two strings giving a new Unicode string. + # Return value: New reference. + unicode PyUnicode_Concat(object left, object right) + + # Split a string giving a list of Unicode strings. If sep is NULL, + # splitting will be done at all whitespace substrings. Otherwise, + # splits occur at the given separator. At most maxsplit splits will + # be done. If negative, no limit is set. Separators are not included + # in the resulting list. + # Return value: New reference. + list PyUnicode_Split(object s, object sep, Py_ssize_t maxsplit) + + # Split a Unicode string at line breaks, returning a list of Unicode + # strings. CRLF is considered to be one line break. If keepend is 0, + # the Line break characters are not included in the resulting strings. + # Return value: New reference. + list PyUnicode_Splitlines(object s, bint keepend) + + # Translate a string by applying a character mapping table to it and + # return the resulting Unicode object. + # + # The mapping table must map Unicode ordinal integers to Unicode ordinal + # integers or None (causing deletion of the character). + # + # Mapping tables need only provide the __getitem__() interface; + # dictionaries and sequences work well. Unmapped character ordinals (ones + # which cause a LookupError) are left untouched and are copied as-is. + # + # errors has the usual meaning for codecs. It may be NULL which indicates + # to use the default error handling. + # Return value: New reference. + unicode PyUnicode_Translate(object str, object table, const char *errors) + + # Join a sequence of strings using the given separator and return the + # resulting Unicode string. + # Return value: New reference. + unicode PyUnicode_Join(object separator, object seq) + + # Return 1 if substr matches str[start:end] at the given tail end + # (direction == -1 means to do a prefix match, direction == 1 a + # suffix match), 0 otherwise. + # Return -1 if an error occurred. + Py_ssize_t PyUnicode_Tailmatch(object str, object substr, + Py_ssize_t start, Py_ssize_t end, int direction) except -1 + + # Return the first position of substr in str[start:end] using the given + # direction (direction == 1 means to do a forward search, direction == -1 + # a backward search). The return value is the index of the first match; + # a value of -1 indicates that no match was found, and -2 indicates that an + # error occurred and an exception has been set. + Py_ssize_t PyUnicode_Find(object str, object substr, Py_ssize_t start, Py_ssize_t end, int direction) except -2 + + # Return the first position of the character ch in str[start:end] using + # the given direction (direction == 1 means to do a forward search, + # direction == -1 a backward search). The return value is the index of + # the first match; a value of -1 indicates that no match was found, and + # -2 indicates that an error occurred and an exception has been set. + # New in version 3.3. + Py_ssize_t PyUnicode_FindChar(object str, Py_UCS4 ch, Py_ssize_t start, Py_ssize_t end, int direction) except -2 + + # Return the number of non-overlapping occurrences of substr in + # str[start:end]. Return -1 if an error occurred. + Py_ssize_t PyUnicode_Count(object str, object substr, Py_ssize_t start, Py_ssize_t end) except -1 + + # Replace at most maxcount occurrences of substr in str with replstr and + # return the resulting Unicode object. maxcount == -1 means replace all + # occurrences. + # Return value: New reference. + unicode PyUnicode_Replace(object str, object substr, object replstr, Py_ssize_t maxcount) + + # Compare two strings and return -1, 0, 1 for less than, + # equal, and greater than, respectively. + int PyUnicode_Compare(object left, object right) except? -1 + + # Compare a unicode object, uni, with string and return -1, 0, 1 for less than, + # equal, and greater than, respectively. It is best to pass only ASCII-encoded + # strings, but the function interprets the input string as ISO-8859-1 if it + # contains non-ASCII characters. + int PyUnicode_CompareWithASCIIString(object uni, char *string) except? -1 + + # Rich compare two unicode strings and return one of the following: + # + # NULL in case an exception was raised + # Py_True or Py_False for successful comparisons + # Py_NotImplemented in case the type combination is unknown + # + # Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in case + # the conversion of the arguments to Unicode fails with a UnicodeDecodeError. + # + # Possible values for op are Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, and Py_LE. + object PyUnicode_RichCompare(object left, object right, int op) + + # Return a new string object from format and args; this is analogous to + # format % args. + # Return value: New reference. + unicode PyUnicode_Format(object format, object args) + + # Check whether element is contained in container and return true or false + # accordingly. + # + # element has to coerce to a one element Unicode string. -1 is returned + # if there was an error. + int PyUnicode_Contains(object container, object element) except -1 + + # Intern the argument *string in place. The argument must be the address + # of a pointer variable pointing to a Python unicode string object. If + # there is an existing interned string that is the same as *string, it sets + # *string to it (decrementing the reference count of the old string object + # and incrementing the reference count of the interned string object), + # otherwise it leaves *string alone and interns it (incrementing its reference + # count). (Clarification: even though there is a lot of talk about reference + # counts, think of this function as reference-count-neutral; you own the object + # after the call if and only if you owned it before the call.) + #void PyUnicode_InternInPlace(PyObject **string) + + # A combination of PyUnicode_FromString() and PyUnicode_InternInPlace(), + # returning either a new unicode string object that has been interned, or + # a new ("owned") reference to an earlier interned string object with the + # same value. + unicode PyUnicode_InternFromString(const char *v) + + # Codecs # Create a Unicode object by decoding size bytes of the encoded @@ -161,22 +286,22 @@ cdef extern from *: # Create a Unicode object by decoding size bytes of the UTF-8 # encoded string s. Return NULL if an exception was raised by the # codec. - object PyUnicode_DecodeUTF8(char *s, Py_ssize_t size, char *errors) + unicode PyUnicode_DecodeUTF8(char *s, Py_ssize_t size, char *errors) # If consumed is NULL, behave like PyUnicode_DecodeUTF8(). If # consumed is not NULL, trailing incomplete UTF-8 byte sequences # will not be treated as an error. Those bytes will not be decoded # and the number of bytes that have been decoded will be stored in # consumed. New in version 2.4. - object PyUnicode_DecodeUTF8Stateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed) + unicode PyUnicode_DecodeUTF8Stateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed) # Encode the Py_UNICODE buffer of the given size using UTF-8 and # return a Python string object. Return NULL if an exception was # raised by the codec. - object PyUnicode_EncodeUTF8(Py_UNICODE *s, Py_ssize_t size, char *errors) + bytes PyUnicode_EncodeUTF8(Py_UNICODE *s, Py_ssize_t size, char *errors) # Encode a Unicode objects using UTF-8 and return the result as Python string object. Error handling is ``strict''. Return NULL if an exception was raised by the codec. - object PyUnicode_AsUTF8String(object unicode) + bytes PyUnicode_AsUTF8String(object unicode) # These are the UTF-16 codec APIs: @@ -198,7 +323,7 @@ cdef extern from *: # order at the. # # If byteorder is NULL, the codec starts in native order mode. - object PyUnicode_DecodeUTF16(char *s, Py_ssize_t size, char *errors, int *byteorder) + unicode PyUnicode_DecodeUTF16(char *s, Py_ssize_t size, char *errors, int *byteorder) # If consumed is NULL, behave like PyUnicode_DecodeUTF16(). If # consumed is not NULL, PyUnicode_DecodeUTF16Stateful() will not @@ -206,7 +331,7 @@ cdef extern from *: # number of bytes or a split surrogate pair) as an error. Those # bytes will not be decoded and the number of bytes that have been # decoded will be stored in consumed. New in version 2.4. - object PyUnicode_DecodeUTF16Stateful(char *s, Py_ssize_t size, char *errors, int *byteorder, Py_ssize_t *consumed) + unicode PyUnicode_DecodeUTF16Stateful(char *s, Py_ssize_t size, char *errors, int *byteorder, Py_ssize_t *consumed) # Return a Python string object holding the UTF-16 encoded value # of the Unicode data in s. If byteorder is not 0, output is @@ -223,13 +348,13 @@ cdef extern from *: # If Py_UNICODE_WIDE is defined, a single Py_UNICODE value may get # represented as a surrogate pair. If it is not defined, each # Py_UNICODE values is interpreted as an UCS-2 character. - object PyUnicode_EncodeUTF16(Py_UNICODE *s, Py_ssize_t size, char *errors, int byteorder) + bytes PyUnicode_EncodeUTF16(Py_UNICODE *s, Py_ssize_t size, char *errors, int byteorder) # Return a Python string using the UTF-16 encoding in native byte # order. The string always starts with a BOM mark. Error handling # is ``strict''. Return NULL if an exception was raised by the # codec. - object PyUnicode_AsUTF16String(object unicode) + bytes PyUnicode_AsUTF16String(object unicode) # These are the ``Unicode Escape'' codec APIs: @@ -270,17 +395,17 @@ cdef extern from *: # Create a Unicode object by decoding size bytes of the Latin-1 # encoded string s. Return NULL if an exception was raised by the # codec. - object PyUnicode_DecodeLatin1(char *s, Py_ssize_t size, char *errors) + unicode PyUnicode_DecodeLatin1(char *s, Py_ssize_t size, char *errors) # Encode the Py_UNICODE buffer of the given size using Latin-1 and - # return a Python string object. Return NULL if an exception was + # return a Python bytes object. Return NULL if an exception was # raised by the codec. - object PyUnicode_EncodeLatin1(Py_UNICODE *s, Py_ssize_t size, char *errors) + bytes PyUnicode_EncodeLatin1(Py_UNICODE *s, Py_ssize_t size, char *errors) # Encode a Unicode objects using Latin-1 and return the result as - # Python string object. Error handling is ``strict''. Return NULL + # Python bytes object. Error handling is ``strict''. Return NULL # if an exception was raised by the codec. - object PyUnicode_AsLatin1String(object unicode) + bytes PyUnicode_AsLatin1String(object unicode) # These are the ASCII codec APIs. Only 7-bit ASCII data is # accepted. All other codes generate errors. @@ -288,17 +413,17 @@ cdef extern from *: # Create a Unicode object by decoding size bytes of the ASCII # encoded string s. Return NULL if an exception was raised by the # codec. - object PyUnicode_DecodeASCII(char *s, Py_ssize_t size, char *errors) + unicode PyUnicode_DecodeASCII(char *s, Py_ssize_t size, char *errors) # Encode the Py_UNICODE buffer of the given size using ASCII and - # return a Python string object. Return NULL if an exception was + # return a Python bytes object. Return NULL if an exception was # raised by the codec. - object PyUnicode_EncodeASCII(Py_UNICODE *s, Py_ssize_t size, char *errors) + bytes PyUnicode_EncodeASCII(Py_UNICODE *s, Py_ssize_t size, char *errors) # Encode a Unicode objects using ASCII and return the result as - # Python string object. Error handling is ``strict''. Return NULL + # Python bytes object. Error handling is ``strict''. Return NULL # if an exception was raised by the codec. - object PyUnicode_AsASCIIString(object o) + bytes PyUnicode_AsASCIIString(object o) # These are the mapping codec APIs: # @@ -339,6 +464,8 @@ cdef extern from *: # Encode the Py_UNICODE buffer of the given size using the given # mapping object and return a Python string object. Return NULL if # an exception was raised by the codec. + # + # Deprecated since version 3.3, will be removed in version 4.0. object PyUnicode_EncodeCharmap(Py_UNICODE *s, Py_ssize_t size, object mapping, char *errors) # Encode a Unicode objects using the given mapping object and @@ -359,6 +486,8 @@ cdef extern from *: # dictionaries and sequences work well. Unmapped character # ordinals (ones which cause a LookupError) are left untouched and # are copied as-is. + # + # Deprecated since version 3.3, will be removed in version 4.0. object PyUnicode_TranslateCharmap(Py_UNICODE *s, Py_ssize_t size, object table, char *errors) @@ -371,21 +500,28 @@ cdef extern from *: # Create a Unicode object by decoding size bytes of the MBCS # encoded string s. Return NULL if an exception was raised by the # codec. - object PyUnicode_DecodeMBCS(char *s, Py_ssize_t size, char *errors) + unicode PyUnicode_DecodeMBCS(char *s, Py_ssize_t size, char *errors) # If consumed is NULL, behave like PyUnicode_DecodeMBCS(). If # consumed is not NULL, PyUnicode_DecodeMBCSStateful() will not # decode trailing lead byte and the number of bytes that have been # decoded will be stored in consumed. New in version 2.5. # NOTE: Python 2.x uses 'int' values for 'size' and 'consumed' (changed in 3.0) - object PyUnicode_DecodeMBCSStateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed) + unicode PyUnicode_DecodeMBCSStateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed) # Encode the Py_UNICODE buffer of the given size using MBCS and # return a Python string object. Return NULL if an exception was # raised by the codec. - object PyUnicode_EncodeMBCS(Py_UNICODE *s, Py_ssize_t size, char *errors) + bytes PyUnicode_EncodeMBCS(Py_UNICODE *s, Py_ssize_t size, char *errors) # Encode a Unicode objects using MBCS and return the result as # Python string object. Error handling is ``strict''. Return NULL # if an exception was raised by the codec. - object PyUnicode_AsMBCSString(object o) + bytes PyUnicode_AsMBCSString(object o) + + # Encode the Unicode object using the specified code page and return + # a Python bytes object. Return NULL if an exception was raised by the + # codec. Use CP_ACP code page to get the MBCS encoder. + # + # New in version 3.3. + bytes PyUnicode_EncodeCodePage(int code_page, object unicode, const char *errors) |