summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2014-10-10 11:24:02 +0200
committerStefan Behnel <stefan_ml@behnel.de>2014-10-10 11:24:02 +0200
commit147f614dde703c8ea0859ad1052fdc3a39502dc7 (patch)
treed3ec7dcf08ef619faca9197dc45131953634927c
parentf5188e8ef91fd71e30ad421781630ba3abe49b86 (diff)
downloadcython-147f614dde703c8ea0859ad1052fdc3a39502dc7.tar.gz
extend PyUnicode C-API declarations
--HG-- extra : transplant_source : r5f%D6%7E%D3%9C%0A%A0%1C%9FX%E0%F6ji%13T%00%A4
-rw-r--r--CHANGES.rst2
-rw-r--r--Cython/Includes/cpython/unicode.pxd180
2 files changed, 160 insertions, 22 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index a692d9fe9..da286ec64 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -8,6 +8,8 @@ Latest
Features added
--------------
+* Missing C-API declarations in ``cpython.unicode`` were added.
+
* Passing ``language='c++'`` into cythonize() globally enables C++ mode for
all modules that were not passed as Extension objects (i.e. only source
files and file patterns).
diff --git a/Cython/Includes/cpython/unicode.pxd b/Cython/Includes/cpython/unicode.pxd
index 32088d6ff..a8370e311 100644
--- a/Cython/Includes/cpython/unicode.pxd
+++ b/Cython/Includes/cpython/unicode.pxd
@@ -131,6 +131,131 @@ cdef extern from *:
#Py_ssize_t PyUnicode_AsWideChar(object o, wchar_t *w, Py_ssize_t size)
+
+# Unicode Methods
+
+ # Concat two strings giving a new Unicode string.
+ # Return value: New reference.
+ unicode PyUnicode_Concat(object left, object right)
+
+ # Split a string giving a list of Unicode strings. If sep is NULL,
+ # splitting will be done at all whitespace substrings. Otherwise,
+ # splits occur at the given separator. At most maxsplit splits will
+ # be done. If negative, no limit is set. Separators are not included
+ # in the resulting list.
+ # Return value: New reference.
+ list PyUnicode_Split(object s, object sep, Py_ssize_t maxsplit)
+
+ # Split a Unicode string at line breaks, returning a list of Unicode
+ # strings. CRLF is considered to be one line break. If keepend is 0,
+ # the Line break characters are not included in the resulting strings.
+ # Return value: New reference.
+ list PyUnicode_Splitlines(object s, bint keepend)
+
+ # Translate a string by applying a character mapping table to it and
+ # return the resulting Unicode object.
+ #
+ # The mapping table must map Unicode ordinal integers to Unicode ordinal
+ # integers or None (causing deletion of the character).
+ #
+ # Mapping tables need only provide the __getitem__() interface;
+ # dictionaries and sequences work well. Unmapped character ordinals (ones
+ # which cause a LookupError) are left untouched and are copied as-is.
+ #
+ # errors has the usual meaning for codecs. It may be NULL which indicates
+ # to use the default error handling.
+ # Return value: New reference.
+ unicode PyUnicode_Translate(object str, object table, const char *errors)
+
+ # Join a sequence of strings using the given separator and return the
+ # resulting Unicode string.
+ # Return value: New reference.
+ unicode PyUnicode_Join(object separator, object seq)
+
+ # Return 1 if substr matches str[start:end] at the given tail end
+ # (direction == -1 means to do a prefix match, direction == 1 a
+ # suffix match), 0 otherwise.
+ # Return -1 if an error occurred.
+ Py_ssize_t PyUnicode_Tailmatch(object str, object substr,
+ Py_ssize_t start, Py_ssize_t end, int direction) except -1
+
+ # Return the first position of substr in str[start:end] using the given
+ # direction (direction == 1 means to do a forward search, direction == -1
+ # a backward search). The return value is the index of the first match;
+ # a value of -1 indicates that no match was found, and -2 indicates that an
+ # error occurred and an exception has been set.
+ Py_ssize_t PyUnicode_Find(object str, object substr, Py_ssize_t start, Py_ssize_t end, int direction) except -2
+
+ # Return the first position of the character ch in str[start:end] using
+ # the given direction (direction == 1 means to do a forward search,
+ # direction == -1 a backward search). The return value is the index of
+ # the first match; a value of -1 indicates that no match was found, and
+ # -2 indicates that an error occurred and an exception has been set.
+ # New in version 3.3.
+ Py_ssize_t PyUnicode_FindChar(object str, Py_UCS4 ch, Py_ssize_t start, Py_ssize_t end, int direction) except -2
+
+ # Return the number of non-overlapping occurrences of substr in
+ # str[start:end]. Return -1 if an error occurred.
+ Py_ssize_t PyUnicode_Count(object str, object substr, Py_ssize_t start, Py_ssize_t end) except -1
+
+ # Replace at most maxcount occurrences of substr in str with replstr and
+ # return the resulting Unicode object. maxcount == -1 means replace all
+ # occurrences.
+ # Return value: New reference.
+ unicode PyUnicode_Replace(object str, object substr, object replstr, Py_ssize_t maxcount)
+
+ # Compare two strings and return -1, 0, 1 for less than,
+ # equal, and greater than, respectively.
+ int PyUnicode_Compare(object left, object right) except? -1
+
+ # Compare a unicode object, uni, with string and return -1, 0, 1 for less than,
+ # equal, and greater than, respectively. It is best to pass only ASCII-encoded
+ # strings, but the function interprets the input string as ISO-8859-1 if it
+ # contains non-ASCII characters.
+ int PyUnicode_CompareWithASCIIString(object uni, char *string) except? -1
+
+ # Rich compare two unicode strings and return one of the following:
+ #
+ # NULL in case an exception was raised
+ # Py_True or Py_False for successful comparisons
+ # Py_NotImplemented in case the type combination is unknown
+ #
+ # Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in case
+ # the conversion of the arguments to Unicode fails with a UnicodeDecodeError.
+ #
+ # Possible values for op are Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, and Py_LE.
+ object PyUnicode_RichCompare(object left, object right, int op)
+
+ # Return a new string object from format and args; this is analogous to
+ # format % args.
+ # Return value: New reference.
+ unicode PyUnicode_Format(object format, object args)
+
+ # Check whether element is contained in container and return true or false
+ # accordingly.
+ #
+ # element has to coerce to a one element Unicode string. -1 is returned
+ # if there was an error.
+ int PyUnicode_Contains(object container, object element) except -1
+
+ # Intern the argument *string in place. The argument must be the address
+ # of a pointer variable pointing to a Python unicode string object. If
+ # there is an existing interned string that is the same as *string, it sets
+ # *string to it (decrementing the reference count of the old string object
+ # and incrementing the reference count of the interned string object),
+ # otherwise it leaves *string alone and interns it (incrementing its reference
+ # count). (Clarification: even though there is a lot of talk about reference
+ # counts, think of this function as reference-count-neutral; you own the object
+ # after the call if and only if you owned it before the call.)
+ #void PyUnicode_InternInPlace(PyObject **string)
+
+ # A combination of PyUnicode_FromString() and PyUnicode_InternInPlace(),
+ # returning either a new unicode string object that has been interned, or
+ # a new ("owned") reference to an earlier interned string object with the
+ # same value.
+ unicode PyUnicode_InternFromString(const char *v)
+
+
# Codecs
# Create a Unicode object by decoding size bytes of the encoded
@@ -161,22 +286,22 @@ cdef extern from *:
# Create a Unicode object by decoding size bytes of the UTF-8
# encoded string s. Return NULL if an exception was raised by the
# codec.
- object PyUnicode_DecodeUTF8(char *s, Py_ssize_t size, char *errors)
+ unicode PyUnicode_DecodeUTF8(char *s, Py_ssize_t size, char *errors)
# If consumed is NULL, behave like PyUnicode_DecodeUTF8(). If
# consumed is not NULL, trailing incomplete UTF-8 byte sequences
# will not be treated as an error. Those bytes will not be decoded
# and the number of bytes that have been decoded will be stored in
# consumed. New in version 2.4.
- object PyUnicode_DecodeUTF8Stateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed)
+ unicode PyUnicode_DecodeUTF8Stateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed)
# Encode the Py_UNICODE buffer of the given size using UTF-8 and
# return a Python string object. Return NULL if an exception was
# raised by the codec.
- object PyUnicode_EncodeUTF8(Py_UNICODE *s, Py_ssize_t size, char *errors)
+ bytes PyUnicode_EncodeUTF8(Py_UNICODE *s, Py_ssize_t size, char *errors)
# Encode a Unicode objects using UTF-8 and return the result as Python string object. Error handling is ``strict''. Return NULL if an exception was raised by the codec.
- object PyUnicode_AsUTF8String(object unicode)
+ bytes PyUnicode_AsUTF8String(object unicode)
# These are the UTF-16 codec APIs:
@@ -198,7 +323,7 @@ cdef extern from *:
# order at the.
#
# If byteorder is NULL, the codec starts in native order mode.
- object PyUnicode_DecodeUTF16(char *s, Py_ssize_t size, char *errors, int *byteorder)
+ unicode PyUnicode_DecodeUTF16(char *s, Py_ssize_t size, char *errors, int *byteorder)
# If consumed is NULL, behave like PyUnicode_DecodeUTF16(). If
# consumed is not NULL, PyUnicode_DecodeUTF16Stateful() will not
@@ -206,7 +331,7 @@ cdef extern from *:
# number of bytes or a split surrogate pair) as an error. Those
# bytes will not be decoded and the number of bytes that have been
# decoded will be stored in consumed. New in version 2.4.
- object PyUnicode_DecodeUTF16Stateful(char *s, Py_ssize_t size, char *errors, int *byteorder, Py_ssize_t *consumed)
+ unicode PyUnicode_DecodeUTF16Stateful(char *s, Py_ssize_t size, char *errors, int *byteorder, Py_ssize_t *consumed)
# Return a Python string object holding the UTF-16 encoded value
# of the Unicode data in s. If byteorder is not 0, output is
@@ -223,13 +348,13 @@ cdef extern from *:
# If Py_UNICODE_WIDE is defined, a single Py_UNICODE value may get
# represented as a surrogate pair. If it is not defined, each
# Py_UNICODE values is interpreted as an UCS-2 character.
- object PyUnicode_EncodeUTF16(Py_UNICODE *s, Py_ssize_t size, char *errors, int byteorder)
+ bytes PyUnicode_EncodeUTF16(Py_UNICODE *s, Py_ssize_t size, char *errors, int byteorder)
# Return a Python string using the UTF-16 encoding in native byte
# order. The string always starts with a BOM mark. Error handling
# is ``strict''. Return NULL if an exception was raised by the
# codec.
- object PyUnicode_AsUTF16String(object unicode)
+ bytes PyUnicode_AsUTF16String(object unicode)
# These are the ``Unicode Escape'' codec APIs:
@@ -270,17 +395,17 @@ cdef extern from *:
# Create a Unicode object by decoding size bytes of the Latin-1
# encoded string s. Return NULL if an exception was raised by the
# codec.
- object PyUnicode_DecodeLatin1(char *s, Py_ssize_t size, char *errors)
+ unicode PyUnicode_DecodeLatin1(char *s, Py_ssize_t size, char *errors)
# Encode the Py_UNICODE buffer of the given size using Latin-1 and
- # return a Python string object. Return NULL if an exception was
+ # return a Python bytes object. Return NULL if an exception was
# raised by the codec.
- object PyUnicode_EncodeLatin1(Py_UNICODE *s, Py_ssize_t size, char *errors)
+ bytes PyUnicode_EncodeLatin1(Py_UNICODE *s, Py_ssize_t size, char *errors)
# Encode a Unicode objects using Latin-1 and return the result as
- # Python string object. Error handling is ``strict''. Return NULL
+ # Python bytes object. Error handling is ``strict''. Return NULL
# if an exception was raised by the codec.
- object PyUnicode_AsLatin1String(object unicode)
+ bytes PyUnicode_AsLatin1String(object unicode)
# These are the ASCII codec APIs. Only 7-bit ASCII data is
# accepted. All other codes generate errors.
@@ -288,17 +413,17 @@ cdef extern from *:
# Create a Unicode object by decoding size bytes of the ASCII
# encoded string s. Return NULL if an exception was raised by the
# codec.
- object PyUnicode_DecodeASCII(char *s, Py_ssize_t size, char *errors)
+ unicode PyUnicode_DecodeASCII(char *s, Py_ssize_t size, char *errors)
# Encode the Py_UNICODE buffer of the given size using ASCII and
- # return a Python string object. Return NULL if an exception was
+ # return a Python bytes object. Return NULL if an exception was
# raised by the codec.
- object PyUnicode_EncodeASCII(Py_UNICODE *s, Py_ssize_t size, char *errors)
+ bytes PyUnicode_EncodeASCII(Py_UNICODE *s, Py_ssize_t size, char *errors)
# Encode a Unicode objects using ASCII and return the result as
- # Python string object. Error handling is ``strict''. Return NULL
+ # Python bytes object. Error handling is ``strict''. Return NULL
# if an exception was raised by the codec.
- object PyUnicode_AsASCIIString(object o)
+ bytes PyUnicode_AsASCIIString(object o)
# These are the mapping codec APIs:
#
@@ -339,6 +464,8 @@ cdef extern from *:
# Encode the Py_UNICODE buffer of the given size using the given
# mapping object and return a Python string object. Return NULL if
# an exception was raised by the codec.
+ #
+ # Deprecated since version 3.3, will be removed in version 4.0.
object PyUnicode_EncodeCharmap(Py_UNICODE *s, Py_ssize_t size, object mapping, char *errors)
# Encode a Unicode objects using the given mapping object and
@@ -359,6 +486,8 @@ cdef extern from *:
# dictionaries and sequences work well. Unmapped character
# ordinals (ones which cause a LookupError) are left untouched and
# are copied as-is.
+ #
+ # Deprecated since version 3.3, will be removed in version 4.0.
object PyUnicode_TranslateCharmap(Py_UNICODE *s, Py_ssize_t size,
object table, char *errors)
@@ -371,21 +500,28 @@ cdef extern from *:
# Create a Unicode object by decoding size bytes of the MBCS
# encoded string s. Return NULL if an exception was raised by the
# codec.
- object PyUnicode_DecodeMBCS(char *s, Py_ssize_t size, char *errors)
+ unicode PyUnicode_DecodeMBCS(char *s, Py_ssize_t size, char *errors)
# If consumed is NULL, behave like PyUnicode_DecodeMBCS(). If
# consumed is not NULL, PyUnicode_DecodeMBCSStateful() will not
# decode trailing lead byte and the number of bytes that have been
# decoded will be stored in consumed. New in version 2.5.
# NOTE: Python 2.x uses 'int' values for 'size' and 'consumed' (changed in 3.0)
- object PyUnicode_DecodeMBCSStateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed)
+ unicode PyUnicode_DecodeMBCSStateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed)
# Encode the Py_UNICODE buffer of the given size using MBCS and
# return a Python string object. Return NULL if an exception was
# raised by the codec.
- object PyUnicode_EncodeMBCS(Py_UNICODE *s, Py_ssize_t size, char *errors)
+ bytes PyUnicode_EncodeMBCS(Py_UNICODE *s, Py_ssize_t size, char *errors)
# Encode a Unicode objects using MBCS and return the result as
# Python string object. Error handling is ``strict''. Return NULL
# if an exception was raised by the codec.
- object PyUnicode_AsMBCSString(object o)
+ bytes PyUnicode_AsMBCSString(object o)
+
+ # Encode the Unicode object using the specified code page and return
+ # a Python bytes object. Return NULL if an exception was raised by the
+ # codec. Use CP_ACP code page to get the MBCS encoder.
+ #
+ # New in version 3.3.
+ bytes PyUnicode_EncodeCodePage(int code_page, object unicode, const char *errors)