Marc-Andre Lemburg <mal@lemburg.com>:

API documentation for Unicode support from C.
author: Fred Drake <fdrake@acm.org> 2000-04-06 14:10:29 (GMT)
committer: Fred Drake <fdrake@acm.org> 2000-04-06 14:10:29 (GMT)
commit: a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4 (patch)
tree: 194b2abdd8f3eed112b0e9af4d1f80d8df21fbfc /Doc
parent: 8b3ce9e099bca495029bdfdd42f4cb6602c5b621 (diff)
download: cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.zip
cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.tar.gz
cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.tar.bz2
1 files changed, 712 insertions, 0 deletions
diff --git a/Doc/api/api.tex b/Doc/api/api.tex
index 55a801d..d9e7fea 100644
--- a/Doc/api/api.tex
+++ b/Doc/api/api.tex
@@ -1899,6 +1899,718 @@ interned string object with the same value.
 \end{cfuncdesc}
 
 
+\subsection{Unicode Objects \label{unicodeObjects}}
+\sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
+
+%--- Unicode Type -------------------------------------------------------
+
+These are the basic Unicode object types used for the Unicode
+implementation in Python:
+
+\begin{ctypedesc}{Py_UNICODE}
+This type represents a 16-bit unsigned storage type which is used by
+Python internally as basis for holding Unicode ordinals. On platforms
+where \ctype{wchar_t} is available and also has 16-bits,
+\ctype{Py_UNICODE} is a typedef alias for \ctype{wchar_t} to enhance
+native platform compatibility. On all other platforms,
+\ctype{Py_UNICODE} is a typedef alias for \ctype{unsigned short}.
+\end{ctypedesc}
+
+\begin{ctypedesc}{PyUnicodeObject}
+This subtype of \ctype{PyObject} represents a Python Unicode object.
+\end{ctypedesc}
+
+\begin{cvardesc}{PyTypeObject}{PyUnicode_Type}
+This instance of \ctype{PyTypeObject} represents the Python Unicode type.
+\end{cvardesc}
+
+%--- These are really C macros... is there a macrodesc TeX macro ?
+
+The following APIs are really C macros and can be used to do fast
+checks and to access internal read-only data of Unicode objects:
+
+\begin{cfuncdesc}{int}{PyUnicode_Check}{PyObject *o}
+Returns true if the object \var{o} is a Unicode object.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{PyUnicode_GET_SIZE}{PyObject *o}
+Returns the size of the object.  o has to be a
+PyUnicodeObject (not checked).
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{PyUnicode_GET_DATA_SIZE}{PyObject *o}
+Returns the size of the object's internal buffer in bytes. o has to be
+a PyUnicodeObject (not checked).
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{PyUnicode_AS_UNICODE}{PyObject *o}
+Returns a pointer to the internal Py_UNICODE buffer of the object. o
+has to be a PyUnicodeObject (not checked).
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{PyUnicode_AS_DATA}{PyObject *o}
+Returns a (const char *) pointer to the internal buffer of the object.
+o has to be a PyUnicodeObject (not checked).
+\end{cfuncdesc}
+
+% --- Unicode character properties ---------------------------------------
+
+Unicode provides many different character properties. The most often
+needed ones are available through these macros which are mapped to C
+functions depending on the Python configuration.
+
+\begin{cfuncdesc}{int}{Py_UNICODE_ISSPACE}{Py_UNICODE ch}
+Returns 1/0 depending on whether \var{ch} is a whitespace character.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_ISLOWER}{Py_UNICODE ch}
+Returns 1/0 depending on whether \var{ch} is a lowercase character.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_ISUPPER}{Py_UNICODE ch}
+Returns 1/0 depending on whether \var{ch} is a uppercase character.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_ISTITLE}{Py_UNICODE ch}
+Returns 1/0 depending on whether \var{ch} is a titlecase character.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_ISLINEBREAK}{Py_UNICODE ch}
+Returns 1/0 depending on whether \var{ch} is a linebreak character.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_ISDECIMAL}{Py_UNICODE ch}
+Returns 1/0 depending on whether \var{ch} is a decimal character.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_ISDIGIT}{Py_UNICODE ch}
+Returns 1/0 depending on whether \var{ch} is a digit character.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_ISNUMERIC}{Py_UNICODE ch}
+Returns 1/0 depending on whether \var{ch} is a numeric character.
+\end{cfuncdesc}
+
+These APIs can be used for fast direct character conversions:
+
+\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
+Returns the character \var{ch} converted to lower case.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOUPPER}{Py_UNICODE ch}
+Returns the character \var{ch} converted to upper case.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOTITLE}{Py_UNICODE ch}
+Returns the character \var{ch} converted to title case.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_TODECIMAL}{Py_UNICODE ch}
+Returns the character \var{ch} converted to a decimal positive integer.
+Returns -1 in case this is not possible. Does not raise exceptions.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{Py_UNICODE_TODIGIT}{Py_UNICODE ch}
+Returns the character \var{ch} converted to a single digit integer.
+Returns -1 in case this is not possible. Does not raise exceptions.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{double}{Py_UNICODE_TONUMERIC}{Py_UNICODE ch}
+Returns the character \var{ch} converted to a (positive) double.
+Returns -1.0 in case this is not possible. Does not raise exceptions.
+\end{cfuncdesc}
+
+% --- Plain Py_UNICODE ---------------------------------------------------
+
+To create Unicode objects and access their basic sequence properties,
+use these APIs:
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_FromUnicode}{const Py_UNICODE *u,
+                                                    int size} 
+
+Create a Unicode Object from the Py_UNICODE buffer \var{u} of the
+given size. \var{u} may be \NULL{} which causes the contents to be
+undefined. It is the user's responsibility to fill in the needed data.
+The buffer is copied into the new object.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{Py_UNICODE *}{PyUnicode_AsUnicode}{PyObject *unicode}
+Return a read-only pointer to the Unicode object's internal
+\ctype{Py_UNICODE} buffer.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{PyUnicode_GetSize}{PyObject *unicode}
+Return the length of the Unicode object.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_FromObject}{PyObject *obj}
+
+Coerce obj to an Unicode object and return a reference with
+incremented refcount.
+
+Coercion is done in the following way:
+\begin{enumerate}
+\item  Unicode objects are passed back as-is with incremented
+      refcount.
+
+\item String and other char buffer compatible objects are decoded
+      under the assumptions that they contain UTF-8 data. Decoding
+      is done in "strict" mode.
+
+\item All other objects raise an exception.
+\end{enumerate}
+The API returns NULL in case of an error. The caller is responsible
+for decref'ing the returned objects.
+\end{cfuncdesc}
+
+% --- wchar_t support for platforms which support it ---------------------
+
+If the platform supports \ctype{wchar_t} and provides a header file
+wchar.h, Python can interface directly to this type using the
+following functions. Support is optimized if Python's own
+\ctype{Py_UNICODE} type is identical to the system's \ctype{wchar_t}.
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_FromWideChar}{const wchar_t *w,
+                                                     int size}
+Create a Unicode Object from the \ctype{whcar_t} buffer \var{w} of the
+given size. Returns \NULL{} on failure.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{PyUnicode_AsWideChar}{PyUnicodeObject *unicode,
+                                             wchar_t *w,
+                                             int size}
+
+Copies the Unicode Object contents into the \ctype{whcar_t} buffer
+\var{w}.  At most \var{size} \ctype{whcar_t} characters are copied.
+Returns the number of \ctype{whcar_t} characters copied or -1 in case
+of an error.
+\end{cfuncdesc}
+
+
+\subsubsection{Builtin Codecs \label{builtinCodecs}}
+
+Python provides a set of builtin codecs which are written in C
+for speed. All of these codecs are directly usable via the
+following functions.
+
+Many of the following APIs take two arguments encoding and
+errors. These parameters encoding and errors have the same semantics
+as the ones of the builtin unicode() Unicode object constructor.
+
+Setting encoding to NULL causes the default encoding to be used which
+is UTF-8.
+
+Error handling is set by errors which may also be set to NULL meaning
+to use the default handling defined for the codec. Default error
+handling for all builtin codecs is ``strict'' (ValueErrors are raised).
+
+The codecs all use a similar interface. Only deviation from the
+following generic ones are documented for simplicity.
+
+% --- Generic Codecs -----------------------------------------------------
+
+These are the generic codec APIs:
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Decode}{const char *s,
+                                               int size,
+                                               const char *encoding,
+                                               const char *errors}
+
+Create a Unicode object by decoding \var{size} bytes of the encoded
+string \var{s}. \var{encoding} and \var{errors} have the same meaning
+as the parameters of the same name in the unicode() builtin
+function. The codec to be used is looked up using the Python codec
+registry. Returns \NULL{} in case an exception was raised by the
+codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Encode}{const Py_UNICODE *s,
+                                               int size,
+                                               const char *encoding,
+                                               const char *errors}
+
+Encodes the \ctype{Py_UNICODE} buffer of the given size and returns a
+Python string object. \var{encoding} and \var{errors} have the same
+meaning as the parameters of the same name in the Unicode .encode()
+method. The codec to be used is looked up using the Python codec
+registry. Returns \NULL{} in case an exception was raised by the
+codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsEncodedString}{PyObject *unicode,
+                                               const char *encoding,
+                                               const char *errors}
+
+Encodes a Unicode object and returns the result as Python string
+object. \var{encoding} and \var{errors} have the same meaning as the
+parameters of the same name in the Unicode .encode() method. The codec
+to be used is looked up using the Python codec registry. Returns
+\NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+% --- UTF-8 Codecs -------------------------------------------------------
+
+These are the UTF-8 codec APIs:
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF8}{const char *s,
+                                               int size,
+                                               const char *errors}
+
+Creates a Unicode object by decoding \var{size} bytes of the UTF-8
+encoded string \var{s}. Returns \NULL{} in case an exception was
+raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF8}{const Py_UNICODE *s,
+                                               int size,
+                                               const char *errors}
+
+Encodes the \ctype{Py_UNICODE} buffer of the given size using UTF-8
+and returns a Python string object.  Returns \NULL{} in case an
+exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUTF8String}{PyObject *unicode}
+
+Encodes a Unicode objects using UTF-8 and returns the result as Python
+string object. Error handling is ``strict''. Returns
+\NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+% --- UTF-16 Codecs ------------------------------------------------------ */
+
+These are the UTF-16 codec APIs:
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF16}{const char *s,
+                                               int size,
+                                               const char *errors,
+                                               int *byteorder}
+
+Decodes \var{length} bytes from a UTF-16 encoded buffer string and
+returns the corresponding Unicode object.
+
+\var{errors} (if non-NULL) defines the error handling. It defaults
+to ``strict''.
+
+If \var{byteorder} is non-\NULL{}, the decoder starts decoding using
+the given byte order:
+
+\begin{verbatim}
+   *byteorder == -1: little endian
+   *byteorder == 0:  native order
+   *byteorder == 1:  big endian
+\end{verbatim}
+
+and then switches according to all byte order marks (BOM) it finds in
+the input data. BOM marks are not copied into the resulting Unicode
+string.  After completion, \var{*byteorder} is set to the current byte
+order at the end of input data.
+
+If \var{byteorder} is \NULL{}, the codec starts in native order mode.
+
+Returns \NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF16}{const Py_UNICODE *s,
+                                               int size,
+                                               const char *errors,
+                                               int byteorder}
+
+Returns a Python string object holding the UTF-16 encoded value of the
+Unicode data in \var{s}.
+
+If \var{byteorder} is not 0, output is written according to the
+following byte order:
+
+\begin{verbatim}
+   byteorder == -1: little endian
+   byteorder == 0:  native byte order (writes a BOM mark)
+   byteorder == 1:  big endian
+\end{verbatim}
+
+If byteorder is 0, the output string will always start with the
+Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
+prepended.
+
+Note that \ctype{Py_UNICODE} data is being interpreted as UTF-16
+reduced to UCS-2. This trick makes it possible to add full UTF-16
+capabilities at a later point without comprimising the APIs.
+
+Returns \NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUTF16String}{PyObject *unicode}
+
+Returns a Python string using the UTF-16 encoding in native byte
+order. The string always starts with a BOM mark. Error handling is
+``strict''. Returns \NULL{} in case an exception was raised by the
+codec.
+\end{cfuncdesc}
+
+% --- Unicode-Escape Codecs ----------------------------------------------
+
+These are the ``Unicode Esacpe'' codec APIs:
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUnicodeEscape}{const char *s,
+                                               int size,
+                                               const char *errors}
+
+Creates a Unicode object by decoding \var{size} bytes of the Unicode-Esacpe
+encoded string \var{s}. Returns \NULL{} in case an exception was
+raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUnicodeEscape}{const Py_UNICODE *s,
+                                               int size,
+                                               const char *errors}
+
+Encodes the \ctype{Py_UNICODE} buffer of the given size using Unicode-Escape
+and returns a Python string object.  Returns \NULL{} in case an
+exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUnicodeEscapeString}{PyObject *unicode}
+
+Encodes a Unicode objects using Unicode-Escape and returns the result
+as Python string object. Error handling is ``strict''. Returns
+\NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+% --- Raw-Unicode-Escape Codecs ------------------------------------------
+
+These are the ``Raw Unicode Esacpe'' codec APIs:
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeRawUnicodeEscape}{const char *s,
+                                               int size,
+                                               const char *errors}
+
+Creates a Unicode object by decoding \var{size} bytes of the Raw-Unicode-Esacpe
+encoded string \var{s}. Returns \NULL{} in case an exception was
+raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeRawUnicodeEscape}{const Py_UNICODE *s,
+                                               int size,
+                                               const char *errors}
+
+Encodes the \ctype{Py_UNICODE} buffer of the given size using Raw-Unicode-Escape
+and returns a Python string object.  Returns \NULL{} in case an
+exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsRawUnicodeEscapeString}{PyObject *unicode}
+
+Encodes a Unicode objects using Raw-Unicode-Escape and returns the result
+as Python string object. Error handling is ``strict''. Returns
+\NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+% --- Latin-1 Codecs ----------------------------------------------------- 
+
+These are the Latin-1 codec APIs:
+
+Latin-1 corresponds to the first 256 Unicode ordinals and only these
+are accepted by the codecs during encoding.
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeLatin1}{const char *s,
+                                               int size,
+                                               const char *errors}
+
+Creates a Unicode object by decoding \var{size} bytes of the Latin-1
+encoded string \var{s}. Returns \NULL{} in case an exception was
+raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeLatin1}{const Py_UNICODE *s,
+                                               int size,
+                                               const char *errors}
+
+Encodes the \ctype{Py_UNICODE} buffer of the given size using Latin-1
+and returns a Python string object.  Returns \NULL{} in case an
+exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsLatin1String}{PyObject *unicode}
+
+Encodes a Unicode objects using Latin-1 and returns the result as
+Python string object. Error handling is ``strict''. Returns
+\NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+% --- ASCII Codecs ------------------------------------------------------- 
+
+These are the ASCII codec APIs:
+
+Only 7-bit ASCII data is excepted. All other codes generate errors.
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeASCII}{const char *s,
+                                               int size,
+                                               const char *errors}
+
+Creates a Unicode object by decoding \var{size} bytes of the ASCII
+encoded string \var{s}. Returns \NULL{} in case an exception was
+raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeASCII}{const Py_UNICODE *s,
+                                               int size,
+                                               const char *errors}
+
+Encodes the \ctype{Py_UNICODE} buffer of the given size using ASCII
+and returns a Python string object.  Returns \NULL{} in case an
+exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsASCIIString}{PyObject *unicode}
+
+Encodes a Unicode objects using ASCII and returns the result as Python
+string object. Error handling is ``strict''. Returns
+\NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+% --- Character Map Codecs ----------------------------------------------- 
+
+These are the mapping codec APIs:
+
+This codec is special in that it can be used to implement many
+different codecs (and this is in fact what was done to obtain most of
+the standard codecs included in the \module{encodings} package). The
+codec uses mapping to encode and decode characters.
+
+Decoding mappings must map single string characters to single Unicode
+characters, integers (which are then interpreted as Unicode ordinals)
+or None (meaning "undefined mapping" and causing an error). 
+
+Encoding mappings must map single Unicode characters to single string
+characters, integers (which are then interpreted as Latin-1 ordinals)
+or None (meaning "undefined mapping" and causing an error).
+
+The mapping objects provided must only support the __getitem__ mapping
+interface.
+
+If a character lookup fails with a LookupError, the character is
+copied as-is meaning that its ordinal value will be interpreted as
+Unicode or Latin-1 ordinal resp. Because of this, mappings only need
+to contain those mappings which map characters to different code
+points.
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeCharmap}{const char *s,
+                                               int size,
+                                               PyObject *mapping,
+                                               const char *errors}
+
+Creates a Unicode object by decoding \var{size} bytes of the encoded
+string \var{s} using the given \var{mapping} object.  Returns \NULL{}
+in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s,
+                                               int size,
+                                               PyObject *mapping,
+                                               const char *errors}
+
+Encodes the \ctype{Py_UNICODE} buffer of the given size using the
+given \var{mapping} object and returns a Python string object.
+Returns \NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsCharmapString}{PyObject *unicode,
+                                                        PyObject *mapping}
+
+Encodes a Unicode objects using the given \var{mapping} object and
+returns the result as Python string object. Error handling is
+``strict''. Returns \NULL{} in case an exception was raised by the
+codec.
+\end{cfuncdesc}
+
+The following codec API is special in that maps Unicode to Unicode.
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_TranslateCharmap}{const Py_UNICODE *s,
+                                               int size,
+                                               PyObject *table,
+                                               const char *errors}
+
+Translates a \ctype{Py_UNICODE} buffer of the given length by applying
+a character mapping \var{table} to it and returns the resulting
+Unicode object.
+
+The \var{mapping} table must map Unicode ordinal integers to Unicode
+ordinal integers or None (causing deletion of the character).
+
+Mapping tables must only provide the __getitem__ interface,
+e.g. dictionaries or sequences. Unmapped character ordinals (ones
+which cause a LookupError) are left untouched and are copied as-is.
+
+Returns \NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+% --- MBCS codecs for Windows --------------------------------------------
+
+These are the MBCS codec APIs. They are currently only available
+Windows and use the Win32 MBCS converters to implement the
+conversions. 
+
+Note that MBCS (or DBCS) is a class of encodings, not just one.  The
+target encoding is defined by the user settings on the machine running
+the codec.
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCS}{const char *s,
+                                               int size,
+                                               const char *errors}
+
+Creates a Unicode object by decoding \var{size} bytes of the MBCS
+encoded string \var{s}. Returns \NULL{} in case an exception was
+raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
+                                               int size,
+                                               const char *errors}
+
+Encodes the \ctype{Py_UNICODE} buffer of the given size using MBCS
+and returns a Python string object.  Returns \NULL{} in case an
+exception was raised by the codec.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_AsMBCSString}{PyObject *unicode}
+
+Encodes a Unicode objects using MBCS and returns the result as Python
+string object. Error handling is ``strict''. Returns
+\NULL{} in case an exception was raised by the codec.
+\end{cfuncdesc}
+
+% --- Methods & Slots ----------------------------------------------------
+
+\subsubsection{Methods and Slot Functions \label{unicodeMethodsAndSlots}}
+
+The following APIs are capable of handling Unicode objects and strings
+on input (we refer to them as strings in the descriptions) and return
+Unicode objects or integers as apporpriate.
+
+They all return \NULL{} or -1 in case an exception occurrs.
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Concat}{PyObject *left,
+                                               PyObject *right}
+
+Concat two strings giving a new Unicode string.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Split}{PyObject *s,
+                                              PyObject *sep,
+                                              int maxsplit}
+
+Split a string giving a list of Unicode strings.
+
+If sep is NULL, splitting will be done at all whitespace
+substrings. Otherwise, splits occur at the given separator.
+
+At most maxsplit splits will be done. If negative, no limit is set.
+
+Separators are not included in the resulting list.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Splitlines}{PyObject *s,
+                                                   int maxsplit}
+
+Dito, but split at line breaks.
+
+CRLF is considered to be one line break. Line breaks are not
+included in the resulting list.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Translate}{PyObject *str,
+                                                  PyObject *table,
+                                                  const char *errors}
+
+Translate a string by applying a character mapping table to it and
+return the resulting Unicode object.
+
+The mapping table must map Unicode ordinal integers to Unicode ordinal
+integers or None (causing deletion of the character).
+
+Mapping tables must only provide the __getitem__ interface,
+e.g. dictionaries or sequences. Unmapped character ordinals (ones
+which cause a LookupError) are left untouched and are copied as-is.
+
+\var{errors} has the usual meaning for codecs. It may be \NULL{}
+which indicates to use the default error handling.
+
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Join}{PyObject *separator,
+                                             PyObject *seq}
+
+Join a sequence of strings using the given separator and return
+the resulting Unicode string.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Tailmatch}{PyObject *str,
+                                                  PyObject *substr,
+                                                  int start,
+                                                  int end,
+                                                  int direction}
+
+Return 1 if \var{substr} matches \var{str}[\var{start}:\var{end}] at
+the given tail end (\var{direction} == -1 means to do a prefix match,
+\var{direction} == 1 a suffix match), 0 otherwise.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Find}{PyObject *str,
+                                                  PyObject *substr,
+                                                  int start,
+                                                  int end,
+                                                  int direction}
+
+Return the first position of \var{substr} in
+\var{str}[\var{start}:\var{end}] using the given \var{direction}
+(\var{direction} == 1 means to do a forward search,
+\var{direction} == -1 a backward search), 0 otherwise.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Count}{PyObject *str,
+                                                  PyObject *substr,
+                                                  int start,
+                                                  int end}
+
+Count the number of occurrences of \var{substr} in
+\var{str}[\var{start}:\var{end}]
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Replace}{PyObject *str,
+                                                PyObject *substr,
+                                                PyObject *replstr,
+                                                int maxcount}
+
+Replace at most \var{maxcount} occurrences of \var{substr} in
+\var{str} with \var{replstr} and return the resulting Unicode object.
+\var{maxcount} == -1 means: replace all occurrences.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{PyUnicode_Compare}{PyObject *left,
+                                                PyObject *right}
+
+Compare two strings and return -1, 0, 1 for less than, equal,
+greater than resp.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{PyObject*}{PyUnicode_Format}{PyObject *format,
+                                              PyObject *args}
+Returns a new string object from \var{format} and \var{args}.  Analogous
+to \code{\var{format} \% \var{args}}.  The \var{args} argument must be
+a tuple.
+\end{cfuncdesc}
+
+\begin{cfuncdesc}{int}{PyUnicode_Contains}{PyObject *container,
+                                           PyObject *element}
+
+Checks whether \var{element} is contained in \var{container} and
+returns 1/0 accordingly.
+
+\var{element} has to coerce to an one element Unicode string. -1 is
+returned in case of an error.
+\end{cfuncdesc}
+
+
 \subsection{Buffer Objects \label{bufferObjects}}
 \sectionauthor{Greg Stein}{gstein@lyra.org}
author	Fred Drake <fdrake@acm.org>	2000-04-06 14:10:29 (GMT)
committer	Fred Drake <fdrake@acm.org>	2000-04-06 14:10:29 (GMT)
commit	a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4 (patch)
tree	194b2abdd8f3eed112b0e9af4d1f80d8df21fbfc /Doc
parent	8b3ce9e099bca495029bdfdd42f4cb6602c5b621 (diff)
download	cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.zip cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.tar.gz cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.tar.bz2