diff options
author | Fred Drake <fdrake@acm.org> | 2000-04-06 14:10:29 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2000-04-06 14:10:29 (GMT) |
commit | a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4 (patch) | |
tree | 194b2abdd8f3eed112b0e9af4d1f80d8df21fbfc /Doc | |
parent | 8b3ce9e099bca495029bdfdd42f4cb6602c5b621 (diff) | |
download | cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.zip cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.tar.gz cpython-a4cd2611f4d6848e7ca1a2de22abafd03c5c26c4.tar.bz2 |
Marc-Andre Lemburg <mal@lemburg.com>:
API documentation for Unicode support from C.
Diffstat (limited to 'Doc')
-rw-r--r-- | Doc/api/api.tex | 712 |
1 files changed, 712 insertions, 0 deletions
diff --git a/Doc/api/api.tex b/Doc/api/api.tex index 55a801d..d9e7fea 100644 --- a/Doc/api/api.tex +++ b/Doc/api/api.tex @@ -1899,6 +1899,718 @@ interned string object with the same value. \end{cfuncdesc} +\subsection{Unicode Objects \label{unicodeObjects}} +\sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com} + +%--- Unicode Type ------------------------------------------------------- + +These are the basic Unicode object types used for the Unicode +implementation in Python: + +\begin{ctypedesc}{Py_UNICODE} +This type represents a 16-bit unsigned storage type which is used by +Python internally as basis for holding Unicode ordinals. On platforms +where \ctype{wchar_t} is available and also has 16-bits, +\ctype{Py_UNICODE} is a typedef alias for \ctype{wchar_t} to enhance +native platform compatibility. On all other platforms, +\ctype{Py_UNICODE} is a typedef alias for \ctype{unsigned short}. +\end{ctypedesc} + +\begin{ctypedesc}{PyUnicodeObject} +This subtype of \ctype{PyObject} represents a Python Unicode object. +\end{ctypedesc} + +\begin{cvardesc}{PyTypeObject}{PyUnicode_Type} +This instance of \ctype{PyTypeObject} represents the Python Unicode type. +\end{cvardesc} + +%--- These are really C macros... is there a macrodesc TeX macro ? + +The following APIs are really C macros and can be used to do fast +checks and to access internal read-only data of Unicode objects: + +\begin{cfuncdesc}{int}{PyUnicode_Check}{PyObject *o} +Returns true if the object \var{o} is a Unicode object. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{PyUnicode_GET_SIZE}{PyObject *o} +Returns the size of the object. o has to be a +PyUnicodeObject (not checked). +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{PyUnicode_GET_DATA_SIZE}{PyObject *o} +Returns the size of the object's internal buffer in bytes. o has to be +a PyUnicodeObject (not checked). +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{PyUnicode_AS_UNICODE}{PyObject *o} +Returns a pointer to the internal Py_UNICODE buffer of the object. o +has to be a PyUnicodeObject (not checked). +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{PyUnicode_AS_DATA}{PyObject *o} +Returns a (const char *) pointer to the internal buffer of the object. +o has to be a PyUnicodeObject (not checked). +\end{cfuncdesc} + +% --- Unicode character properties --------------------------------------- + +Unicode provides many different character properties. The most often +needed ones are available through these macros which are mapped to C +functions depending on the Python configuration. + +\begin{cfuncdesc}{int}{Py_UNICODE_ISSPACE}{Py_UNICODE ch} +Returns 1/0 depending on whether \var{ch} is a whitespace character. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_ISLOWER}{Py_UNICODE ch} +Returns 1/0 depending on whether \var{ch} is a lowercase character. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_ISUPPER}{Py_UNICODE ch} +Returns 1/0 depending on whether \var{ch} is a uppercase character. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_ISTITLE}{Py_UNICODE ch} +Returns 1/0 depending on whether \var{ch} is a titlecase character. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_ISLINEBREAK}{Py_UNICODE ch} +Returns 1/0 depending on whether \var{ch} is a linebreak character. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_ISDECIMAL}{Py_UNICODE ch} +Returns 1/0 depending on whether \var{ch} is a decimal character. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_ISDIGIT}{Py_UNICODE ch} +Returns 1/0 depending on whether \var{ch} is a digit character. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_ISNUMERIC}{Py_UNICODE ch} +Returns 1/0 depending on whether \var{ch} is a numeric character. +\end{cfuncdesc} + +These APIs can be used for fast direct character conversions: + +\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch} +Returns the character \var{ch} converted to lower case. +\end{cfuncdesc} + +\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOUPPER}{Py_UNICODE ch} +Returns the character \var{ch} converted to upper case. +\end{cfuncdesc} + +\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOTITLE}{Py_UNICODE ch} +Returns the character \var{ch} converted to title case. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_TODECIMAL}{Py_UNICODE ch} +Returns the character \var{ch} converted to a decimal positive integer. +Returns -1 in case this is not possible. Does not raise exceptions. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{Py_UNICODE_TODIGIT}{Py_UNICODE ch} +Returns the character \var{ch} converted to a single digit integer. +Returns -1 in case this is not possible. Does not raise exceptions. +\end{cfuncdesc} + +\begin{cfuncdesc}{double}{Py_UNICODE_TONUMERIC}{Py_UNICODE ch} +Returns the character \var{ch} converted to a (positive) double. +Returns -1.0 in case this is not possible. Does not raise exceptions. +\end{cfuncdesc} + +% --- Plain Py_UNICODE --------------------------------------------------- + +To create Unicode objects and access their basic sequence properties, +use these APIs: + +\begin{cfuncdesc}{PyObject*}{PyUnicode_FromUnicode}{const Py_UNICODE *u, + int size} + +Create a Unicode Object from the Py_UNICODE buffer \var{u} of the +given size. \var{u} may be \NULL{} which causes the contents to be +undefined. It is the user's responsibility to fill in the needed data. +The buffer is copied into the new object. +\end{cfuncdesc} + +\begin{cfuncdesc}{Py_UNICODE *}{PyUnicode_AsUnicode}{PyObject *unicode} +Return a read-only pointer to the Unicode object's internal +\ctype{Py_UNICODE} buffer. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{PyUnicode_GetSize}{PyObject *unicode} +Return the length of the Unicode object. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_FromObject}{PyObject *obj} + +Coerce obj to an Unicode object and return a reference with +incremented refcount. + +Coercion is done in the following way: +\begin{enumerate} +\item Unicode objects are passed back as-is with incremented + refcount. + +\item String and other char buffer compatible objects are decoded + under the assumptions that they contain UTF-8 data. Decoding + is done in "strict" mode. + +\item All other objects raise an exception. +\end{enumerate} +The API returns NULL in case of an error. The caller is responsible +for decref'ing the returned objects. +\end{cfuncdesc} + +% --- wchar_t support for platforms which support it --------------------- + +If the platform supports \ctype{wchar_t} and provides a header file +wchar.h, Python can interface directly to this type using the +following functions. Support is optimized if Python's own +\ctype{Py_UNICODE} type is identical to the system's \ctype{wchar_t}. + +\begin{cfuncdesc}{PyObject*}{PyUnicode_FromWideChar}{const wchar_t *w, + int size} +Create a Unicode Object from the \ctype{whcar_t} buffer \var{w} of the +given size. Returns \NULL{} on failure. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{PyUnicode_AsWideChar}{PyUnicodeObject *unicode, + wchar_t *w, + int size} + +Copies the Unicode Object contents into the \ctype{whcar_t} buffer +\var{w}. At most \var{size} \ctype{whcar_t} characters are copied. +Returns the number of \ctype{whcar_t} characters copied or -1 in case +of an error. +\end{cfuncdesc} + + +\subsubsection{Builtin Codecs \label{builtinCodecs}} + +Python provides a set of builtin codecs which are written in C +for speed. All of these codecs are directly usable via the +following functions. + +Many of the following APIs take two arguments encoding and +errors. These parameters encoding and errors have the same semantics +as the ones of the builtin unicode() Unicode object constructor. + +Setting encoding to NULL causes the default encoding to be used which +is UTF-8. + +Error handling is set by errors which may also be set to NULL meaning +to use the default handling defined for the codec. Default error +handling for all builtin codecs is ``strict'' (ValueErrors are raised). + +The codecs all use a similar interface. Only deviation from the +following generic ones are documented for simplicity. + +% --- Generic Codecs ----------------------------------------------------- + +These are the generic codec APIs: + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Decode}{const char *s, + int size, + const char *encoding, + const char *errors} + +Create a Unicode object by decoding \var{size} bytes of the encoded +string \var{s}. \var{encoding} and \var{errors} have the same meaning +as the parameters of the same name in the unicode() builtin +function. The codec to be used is looked up using the Python codec +registry. Returns \NULL{} in case an exception was raised by the +codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Encode}{const Py_UNICODE *s, + int size, + const char *encoding, + const char *errors} + +Encodes the \ctype{Py_UNICODE} buffer of the given size and returns a +Python string object. \var{encoding} and \var{errors} have the same +meaning as the parameters of the same name in the Unicode .encode() +method. The codec to be used is looked up using the Python codec +registry. Returns \NULL{} in case an exception was raised by the +codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsEncodedString}{PyObject *unicode, + const char *encoding, + const char *errors} + +Encodes a Unicode object and returns the result as Python string +object. \var{encoding} and \var{errors} have the same meaning as the +parameters of the same name in the Unicode .encode() method. The codec +to be used is looked up using the Python codec registry. Returns +\NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +% --- UTF-8 Codecs ------------------------------------------------------- + +These are the UTF-8 codec APIs: + +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF8}{const char *s, + int size, + const char *errors} + +Creates a Unicode object by decoding \var{size} bytes of the UTF-8 +encoded string \var{s}. Returns \NULL{} in case an exception was +raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF8}{const Py_UNICODE *s, + int size, + const char *errors} + +Encodes the \ctype{Py_UNICODE} buffer of the given size using UTF-8 +and returns a Python string object. Returns \NULL{} in case an +exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUTF8String}{PyObject *unicode} + +Encodes a Unicode objects using UTF-8 and returns the result as Python +string object. Error handling is ``strict''. Returns +\NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +% --- UTF-16 Codecs ------------------------------------------------------ */ + +These are the UTF-16 codec APIs: + +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF16}{const char *s, + int size, + const char *errors, + int *byteorder} + +Decodes \var{length} bytes from a UTF-16 encoded buffer string and +returns the corresponding Unicode object. + +\var{errors} (if non-NULL) defines the error handling. It defaults +to ``strict''. + +If \var{byteorder} is non-\NULL{}, the decoder starts decoding using +the given byte order: + +\begin{verbatim} + *byteorder == -1: little endian + *byteorder == 0: native order + *byteorder == 1: big endian +\end{verbatim} + +and then switches according to all byte order marks (BOM) it finds in +the input data. BOM marks are not copied into the resulting Unicode +string. After completion, \var{*byteorder} is set to the current byte +order at the end of input data. + +If \var{byteorder} is \NULL{}, the codec starts in native order mode. + +Returns \NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF16}{const Py_UNICODE *s, + int size, + const char *errors, + int byteorder} + +Returns a Python string object holding the UTF-16 encoded value of the +Unicode data in \var{s}. + +If \var{byteorder} is not 0, output is written according to the +following byte order: + +\begin{verbatim} + byteorder == -1: little endian + byteorder == 0: native byte order (writes a BOM mark) + byteorder == 1: big endian +\end{verbatim} + +If byteorder is 0, the output string will always start with the +Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is +prepended. + +Note that \ctype{Py_UNICODE} data is being interpreted as UTF-16 +reduced to UCS-2. This trick makes it possible to add full UTF-16 +capabilities at a later point without comprimising the APIs. + +Returns \NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUTF16String}{PyObject *unicode} + +Returns a Python string using the UTF-16 encoding in native byte +order. The string always starts with a BOM mark. Error handling is +``strict''. Returns \NULL{} in case an exception was raised by the +codec. +\end{cfuncdesc} + +% --- Unicode-Escape Codecs ---------------------------------------------- + +These are the ``Unicode Esacpe'' codec APIs: + +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUnicodeEscape}{const char *s, + int size, + const char *errors} + +Creates a Unicode object by decoding \var{size} bytes of the Unicode-Esacpe +encoded string \var{s}. Returns \NULL{} in case an exception was +raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUnicodeEscape}{const Py_UNICODE *s, + int size, + const char *errors} + +Encodes the \ctype{Py_UNICODE} buffer of the given size using Unicode-Escape +and returns a Python string object. Returns \NULL{} in case an +exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUnicodeEscapeString}{PyObject *unicode} + +Encodes a Unicode objects using Unicode-Escape and returns the result +as Python string object. Error handling is ``strict''. Returns +\NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +% --- Raw-Unicode-Escape Codecs ------------------------------------------ + +These are the ``Raw Unicode Esacpe'' codec APIs: + +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeRawUnicodeEscape}{const char *s, + int size, + const char *errors} + +Creates a Unicode object by decoding \var{size} bytes of the Raw-Unicode-Esacpe +encoded string \var{s}. Returns \NULL{} in case an exception was +raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeRawUnicodeEscape}{const Py_UNICODE *s, + int size, + const char *errors} + +Encodes the \ctype{Py_UNICODE} buffer of the given size using Raw-Unicode-Escape +and returns a Python string object. Returns \NULL{} in case an +exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsRawUnicodeEscapeString}{PyObject *unicode} + +Encodes a Unicode objects using Raw-Unicode-Escape and returns the result +as Python string object. Error handling is ``strict''. Returns +\NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +% --- Latin-1 Codecs ----------------------------------------------------- + +These are the Latin-1 codec APIs: + +Latin-1 corresponds to the first 256 Unicode ordinals and only these +are accepted by the codecs during encoding. + +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeLatin1}{const char *s, + int size, + const char *errors} + +Creates a Unicode object by decoding \var{size} bytes of the Latin-1 +encoded string \var{s}. Returns \NULL{} in case an exception was +raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeLatin1}{const Py_UNICODE *s, + int size, + const char *errors} + +Encodes the \ctype{Py_UNICODE} buffer of the given size using Latin-1 +and returns a Python string object. Returns \NULL{} in case an +exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsLatin1String}{PyObject *unicode} + +Encodes a Unicode objects using Latin-1 and returns the result as +Python string object. Error handling is ``strict''. Returns +\NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +% --- ASCII Codecs ------------------------------------------------------- + +These are the ASCII codec APIs: + +Only 7-bit ASCII data is excepted. All other codes generate errors. + +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeASCII}{const char *s, + int size, + const char *errors} + +Creates a Unicode object by decoding \var{size} bytes of the ASCII +encoded string \var{s}. Returns \NULL{} in case an exception was +raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeASCII}{const Py_UNICODE *s, + int size, + const char *errors} + +Encodes the \ctype{Py_UNICODE} buffer of the given size using ASCII +and returns a Python string object. Returns \NULL{} in case an +exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsASCIIString}{PyObject *unicode} + +Encodes a Unicode objects using ASCII and returns the result as Python +string object. Error handling is ``strict''. Returns +\NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +% --- Character Map Codecs ----------------------------------------------- + +These are the mapping codec APIs: + +This codec is special in that it can be used to implement many +different codecs (and this is in fact what was done to obtain most of +the standard codecs included in the \module{encodings} package). The +codec uses mapping to encode and decode characters. + +Decoding mappings must map single string characters to single Unicode +characters, integers (which are then interpreted as Unicode ordinals) +or None (meaning "undefined mapping" and causing an error). + +Encoding mappings must map single Unicode characters to single string +characters, integers (which are then interpreted as Latin-1 ordinals) +or None (meaning "undefined mapping" and causing an error). + +The mapping objects provided must only support the __getitem__ mapping +interface. + +If a character lookup fails with a LookupError, the character is +copied as-is meaning that its ordinal value will be interpreted as +Unicode or Latin-1 ordinal resp. Because of this, mappings only need +to contain those mappings which map characters to different code +points. + +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeCharmap}{const char *s, + int size, + PyObject *mapping, + const char *errors} + +Creates a Unicode object by decoding \var{size} bytes of the encoded +string \var{s} using the given \var{mapping} object. Returns \NULL{} +in case an exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s, + int size, + PyObject *mapping, + const char *errors} + +Encodes the \ctype{Py_UNICODE} buffer of the given size using the +given \var{mapping} object and returns a Python string object. +Returns \NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsCharmapString}{PyObject *unicode, + PyObject *mapping} + +Encodes a Unicode objects using the given \var{mapping} object and +returns the result as Python string object. Error handling is +``strict''. Returns \NULL{} in case an exception was raised by the +codec. +\end{cfuncdesc} + +The following codec API is special in that maps Unicode to Unicode. + +\begin{cfuncdesc}{PyObject*}{PyUnicode_TranslateCharmap}{const Py_UNICODE *s, + int size, + PyObject *table, + const char *errors} + +Translates a \ctype{Py_UNICODE} buffer of the given length by applying +a character mapping \var{table} to it and returns the resulting +Unicode object. + +The \var{mapping} table must map Unicode ordinal integers to Unicode +ordinal integers or None (causing deletion of the character). + +Mapping tables must only provide the __getitem__ interface, +e.g. dictionaries or sequences. Unmapped character ordinals (ones +which cause a LookupError) are left untouched and are copied as-is. + +Returns \NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +% --- MBCS codecs for Windows -------------------------------------------- + +These are the MBCS codec APIs. They are currently only available +Windows and use the Win32 MBCS converters to implement the +conversions. + +Note that MBCS (or DBCS) is a class of encodings, not just one. The +target encoding is defined by the user settings on the machine running +the codec. + +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCS}{const char *s, + int size, + const char *errors} + +Creates a Unicode object by decoding \var{size} bytes of the MBCS +encoded string \var{s}. Returns \NULL{} in case an exception was +raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s, + int size, + const char *errors} + +Encodes the \ctype{Py_UNICODE} buffer of the given size using MBCS +and returns a Python string object. Returns \NULL{} in case an +exception was raised by the codec. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_AsMBCSString}{PyObject *unicode} + +Encodes a Unicode objects using MBCS and returns the result as Python +string object. Error handling is ``strict''. Returns +\NULL{} in case an exception was raised by the codec. +\end{cfuncdesc} + +% --- Methods & Slots ---------------------------------------------------- + +\subsubsection{Methods and Slot Functions \label{unicodeMethodsAndSlots}} + +The following APIs are capable of handling Unicode objects and strings +on input (we refer to them as strings in the descriptions) and return +Unicode objects or integers as apporpriate. + +They all return \NULL{} or -1 in case an exception occurrs. + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Concat}{PyObject *left, + PyObject *right} + +Concat two strings giving a new Unicode string. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Split}{PyObject *s, + PyObject *sep, + int maxsplit} + +Split a string giving a list of Unicode strings. + +If sep is NULL, splitting will be done at all whitespace +substrings. Otherwise, splits occur at the given separator. + +At most maxsplit splits will be done. If negative, no limit is set. + +Separators are not included in the resulting list. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Splitlines}{PyObject *s, + int maxsplit} + +Dito, but split at line breaks. + +CRLF is considered to be one line break. Line breaks are not +included in the resulting list. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Translate}{PyObject *str, + PyObject *table, + const char *errors} + +Translate a string by applying a character mapping table to it and +return the resulting Unicode object. + +The mapping table must map Unicode ordinal integers to Unicode ordinal +integers or None (causing deletion of the character). + +Mapping tables must only provide the __getitem__ interface, +e.g. dictionaries or sequences. Unmapped character ordinals (ones +which cause a LookupError) are left untouched and are copied as-is. + +\var{errors} has the usual meaning for codecs. It may be \NULL{} +which indicates to use the default error handling. + +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Join}{PyObject *separator, + PyObject *seq} + +Join a sequence of strings using the given separator and return +the resulting Unicode string. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Tailmatch}{PyObject *str, + PyObject *substr, + int start, + int end, + int direction} + +Return 1 if \var{substr} matches \var{str}[\var{start}:\var{end}] at +the given tail end (\var{direction} == -1 means to do a prefix match, +\var{direction} == 1 a suffix match), 0 otherwise. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Find}{PyObject *str, + PyObject *substr, + int start, + int end, + int direction} + +Return the first position of \var{substr} in +\var{str}[\var{start}:\var{end}] using the given \var{direction} +(\var{direction} == 1 means to do a forward search, +\var{direction} == -1 a backward search), 0 otherwise. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Count}{PyObject *str, + PyObject *substr, + int start, + int end} + +Count the number of occurrences of \var{substr} in +\var{str}[\var{start}:\var{end}] +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Replace}{PyObject *str, + PyObject *substr, + PyObject *replstr, + int maxcount} + +Replace at most \var{maxcount} occurrences of \var{substr} in +\var{str} with \var{replstr} and return the resulting Unicode object. +\var{maxcount} == -1 means: replace all occurrences. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{PyUnicode_Compare}{PyObject *left, + PyObject *right} + +Compare two strings and return -1, 0, 1 for less than, equal, +greater than resp. +\end{cfuncdesc} + +\begin{cfuncdesc}{PyObject*}{PyUnicode_Format}{PyObject *format, + PyObject *args} +Returns a new string object from \var{format} and \var{args}. Analogous +to \code{\var{format} \% \var{args}}. The \var{args} argument must be +a tuple. +\end{cfuncdesc} + +\begin{cfuncdesc}{int}{PyUnicode_Contains}{PyObject *container, + PyObject *element} + +Checks whether \var{element} is contained in \var{container} and +returns 1/0 accordingly. + +\var{element} has to coerce to an one element Unicode string. -1 is +returned in case of an error. +\end{cfuncdesc} + + \subsection{Buffer Objects \label{bufferObjects}} \sectionauthor{Greg Stein}{gstein@lyra.org} |