summaryrefslogtreecommitdiffstats
path: root/Doc/library/codecs.rst
blob: b58e410f8b04ff2d7055a034c34a27d885921765 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""Python part of the warnings subsystem."""

# Note: function level imports should *not* be used
# in this module as it may cause import lock deadlock.
# See bug 683658.
import linecache
import sys

__all__ = ["warn", "showwarning", "formatwarning", "filterwarnings",
           "resetwarnings", "catch_warnings"]


def showwarning(message, category, filename, lineno, file=None, line=None):
    """Hook to write a warning to a file; replace if you like."""
    if file is None:
        file = sys.stderr
    try:
        file.write(formatwarning(message, category, filename, lineno, line))
    except IOError:
        pass # the file (probably stderr) is invalid - this warning gets lost.

def formatwarning(message, category, filename, lineno, line=None):
    """Function to format a warning the standard way."""
    s =  "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
    line = linecache.getline(filename, lineno) if line is None else line
    if line:
        line = line.strip()
        s += "  %s\n" % line
    return s

def filterwarnings(action, message="", category=Warning, module="", lineno=0,
                   append=False):
    """Insert an entry into the list of warnings filters (at the front).

    'action' -- one of "error", "ignore", "always", "default", "module",
                or "once"
    'message' -- a regex that the warning message must match
    'category' -- a class that the warning must be a subclass of
    'module' -- a regex that the module name must match
    'lineno' -- an integer line number, 0 matches all warnings
    'append' -- if true, append to the list of filters
    """
    import re
    assert action in ("error", "ignore", "always", "default", "module",
                      "once"), "invalid action: %r" % (action,)
    assert isinstance(message, str), "message must be a string"
    assert isinstance(category, type), "category must be a class"
    assert issubclass(category, Warning), "category must be a Warning subclass"
    assert isinstance(module, str), "module must be a string"
    assert isinstance(lineno, int) and lineno >= 0, \
           "lineno must be an int >= 0"
    item = (action, re.compile(message, re.I), category,
            re.compile(module), lineno)
    if append:
        filters.append(item)
    else:
        filters.insert(0, item)

def simplefilter(action, category=Warning, lineno=0, append=False):
    """Insert a simple entry into the list of warnings filters (at the front).

    A simple filter matches all modules and messages.
    'action' -- one of "error", "ignore", "always", "default", "module",
                or "once"
    'category' -- a class that the warning must be a subclass of
    'lineno' -- an integer line number, 0 matches all warnings
    'append' -- if true, append to the list of filters
    """
    assert action in ("error", "ignore", "always", "default", "module",
                      "once"), "invalid action: %r" % (action,)
    assert isinstance(lineno, int) and lineno >= 0, \
           "lineno must be an int >= 0"
    item = (action, None, category, None, lineno)
    if append:
        filters.append(item)
    else:
        filters.insert(0, item)

def resetwarnings():
    """Clear the list of warning filters, so that no filters are active."""
    filters[:] = []

class _OptionError(Exception):
    """Exception used by option processing helpers."""
    pass

# Helper to process -W options passed via sys.warnoptions
def _processoptions(args):
    for arg in args:
        try:
            _setoption(arg)
        except _OptionError as msg:
            print("Invalid -W option ignored:", msg, file=sys.stderr)

# Helper for _processoptions()
def _setoption(arg):
    import re
    parts = arg.split(':')
    if len(parts) > 5:
        raise _OptionError("too many fields (max 5): %r" % (arg,))
    while len(parts) < 5:
        parts.append('')
    action, message, category, module, lineno = [s.strip()
                                                 for s in parts]
    action = _getaction(action)
    message = re.escape(message)
    category = _getcategory(category)
    module = re.escape(module)
    if module:
        module = module + '$'
    if lineno:
        try:
            lineno = int(lineno)
            if lineno < 0:
                raise ValueError
        except (ValueError, OverflowError):
            raise _OptionError("invalid lineno %r" % (lineno,))
    else:
        lineno = 0
    filterwarnings(action, message, category, module, lineno)

# Helper for _setoption()
def _getaction(action):
    if not action:
        return "default"
    if action == "all": return "always" # Alias
    for a in ('default', 'always', 'ignore', 'module', 'once', 'error'):
        if a.startswith(action):
            return a
    raise _OptionError("invalid action: %r" % (action,))

# Helper for _setoption()
def _getcategory(category):
    import re
    if not category:
        return Warning
    if re.match("^[a-zA-Z0-9_]+$", category):
        try:
            cat = eval(category)
        except NameError:
            raise _OptionError("unknown warning category: %r" % (category,))
    else:
        i = category.rfind(".")
        module = category[:i]
        klass = category[i+1:]
        try:
            m = __import__(module, None, None, [klass])
        except ImportError:
            raise _OptionError("invalid module name: %r" % (module,))
        try:
            cat = getattr(m, klass)
        except AttributeError:
            raise _OptionError("unknown warning category: %r" % (category,))
    if not issubclass(cat, Warning):
        raise _OptionError("invalid warning category: %r" % (category,))
    return cat


# Code typically replaced by _warnings
def warn(message, category=None, stacklevel=1):
    """Issue a warning, or maybe ignore it or raise an exception."""
    # Check if message is already a Warning object
    if isinstance(message, Warning):
        category = message.__class__
    # Check category argument
    if category is None:
        category = UserWarning
    assert issubclass(category, Warning)
    # Get context information
    try:
        caller = sys._getframe(stacklevel)
    except ValueError:
        globals = sys.__dict__
        lineno = 1
    else:
        globals = caller.f_globals
        lineno = caller.f_lineno
    if '__name__' in globals:
        module = globals['__name__']
    else:
        module = "<string>"
    filename = globals.get('__file__')
    if filename:
        fnl = filename.lower()
        if fnl.endswith((".pyc", ".pyo")):
            filename = filename[:-1]
    else:
        if module == "__main__":
            try:
                filename = sys.argv[0]
            except AttributeError:
                # embedded interpreters don't have sys.argv, see bug #839151
                filename = '__main__'
        if not filename:
            filename = module
    registry = globals.setdefault("__warningregistry__", {})
    warn_explicit(message, category, filename, lineno, module, registry,
                  globals)

def warn_explicit(message, category, filename, lineno,
                  module=None, registry=None, module_globals=None):
    lineno = int(lineno)
    if module is None:
        module = filename or "<unknown>"
        if module[-3:].lower() == ".py":
            module = module[:-3] # XXX What about leading pathname?
    if registry is None:
        registry = {}
    if isinstance(message, Warning):
        text = str(message)
        category = message.__class__
    else:
        text = message
        message = category(message)
    key = (text, category, lineno)
    # Quick test for common case
    if registry.get(key):
        return
    # Search the filters
    for item in filters:
        action, msg, cat, mod, ln = item
        if ((msg is None or msg.match(text)) and
            issubclass(category, cat) and
            (mod is None or mod.match(module)) and
            (ln == 0 or lineno == ln)):
            break
    else:
        action = defaultaction
    # Early exit actions
    if action == "ignore":
        registry[key] = 1
        return

    # Prime the linecache for formatting, in case the
    # "file" is actually in a zipfile or something.
    linecache.getlines(filename, module_globals)

    if action == "error":
        raise message
    # Other actions
    if action == "once":
        registry[key] = 1
        oncekey = (text, category)
        if onceregistry.get(oncekey):
            return
        onceregistry[oncekey] = 1
    elif action == "always":
        pass
    elif action == "module":
        registry[key] = 1
        altkey = (text, category, 0)
        if registry.get(altkey):
            return
        registry[altkey] = 1
    elif action == "default":
        registry[key] = 1
    else:
        # Unrecognized actions are errors
        raise RuntimeError(
              "Unrecognized action (%r) in warnings.filters:\n %s" %
              (action, item))
    if not hasattr(showwarning, "__call__"):
        raise TypeError("warnings.showwarning() must be set to a "
                        "function or method")
    # Print message and context
    showwarning(message, category, filename, lineno)


class WarningMessage(object):

    """Holds the result of a single showwarning() call."""

    _WARNING_DETAILS = ("message", "category", "filename", "lineno", "file",
                        "line")

    def __init__(self, message, category, filename, lineno, file=None,
                    line=None):
        local_values = locals()
        for attr in self._WARNING_DETAILS:
            setattr(self, attr, local_values[attr])
        self._category_name = category.__name__ if category else None

    def __str__(self):
        return ("{message : %r, category : %r, filename : %r, lineno : %s, "
                    "line : %r}" % (self.message, self._category_name,
                                    self.filename, self.lineno, self.line))


class catch_warnings(object):

    """A context manager that copies and restores the warnings filter upon
    exiting the context.

    The 'record' argument specifies whether warnings should be captured by a
    custom implementation of warnings.showwarning() and be appended to a list
    returned by the context manager. Otherwise None is returned by the context
    manager. The objects appended to the list are arguments whose attributes
    mirror the arguments to showwarning().

    The 'module' argument is to specify an alternative module to the module
    named 'warnings' and imported under that name. This argument is only useful
    when testing the warnings module itself.

    """

    def __init__(self, *, record=False, module=None):
        """Specify whether to record warnings and if an alternative module
        should be used other than sys.modules['warnings'].

        For compatibility with Python 3.0, please consider all arguments to be
        keyword-only.

        """
        self._record = record
        self._module = sys.modules['warnings'] if module is None else module
        self._entered = False

    def __repr__(self):
        args = []
        if self._record:
            args.append("record=True")
        if self._module is not sys.modules['warnings']:
            args.append("module=%r" % self._module)
        name = type(self).__name__
        return "%s(%s)" % (name, ", ".join(args))

    def __enter__(self):
        if self._entered:
            raise RuntimeError("Cannot enter %r twice" % self)
        self._entered = True
        self._filters = self._module.filters
        self._module.filters = self._filters[:]
        self._showwarning = self._module.showwarning
        if self._record:
            log = []
            def showwarning(*args, **kwargs):
                log.append(WarningMessage(*args, **kwargs))
            self._module.showwarning = showwarning
            return log
        else:
            return None

    def __exit__(self, *exc_info):
        if not self._entered:
            raise RuntimeError("Cannot exit %r without entering first" % self)
        self._module.filters = self._filters
        self._module.showwarning = self._showwarning


# filters contains a sequence of filter 5-tuples
# The components of the 5-tuple are:
# - an action: error, ignore, always, default, module, or once
# - a compiled regex that must match the warning message
# - a class representing the warning category
# - a compiled regex that must match the module that is being warned
# - a line number for the line being warning, or 0 to mean any line
# If either if the compiled regexs are None, match anything.
_warnings_defaults = False
try:
    from _warnings import (filters, _defaultaction, _onceregistry,
                            warn, warn_explicit)
    defaultaction = _defaultaction
    onceregistry = _onceregistry
    _warnings_defaults = True
except ImportError:
    filters = []
    defaultaction = "default"
    onceregistry = {}


# Module initialization
_processoptions(sys.warnoptions)
if not _warnings_defaults:
    silence = [ImportWarning, PendingDeprecationWarning]
    silence.append(DeprecationWarning)
    for cls in silence:
        simplefilter("ignore", category=cls)
    bytes_warning = sys.flags.bytes_warning
    if bytes_warning > 1:
        bytes_action = "error"
    elif bytes_warning:
        bytes_action = "default"
    else:
        bytes_action = "ignore"
    simplefilter(bytes_action, category=BytesWarning, append=1)
    # resource usage warnings are enabled by default in pydebug mode
    if hasattr(sys, 'gettotalrefcount'):
        resource_action = "always"
    else:
        resource_action = "ignore"
    simplefilter(resource_action, category=ResourceWarning, append=1)

del _warnings_defaults
racter * ``'xmlcharrefreplace'`` Replace with the appropriate XML character reference * ``'backslashreplace'`` Replace with backslashed escape sequences. The *errors* argument will be assigned to an attribute of the same name. Assigning to this attribute makes it possible to switch between different error handling strategies during the lifetime of the :class:`IncrementalEncoder` object. The set of allowed values for the *errors* argument can be extended with :func:`register_error`. .. method:: encode(object[, final]) Encodes *object* (taking the current state of the encoder into account) and returns the resulting encoded object. If this is the last call to :meth:`encode` *final* must be true (the default is false). .. method:: reset() Reset the encoder to the initial state. .. method:: IncrementalEncoder.getstate() Return the current state of the encoder which must be an integer. The implementation should make sure that ``0`` is the most common state. (States that are more complicated than integers can be converted into an integer by marshaling/pickling the state and encoding the bytes of the resulting string into an integer). .. method:: IncrementalEncoder.setstate(state) Set the state of the encoder to *state*. *state* must be an encoder state returned by :meth:`getstate`. .. _incremental-decoder-objects: IncrementalDecoder Objects ^^^^^^^^^^^^^^^^^^^^^^^^^^ The :class:`IncrementalDecoder` class is used for decoding an input in multiple steps. It defines the following methods which every incremental decoder must define in order to be compatible with the Python codec registry. .. class:: IncrementalDecoder([errors]) Constructor for an :class:`IncrementalDecoder` instance. All incremental decoders must provide this constructor interface. They are free to add additional keyword arguments, but only the ones defined here are used by the Python codec registry. The :class:`IncrementalDecoder` may implement different error handling schemes by providing the *errors* keyword argument. These parameters are predefined: * ``'strict'`` Raise :exc:`ValueError` (or a subclass); this is the default. * ``'ignore'`` Ignore the character and continue with the next. * ``'replace'`` Replace with a suitable replacement character. The *errors* argument will be assigned to an attribute of the same name. Assigning to this attribute makes it possible to switch between different error handling strategies during the lifetime of the :class:`IncrementalDecoder` object. The set of allowed values for the *errors* argument can be extended with :func:`register_error`. .. method:: decode(object[, final]) Decodes *object* (taking the current state of the decoder into account) and returns the resulting decoded object. If this is the last call to :meth:`decode` *final* must be true (the default is false). If *final* is true the decoder must decode the input completely and must flush all buffers. If this isn't possible (e.g. because of incomplete byte sequences at the end of the input) it must initiate error handling just like in the stateless case (which might raise an exception). .. method:: reset() Reset the decoder to the initial state. .. method:: getstate() Return the current state of the decoder. This must be a tuple with two items, the first must be the buffer containing the still undecoded input. The second must be an integer and can be additional state info. (The implementation should make sure that ``0`` is the most common additional state info.) If this additional state info is ``0`` it must be possible to set the decoder to the state which has no input buffered and ``0`` as the additional state info, so that feeding the previously buffered input to the decoder returns it to the previous state without producing any output. (Additional state info that is more complicated than integers can be converted into an integer by marshaling/pickling the info and encoding the bytes of the resulting string into an integer.) .. method:: setstate(state) Set the state of the encoder to *state*. *state* must be a decoder state returned by :meth:`getstate`. The :class:`StreamWriter` and :class:`StreamReader` classes provide generic working interfaces which can be used to implement new encoding submodules very easily. See :mod:`encodings.utf_8` for an example of how this is done. .. _stream-writer-objects: StreamWriter Objects ^^^^^^^^^^^^^^^^^^^^ The :class:`StreamWriter` class is a subclass of :class:`Codec` and defines the following methods which every stream writer must define in order to be compatible with the Python codec registry. .. deprecated:: 3.3 Use the builtin the :class:`io.TextIOWrapper` class. .. class:: StreamWriter(stream[, errors]) Constructor for a :class:`StreamWriter` instance. All stream writers must provide this constructor interface. They are free to add additional keyword arguments, but only the ones defined here are used by the Python codec registry. *stream* must be a file-like object open for writing binary data. The :class:`StreamWriter` may implement different error handling schemes by providing the *errors* keyword argument. These parameters are predefined: * ``'strict'`` Raise :exc:`ValueError` (or a subclass); this is the default. * ``'ignore'`` Ignore the character and continue with the next. * ``'replace'`` Replace with a suitable replacement character * ``'xmlcharrefreplace'`` Replace with the appropriate XML character reference * ``'backslashreplace'`` Replace with backslashed escape sequences. The *errors* argument will be assigned to an attribute of the same name. Assigning to this attribute makes it possible to switch between different error handling strategies during the lifetime of the :class:`StreamWriter` object. The set of allowed values for the *errors* argument can be extended with :func:`register_error`. .. method:: write(object) Writes the object's contents encoded to the stream. .. method:: writelines(list) Writes the concatenated list of strings to the stream (possibly by reusing the :meth:`write` method). .. method:: reset() Flushes and resets the codec buffers used for keeping state. Calling this method should ensure that the data on the output is put into a clean state that allows appending of new fresh data without having to rescan the whole stream to recover state. In addition to the above methods, the :class:`StreamWriter` must also inherit all other methods and attributes from the underlying stream. .. _stream-reader-objects: StreamReader Objects ^^^^^^^^^^^^^^^^^^^^ The :class:`StreamReader` class is a subclass of :class:`Codec` and defines the following methods which every stream reader must define in order to be compatible with the Python codec registry. .. deprecated:: 3.3 Use the builtin the :class:`io.TextIOWrapper` class. .. class:: StreamReader(stream[, errors]) Constructor for a :class:`StreamReader` instance. All stream readers must provide this constructor interface. They are free to add additional keyword arguments, but only the ones defined here are used by the Python codec registry. *stream* must be a file-like object open for reading (binary) data. The :class:`StreamReader` may implement different error handling schemes by providing the *errors* keyword argument. These parameters are defined: * ``'strict'`` Raise :exc:`ValueError` (or a subclass); this is the default. * ``'ignore'`` Ignore the character and continue with the next. * ``'replace'`` Replace with a suitable replacement character. The *errors* argument will be assigned to an attribute of the same name. Assigning to this attribute makes it possible to switch between different error handling strategies during the lifetime of the :class:`StreamReader` object. The set of allowed values for the *errors* argument can be extended with :func:`register_error`. .. method:: read([size[, chars, [firstline]]]) Decodes data from the stream and returns the resulting object. *chars* indicates the number of characters to read from the stream. :func:`read` will never return more than *chars* characters, but it might return less, if there are not enough characters available. *size* indicates the approximate maximum number of bytes to read from the stream for decoding purposes. The decoder can modify this setting as appropriate. The default value -1 indicates to read and decode as much as possible. *size* is intended to prevent having to decode huge files in one step. *firstline* indicates that it would be sufficient to only return the first line, if there are decoding errors on later lines. The method should use a greedy read strategy meaning that it should read as much data as is allowed within the definition of the encoding and the given size, e.g. if optional encoding endings or state markers are available on the stream, these should be read too. .. method:: readline([size[, keepends]]) Read one line from the input stream and return the decoded data. *size*, if given, is passed as size argument to the stream's :meth:`readline` method. If *keepends* is false line-endings will be stripped from the lines returned. .. method:: readlines([sizehint[, keepends]]) Read all lines available on the input stream and return them as a list of lines. Line-endings are implemented using the codec's decoder method and are included in the list entries if *keepends* is true. *sizehint*, if given, is passed as the *size* argument to the stream's :meth:`read` method. .. method:: reset() Resets the codec buffers used for keeping state. Note that no stream repositioning should take place. This method is primarily intended to be able to recover from decoding errors. In addition to the above methods, the :class:`StreamReader` must also inherit all other methods and attributes from the underlying stream. The next two base classes are included for convenience. They are not needed by the codec registry, but may provide useful in practice. .. _stream-reader-writer: StreamReaderWriter Objects ^^^^^^^^^^^^^^^^^^^^^^^^^^ The :class:`StreamReaderWriter` allows wrapping streams which work in both read and write modes. The design is such that one can use the factory functions returned by the :func:`lookup` function to construct the instance. .. deprecated:: 3.3 Use the :class:`io.TextIOWrapper` class. .. class:: StreamReaderWriter(stream, Reader, Writer, errors) Creates a :class:`StreamReaderWriter` instance. *stream* must be a file-like object. *Reader* and *Writer* must be factory functions or classes providing the :class:`StreamReader` and :class:`StreamWriter` interface resp. Error handling is done in the same way as defined for the stream readers and writers. :class:`StreamReaderWriter` instances define the combined interfaces of :class:`StreamReader` and :class:`StreamWriter` classes. They inherit all other methods and attributes from the underlying stream. .. _stream-recoder-objects: StreamRecoder Objects ^^^^^^^^^^^^^^^^^^^^^ The :class:`StreamRecoder` provide a frontend - backend view of encoding data which is sometimes useful when dealing with different encoding environments. The design is such that one can use the factory functions returned by the :func:`lookup` function to construct the instance. .. deprecated:: 3.3 .. class:: StreamRecoder(stream, encode, decode, Reader, Writer, errors) Creates a :class:`StreamRecoder` instance which implements a two-way conversion: *encode* and *decode* work on the frontend (the input to :meth:`read` and output of :meth:`write`) while *Reader* and *Writer* work on the backend (reading and writing to the stream). You can use these objects to do transparent direct recodings from e.g. Latin-1 to UTF-8 and back. *stream* must be a file-like object. *encode*, *decode* must adhere to the :class:`Codec` interface. *Reader*, *Writer* must be factory functions or classes providing objects of the :class:`StreamReader` and :class:`StreamWriter` interface respectively. *encode* and *decode* are needed for the frontend translation, *Reader* and *Writer* for the backend translation. Error handling is done in the same way as defined for the stream readers and writers. :class:`StreamRecoder` instances define the combined interfaces of :class:`StreamReader` and :class:`StreamWriter` classes. They inherit all other methods and attributes from the underlying stream. .. _encodings-overview: Encodings and Unicode --------------------- Strings are stored internally as sequences of codepoints (to be precise as :c:type:`Py_UNICODE` arrays). Depending on the way Python is compiled (either via ``--without-wide-unicode`` or ``--with-wide-unicode``, with the former being the default) :c:type:`Py_UNICODE` is either a 16-bit or 32-bit data type. Once a string object is used outside of CPU and memory, CPU endianness and how these arrays are stored as bytes become an issue. Transforming a string object into a sequence of bytes is called encoding and recreating the string object from the sequence of bytes is known as decoding. There are many different methods for how this transformation can be done (these methods are also called encodings). The simplest method is to map the codepoints 0-255 to the bytes ``0x0``-``0xff``. This means that a string object that contains codepoints above ``U+00FF`` can't be encoded with this method (which is called ``'latin-1'`` or ``'iso-8859-1'``). :func:`str.encode` will raise a :exc:`UnicodeEncodeError` that looks like this: ``UnicodeEncodeError: 'latin-1' codec can't encode character '\u1234' in position 3: ordinal not in range(256)``. There's another group of encodings (the so called charmap encodings) that choose a different subset of all Unicode code points and how these codepoints are mapped to the bytes ``0x0``-``0xff``. To see how this is done simply open e.g. :file:`encodings/cp1252.py` (which is an encoding that is used primarily on Windows). There's a string constant with 256 characters that shows you which character is mapped to which byte value. All of these encodings can only encode 256 of the 65536 (or 1114111) codepoints defined in Unicode. A simple and straightforward way that can store each Unicode code point, is to store each codepoint as two consecutive bytes. There are two possibilities: Store the bytes in big endian or in little endian order. These two encodings are called UTF-16-BE and UTF-16-LE respectively. Their disadvantage is that if e.g. you use UTF-16-BE on a little endian machine you will always have to swap bytes on encoding and decoding. UTF-16 avoids this problem: Bytes will always be in natural endianness. When these bytes are read by a CPU with a different endianness, then bytes have to be swapped though. To be able to detect the endianness of a UTF-16 byte sequence, there's the so called BOM (the "Byte Order Mark"). This is the Unicode character ``U+FEFF``. This character will be prepended to every UTF-16 byte sequence. The byte swapped version of this character (``0xFFFE``) is an illegal character that may not appear in a Unicode text. So when the first character in an UTF-16 byte sequence appears to be a ``U+FFFE`` the bytes have to be swapped on decoding. Unfortunately upto Unicode 4.0 the character ``U+FEFF`` had a second purpose as a ``ZERO WIDTH NO-BREAK SPACE``: A character that has no width and doesn't allow a word to be split. It can e.g. be used to give hints to a ligature algorithm. With Unicode 4.0 using ``U+FEFF`` as a ``ZERO WIDTH NO-BREAK SPACE`` has been deprecated (with ``U+2060`` (``WORD JOINER``) assuming this role). Nevertheless Unicode software still must be able to handle ``U+FEFF`` in both roles: As a BOM it's a device to determine the storage layout of the encoded bytes, and vanishes once the byte sequence has been decoded into a string; as a ``ZERO WIDTH NO-BREAK SPACE`` it's a normal character that will be decoded like any other. There's another encoding that is able to encoding the full range of Unicode characters: UTF-8. UTF-8 is an 8-bit encoding, which means there are no issues with byte order in UTF-8. Each byte in a UTF-8 byte sequence consists of two parts: Marker bits (the most significant bits) and payload bits. The marker bits are a sequence of zero to six 1 bits followed by a 0 bit. Unicode characters are encoded like this (with x being payload bits, which when concatenated give the Unicode character): +-----------------------------------+----------------------------------------------+ | Range | Encoding | +===================================+==============================================+ | ``U-00000000`` ... ``U-0000007F`` | 0xxxxxxx | +-----------------------------------+----------------------------------------------+ | ``U-00000080`` ... ``U-000007FF`` | 110xxxxx 10xxxxxx | +-----------------------------------+----------------------------------------------+ | ``U-00000800`` ... ``U-0000FFFF`` | 1110xxxx 10xxxxxx 10xxxxxx | +-----------------------------------+----------------------------------------------+ | ``U-00010000`` ... ``U-001FFFFF`` | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | +-----------------------------------+----------------------------------------------+ | ``U-00200000`` ... ``U-03FFFFFF`` | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | +-----------------------------------+----------------------------------------------+ | ``U-04000000`` ... ``U-7FFFFFFF`` | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | | | 10xxxxxx | +-----------------------------------+----------------------------------------------+ The least significant bit of the Unicode character is the rightmost x bit. As UTF-8 is an 8-bit encoding no BOM is required and any ``U+FEFF`` character in the decoded string (even if it's the first character) is treated as a ``ZERO WIDTH NO-BREAK SPACE``. Without external information it's impossible to reliably determine which encoding was used for encoding a string. Each charmap encoding can decode any random byte sequence. However that's not possible with UTF-8, as UTF-8 byte sequences have a structure that doesn't allow arbitrary byte sequences. To increase the reliability with which a UTF-8 encoding can be detected, Microsoft invented a variant of UTF-8 (that Python 2.5 calls ``"utf-8-sig"``) for its Notepad program: Before any of the Unicode characters is written to the file, a UTF-8 encoded BOM (which looks like this as a byte sequence: ``0xef``, ``0xbb``, ``0xbf``) is written. As it's rather improbable that any charmap encoded file starts with these byte values (which would e.g. map to | LATIN SMALL LETTER I WITH DIAERESIS | RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK | INVERTED QUESTION MARK in iso-8859-1), this increases the probability that a utf-8-sig encoding can be correctly guessed from the byte sequence. So here the BOM is not used to be able to determine the byte order used for generating the byte sequence, but as a signature that helps in guessing the encoding. On encoding the utf-8-sig codec will write ``0xef``, ``0xbb``, ``0xbf`` as the first three bytes to the file. On decoding utf-8-sig will skip those three bytes if they appear as the first three bytes in the file. .. _standard-encodings: Standard Encodings ------------------ Python comes with a number of codecs built-in, either implemented as C functions or with dictionaries as mapping tables. The following table lists the codecs by name, together with a few common aliases, and the languages for which the encoding is likely used. Neither the list of aliases nor the list of languages is meant to be exhaustive. Notice that spelling alternatives that only differ in case or use a hyphen instead of an underscore are also valid aliases; therefore, e.g. ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec. .. impl-detail:: Some common encodings can bypass the codecs lookup machinery to improve performance. These optimization opportunities are only recognized by CPython for a limited set of aliases: utf-8, utf8, latin-1, latin1, iso-8859-1, mbcs (Windows only), ascii, utf-16, and utf-32. Using alternative spellings for these encodings may result in slower execution. Many of the character sets support the same languages. They vary in individual characters (e.g. whether the EURO SIGN is supported or not), and in the assignment of characters to code positions. For the European languages in particular, the following variants typically exist: * an ISO 8859 codeset * a Microsoft Windows code page, which is typically derived from a 8859 codeset, but replaces control characters with additional graphic characters * an IBM EBCDIC code page * an IBM PC code page, which is ASCII compatible +-----------------+--------------------------------+--------------------------------+ | Codec | Aliases | Languages | +=================+================================+================================+ | ascii | 646, us-ascii | English | +-----------------+--------------------------------+--------------------------------+ | big5 | big5-tw, csbig5 | Traditional Chinese | +-----------------+--------------------------------+--------------------------------+ | big5hkscs | big5-hkscs, hkscs | Traditional Chinese | +-----------------+--------------------------------+--------------------------------+ | cp037 | IBM037, IBM039 | English | +-----------------+--------------------------------+--------------------------------+ | cp424 | EBCDIC-CP-HE, IBM424 | Hebrew | +-----------------+--------------------------------+--------------------------------+ | cp437 | 437, IBM437 | English | +-----------------+--------------------------------+--------------------------------+ | cp500 | EBCDIC-CP-BE, EBCDIC-CP-CH, | Western Europe | | | IBM500 | | +-----------------+--------------------------------+--------------------------------+ | cp720 | | Arabic | +-----------------+--------------------------------+--------------------------------+ | cp737 | | Greek | +-----------------+--------------------------------+--------------------------------+ | cp775 | IBM775 | Baltic languages | +-----------------+--------------------------------+--------------------------------+ | cp850 | 850, IBM850 | Western Europe | +-----------------+--------------------------------+--------------------------------+ | cp852 | 852, IBM852 | Central and Eastern Europe | +-----------------+--------------------------------+--------------------------------+ | cp855 | 855, IBM855 | Bulgarian, Byelorussian, | | | | Macedonian, Russian, Serbian | +-----------------+--------------------------------+--------------------------------+ | cp856 | | Hebrew | +-----------------+--------------------------------+--------------------------------+ | cp857 | 857, IBM857 | Turkish | +-----------------+--------------------------------+--------------------------------+ | cp858 | 858, IBM858 | Western Europe | +-----------------+--------------------------------+--------------------------------+ | cp860 | 860, IBM860 | Portuguese | +-----------------+--------------------------------+--------------------------------+ | cp861 | 861, CP-IS, IBM861 | Icelandic | +-----------------+--------------------------------+--------------------------------+ | cp862 | 862, IBM862 | Hebrew | +-----------------+--------------------------------+--------------------------------+ | cp863 | 863, IBM863 | Canadian | +-----------------+--------------------------------+--------------------------------+ | cp864 | IBM864 | Arabic | +-----------------+--------------------------------+--------------------------------+ | cp865 | 865, IBM865 | Danish, Norwegian | +-----------------+--------------------------------+--------------------------------+ | cp866 | 866, IBM866 | Russian | +-----------------+--------------------------------+--------------------------------+ | cp869 | 869, CP-GR, IBM869 | Greek | +-----------------+--------------------------------+--------------------------------+ | cp874 | | Thai | +-----------------+--------------------------------+--------------------------------+ | cp875 | | Greek | +-----------------+--------------------------------+--------------------------------+ | cp932 | 932, ms932, mskanji, ms-kanji | Japanese | +-----------------+--------------------------------+--------------------------------+ | cp949 | 949, ms949, uhc | Korean | +-----------------+--------------------------------+--------------------------------+ | cp950 | 950, ms950 | Traditional Chinese | +-----------------+--------------------------------+--------------------------------+ | cp1006 | | Urdu | +-----------------+--------------------------------+--------------------------------+ | cp1026 | ibm1026 | Turkish | +-----------------+--------------------------------+--------------------------------+ | cp1140 | ibm1140 | Western Europe | +-----------------+--------------------------------+--------------------------------+ | cp1250 | windows-1250 | Central and Eastern Europe | +-----------------+--------------------------------+--------------------------------+ | cp1251 | windows-1251 | Bulgarian, Byelorussian, | | | | Macedonian, Russian, Serbian | +-----------------+--------------------------------+--------------------------------+ | cp1252 | windows-1252 | Western Europe | +-----------------+--------------------------------+--------------------------------+ | cp1253 | windows-1253 | Greek | +-----------------+--------------------------------+--------------------------------+ | cp1254 | windows-1254 | Turkish | +-----------------+--------------------------------+--------------------------------+ | cp1255 | windows-1255 | Hebrew | +-----------------+--------------------------------+--------------------------------+ | cp1256 | windows-1256 | Arabic | +-----------------+--------------------------------+--------------------------------+ | cp1257 | windows-1257 | Baltic languages | +-----------------+--------------------------------+--------------------------------+ | cp1258 | windows-1258 | Vietnamese | +-----------------+--------------------------------+--------------------------------+ | euc_jp | eucjp, ujis, u-jis | Japanese | +-----------------+--------------------------------+--------------------------------+ | euc_jis_2004 | jisx0213, eucjis2004 | Japanese | +-----------------+--------------------------------+--------------------------------+ | euc_jisx0213 | eucjisx0213 | Japanese | +-----------------+--------------------------------+--------------------------------+ | euc_kr | euckr, korean, ksc5601, | Korean | | | ks_c-5601, ks_c-5601-1987, | | | | ksx1001, ks_x-1001 | | +-----------------+--------------------------------+--------------------------------+ | gb2312 | chinese, csiso58gb231280, euc- | Simplified Chinese | | | cn, euccn, eucgb2312-cn, | | | | gb2312-1980, gb2312-80, iso- | | | | ir-58 | | +-----------------+--------------------------------+--------------------------------+ | gbk | 936, cp936, ms936 | Unified Chinese | +-----------------+--------------------------------+--------------------------------+ | gb18030 | gb18030-2000 | Unified Chinese | +-----------------+--------------------------------+--------------------------------+ | hz | hzgb, hz-gb, hz-gb-2312 | Simplified Chinese | +-----------------+--------------------------------+--------------------------------+ | iso2022_jp | csiso2022jp, iso2022jp, | Japanese | | | iso-2022-jp | | +-----------------+--------------------------------+--------------------------------+ | iso2022_jp_1 | iso2022jp-1, iso-2022-jp-1 | Japanese | +-----------------+--------------------------------+--------------------------------+ | iso2022_jp_2 | iso2022jp-2, iso-2022-jp-2 | Japanese, Korean, Simplified | | | | Chinese, Western Europe, Greek | +-----------------+--------------------------------+--------------------------------+ | iso2022_jp_2004 | iso2022jp-2004, | Japanese | | | iso-2022-jp-2004 | | +-----------------+--------------------------------+--------------------------------+ | iso2022_jp_3 | iso2022jp-3, iso-2022-jp-3 | Japanese | +-----------------+--------------------------------+--------------------------------+ | iso2022_jp_ext | iso2022jp-ext, iso-2022-jp-ext | Japanese | +-----------------+--------------------------------+--------------------------------+ | iso2022_kr | csiso2022kr, iso2022kr, | Korean | | | iso-2022-kr | | +-----------------+--------------------------------+--------------------------------+ | latin_1 | iso-8859-1, iso8859-1, 8859, | West Europe | | | cp819, latin, latin1, L1 | | +-----------------+--------------------------------+--------------------------------+ | iso8859_2 | iso-8859-2, latin2, L2 | Central and Eastern Europe | +-----------------+--------------------------------+--------------------------------+ | iso8859_3 | iso-8859-3, latin3, L3 | Esperanto, Maltese | +-----------------+--------------------------------+--------------------------------+ | iso8859_4 | iso-8859-4, latin4, L4 | Baltic languages | +-----------------+--------------------------------+--------------------------------+ | iso8859_5 | iso-8859-5, cyrillic | Bulgarian, Byelorussian, | | | | Macedonian, Russian, Serbian | +-----------------+--------------------------------+--------------------------------+ | iso8859_6 | iso-8859-6, arabic | Arabic | +-----------------+--------------------------------+--------------------------------+ | iso8859_7 | iso-8859-7, greek, greek8 | Greek | +-----------------+--------------------------------+--------------------------------+ | iso8859_8 | iso-8859-8, hebrew | Hebrew | +-----------------+--------------------------------+--------------------------------+ | iso8859_9 | iso-8859-9, latin5, L5 | Turkish | +-----------------+--------------------------------+--------------------------------+ | iso8859_10 | iso-8859-10, latin6, L6 | Nordic languages | +-----------------+--------------------------------+--------------------------------+ | iso8859_13 | iso-8859-13, latin7, L7 | Baltic languages | +-----------------+--------------------------------+--------------------------------+ | iso8859_14 | iso-8859-14, latin8, L8 | Celtic languages | +-----------------+--------------------------------+--------------------------------+ | iso8859_15 | iso-8859-15, latin9, L9 | Western Europe | +-----------------+--------------------------------+--------------------------------+ | iso8859_16 | iso-8859-16, latin10, L10 | South-Eastern Europe | +-----------------+--------------------------------+--------------------------------+ | johab | cp1361, ms1361 | Korean | +-----------------+--------------------------------+--------------------------------+ | koi8_r | | Russian | +-----------------+--------------------------------+--------------------------------+ | koi8_u | | Ukrainian | +-----------------+--------------------------------+--------------------------------+ | mac_cyrillic | maccyrillic | Bulgarian, Byelorussian, | | | | Macedonian, Russian, Serbian | +-----------------+--------------------------------+--------------------------------+ | mac_greek | macgreek | Greek | +-----------------+--------------------------------+--------------------------------+ | mac_iceland | maciceland | Icelandic | +-----------------+--------------------------------+--------------------------------+ | mac_latin2 | maclatin2, maccentraleurope | Central and Eastern Europe | +-----------------+--------------------------------+--------------------------------+ | mac_roman | macroman, macintosh | Western Europe | +-----------------+--------------------------------+--------------------------------+ | mac_turkish | macturkish | Turkish | +-----------------+--------------------------------+--------------------------------+ | ptcp154 | csptcp154, pt154, cp154, | Kazakh | | | cyrillic-asian | | +-----------------+--------------------------------+--------------------------------+ | shift_jis | csshiftjis, shiftjis, sjis, | Japanese | | | s_jis | | +-----------------+--------------------------------+--------------------------------+ | shift_jis_2004 | shiftjis2004, sjis_2004, | Japanese | | | sjis2004 | | +-----------------+--------------------------------+--------------------------------+ | shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese | | | s_jisx0213 | | +-----------------+--------------------------------+--------------------------------+ | utf_32 | U32, utf32 | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_32_be | UTF-32BE | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_32_le | UTF-32LE | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_16 | U16, utf16 | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_16_be | UTF-16BE | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_16_le | UTF-16LE | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_7 | U7, unicode-1-1-utf-7 | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_8 | U8, UTF, utf8 | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_8_sig | | all languages | +-----------------+--------------------------------+--------------------------------+ .. XXX fix here, should be in above table +--------------------+---------+---------------------------+ | Codec | Aliases | Purpose | +====================+=========+===========================+ | idna | | Implements :rfc:`3490`, | | | | see also | | | | :mod:`encodings.idna` | +--------------------+---------+---------------------------+ | mbcs | dbcs | Windows only: Encode | | | | operand according to the | | | | ANSI codepage (CP_ACP) | +--------------------+---------+---------------------------+ | palmos | | Encoding of PalmOS 3.5 | +--------------------+---------+---------------------------+ | punycode | | Implements :rfc:`3492` | +--------------------+---------+---------------------------+ | raw_unicode_escape | | Produce a string that is | | | | suitable as raw Unicode | | | | literal in Python source | | | | code | +--------------------+---------+---------------------------+ | undefined | | Raise an exception for | | | | all conversions. Can be | | | | used as the system | | | | encoding if no automatic | | | | coercion between byte and | | | | Unicode strings is | | | | desired. | +--------------------+---------+---------------------------+ | unicode_escape | | Produce a string that is | | | | suitable as Unicode | | | | literal in Python source | | | | code | +--------------------+---------+---------------------------+ | unicode_internal | | Return the internal | | | | representation of the | | | | operand | +--------------------+---------+---------------------------+ The following codecs provide bytes-to-bytes mappings. +--------------------+---------------------------+---------------------------+ | Codec | Aliases | Purpose | +====================+===========================+===========================+ | base64_codec | base64, base-64 | Convert operand to MIME | | | | base64 | +--------------------+---------------------------+---------------------------+ | bz2_codec | bz2 | Compress the operand | | | | using bz2 | +--------------------+---------------------------+---------------------------+ | hex_codec | hex | Convert operand to | | | | hexadecimal | | | | representation, with two | | | | digits per byte | +--------------------+---------------------------+---------------------------+ | quopri_codec | quopri, quoted-printable, | Convert operand to MIME | | | quotedprintable | quoted printable | +--------------------+---------------------------+---------------------------+ | uu_codec | uu | Convert the operand using | | | | uuencode | +--------------------+---------------------------+---------------------------+ | zlib_codec | zip, zlib | Compress the operand | | | | using gzip | +--------------------+---------------------------+---------------------------+ The following codecs provide string-to-string mappings. +--------------------+---------------------------+---------------------------+ | Codec | Aliases | Purpose | +====================+===========================+===========================+ | rot_13 | rot13 | Returns the Caesar-cypher | | | | encryption of the operand | +--------------------+---------------------------+---------------------------+ .. versionadded:: 3.2 bytes-to-bytes and string-to-string codecs. :mod:`encodings.idna` --- Internationalized Domain Names in Applications ------------------------------------------------------------------------ .. module:: encodings.idna :synopsis: Internationalized Domain Names implementation .. moduleauthor:: Martin v. Löwis This module implements :rfc:`3490` (Internationalized Domain Names in Applications) and :rfc:`3492` (Nameprep: A Stringprep Profile for Internationalized Domain Names (IDN)). It builds upon the ``punycode`` encoding and :mod:`stringprep`. These RFCs together define a protocol to support non-ASCII characters in domain names. A domain name containing non-ASCII characters (such as ``www.Alliancefrançaise.nu``) is converted into an ASCII-compatible encoding (ACE, such as ``www.xn--alliancefranaise-npb.nu``). The ACE form of the domain name is then used in all places where arbitrary characters are not allowed by the protocol, such as DNS queries, HTTP :mailheader:`Host` fields, and so on. This conversion is carried out in the application; if possible invisible to the user: The application should transparently convert Unicode domain labels to IDNA on the wire, and convert back ACE labels to Unicode before presenting them to the user. Python supports this conversion in several ways: the ``idna`` codec performs conversion between Unicode and ACE, separating an input string into labels based on the separator characters defined in `section 3.1`_ (1) of :rfc:`3490` and converting each label to ACE as required, and conversely separating an input byte string into labels based on the ``.`` separator and converting any ACE labels found into unicode. Furthermore, the :mod:`socket` module transparently converts Unicode host names to ACE, so that applications need not be concerned about converting host names themselves when they pass them to the socket module. On top of that, modules that have host names as function parameters, such as :mod:`http.client` and :mod:`ftplib`, accept Unicode host names (:mod:`http.client` then also transparently sends an IDNA hostname in the :mailheader:`Host` field if it sends that field at all). .. _section 3.1: http://tools.ietf.org/html/rfc3490#section-3.1 When receiving host names from the wire (such as in reverse name lookup), no automatic conversion to Unicode is performed: Applications wishing to present such host names to the user should decode them to Unicode. The module :mod:`encodings.idna` also implements the nameprep procedure, which performs certain normalizations on host names, to achieve case-insensitivity of international domain names, and to unify similar characters. The nameprep functions can be used directly if desired. .. function:: nameprep(label) Return the nameprepped version of *label*. The implementation currently assumes query strings, so ``AllowUnassigned`` is true. .. function:: ToASCII(label) Convert a label to ASCII, as specified in :rfc:`3490`. ``UseSTD3ASCIIRules`` is assumed to be false. .. function:: ToUnicode(label) Convert a label to Unicode, as specified in :rfc:`3490`. :mod:`encodings.mbcs` --- Windows ANSI codepage ----------------------------------------------- .. module:: encodings.mbcs :synopsis: Windows ANSI codepage Encode operand according to the ANSI codepage (CP_ACP). This codec only supports ``'strict'`` and ``'replace'`` error handlers to encode, and ``'strict'`` and ``'ignore'`` error handlers to decode. Availability: Windows only. .. versionchanged:: 3.2 Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used to encode, and ``'ignore'`` to decode. :mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature ------------------------------------------------------------- .. module:: encodings.utf_8_sig :synopsis: UTF-8 codec with BOM signature .. moduleauthor:: Walter Dörwald This module implements a variant of the UTF-8 codec: On encoding a UTF-8 encoded BOM will be prepended to the UTF-8 encoded bytes. For the stateful encoder this is only done once (on the first write to the byte stream). For decoding an optional UTF-8 encoded BOM at the start of the data will be skipped.