summaryrefslogtreecommitdiffstats
path: root/Lib/posixpath.py
blob: 0aa53feac2e5885d2dfb1f90efa9bf2c4147ff7e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
"""Common operations on Posix pathnames.

Instead of importing this module directly, import os and refer to
this module as os.path.  The "os.path" name is an alias for this
module on Posix systems; on other systems (e.g. Mac, Windows),
os.path provides the same operations in a manner specific to that
platform, and is an alias to another module (e.g. macpath, ntpath).

Some of this can actually be useful on non-Posix systems too, e.g.
for manipulation of the pathname component of URLs.
"""

import os
import sys
import stat
import genericpath
from genericpath import *

__all__ = ["normcase","isabs","join","splitdrive","split","splitext",
           "basename","dirname","commonprefix","getsize","getmtime",
           "getatime","getctime","islink","exists","lexists","isdir","isfile",
           "ismount", "expanduser","expandvars","normpath","abspath",
           "samefile","sameopenfile","samestat",
           "curdir","pardir","sep","pathsep","defpath","altsep","extsep",
           "devnull","realpath","supports_unicode_filenames","relpath"]

# Strings representing various path-related bits and pieces.
# These are primarily for export; internally, they are hardcoded.
curdir = '.'
pardir = '..'
extsep = '.'
sep = '/'
pathsep = ':'
defpath = ':/bin:/usr/bin'
altsep = None
devnull = '/dev/null'

def _get_sep(path):
    if isinstance(path, bytes):
        return b'/'
    else:
        return '/'

# Normalize the case of a pathname.  Trivial in Posix, string.lower on Mac.
# On MS-DOS this may also turn slashes into backslashes; however, other
# normalizations (such as optimizing '../' away) are not allowed
# (another function should be defined to do that).

def normcase(s):
    """Normalize case of pathname.  Has no effect under Posix"""
    if not isinstance(s, (bytes, str)):
        raise TypeError("normcase() argument must be str or bytes, "
                        "not '{}'".format(s.__class__.__name__))
    return s


# Return whether a path is absolute.
# Trivial in Posix, harder on the Mac or MS-DOS.

def isabs(s):
    """Test whether a path is absolute"""
    sep = _get_sep(s)
    return s.startswith(sep)


# Join pathnames.
# Ignore the previous parts if a part is absolute.
# Insert a '/' unless the first part is empty or already ends in '/'.

def join(a, *p):
    """Join two or more pathname components, inserting '/' as needed.
    If any component is an absolute path, all previous path components
    will be discarded.  An empty last part will result in a path that
    ends with a separator."""
    sep = _get_sep(a)
    path = a
    try:
        for b in p:
            if b.startswith(sep):
                path = b
            elif not path or path.endswith(sep):
                path += b
            else:
                path += sep + b
    except TypeError:
        if all(isinstance(s, (str, bytes)) for s in (a,) + p):
            # Must have a mixture of text and binary data
            raise TypeError("Can't mix strings and bytes in path "
                            "components") from None
        raise
    return path


# Split a path in head (everything up to the last '/') and tail (the
# rest).  If the path ends in '/', tail will be empty.  If there is no
# '/' in the path, head  will be empty.
# Trailing '/'es are stripped from head unless it is the root.

def split(p):
    """Split a pathname.  Returns tuple "(head, tail)" where "tail" is
    everything after the final slash.  Either part may be empty."""
    sep = _get_sep(p)
    i = p.rfind(sep) + 1
    head, tail = p[:i], p[i:]
    if head and head != sep*len(head):
        head = head.rstrip(sep)
    return head, tail


# Split a path in root and extension.
# The extension is everything starting at the last dot in the last
# pathname component; the root is everything before that.
# It is always true that root + ext == p.

def splitext(p):
    if isinstance(p, bytes):
        sep = b'/'
        extsep = b'.'
    else:
        sep = '/'
        extsep = '.'
    return genericpath._splitext(p, sep, None, extsep)
splitext.__doc__ = genericpath._splitext.__doc__

# Split a pathname into a drive specification and the rest of the
# path.  Useful on DOS/Windows/NT; on Unix, the drive is always empty.

def splitdrive(p):
    """Split a pathname into drive and path. On Posix, drive is always
    empty."""
    return p[:0], p


# Return the tail (basename) part of a path, same as split(path)[1].

def basename(p):
    """Returns the final component of a pathname"""
    sep = _get_sep(p)
    i = p.rfind(sep) + 1
    return p[i:]


# Return the head (dirname) part of a path, same as split(path)[0].

def dirname(p):
    """Returns the directory component of a pathname"""
    sep = _get_sep(p)
    i = p.rfind(sep) + 1
    head = p[:i]
    if head and head != sep*len(head):
        head = head.rstrip(sep)
    return head


# Is a path a symbolic link?
# This will always return false on systems where os.lstat doesn't exist.

def islink(path):
    """Test whether a path is a symbolic link"""
    try:
        st = os.lstat(path)
    except (OSError, AttributeError):
        return False
    return stat.S_ISLNK(st.st_mode)

# Being true for dangling symbolic links is also useful.

def lexists(path):
    """Test whether a path exists.  Returns True for broken symbolic links"""
    try:
        os.lstat(path)
    except OSError:
        return False
    return True


# Is a path a mount point?
# (Does this work for all UNIXes?  Is it even guaranteed to work by Posix?)

def ismount(path):
    """Test whether a path is a mount point"""
    try:
        s1 = os.lstat(path)
    except OSError:
        # It doesn't exist -- so not a mount point. :-)
        return False
    else:
        # A symlink can never be a mount point
        if stat.S_ISLNK(s1.st_mode):
            return False

    if isinstance(path, bytes):
        parent = join(path, b'..')
    else:
        parent = join(path, '..')
    try:
        s2 = os.lstat(parent)
    except OSError:
        return False

    dev1 = s1.st_dev
    dev2 = s2.st_dev
    if dev1 != dev2:
        return True     # path/.. on a different device as path
    ino1 = s1.st_ino
    ino2 = s2.st_ino
    if ino1 == ino2:
        return True     # path/.. is the same i-node as path
    return False


# Expand paths beginning with '~' or '~user'.
# '~' means $HOME; '~user' means that user's home directory.
# If the path doesn't begin with '~', or if the user or $HOME is unknown,
# the path is returned unchanged (leaving error reporting to whatever
# function is called with the expanded path as argument).
# See also module 'glob' for expansion of *, ? and [...] in pathnames.
# (A function should also be defined to do full *sh-style environment
# variable expansion.)

def expanduser(path):
    """Expand ~ and ~user constructions.  If user or $HOME is unknown,
    do nothing."""
    if isinstance(path, bytes):
        tilde = b'~'
    else:
        tilde = '~'
    if not path.startswith(tilde):
        return path
    sep = _get_sep(path)
    i = path.find(sep, 1)
    if i < 0:
        i = len(path)
    if i == 1:
        if 'HOME' not in os.environ:
            import pwd
            userhome = pwd.getpwuid(os.getuid()).pw_dir
        else:
            userhome = os.environ['HOME']
    else:
        import pwd
        name = path[1:i]
        if isinstance(name, bytes):
            name = str(name, 'ASCII')
        try:
            pwent = pwd.getpwnam(name)
        except KeyError:
            return path
        userhome = pwent.pw_dir
    if isinstance(path, bytes):
        userhome = os.fsencode(userhome)
        root = b'/'
    else:
        root = '/'
    userhome = userhome.rstrip(root)
    return (userhome + path[i:]) or root


# Expand paths containing shell variable substitutions.
# This expands the forms $variable and ${variable} only.
# Non-existent variables are left unchanged.

_varprog = None
_varprogb = None

def expandvars(path):
    """Expand shell variables of form $var and ${var}.  Unknown variables
    are left unchanged."""
    global _varprog, _varprogb
    if isinstance(path, bytes):
        if b'$' not in path:
            return path
        if not _varprogb:
            import re
            _varprogb = re.compile(br'\$(\w+|\{[^}]*\})', re.ASCII)
        search = _varprogb.search
        start = b'{'
        end = b'}'
        environ = getattr(os, 'environb', None)
    else:
        if '$' not in path:
            return path
        if not _varprog:
            import re
            _varprog = re.compile(r'\$(\w+|\{[^}]*\})', re.ASCII)
        search = _varprog.search
        start = '{'
        end = '}'
        environ = os.environ
    i = 0
    while True:
        m = search(path, i)
        if not m:
            break
        i, j = m.span(0)
        name = m.group(1)
        if name.startswith(start) and name.endswith(end):
            name = name[1:-1]
        try:
            if environ is None:
                value = os.fsencode(os.environ[os.fsdecode(name)])
            else:
                value = environ[name]
        except KeyError:
            i = j
        else:
            tail = path[j:]
            path = path[:i] + value
            i = len(path)
            path += tail
    return path


# Normalize a path, e.g. A//B, A/./B and A/foo/../B all become A/B.
# It should be understood that this may change the meaning of the path
# if it contains symbolic links!

def normpath(path):
    """Normalize path, eliminating double slashes, etc."""
    if isinstance(path, bytes):
        sep = b'/'
        empty = b''
        dot = b'.'
        dotdot = b'..'
    else:
        sep = '/'
        empty = ''
        dot = '.'
        dotdot = '..'
    if path == empty:
        return dot
    initial_slashes = path.startswith(sep)
    # POSIX allows one or two initial slashes, but treats three or more
    # as single slash.
    if (initial_slashes and
        path.startswith(sep*2) and not path.startswith(sep*3)):
        initial_slashes = 2
    comps = path.split(sep)
    new_comps = []
    for comp in comps:
        if comp in (empty, dot):
            continue
        if (comp != dotdot or (not initial_slashes and not new_comps) or
             (new_comps and new_comps[-1] == dotdot)):
            new_comps.append(comp)
        elif new_comps:
            new_comps.pop()
    comps = new_comps
    path = sep.join(comps)
    if initial_slashes:
        path = sep*initial_slashes + path
    return path or dot


def abspath(path):
    """Return an absolute path."""
    if not isabs(path):
        if isinstance(path, bytes):
            cwd = os.getcwdb()
        else:
            cwd = os.getcwd()
        path = join(cwd, path)
    return normpath(path)


# Return a canonical path (i.e. the absolute location of a file on the
# filesystem).

def realpath(filename):
    """Return the canonical path of the specified filename, eliminating any
symbolic links encountered in the path."""
    path, ok = _joinrealpath(filename[:0], filename, {})
    return abspath(path)

# Join two paths, normalizing ang eliminating any symbolic links
# encountered in the second path.
def _joinrealpath(path, rest, seen):
    if isinstance(path, bytes):
        sep = b'/'
        curdir = b'.'
        pardir = b'..'
    else:
        sep = '/'
        curdir = '.'
        pardir = '..'

    if isabs(rest):
        rest = rest[1:]
        path = sep

    while rest:
        name, _, rest = rest.partition(sep)
        if not name or name == curdir:
            # current dir
            continue
        if name == pardir:
            # parent dir
            if path:
                path, name = split(path)
                if name == pardir:
                    path = join(path, pardir, pardir)
            else:
                path = pardir
            continue
        newpath = join(path, name)
        if not islink(newpath):
            path = newpath
            continue
        # Resolve the symbolic link
        if newpath in seen:
            # Already seen this path
            path = seen[newpath]
            if path is not None:
                # use cached value
                continue
            # The symlink is not resolved, so we must have a symlink loop.
            # Return already resolved part + rest of the path unchanged.
            return join(newpath, rest), False
        seen[newpath] = None # not resolved symlink
        path, ok = _joinrealpath(path, os.readlink(newpath), seen)
        if not ok:
            return join(path, rest), False
        seen[newpath] = path # resolved symlink

    return path, True


supports_unicode_filenames = (sys.platform == 'darwin')

def relpath(path, start=None):
    """Return a relative version of a path"""

    if not path:
        raise ValueError("no path specified")

    if isinstance(path, bytes):
        curdir = b'.'
        sep = b'/'
        pardir = b'..'
    else:
        curdir = '.'
        sep = '/'
        pardir = '..'

    if start is None:
        start = curdir

    start_list = [x for x in abspath(start).split(sep) if x]
    path_list = [x for x in abspath(path).split(sep) if x]

    # Work out how much of the filepath is shared by start and path.
    i = len(commonprefix([start_list, path_list]))

    rel_list = [pardir] * (len(start_list)-i) + path_list[i:]
    if not rel_list:
        return curdir
    return join(*rel_list)
(self, errors='strict'): IncrementalDecoder.__init__(self, errors) # undecoded input that is kept between calls to decode() self.buffer = b"" def _buffer_decode(self, input, errors, final): # Overwrite this method in subclasses: It must decode input # and return an (output, length consumed) tuple raise NotImplementedError def decode(self, input, final=False): # decode input (taking the buffer into account) data = self.buffer + input (result, consumed) = self._buffer_decode(data, self.errors, final) # keep undecoded input until the next call self.buffer = data[consumed:] return result def reset(self): IncrementalDecoder.reset(self) self.buffer = b"" def getstate(self): # additional state info is always 0 return (self.buffer, 0) def setstate(self, state): # ignore additional state info self.buffer = state[0] # # The StreamWriter and StreamReader class provide generic working # interfaces which can be used to implement new encoding submodules # very easily. See encodings/utf_8.py for an example on how this is # done. # class StreamWriter(Codec): def __init__(self, stream, errors='strict'): """ Creates a StreamWriter instance. stream must be a file-like object open for writing (binary) data. The StreamWriter may use different error handling schemes by providing the errors keyword argument. These parameters are predefined: 'strict' - raise a ValueError (or a subclass) 'ignore' - ignore the character and continue with the next 'replace'- replace with a suitable replacement character 'xmlcharrefreplace' - Replace with the appropriate XML character reference. 'backslashreplace' - Replace with backslashed escape sequences (only for encoding). The set of allowed parameter values can be extended via register_error. """ self.stream = stream self.errors = errors def write(self, object): """ Writes the object's contents encoded to self.stream. """ data, consumed = self.encode(object, self.errors) self.stream.write(data) def writelines(self, list): """ Writes the concatenated list of strings to the stream using .write(). """ self.write(''.join(list)) def reset(self): """ Flushes and resets the codec buffers used for keeping state. Calling this method should ensure that the data on the output is put into a clean state, that allows appending of new fresh data without having to rescan the whole stream to recover state. """ pass def seek(self, offset, whence=0): self.stream.seek(offset, whence) if whence == 0 and offset == 0: self.reset() def __getattr__(self, name, getattr=getattr): """ Inherit all other methods from the underlying stream. """ return getattr(self.stream, name) def __enter__(self): return self def __exit__(self, type, value, tb): self.stream.close() ### class StreamReader(Codec): charbuffertype = str def __init__(self, stream, errors='strict'): """ Creates a StreamReader instance. stream must be a file-like object open for reading (binary) data. The StreamReader may use different error handling schemes by providing the errors keyword argument. These parameters are predefined: 'strict' - raise a ValueError (or a subclass) 'ignore' - ignore the character and continue with the next 'replace'- replace with a suitable replacement character; The set of allowed parameter values can be extended via register_error. """ self.stream = stream self.errors = errors self.bytebuffer = b"" self._empty_charbuffer = self.charbuffertype() self.charbuffer = self._empty_charbuffer self.linebuffer = None def decode(self, input, errors='strict'): raise NotImplementedError def read(self, size=-1, chars=-1, firstline=False): """ Decodes data from the stream self.stream and returns the resulting object. chars indicates the number of characters to read from the stream. read() will never return more than chars characters, but it might return less, if there are not enough characters available. size indicates the approximate maximum number of bytes to read from the stream for decoding purposes. The decoder can modify this setting as appropriate. The default value -1 indicates to read and decode as much as possible. size is intended to prevent having to decode huge files in one step. If firstline is true, and a UnicodeDecodeError happens after the first line terminator in the input only the first line will be returned, the rest of the input will be kept until the next call to read(). The method should use a greedy read strategy meaning that it should read as much data as is allowed within the definition of the encoding and the given size, e.g. if optional encoding endings or state markers are available on the stream, these should be read too. """ # If we have lines cached, first merge them back into characters if self.linebuffer: self.charbuffer = self._empty_charbuffer.join(self.linebuffer) self.linebuffer = None # read until we get the required number of characters (if available) while True: # can the request can be satisfied from the character buffer? if chars < 0: if size < 0: if self.charbuffer: break elif len(self.charbuffer) >= size: break else: if len(self.charbuffer) >= chars: break # we need more data if size < 0: newdata = self.stream.read() else: newdata = self.stream.read(size) # decode bytes (those remaining from the last call included) data = self.bytebuffer + newdata try: newchars, decodedbytes = self.decode(data, self.errors) except UnicodeDecodeError as exc: if firstline: newchars, decodedbytes = \ self.decode(data[:exc.start], self.errors) lines = newchars.splitlines(True) if len(lines)<=1: raise else: raise # keep undecoded bytes until the next call self.bytebuffer = data[decodedbytes:] # put new characters in the character buffer self.charbuffer += newchars # there was no data available if not newdata: break if chars < 0: # Return everything we've got result = self.charbuffer self.charbuffer = self._empty_charbuffer else: # Return the first chars characters result = self.charbuffer[:chars] self.charbuffer = self.charbuffer[chars:] return result def readline(self, size=None, keepends=True): """ Read one line from the input stream and return the decoded data. size, if given, is passed as size argument to the read() method. """ # If we have lines cached from an earlier read, return # them unconditionally if self.linebuffer: line = self.linebuffer[0] del self.linebuffer[0] if len(self.linebuffer) == 1: # revert to charbuffer mode; we might need more data # next time self.charbuffer = self.linebuffer[0] self.linebuffer = None if not keepends: line = line.splitlines(False)[0] return line readsize = size or 72 line = self._empty_charbuffer # If size is given, we call read() only once while True: data = self.read(readsize, firstline=True) if data: # If we're at a "\r" read one extra character (which might # be a "\n") to get a proper line ending. If the stream is # temporarily exhausted we return the wrong line ending. if (isinstance(data, str) and data.endswith("\r")) or \ (isinstance(data, bytes) and data.endswith(b"\r")): data += self.read(size=1, chars=1) line += data lines = line.splitlines(True) if lines: if len(lines) > 1: # More than one line result; the first line is a full line # to return line = lines[0] del lines[0] if len(lines) > 1: # cache the remaining lines lines[-1] += self.charbuffer self.linebuffer = lines self.charbuffer = None else: # only one remaining line, put it back into charbuffer self.charbuffer = lines[0] + self.charbuffer if not keepends: line = line.splitlines(False)[0] break line0withend = lines[0] line0withoutend = lines[0].splitlines(False)[0] if line0withend != line0withoutend: # We really have a line end # Put the rest back together and keep it until the next call self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ self.charbuffer if keepends: line = line0withend else: line = line0withoutend break # we didn't get anything or this was our only try if not data or size is not None: if line and not keepends: line = line.splitlines(False)[0] break if readsize < 8000: readsize *= 2 return line def readlines(self, sizehint=None, keepends=True): """ Read all lines available on the input stream and return them as list of lines. Line breaks are implemented using the codec's decoder method and are included in the list entries. sizehint, if given, is ignored since there is no efficient way to finding the true end-of-line. """ data = self.read() return data.splitlines(keepends) def reset(self): """ Resets the codec buffers used for keeping state. Note that no stream repositioning should take place. This method is primarily intended to be able to recover from decoding errors. """ self.bytebuffer = b"" self.charbuffer = self._empty_charbuffer self.linebuffer = None def seek(self, offset, whence=0): """ Set the input stream's current position. Resets the codec buffers used for keeping state. """ self.stream.seek(offset, whence) self.reset() def __next__(self): """ Return the next decoded line from the input stream.""" line = self.readline() if line: return line raise StopIteration def __iter__(self): return self def __getattr__(self, name, getattr=getattr): """ Inherit all other methods from the underlying stream. """ return getattr(self.stream, name) def __enter__(self): return self def __exit__(self, type, value, tb): self.stream.close() ### class StreamReaderWriter: """ StreamReaderWriter instances allow wrapping streams which work in both read and write modes. The design is such that one can use the factory functions returned by the codec.lookup() function to construct the instance. """ # Optional attributes set by the file wrappers below encoding = 'unknown' def __init__(self, stream, Reader, Writer, errors='strict'): """ Creates a StreamReaderWriter instance. stream must be a Stream-like object. Reader, Writer must be factory functions or classes providing the StreamReader, StreamWriter interface resp. Error handling is done in the same way as defined for the StreamWriter/Readers. """ self.stream = stream self.reader = Reader(stream, errors) self.writer = Writer(stream, errors) self.errors = errors def read(self, size=-1): return self.reader.read(size) def readline(self, size=None): return self.reader.readline(size) def readlines(self, sizehint=None): return self.reader.readlines(sizehint) def __next__(self): """ Return the next decoded line from the input stream.""" return next(self.reader) def __iter__(self): return self def write(self, data): return self.writer.write(data) def writelines(self, list): return self.writer.writelines(list) def reset(self): self.reader.reset() self.writer.reset() def seek(self, offset, whence=0): self.stream.seek(offset, whence) self.reader.reset() if whence == 0 and offset == 0: self.writer.reset() def __getattr__(self, name, getattr=getattr): """ Inherit all other methods from the underlying stream. """ return getattr(self.stream, name) # these are needed to make "with codecs.open(...)" work properly def __enter__(self): return self def __exit__(self, type, value, tb): self.stream.close() ### class StreamRecoder: """ StreamRecoder instances provide a frontend - backend view of encoding data. They use the complete set of APIs returned by the codecs.lookup() function to implement their task. Data written to the stream is first decoded into an intermediate format (which is dependent on the given codec combination) and then written to the stream using an instance of the provided Writer class. In the other direction, data is read from the stream using a Reader instance and then return encoded data to the caller. """ # Optional attributes set by the file wrappers below data_encoding = 'unknown' file_encoding = 'unknown' def __init__(self, stream, encode, decode, Reader, Writer, errors='strict'): """ Creates a StreamRecoder instance which implements a two-way conversion: encode and decode work on the frontend (the input to .read() and output of .write()) while Reader and Writer work on the backend (reading and writing to the stream). You can use these objects to do transparent direct recodings from e.g. latin-1 to utf-8 and back. stream must be a file-like object. encode, decode must adhere to the Codec interface, Reader, Writer must be factory functions or classes providing the StreamReader, StreamWriter interface resp. encode and decode are needed for the frontend translation, Reader and Writer for the backend translation. Unicode is used as intermediate encoding. Error handling is done in the same way as defined for the StreamWriter/Readers. """ self.stream = stream self.encode = encode self.decode = decode self.reader = Reader(stream, errors) self.writer = Writer(stream, errors) self.errors = errors def read(self, size=-1): data = self.reader.read(size) data, bytesencoded = self.encode(data, self.errors) return data def readline(self, size=None): if size is None: data = self.reader.readline() else: data = self.reader.readline(size) data, bytesencoded = self.encode(data, self.errors) return data def readlines(self, sizehint=None): data = self.reader.read() data, bytesencoded = self.encode(data, self.errors) return data.splitlines(1) def __next__(self): """ Return the next decoded line from the input stream.""" data = next(self.reader) data, bytesencoded = self.encode(data, self.errors) return data def __iter__(self): return self def write(self, data): data, bytesdecoded = self.decode(data, self.errors) return self.writer.write(data) def writelines(self, list): data = ''.join(list) data, bytesdecoded = self.decode(data, self.errors) return self.writer.write(data) def reset(self): self.reader.reset() self.writer.reset() def __getattr__(self, name, getattr=getattr): """ Inherit all other methods from the underlying stream. """ return getattr(self.stream, name) def __enter__(self): return self def __exit__(self, type, value, tb): self.stream.close() ### Shortcuts def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): """ Open an encoded file using the given mode and return a wrapped version providing transparent encoding/decoding. Note: The wrapped version will only accept the object format defined by the codecs, i.e. Unicode objects for most builtin codecs. Output is also codec dependent and will usually be Unicode as well. Files are always opened in binary mode, even if no binary mode was specified. This is done to avoid data loss due to encodings using 8-bit values. The default file mode is 'rb' meaning to open the file in binary read mode. encoding specifies the encoding which is to be used for the file. errors may be given to define the error handling. It defaults to 'strict' which causes ValueErrors to be raised in case an encoding error occurs. buffering has the same meaning as for the builtin open() API. It defaults to line buffered. The returned wrapped file object provides an extra attribute .encoding which allows querying the used encoding. This attribute is only available if an encoding was specified as parameter. """ if encoding is not None and \ 'b' not in mode: # Force opening of the file in binary mode mode = mode + 'b' file = builtins.open(filename, mode, buffering) if encoding is None: return file info = lookup(encoding) srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) # Add attributes to simplify introspection srw.encoding = encoding return srw def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): """ Return a wrapped version of file which provides transparent encoding translation. Strings written to the wrapped file are interpreted according to the given data_encoding and then written to the original file as string using file_encoding. The intermediate encoding will usually be Unicode but depends on the specified codecs. Strings are read from the file using file_encoding and then passed back to the caller as string using data_encoding. If file_encoding is not given, it defaults to data_encoding. errors may be given to define the error handling. It defaults to 'strict' which causes ValueErrors to be raised in case an encoding error occurs. The returned wrapped file object provides two extra attributes .data_encoding and .file_encoding which reflect the given parameters of the same name. The attributes can be used for introspection by Python programs. """ if file_encoding is None: file_encoding = data_encoding data_info = lookup(data_encoding) file_info = lookup(file_encoding) sr = StreamRecoder(file, data_info.encode, data_info.decode, file_info.streamreader, file_info.streamwriter, errors) # Add attributes to simplify introspection sr.data_encoding = data_encoding sr.file_encoding = file_encoding return sr ### Helpers for codec lookup def getencoder(encoding): """ Lookup up the codec for the given encoding and return its encoder function. Raises a LookupError in case the encoding cannot be found. """ return lookup(encoding).encode def getdecoder(encoding): """ Lookup up the codec for the given encoding and return its decoder function. Raises a LookupError in case the encoding cannot be found. """ return lookup(encoding).decode def getincrementalencoder(encoding): """ Lookup up the codec for the given encoding and return its IncrementalEncoder class or factory function. Raises a LookupError in case the encoding cannot be found or the codecs doesn't provide an incremental encoder. """ encoder = lookup(encoding).incrementalencoder if encoder is None: raise LookupError(encoding) return encoder def getincrementaldecoder(encoding): """ Lookup up the codec for the given encoding and return its IncrementalDecoder class or factory function. Raises a LookupError in case the encoding cannot be found or the codecs doesn't provide an incremental decoder. """ decoder = lookup(encoding).incrementaldecoder if decoder is None: raise LookupError(encoding) return decoder def getreader(encoding): """ Lookup up the codec for the given encoding and return its StreamReader class or factory function. Raises a LookupError in case the encoding cannot be found. """ return lookup(encoding).streamreader def getwriter(encoding): """ Lookup up the codec for the given encoding and return its StreamWriter class or factory function. Raises a LookupError in case the encoding cannot be found. """ return lookup(encoding).streamwriter def iterencode(iterator, encoding, errors='strict', **kwargs): """ Encoding iterator. Encodes the input strings from the iterator using a IncrementalEncoder. errors and kwargs are passed through to the IncrementalEncoder constructor. """ encoder = getincrementalencoder(encoding)(errors, **kwargs) for input in iterator: output = encoder.encode(input) if output: yield output output = encoder.encode("", True) if output: yield output def iterdecode(iterator, encoding, errors='strict', **kwargs): """ Decoding iterator. Decodes the input strings from the iterator using a IncrementalDecoder. errors and kwargs are passed through to the IncrementalDecoder constructor. """ decoder = getincrementaldecoder(encoding)(errors, **kwargs) for input in iterator: output = decoder.decode(input) if output: yield output output = decoder.decode(b"", True) if output: yield output ### Helpers for charmap-based codecs def make_identity_dict(rng): """ make_identity_dict(rng) -> dict Return a dictionary where elements of the rng sequence are mapped to themselves. """ res = {} for i in rng: res[i]=i return res def make_encoding_map(decoding_map): """ Creates an encoding map from a decoding map. If a target mapping in the decoding map occurs multiple times, then that target is mapped to None (undefined mapping), causing an exception when encountered by the charmap codec during translation. One example where this happens is cp875.py which decodes multiple character to \u001a. """ m = {} for k,v in decoding_map.items(): if not v in m: m[v] = k else: m[v] = None return m ### error handlers try: strict_errors = lookup_error("strict") ignore_errors = lookup_error("ignore") replace_errors = lookup_error("replace") xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") backslashreplace_errors = lookup_error("backslashreplace") except LookupError: # In --disable-unicode builds, these error handler are missing strict_errors = None ignore_errors = None replace_errors = None xmlcharrefreplace_errors = None backslashreplace_errors = None # Tell modulefinder that using codecs probably needs the encodings # package _false = 0 if _false: import encodings ### Tests if __name__ == '__main__': # Make stdout translate Latin-1 output into UTF-8 output sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') # Have stdin translate Latin-1 input into UTF-8 input sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')