diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2020-06-30 06:33:22 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-30 06:33:22 (GMT) |
commit | 694d31e714074176f0c324f95948b75dc768c091 (patch) | |
tree | b42f6011379b2b5693196f2b7dc1237b8c1c0622 /Lib/idlelib | |
parent | 038dd0f79dc89566b01ba66a5a018266b2917a19 (diff) | |
download | cpython-694d31e714074176f0c324f95948b75dc768c091.zip cpython-694d31e714074176f0c324f95948b75dc768c091.tar.gz cpython-694d31e714074176f0c324f95948b75dc768c091.tar.bz2 |
bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)
Diffstat (limited to 'Lib/idlelib')
-rw-r--r-- | Lib/idlelib/iomenu.py | 193 |
1 files changed, 41 insertions, 152 deletions
diff --git a/Lib/idlelib/iomenu.py b/Lib/idlelib/iomenu.py index 7f3f656..7641d86 100644 --- a/Lib/idlelib/iomenu.py +++ b/Lib/idlelib/iomenu.py @@ -1,10 +1,8 @@ -import codecs -from codecs import BOM_UTF8 import os -import re import shlex import sys import tempfile +import tokenize import tkinter.filedialog as tkFileDialog import tkinter.messagebox as tkMessageBox @@ -20,49 +18,6 @@ else: errors = 'surrogateescape' -coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) -blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) - -def coding_spec(data): - """Return the encoding declaration according to PEP 263. - - When checking encoded data, only the first two lines should be passed - in to avoid a UnicodeDecodeError if the rest of the data is not unicode. - The first two lines would contain the encoding specification. - - Raise a LookupError if the encoding is declared but unknown. - """ - if isinstance(data, bytes): - # This encoding might be wrong. However, the coding - # spec must be ASCII-only, so any non-ASCII characters - # around here will be ignored. Decoding to Latin-1 should - # never fail (except for memory outage) - lines = data.decode('iso-8859-1') - else: - lines = data - # consider only the first two lines - if '\n' in lines: - lst = lines.split('\n', 2)[:2] - elif '\r' in lines: - lst = lines.split('\r', 2)[:2] - else: - lst = [lines] - for line in lst: - match = coding_re.match(line) - if match is not None: - break - if not blank_re.match(line): - return None - else: - return None - name = match.group(1) - try: - codecs.lookup(name) - except LookupError: - # The standard encoding error does not indicate the encoding - raise LookupError("Unknown encoding: "+name) - return name - class IOBinding: # One instance per editor Window so methods know which to save, close. @@ -78,7 +33,7 @@ class IOBinding: self.save_as) self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>", self.save_a_copy) - self.fileencoding = None + self.fileencoding = 'utf-8' self.__id_print = self.text.bind("<<print-window>>", self.print_window) def close(self): @@ -165,34 +120,44 @@ class IOBinding: self.text.focus_set() return "break" - eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac) - eol_re = re.compile(eol) eol_convention = os.linesep # default def loadfile(self, filename): try: - # open the file in binary mode so that we can handle - # end-of-line convention ourselves. - with open(filename, 'rb') as f: - two_lines = f.readline() + f.readline() - f.seek(0) - bytes = f.read() - except OSError as msg: - tkMessageBox.showerror("I/O Error", str(msg), parent=self.text) + try: + with tokenize.open(filename) as f: + chars = f.read() + fileencoding = f.encoding + eol_convention = f.newlines + converted = False + except (UnicodeDecodeError, SyntaxError): + # Wait for the editor window to appear + self.editwin.text.update() + enc = askstring( + "Specify file encoding", + "The file's encoding is invalid for Python 3.x.\n" + "IDLE will convert it to UTF-8.\n" + "What is the current encoding of the file?", + initialvalue='utf-8', + parent=self.editwin.text) + with open(filename, encoding=enc) as f: + chars = f.read() + fileencoding = f.encoding + eol_convention = f.newlines + converted = True + except OSError as err: + tkMessageBox.showerror("I/O Error", str(err), parent=self.text) return False - chars, converted = self._decode(two_lines, bytes) - if chars is None: + except UnicodeDecodeError: tkMessageBox.showerror("Decoding Error", "File %s\nFailed to Decode" % filename, parent=self.text) return False - # We now convert all end-of-lines to '\n's - firsteol = self.eol_re.search(chars) - if firsteol: - self.eol_convention = firsteol.group(0) - chars = self.eol_re.sub(r"\n", chars) + self.text.delete("1.0", "end") self.set_filename(None) + self.fileencoding = fileencoding + self.eol_convention = eol_convention self.text.insert("1.0", chars) self.reset_undo() self.set_filename(filename) @@ -205,74 +170,6 @@ class IOBinding: self.updaterecentfileslist(filename) return True - def _decode(self, two_lines, bytes): - "Create a Unicode string." - chars = None - # Check presence of a UTF-8 signature first - if bytes.startswith(BOM_UTF8): - try: - chars = bytes[3:].decode("utf-8") - except UnicodeDecodeError: - # has UTF-8 signature, but fails to decode... - return None, False - else: - # Indicates that this file originally had a BOM - self.fileencoding = 'BOM' - return chars, False - # Next look for coding specification - try: - enc = coding_spec(two_lines) - except LookupError as name: - tkMessageBox.showerror( - title="Error loading the file", - message="The encoding '%s' is not known to this Python "\ - "installation. The file may not display correctly" % name, - parent = self.text) - enc = None - except UnicodeDecodeError: - return None, False - if enc: - try: - chars = str(bytes, enc) - self.fileencoding = enc - return chars, False - except UnicodeDecodeError: - pass - # Try ascii: - try: - chars = str(bytes, 'ascii') - self.fileencoding = None - return chars, False - except UnicodeDecodeError: - pass - # Try utf-8: - try: - chars = str(bytes, 'utf-8') - self.fileencoding = 'utf-8' - return chars, False - except UnicodeDecodeError: - pass - # Finally, try the locale's encoding. This is deprecated; - # the user should declare a non-ASCII encoding - try: - # Wait for the editor window to appear - self.editwin.text.update() - enc = askstring( - "Specify file encoding", - "The file's encoding is invalid for Python 3.x.\n" - "IDLE will convert it to UTF-8.\n" - "What is the current encoding of the file?", - initialvalue = encoding, - parent = self.editwin.text) - - if enc: - chars = str(bytes, enc) - self.fileencoding = None - return chars, True - except (UnicodeDecodeError, LookupError): - pass - return None, False # None on failure - def maybesave(self): if self.get_saved(): return "yes" @@ -360,38 +257,30 @@ class IOBinding: # text to us. Don't try to guess further. return chars # Preserve a BOM that might have been present on opening - if self.fileencoding == 'BOM': - return BOM_UTF8 + chars.encode("utf-8") + if self.fileencoding == 'utf-8-sig': + return chars.encode('utf-8-sig') # See whether there is anything non-ASCII in it. # If not, no need to figure out the encoding. try: return chars.encode('ascii') - except UnicodeError: + except UnicodeEncodeError: pass # Check if there is an encoding declared try: - # a string, let coding_spec slice it to the first two lines - enc = coding_spec(chars) - failed = None - except LookupError as msg: - failed = msg - enc = None - else: - if not enc: - # PEP 3120: default source encoding is UTF-8 - enc = 'utf-8' - if enc: - try: - return chars.encode(enc) - except UnicodeError: - failed = "Invalid encoding '%s'" % enc + encoded = chars.encode('ascii', 'replace') + enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline) + return chars.encode(enc) + except SyntaxError as err: + failed = str(err) + except UnicodeEncodeError: + failed = "Invalid encoding '%s'" % enc tkMessageBox.showerror( "I/O Error", "%s.\nSaving as UTF-8" % failed, - parent = self.text) + parent=self.text) # Fallback: save as UTF-8, with BOM - ignoring the incorrect # declared encoding - return BOM_UTF8 + chars.encode("utf-8") + return chars.encode('utf-8-sig') def print_window(self, event): confirm = tkMessageBox.askokcancel( |