summaryrefslogtreecommitdiffstats
path: root/Lib/idlelib
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2020-06-30 06:33:22 (GMT)
committerGitHub <noreply@github.com>2020-06-30 06:33:22 (GMT)
commit694d31e714074176f0c324f95948b75dc768c091 (patch)
treeb42f6011379b2b5693196f2b7dc1237b8c1c0622 /Lib/idlelib
parent038dd0f79dc89566b01ba66a5a018266b2917a19 (diff)
downloadcpython-694d31e714074176f0c324f95948b75dc768c091.zip
cpython-694d31e714074176f0c324f95948b75dc768c091.tar.gz
cpython-694d31e714074176f0c324f95948b75dc768c091.tar.bz2
bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)
Diffstat (limited to 'Lib/idlelib')
-rw-r--r--Lib/idlelib/iomenu.py193
1 files changed, 41 insertions, 152 deletions
diff --git a/Lib/idlelib/iomenu.py b/Lib/idlelib/iomenu.py
index 7f3f656..7641d86 100644
--- a/Lib/idlelib/iomenu.py
+++ b/Lib/idlelib/iomenu.py
@@ -1,10 +1,8 @@
-import codecs
-from codecs import BOM_UTF8
import os
-import re
import shlex
import sys
import tempfile
+import tokenize
import tkinter.filedialog as tkFileDialog
import tkinter.messagebox as tkMessageBox
@@ -20,49 +18,6 @@ else:
errors = 'surrogateescape'
-coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
-blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
-
-def coding_spec(data):
- """Return the encoding declaration according to PEP 263.
-
- When checking encoded data, only the first two lines should be passed
- in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
- The first two lines would contain the encoding specification.
-
- Raise a LookupError if the encoding is declared but unknown.
- """
- if isinstance(data, bytes):
- # This encoding might be wrong. However, the coding
- # spec must be ASCII-only, so any non-ASCII characters
- # around here will be ignored. Decoding to Latin-1 should
- # never fail (except for memory outage)
- lines = data.decode('iso-8859-1')
- else:
- lines = data
- # consider only the first two lines
- if '\n' in lines:
- lst = lines.split('\n', 2)[:2]
- elif '\r' in lines:
- lst = lines.split('\r', 2)[:2]
- else:
- lst = [lines]
- for line in lst:
- match = coding_re.match(line)
- if match is not None:
- break
- if not blank_re.match(line):
- return None
- else:
- return None
- name = match.group(1)
- try:
- codecs.lookup(name)
- except LookupError:
- # The standard encoding error does not indicate the encoding
- raise LookupError("Unknown encoding: "+name)
- return name
-
class IOBinding:
# One instance per editor Window so methods know which to save, close.
@@ -78,7 +33,7 @@ class IOBinding:
self.save_as)
self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
self.save_a_copy)
- self.fileencoding = None
+ self.fileencoding = 'utf-8'
self.__id_print = self.text.bind("<<print-window>>", self.print_window)
def close(self):
@@ -165,34 +120,44 @@ class IOBinding:
self.text.focus_set()
return "break"
- eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
- eol_re = re.compile(eol)
eol_convention = os.linesep # default
def loadfile(self, filename):
try:
- # open the file in binary mode so that we can handle
- # end-of-line convention ourselves.
- with open(filename, 'rb') as f:
- two_lines = f.readline() + f.readline()
- f.seek(0)
- bytes = f.read()
- except OSError as msg:
- tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)
+ try:
+ with tokenize.open(filename) as f:
+ chars = f.read()
+ fileencoding = f.encoding
+ eol_convention = f.newlines
+ converted = False
+ except (UnicodeDecodeError, SyntaxError):
+ # Wait for the editor window to appear
+ self.editwin.text.update()
+ enc = askstring(
+ "Specify file encoding",
+ "The file's encoding is invalid for Python 3.x.\n"
+ "IDLE will convert it to UTF-8.\n"
+ "What is the current encoding of the file?",
+ initialvalue='utf-8',
+ parent=self.editwin.text)
+ with open(filename, encoding=enc) as f:
+ chars = f.read()
+ fileencoding = f.encoding
+ eol_convention = f.newlines
+ converted = True
+ except OSError as err:
+ tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
return False
- chars, converted = self._decode(two_lines, bytes)
- if chars is None:
+ except UnicodeDecodeError:
tkMessageBox.showerror("Decoding Error",
"File %s\nFailed to Decode" % filename,
parent=self.text)
return False
- # We now convert all end-of-lines to '\n's
- firsteol = self.eol_re.search(chars)
- if firsteol:
- self.eol_convention = firsteol.group(0)
- chars = self.eol_re.sub(r"\n", chars)
+
self.text.delete("1.0", "end")
self.set_filename(None)
+ self.fileencoding = fileencoding
+ self.eol_convention = eol_convention
self.text.insert("1.0", chars)
self.reset_undo()
self.set_filename(filename)
@@ -205,74 +170,6 @@ class IOBinding:
self.updaterecentfileslist(filename)
return True
- def _decode(self, two_lines, bytes):
- "Create a Unicode string."
- chars = None
- # Check presence of a UTF-8 signature first
- if bytes.startswith(BOM_UTF8):
- try:
- chars = bytes[3:].decode("utf-8")
- except UnicodeDecodeError:
- # has UTF-8 signature, but fails to decode...
- return None, False
- else:
- # Indicates that this file originally had a BOM
- self.fileencoding = 'BOM'
- return chars, False
- # Next look for coding specification
- try:
- enc = coding_spec(two_lines)
- except LookupError as name:
- tkMessageBox.showerror(
- title="Error loading the file",
- message="The encoding '%s' is not known to this Python "\
- "installation. The file may not display correctly" % name,
- parent = self.text)
- enc = None
- except UnicodeDecodeError:
- return None, False
- if enc:
- try:
- chars = str(bytes, enc)
- self.fileencoding = enc
- return chars, False
- except UnicodeDecodeError:
- pass
- # Try ascii:
- try:
- chars = str(bytes, 'ascii')
- self.fileencoding = None
- return chars, False
- except UnicodeDecodeError:
- pass
- # Try utf-8:
- try:
- chars = str(bytes, 'utf-8')
- self.fileencoding = 'utf-8'
- return chars, False
- except UnicodeDecodeError:
- pass
- # Finally, try the locale's encoding. This is deprecated;
- # the user should declare a non-ASCII encoding
- try:
- # Wait for the editor window to appear
- self.editwin.text.update()
- enc = askstring(
- "Specify file encoding",
- "The file's encoding is invalid for Python 3.x.\n"
- "IDLE will convert it to UTF-8.\n"
- "What is the current encoding of the file?",
- initialvalue = encoding,
- parent = self.editwin.text)
-
- if enc:
- chars = str(bytes, enc)
- self.fileencoding = None
- return chars, True
- except (UnicodeDecodeError, LookupError):
- pass
- return None, False # None on failure
-
def maybesave(self):
if self.get_saved():
return "yes"
@@ -360,38 +257,30 @@ class IOBinding:
# text to us. Don't try to guess further.
return chars
# Preserve a BOM that might have been present on opening
- if self.fileencoding == 'BOM':
- return BOM_UTF8 + chars.encode("utf-8")
+ if self.fileencoding == 'utf-8-sig':
+ return chars.encode('utf-8-sig')
# See whether there is anything non-ASCII in it.
# If not, no need to figure out the encoding.
try:
return chars.encode('ascii')
- except UnicodeError:
+ except UnicodeEncodeError:
pass
# Check if there is an encoding declared
try:
- # a string, let coding_spec slice it to the first two lines
- enc = coding_spec(chars)
- failed = None
- except LookupError as msg:
- failed = msg
- enc = None
- else:
- if not enc:
- # PEP 3120: default source encoding is UTF-8
- enc = 'utf-8'
- if enc:
- try:
- return chars.encode(enc)
- except UnicodeError:
- failed = "Invalid encoding '%s'" % enc
+ encoded = chars.encode('ascii', 'replace')
+ enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
+ return chars.encode(enc)
+ except SyntaxError as err:
+ failed = str(err)
+ except UnicodeEncodeError:
+ failed = "Invalid encoding '%s'" % enc
tkMessageBox.showerror(
"I/O Error",
"%s.\nSaving as UTF-8" % failed,
- parent = self.text)
+ parent=self.text)
# Fallback: save as UTF-8, with BOM - ignoring the incorrect
# declared encoding
- return BOM_UTF8 + chars.encode("utf-8")
+ return chars.encode('utf-8-sig')
def print_window(self, event):
confirm = tkMessageBox.askokcancel(