diff options
author | Benjamin Peterson <benjamin@python.org> | 2009-10-09 21:53:27 (GMT) |
---|---|---|
committer | Benjamin Peterson <benjamin@python.org> | 2009-10-09 21:53:27 (GMT) |
commit | 0c7f9c96f5fbe8c445ccd3e2ee315f2e2f45a3ca (patch) | |
tree | 0deb207baf2fc47748cb82897cbb5bb445a26dba /Lib/tokenize.py | |
parent | ae2fa6fad26cbbef8ad408b4f5188d2a7a9f8c89 (diff) | |
download | cpython-0c7f9c96f5fbe8c445ccd3e2ee315f2e2f45a3ca.zip cpython-0c7f9c96f5fbe8c445ccd3e2ee315f2e2f45a3ca.tar.gz cpython-0c7f9c96f5fbe8c445ccd3e2ee315f2e2f45a3ca.tar.bz2 |
Merged revisions 75299 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/branches/py3k
........
r75299 | benjamin.peterson | 2009-10-09 16:43:09 -0500 (Fri, 09 Oct 2009) | 1 line
normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does
........
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r-- | Lib/tokenize.py | 13 |
1 files changed, 12 insertions, 1 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py index f83bda5..fb58c6b 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -279,6 +279,17 @@ def untokenize(iterable): return out +def _get_normal_name(orig_enc): + """Imitates get_normal_name in tokenizer.c.""" + # Only care about the first 12 characters. + enc = orig_enc[:12].lower().replace("_", "-") + if enc == "utf-8" or enc.startswith("utf-8-"): + return "utf-8" + if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ + enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): + return "iso-8859-1" + return orig_enc + def detect_encoding(readline): """ The detect_encoding() function is used to detect the encoding that should @@ -313,7 +324,7 @@ def detect_encoding(readline): matches = cookie_re.findall(line_string) if not matches: return None - encoding = matches[0] + encoding = _get_normal_name(matches[0]) try: codec = lookup(encoding) except LookupError: |