diff options
Diffstat (limited to 'Lib/lib2to3/pgen2/tokenize.py')
-rw-r--r-- | Lib/lib2to3/pgen2/tokenize.py | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py index 799566b..4585ca3 100644 --- a/Lib/lib2to3/pgen2/tokenize.py +++ b/Lib/lib2to3/pgen2/tokenize.py @@ -231,6 +231,17 @@ class Untokenizer: cookie_re = re.compile("coding[:=]\s*([-\w.]+)") +def _get_normal_name(orig_enc): + """Imitates get_normal_name in tokenizer.c.""" + # Only care about the first 12 characters. + enc = orig_enc[:12].lower().replace("_", "-") + if enc == "utf-8" or enc.startswith("utf-8-"): + return "utf-8" + if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ + enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): + return "iso-8859-1" + return orig_enc + def detect_encoding(readline): """ The detect_encoding() function is used to detect the encoding that should @@ -265,7 +276,7 @@ def detect_encoding(readline): matches = cookie_re.findall(line_string) if not matches: return None - encoding = matches[0] + encoding = _get_normal_name(matches[0]) try: codec = lookup(encoding) except LookupError: @@ -375,7 +386,7 @@ def generate_tokens(readline): column = 0 while pos < max: # measure leading whitespace if line[pos] == ' ': column = column + 1 - elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize + elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize elif line[pos] == '\f': column = 0 else: break pos = pos + 1 |