1 files changed, 460 insertions, 129 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 489f68f..f9652ce 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,13 +1,12 @@
 doctests = """
 Tests for the tokenize module.
 
-    >>> import glob, random, sys
-
 The tests can be really simple. Given a small fragment of source
 code, print out a table with tokens. The ENDMARK is omitted for
 brevity.
 
     >>> dump_tokens("1 + 1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '1'           (1, 0) (1, 1)
     OP         '+'           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -15,6 +14,7 @@ brevity.
     >>> dump_tokens("if False:\\n"
     ...             "    # NL\\n"
     ...             "    True = False # NEWLINE\\n")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'if'          (1, 0) (1, 2)
     NAME       'False'       (1, 3) (1, 8)
     OP         ':'           (1, 8) (1, 9)
@@ -34,32 +34,16 @@ brevity.
     ...     x += 2
     ...   x += 5
     ... \"""
-
-    >>> for tok in generate_tokens(StringIO(indent_error_file).readline): pass
+    >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
+    >>> for tok in tokenize(readline): pass
     Traceback (most recent call last):
         ...
     IndentationError: unindent does not match any outer indentation level
 
-Test roundtrip for `untokenize`. `f` is an open file or a string. The source
-code in f is tokenized, converted back to source code via tokenize.untokenize(),
-and tokenized again from the latter. The test fails if the second tokenization
-doesn't match the first.
-
-    >>> def roundtrip(f):
-    ...     if isinstance(f, str): f = StringIO(f)
-    ...     token_list = list(generate_tokens(f.readline))
-    ...     f.close()
-    ...     tokens1 = [tok[:2] for tok in token_list]
-    ...     new_text = untokenize(tokens1)
-    ...     readline = iter(new_text.splitlines(1)).next
-    ...     tokens2 = [tok[:2] for tok in generate_tokens(readline)]
-    ...     return tokens1 == tokens2
-    ...
-
 There are some standard formatting practices that are easy to get right.
 
     >>> roundtrip("if x == 1:\\n"
-    ...           "    print x\\n")
+    ...           "    print(x)\\n")
     True
 
     >>> roundtrip("# This is a comment\\n# This also")
@@ -68,26 +52,26 @@ There are some standard formatting practices that are easy to get right.
 Some people use different formatting conventions, which makes
 untokenize a little trickier. Note that this test involves trailing
 whitespace after the colon. Note that we use hex escapes to make the
-two trailing blanks apperant in the expected output.
+two trailing blanks apparent in the expected output.
 
     >>> roundtrip("if x == 1 : \\n"
-    ...           "  print x\\n")
+    ...           "  print(x)\\n")
     True
 
-    >>> f = test_support.findfile("tokenize_tests" + os.extsep + "txt")
-    >>> roundtrip(open(f))
+    >>> f = support.findfile("tokenize_tests.txt")
+    >>> roundtrip(open(f, 'rb'))
     True
 
     >>> roundtrip("if x == 1:\\n"
     ...           "    # A comment by itself.\\n"
-    ...           "    print x # Comment here, too.\\n"
+    ...           "    print(x) # Comment here, too.\\n"
     ...           "    # Another comment.\\n"
     ...           "after_if = True\\n")
     True
 
     >>> roundtrip("if (x # The comments need to go in the right place\\n"
     ...           "    == 1):\\n"
-    ...           "    print 'x==1'\\n")
+    ...           "    print('x==1')\\n")
     True
 
     >>> roundtrip("class Test: # A comment here\\n"
@@ -102,8 +86,8 @@ Some error-handling code
 
     >>> roundtrip("try: import somemodule\\n"
     ...           "except ImportError: # comment\\n"
-    ...           "    print 'Can not import' # comment2\\n"
-    ...           "else:   print 'Loaded'\\n")
+    ...           "    print('Can not import' # comment2\\n)"
+    ...           "else:   print('Loaded')\\n")
     True
 
 Balancing continuation
@@ -123,27 +107,33 @@ Balancing continuation
 Ordinary integers and binary operators
 
     >>> dump_tokens("0xff <= 255")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xff'        (1, 0) (1, 4)
     OP         '<='          (1, 5) (1, 7)
     NUMBER     '255'         (1, 8) (1, 11)
     >>> dump_tokens("0b10 <= 255")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0b10'        (1, 0) (1, 4)
     OP         '<='          (1, 5) (1, 7)
     NUMBER     '255'         (1, 8) (1, 11)
-    >>> dump_tokens("0o123 <= 0123")
+    >>> dump_tokens("0o123 <= 0O123")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0o123'       (1, 0) (1, 5)
     OP         '<='          (1, 6) (1, 8)
-    NUMBER     '0123'        (1, 9) (1, 13)
-    >>> dump_tokens("01234567 > ~0x15")
-    NUMBER     '01234567'    (1, 0) (1, 8)
-    OP         '>'           (1, 9) (1, 10)
-    OP         '~'           (1, 11) (1, 12)
-    NUMBER     '0x15'        (1, 12) (1, 16)
-    >>> dump_tokens("2134568 != 01231515")
+    NUMBER     '0O123'       (1, 9) (1, 14)
+    >>> dump_tokens("1234567 > ~0x15")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    NUMBER     '1234567'     (1, 0) (1, 7)
+    OP         '>'           (1, 8) (1, 9)
+    OP         '~'           (1, 10) (1, 11)
+    NUMBER     '0x15'        (1, 11) (1, 15)
+    >>> dump_tokens("2134568 != 1231515")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '2134568'     (1, 0) (1, 7)
     OP         '!='          (1, 8) (1, 10)
-    NUMBER     '01231515'    (1, 11) (1, 19)
-    >>> dump_tokens("(-124561-1) & 0200000000")
+    NUMBER     '1231515'     (1, 11) (1, 18)
+    >>> dump_tokens("(-124561-1) & 200000000")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '('           (1, 0) (1, 1)
     OP         '-'           (1, 1) (1, 2)
     NUMBER     '124561'      (1, 2) (1, 8)
@@ -151,17 +141,20 @@ Ordinary integers and binary operators
     NUMBER     '1'           (1, 9) (1, 10)
     OP         ')'           (1, 10) (1, 11)
     OP         '&'           (1, 12) (1, 13)
-    NUMBER     '0200000000'  (1, 14) (1, 24)
+    NUMBER     '200000000'   (1, 14) (1, 23)
     >>> dump_tokens("0xdeadbeef != -1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
     OP         '!='          (1, 11) (1, 13)
     OP         '-'           (1, 14) (1, 15)
     NUMBER     '1'           (1, 15) (1, 16)
-    >>> dump_tokens("0xdeadc0de & 012345")
+    >>> dump_tokens("0xdeadc0de & 12345")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
     OP         '&'           (1, 11) (1, 12)
-    NUMBER     '012345'      (1, 13) (1, 19)
+    NUMBER     '12345'       (1, 13) (1, 18)
     >>> dump_tokens("0xFF & 0x15 | 1234")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xFF'        (1, 0) (1, 4)
     OP         '&'           (1, 5) (1, 6)
     NUMBER     '0x15'        (1, 7) (1, 11)
@@ -170,53 +163,64 @@ Ordinary integers and binary operators
 
 Long integers
 
-    >>> dump_tokens("x = 0L")
+    >>> dump_tokens("x = 0")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
-    NUMBER     '0L'          (1, 4) (1, 6)
+    NUMBER     '0'           (1, 4) (1, 5)
     >>> dump_tokens("x = 0xfffffffffff")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '0xffffffffff (1, 4) (1, 17)
-    >>> dump_tokens("x = 123141242151251616110l")
+    >>> dump_tokens("x = 123141242151251616110")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
-    NUMBER     '123141242151 (1, 4) (1, 26)
-    >>> dump_tokens("x = -15921590215012591L")
+    NUMBER     '123141242151 (1, 4) (1, 25)
+    >>> dump_tokens("x = -15921590215012591")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     OP         '-'           (1, 4) (1, 5)
-    NUMBER     '159215902150 (1, 5) (1, 23)
+    NUMBER     '159215902150 (1, 5) (1, 22)
 
 Floating point numbers
 
     >>> dump_tokens("x = 3.14159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3.14159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 314159.")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '314159.'     (1, 4) (1, 11)
     >>> dump_tokens("x = .314159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '.314159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 3e14159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3e14159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 3E123")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3E123'       (1, 4) (1, 9)
     >>> dump_tokens("x+y = 3e-1230")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '+'           (1, 1) (1, 2)
     NAME       'y'           (1, 2) (1, 3)
     OP         '='           (1, 4) (1, 5)
     NUMBER     '3e-1230'     (1, 6) (1, 13)
     >>> dump_tokens("x = 3.14e159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3.14e159'    (1, 4) (1, 12)
@@ -224,6 +228,7 @@ Floating point numbers
 String literals
 
     >>> dump_tokens("x = ''; y = \\\"\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     "''"          (1, 4) (1, 6)
@@ -232,6 +237,7 @@ String literals
     OP         '='           (1, 10) (1, 11)
     STRING     '""'          (1, 12) (1, 14)
     >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     '\\'"\\''       (1, 4) (1, 7)
@@ -240,72 +246,53 @@ String literals
     OP         '='           (1, 11) (1, 12)
     STRING     '"\\'"'        (1, 13) (1, 16)
     >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     '"doesn\\'t "' (1, 4) (1, 14)
     NAME       'shrink'      (1, 14) (1, 20)
     STRING     '", does it"' (1, 20) (1, 31)
-    >>> dump_tokens("x = u'abc' + U'ABC'")
+    >>> dump_tokens("x = 'abc' + 'ABC'")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
-    STRING     "u'abc'"      (1, 4) (1, 10)
-    OP         '+'           (1, 11) (1, 12)
-    STRING     "U'ABC'"      (1, 13) (1, 19)
-    >>> dump_tokens('y = u"ABC" + U"ABC"')
+    STRING     "'abc'"       (1, 4) (1, 9)
+    OP         '+'           (1, 10) (1, 11)
+    STRING     "'ABC'"       (1, 12) (1, 17)
+    >>> dump_tokens('y = "ABC" + "ABC"')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'y'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
-    STRING     'u"ABC"'      (1, 4) (1, 10)
-    OP         '+'           (1, 11) (1, 12)
-    STRING     'U"ABC"'      (1, 13) (1, 19)
-    >>> dump_tokens("x = ur'abc' + Ur'ABC' + uR'ABC' + UR'ABC'")
+    STRING     '"ABC"'       (1, 4) (1, 9)
+    OP         '+'           (1, 10) (1, 11)
+    STRING     '"ABC"'       (1, 12) (1, 17)
+    >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
-    STRING     "ur'abc'"     (1, 4) (1, 11)
-    OP         '+'           (1, 12) (1, 13)
-    STRING     "Ur'ABC'"     (1, 14) (1, 21)
-    OP         '+'           (1, 22) (1, 23)
-    STRING     "uR'ABC'"     (1, 24) (1, 31)
-    OP         '+'           (1, 32) (1, 33)
-    STRING     "UR'ABC'"     (1, 34) (1, 41)
-    >>> dump_tokens('y = ur"abc" + Ur"ABC" + uR"ABC" + UR"ABC"')
+    STRING     "r'abc'"      (1, 4) (1, 10)
+    OP         '+'           (1, 11) (1, 12)
+    STRING     "r'ABC'"      (1, 13) (1, 19)
+    OP         '+'           (1, 20) (1, 21)
+    STRING     "R'ABC'"      (1, 22) (1, 28)
+    OP         '+'           (1, 29) (1, 30)
+    STRING     "R'ABC'"      (1, 31) (1, 37)
+    >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'y'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
-    STRING     'ur"abc"'     (1, 4) (1, 11)
-    OP         '+'           (1, 12) (1, 13)
-    STRING     'Ur"ABC"'     (1, 14) (1, 21)
-    OP         '+'           (1, 22) (1, 23)
-    STRING     'uR"ABC"'     (1, 24) (1, 31)
-    OP         '+'           (1, 32) (1, 33)
-    STRING     'UR"ABC"'     (1, 34) (1, 41)
-
-    >>> dump_tokens("b'abc' + B'abc'")
-    STRING     "b'abc'"      (1, 0) (1, 6)
-    OP         '+'           (1, 7) (1, 8)
-    STRING     "B'abc'"      (1, 9) (1, 15)
-    >>> dump_tokens('b"abc" + B"abc"')
-    STRING     'b"abc"'      (1, 0) (1, 6)
-    OP         '+'           (1, 7) (1, 8)
-    STRING     'B"abc"'      (1, 9) (1, 15)
-    >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
-    STRING     "br'abc'"     (1, 0) (1, 7)
-    OP         '+'           (1, 8) (1, 9)
-    STRING     "bR'abc'"     (1, 10) (1, 17)
-    OP         '+'           (1, 18) (1, 19)
-    STRING     "Br'abc'"     (1, 20) (1, 27)
-    OP         '+'           (1, 28) (1, 29)
-    STRING     "BR'abc'"     (1, 30) (1, 37)
-    >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
-    STRING     'br"abc"'     (1, 0) (1, 7)
-    OP         '+'           (1, 8) (1, 9)
-    STRING     'bR"abc"'     (1, 10) (1, 17)
-    OP         '+'           (1, 18) (1, 19)
-    STRING     'Br"abc"'     (1, 20) (1, 27)
-    OP         '+'           (1, 28) (1, 29)
-    STRING     'BR"abc"'     (1, 30) (1, 37)
+    STRING     'r"abc"'      (1, 4) (1, 10)
+    OP         '+'           (1, 11) (1, 12)
+    STRING     'r"ABC"'      (1, 13) (1, 19)
+    OP         '+'           (1, 20) (1, 21)
+    STRING     'R"ABC"'      (1, 22) (1, 28)
+    OP         '+'           (1, 29) (1, 30)
+    STRING     'R"ABC"'      (1, 31) (1, 37)
 
 Operators
 
     >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'def'         (1, 0) (1, 3)
     NAME       'd22'         (1, 4) (1, 7)
     OP         '('           (1, 7) (1, 8)
@@ -327,6 +314,7 @@ Operators
     OP         ':'           (1, 27) (1, 28)
     NAME       'pass'        (1, 29) (1, 33)
     >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'def'         (1, 0) (1, 3)
     NAME       'd01v_'       (1, 4) (1, 9)
     OP         '('           (1, 9) (1, 10)
@@ -347,6 +335,7 @@ Comparison
 
     >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
     ...             "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'if'          (1, 0) (1, 2)
     NUMBER     '1'           (1, 3) (1, 4)
     OP         '<'           (1, 5) (1, 6)
@@ -383,6 +372,7 @@ Comparison
 Shift
 
     >>> dump_tokens("x = 1 << 1 >> 5")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -393,7 +383,8 @@ Shift
 
 Additive
 
-    >>> dump_tokens("x = 1 - y + 15 - 01 + 0x124 + z + a[5]")
+    >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -402,20 +393,21 @@ Additive
     OP         '+'           (1, 10) (1, 11)
     NUMBER     '15'          (1, 12) (1, 14)
     OP         '-'           (1, 15) (1, 16)
-    NUMBER     '01'          (1, 17) (1, 19)
-    OP         '+'           (1, 20) (1, 21)
-    NUMBER     '0x124'       (1, 22) (1, 27)
-    OP         '+'           (1, 28) (1, 29)
-    NAME       'z'           (1, 30) (1, 31)
-    OP         '+'           (1, 32) (1, 33)
-    NAME       'a'           (1, 34) (1, 35)
-    OP         '['           (1, 35) (1, 36)
-    NUMBER     '5'           (1, 36) (1, 37)
-    OP         ']'           (1, 37) (1, 38)
+    NUMBER     '1'           (1, 17) (1, 18)
+    OP         '+'           (1, 19) (1, 20)
+    NUMBER     '0x124'       (1, 21) (1, 26)
+    OP         '+'           (1, 27) (1, 28)
+    NAME       'z'           (1, 29) (1, 30)
+    OP         '+'           (1, 31) (1, 32)
+    NAME       'a'           (1, 33) (1, 34)
+    OP         '['           (1, 34) (1, 35)
+    NUMBER     '5'           (1, 35) (1, 36)
+    OP         ']'           (1, 36) (1, 37)
 
 Multiplicative
 
     >>> dump_tokens("x = 1//1*1/5*12%0x12")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -433,6 +425,7 @@ Multiplicative
 Unary
 
     >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '~'           (1, 0) (1, 1)
     NUMBER     '1'           (1, 1) (1, 2)
     OP         '^'           (1, 3) (1, 4)
@@ -445,6 +438,7 @@ Unary
     OP         '-'           (1, 16) (1, 17)
     NUMBER     '1'           (1, 17) (1, 18)
     >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '-'           (1, 0) (1, 1)
     NUMBER     '1'           (1, 1) (1, 2)
     OP         '*'           (1, 2) (1, 3)
@@ -468,6 +462,7 @@ Unary
 Selector
 
     >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'import'      (1, 0) (1, 6)
     NAME       'sys'         (1, 7) (1, 10)
     OP         ','           (1, 10) (1, 11)
@@ -489,6 +484,7 @@ Selector
 Methods
 
     >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '@'           (1, 0) (1, 1)
     NAME       'staticmethod (1, 1) (1, 13)
     NEWLINE    '\\n'          (1, 13) (1, 14)
@@ -520,22 +516,27 @@ Two string literals on the same line
 Test roundtrip on random python modules.
 pass the '-ucpu' option to process the full directory.
 
-    >>>
+    >>> import random
     >>> tempdir = os.path.dirname(f) or os.curdir
     >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
 
-    >>> if not test_support.is_resource_enabled("cpu"):
+tokenize is broken on test_pep3131.py because regular expressions are broken on
+the obscure unicode identifiers in it. *sigh*
+    >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
+    >>> if not support.is_resource_enabled("cpu"):
     ...     testfiles = random.sample(testfiles, 10)
     ...
     >>> for testfile in testfiles:
-    ...     if not roundtrip(open(testfile)):
-    ...         print "Roundtrip failed for file %s" % testfile
+    ...     if not roundtrip(open(testfile, 'rb')):
+    ...         print("Roundtrip failed for file %s" % testfile)
     ...         break
     ... else: True
     True
 
 Evil tabs
+
     >>> dump_tokens("def f():\\n\\tif x\\n        \\tpass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'def'         (1, 0) (1, 3)
     NAME       'f'           (1, 4) (1, 5)
     OP         '('           (1, 5) (1, 6)
@@ -553,44 +554,77 @@ Evil tabs
 
 Pathological whitespace (http://bugs.python.org/issue16152)
     >>> dump_tokens("@          ")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '@'           (1, 0) (1, 1)
-"""
 
+Non-ascii identifiers
+
+    >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    NAME       'Örter'       (1, 0) (1, 5)
+    OP         '='           (1, 6) (1, 7)
+    STRING     "'places'"    (1, 8) (1, 16)
+    NEWLINE    '\\n'          (1, 16) (1, 17)
+    NAME       'grün'        (2, 0) (2, 4)
+    OP         '='           (2, 5) (2, 6)
+    STRING     "'green'"     (2, 7) (2, 14)
+"""
 
-from test import test_support
-from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP,
-                     STRING, ENDMARKER, tok_name)
-from StringIO import StringIO
-import os
+from test import support
+from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
+                     STRING, ENDMARKER, tok_name, detect_encoding,
+                     open as tokenize_open)
+from io import BytesIO
+from unittest import TestCase
+import os, sys, glob
 
 def dump_tokens(s):
     """Print out the tokens in s in a table format.
 
     The ENDMARKER is omitted.
     """
-    f = StringIO(s)
-    for type, token, start, end, line in generate_tokens(f.readline):
+    f = BytesIO(s.encode('utf-8'))
+    for type, token, start, end, line in tokenize(f.readline):
         if type == ENDMARKER:
             break
         type = tok_name[type]
         print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
 
+def roundtrip(f):
+    """
+    Test roundtrip for `untokenize`. `f` is an open file or a string.
+    The source code in f is tokenized, converted back to source code via
+    tokenize.untokenize(), and tokenized again from the latter. The test
+    fails if the second tokenization doesn't match the first.
+    """
+    if isinstance(f, str):
+        f = BytesIO(f.encode('utf-8'))
+    try:
+        token_list = list(tokenize(f.readline))
+    finally:
+        f.close()
+    tokens1 = [tok[:2] for tok in token_list]
+    new_bytes = untokenize(tokens1)
+    readline = (line for line in new_bytes.splitlines(1)).__next__
+    tokens2 = [tok[:2] for tok in tokenize(readline)]
+    return tokens1 == tokens2
+
 # This is an example from the docs, set up as a doctest.
 def decistmt(s):
     """Substitute Decimals for floats in a string of statements.
 
     >>> from decimal import Decimal
-    >>> s = 'print +21.3e-5*-.1234/81.7'
+    >>> s = 'print(+21.3e-5*-.1234/81.7)'
     >>> decistmt(s)
-    "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
+    "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
 
     The format of the exponent is inherited from the platform C library.
     Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since
-    we're only showing 12 digits, and the 13th isn't close to 5, the
+    we're only showing 11 digits, and the 12th isn't close to 5, the
     rest of the output should be platform-independent.
 
     >>> exec(s) #doctest: +ELLIPSIS
-    -3.21716034272e-0...7
+    -3.2171603427...e-0...7
 
     Output from calculations with Decimal should be identical across all
     platforms.
@@ -598,9 +632,8 @@ def decistmt(s):
     >>> exec(decistmt(s))
     -3.217160342717258261933904529E-7
     """
-
     result = []
-    g = generate_tokens(StringIO(s).readline)   # tokenize the string
+    g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
     for toknum, tokval, _, _, _  in g:
         if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
             result.extend([
@@ -611,15 +644,313 @@ def decistmt(s):
             ])
         else:
             result.append((toknum, tokval))
-    return untokenize(result)
+    return untokenize(result).decode('utf-8')
 
 
-__test__ = {"doctests" : doctests, 'decistmt': decistmt}
+class TestTokenizerAdheresToPep0263(TestCase):
+    """
+    Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
+    """
 
+    def _testFile(self, filename):
+        path = os.path.join(os.path.dirname(__file__), filename)
+        return roundtrip(open(path, 'rb'))
+
+    def test_utf8_coding_cookie_and_no_utf8_bom(self):
+        f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
+        self.assertTrue(self._testFile(f))
+
+    def test_latin1_coding_cookie_and_utf8_bom(self):
+        """
+        As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
+        allowed encoding for the comment is 'utf-8'.  The text file used in
+        this test starts with a BOM signature, but specifies latin1 as the
+        coding, so verify that a SyntaxError is raised, which matches the
+        behaviour of the interpreter when it encounters a similar condition.
+        """
+        f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
+        self.assertRaises(SyntaxError, self._testFile, f)
+
+    def test_no_coding_cookie_and_utf8_bom(self):
+        f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
+        self.assertTrue(self._testFile(f))
+
+    def test_utf8_coding_cookie_and_utf8_bom(self):
+        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
+        self.assertTrue(self._testFile(f))
+
+    def test_bad_coding_cookie(self):
+        self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
+        self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
+
+
+class Test_Tokenize(TestCase):
+
+    def test__tokenize_decodes_with_specified_encoding(self):
+        literal = '"ЉЊЈЁЂ"'
+        line = literal.encode('utf-8')
+        first = False
+        def readline():
+            nonlocal first
+            if not first:
+                first = True
+                return line
+            else:
+                return b''
+
+        # skip the initial encoding token and the end token
+        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
+        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        self.assertEqual(tokens, expected_tokens,
+                         "bytes not decoded with encoding")
+
+    def test__tokenize_does_not_decode_with_encoding_none(self):
+        literal = '"ЉЊЈЁЂ"'
+        first = False
+        def readline():
+            nonlocal first
+            if not first:
+                first = True
+                return literal
+            else:
+                return b''
+
+        # skip the end token
+        tokens = list(_tokenize(readline, encoding=None))[:-1]
+        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        self.assertEqual(tokens, expected_tokens,
+                         "string not tokenized when encoding is None")
+
+
+class TestDetectEncoding(TestCase):
+
+    def get_readline(self, lines):
+        index = 0
+        def readline():
+            nonlocal index
+            if index == len(lines):
+                raise StopIteration
+            line = lines[index]
+            index += 1
+            return line
+        return readline
+
+    def test_no_bom_no_encoding_cookie(self):
+        lines = (
+            b'# something\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8')
+        self.assertEqual(consumed_lines, list(lines[:2]))
+
+    def test_bom_no_cookie(self):
+        lines = (
+            b'\xef\xbb\xbf# something\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8-sig')
+        self.assertEqual(consumed_lines,
+                         [b'# something\n', b'print(something)\n'])
+
+    def test_cookie_first_line_no_bom(self):
+        lines = (
+            b'# -*- coding: latin-1 -*-\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso-8859-1')
+        self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
+
+    def test_matched_bom_and_cookie_first_line(self):
+        lines = (
+            b'\xef\xbb\xbf# coding=utf-8\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8-sig')
+        self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
+
+    def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
+        lines = (
+            b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+    def test_cookie_second_line_no_bom(self):
+        lines = (
+            b'#! something\n',
+            b'# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'ascii')
+        expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
+        self.assertEqual(consumed_lines, expected)
+
+    def test_matched_bom_and_cookie_second_line(self):
+        lines = (
+            b'\xef\xbb\xbf#! something\n',
+            b'f# coding=utf-8\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8-sig')
+        self.assertEqual(consumed_lines,
+                         [b'#! something\n', b'f# coding=utf-8\n'])
+
+    def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
+        lines = (
+            b'\xef\xbb\xbf#! something\n',
+            b'# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+    def test_latin1_normalization(self):
+        # See get_normal_name() in tokenizer.c.
+        encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
+                     "iso-8859-1-unix", "iso-latin-1-mac")
+        for encoding in encodings:
+            for rep in ("-", "_"):
+                enc = encoding.replace("-", rep)
+                lines = (b"#!/usr/bin/python\n",
+                         b"# coding: " + enc.encode("ascii") + b"\n",
+                         b"print(things)\n",
+                         b"do_something += 4\n")
+                rl = self.get_readline(lines)
+                found, consumed_lines = detect_encoding(rl)
+                self.assertEqual(found, "iso-8859-1")
+
+    def test_syntaxerror_latin1(self):
+        # Issue 14629: need to raise SyntaxError if the first
+        # line(s) have non-UTF-8 characters
+        lines = (
+            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
+            )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+
+    def test_utf8_normalization(self):
+        # See get_normal_name() in tokenizer.c.
+        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
+        for encoding in encodings:
+            for rep in ("-", "_"):
+                enc = encoding.replace("-", rep)
+                lines = (b"#!/usr/bin/python\n",
+                         b"# coding: " + enc.encode("ascii") + b"\n",
+                         b"1 + 3\n")
+                rl = self.get_readline(lines)
+                found, consumed_lines = detect_encoding(rl)
+                self.assertEqual(found, "utf-8")
+
+    def test_short_files(self):
+        readline = self.get_readline((b'print(something)\n',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEqual(encoding, 'utf-8')
+        self.assertEqual(consumed_lines, [b'print(something)\n'])
+
+        encoding, consumed_lines = detect_encoding(self.get_readline(()))
+        self.assertEqual(encoding, 'utf-8')
+        self.assertEqual(consumed_lines, [])
+
+        readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEqual(encoding, 'utf-8-sig')
+        self.assertEqual(consumed_lines, [b'print(something)\n'])
+
+        readline = self.get_readline((b'\xef\xbb\xbf',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEqual(encoding, 'utf-8-sig')
+        self.assertEqual(consumed_lines, [])
+
+        readline = self.get_readline((b'# coding: bad\n',))
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+    def test_open(self):
+        filename = support.TESTFN + '.py'
+        self.addCleanup(support.unlink, filename)
+
+        # test coding cookie
+        for encoding in ('iso-8859-15', 'utf-8'):
+            with open(filename, 'w', encoding=encoding) as fp:
+                print("# coding: %s" % encoding, file=fp)
+                print("print('euro:\u20ac')", file=fp)
+            with tokenize_open(filename) as fp:
+                self.assertEqual(fp.encoding, encoding)
+                self.assertEqual(fp.mode, 'r')
+
+        # test BOM (no coding cookie)
+        with open(filename, 'w', encoding='utf-8-sig') as fp:
+            print("print('euro:\u20ac')", file=fp)
+        with tokenize_open(filename) as fp:
+            self.assertEqual(fp.encoding, 'utf-8-sig')
+            self.assertEqual(fp.mode, 'r')
+
+class TestTokenize(TestCase):
+
+    def test_tokenize(self):
+        import tokenize as tokenize_module
+        encoding = object()
+        encoding_used = None
+        def mock_detect_encoding(readline):
+            return encoding, ['first', 'second']
+
+        def mock__tokenize(readline, encoding):
+            nonlocal encoding_used
+            encoding_used = encoding
+            out = []
+            while True:
+                next_line = readline()
+                if next_line:
+                    out.append(next_line)
+                    continue
+                return out
+
+        counter = 0
+        def mock_readline():
+            nonlocal counter
+            counter += 1
+            if counter == 5:
+                return b''
+            return counter
+
+        orig_detect_encoding = tokenize_module.detect_encoding
+        orig__tokenize = tokenize_module._tokenize
+        tokenize_module.detect_encoding = mock_detect_encoding
+        tokenize_module._tokenize = mock__tokenize
+        try:
+            results = tokenize(mock_readline)
+            self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])
+        finally:
+            tokenize_module.detect_encoding = orig_detect_encoding
+            tokenize_module._tokenize = orig__tokenize
+
+        self.assertTrue(encoding_used, encoding)
+
+
+__test__ = {"doctests" : doctests, 'decistmt': decistmt}
 
 def test_main():
     from test import test_tokenize
-    test_support.run_doctest(test_tokenize, True)
+    support.run_doctest(test_tokenize, True)
+    support.run_unittest(TestTokenizerAdheresToPep0263)
+    support.run_unittest(Test_Tokenize)
+    support.run_unittest(TestDetectEncoding)
+    support.run_unittest(TestTokenize)
 
 if __name__ == "__main__":
     test_main()