1 files changed, 305 insertions, 46 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 2938520..308158f 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,13 +1,14 @@
+# -*- coding: utf-8 -*-
+
 doctests = """
 Tests for the tokenize module.
 
-    >>> import glob, random, sys
-
 The tests can be really simple. Given a small fragment of source
 code, print out a table with tokens. The ENDMARK is omitted for
 brevity.
 
     >>> dump_tokens("1 + 1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '1'           (1, 0) (1, 1)
     OP         '+'           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -15,6 +16,7 @@ brevity.
     >>> dump_tokens("if False:\\n"
     ...             "    # NL\\n"
     ...             "    True = False # NEWLINE\\n")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'if'          (1, 0) (1, 2)
     NAME       'False'       (1, 3) (1, 8)
     OP         ':'           (1, 8) (1, 9)
@@ -34,27 +36,12 @@ brevity.
     ...     x += 2
     ...   x += 5
     ... \"""
-    >>> for tok in generate_tokens(StringIO(indent_error_file).readline): pass
+    >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
+    >>> for tok in tokenize(readline): pass
     Traceback (most recent call last):
         ...
     IndentationError: unindent does not match any outer indentation level
 
-Test roundtrip for `untokenize`. `f` is an open file or a string. The source
-code in f is tokenized, converted back to source code via tokenize.untokenize(),
-and tokenized again from the latter. The test fails if the second tokenization
-doesn't match the first.
-
-    >>> def roundtrip(f):
-    ...     if isinstance(f, str): f = StringIO(f)
-    ...     token_list = list(generate_tokens(f.readline))
-    ...     f.close()
-    ...     tokens1 = [tok[:2] for tok in token_list]
-    ...     new_text = untokenize(tokens1)
-    ...     readline = iter(new_text.splitlines(1)).__next__
-    ...     tokens2 = [tok[:2] for tok in generate_tokens(readline)]
-    ...     return tokens1 == tokens2
-    ...
-
 There are some standard formattig practises that are easy to get right.
 
     >>> roundtrip("if x == 1:\\n"
@@ -67,14 +54,14 @@ There are some standard formattig practises that are easy to get right.
 Some people use different formatting conventions, which makes
 untokenize a little trickier. Note that this test involves trailing
 whitespace after the colon. Note that we use hex escapes to make the
-two trailing blanks apperant in the expected output.
+two trailing blanks apparent in the expected output.
 
     >>> roundtrip("if x == 1 : \\n"
     ...           "  print(x)\\n")
     True
 
     >>> f = test_support.findfile("tokenize_tests.txt")
-    >>> roundtrip(open(f))
+    >>> roundtrip(open(f, 'rb'))
     True
 
     >>> roundtrip("if x == 1:\\n"
@@ -122,27 +109,33 @@ Balancing continuation
 Ordinary integers and binary operators
 
     >>> dump_tokens("0xff <= 255")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xff'        (1, 0) (1, 4)
     OP         '<='          (1, 5) (1, 7)
     NUMBER     '255'         (1, 8) (1, 11)
     >>> dump_tokens("0b10 <= 255")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0b10'        (1, 0) (1, 4)
     OP         '<='          (1, 5) (1, 7)
     NUMBER     '255'         (1, 8) (1, 11)
     >>> dump_tokens("0o123 <= 0O123")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0o123'       (1, 0) (1, 5)
     OP         '<='          (1, 6) (1, 8)
     NUMBER     '0O123'       (1, 9) (1, 14)
     >>> dump_tokens("1234567 > ~0x15")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '1234567'     (1, 0) (1, 7)
     OP         '>'           (1, 8) (1, 9)
     OP         '~'           (1, 10) (1, 11)
     NUMBER     '0x15'        (1, 11) (1, 15)
     >>> dump_tokens("2134568 != 1231515")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '2134568'     (1, 0) (1, 7)
     OP         '!='          (1, 8) (1, 10)
     NUMBER     '1231515'     (1, 11) (1, 18)
     >>> dump_tokens("(-124561-1) & 200000000")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '('           (1, 0) (1, 1)
     OP         '-'           (1, 1) (1, 2)
     NUMBER     '124561'      (1, 2) (1, 8)
@@ -152,15 +145,18 @@ Ordinary integers and binary operators
     OP         '&'           (1, 12) (1, 13)
     NUMBER     '200000000'   (1, 14) (1, 23)
     >>> dump_tokens("0xdeadbeef != -1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
     OP         '!='          (1, 11) (1, 13)
     OP         '-'           (1, 14) (1, 15)
     NUMBER     '1'           (1, 15) (1, 16)
     >>> dump_tokens("0xdeadc0de & 12345")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
     OP         '&'           (1, 11) (1, 12)
     NUMBER     '12345'       (1, 13) (1, 18)
     >>> dump_tokens("0xFF & 0x15 | 1234")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xFF'        (1, 0) (1, 4)
     OP         '&'           (1, 5) (1, 6)
     NUMBER     '0x15'        (1, 7) (1, 11)
@@ -170,18 +166,22 @@ Ordinary integers and binary operators
 Long integers
 
     >>> dump_tokens("x = 0")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '0'           (1, 4) (1, 5)
     >>> dump_tokens("x = 0xfffffffffff")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '0xffffffffff (1, 4) (1, 17)
     >>> dump_tokens("x = 123141242151251616110")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '123141242151 (1, 4) (1, 25)
     >>> dump_tokens("x = -15921590215012591")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     OP         '-'           (1, 4) (1, 5)
@@ -190,32 +190,39 @@ Long integers
 Floating point numbers
 
     >>> dump_tokens("x = 3.14159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3.14159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 314159.")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '314159.'     (1, 4) (1, 11)
     >>> dump_tokens("x = .314159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '.314159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 3e14159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3e14159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 3E123")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3E123'       (1, 4) (1, 9)
     >>> dump_tokens("x+y = 3e-1230")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '+'           (1, 1) (1, 2)
     NAME       'y'           (1, 2) (1, 3)
     OP         '='           (1, 4) (1, 5)
     NUMBER     '3e-1230'     (1, 6) (1, 13)
     >>> dump_tokens("x = 3.14e159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3.14e159'    (1, 4) (1, 12)
@@ -223,6 +230,7 @@ Floating point numbers
 String literals
 
     >>> dump_tokens("x = ''; y = \\\"\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     "''"          (1, 4) (1, 6)
@@ -231,6 +239,7 @@ String literals
     OP         '='           (1, 10) (1, 11)
     STRING     '""'          (1, 12) (1, 14)
     >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     '\\'"\\''       (1, 4) (1, 7)
@@ -239,24 +248,28 @@ String literals
     OP         '='           (1, 11) (1, 12)
     STRING     '"\\'"'        (1, 13) (1, 16)
     >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     '"doesn\\'t "' (1, 4) (1, 14)
     NAME       'shrink'      (1, 14) (1, 20)
     STRING     '", does it"' (1, 20) (1, 31)
     >>> dump_tokens("x = 'abc' + 'ABC'")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     "'abc'"       (1, 4) (1, 9)
     OP         '+'           (1, 10) (1, 11)
     STRING     "'ABC'"       (1, 12) (1, 17)
     >>> dump_tokens('y = "ABC" + "ABC"')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'y'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     '"ABC"'       (1, 4) (1, 9)
     OP         '+'           (1, 10) (1, 11)
     STRING     '"ABC"'       (1, 12) (1, 17)
     >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     "r'abc'"      (1, 4) (1, 10)
@@ -267,6 +280,7 @@ String literals
     OP         '+'           (1, 29) (1, 30)
     STRING     "R'ABC'"      (1, 31) (1, 37)
     >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'y'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     'r"abc"'      (1, 4) (1, 10)
@@ -280,6 +294,7 @@ String literals
 Operators
 
     >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'def'         (1, 0) (1, 3)
     NAME       'd22'         (1, 4) (1, 7)
     OP         '('           (1, 7) (1, 8)
@@ -301,6 +316,7 @@ Operators
     OP         ':'           (1, 27) (1, 28)
     NAME       'pass'        (1, 29) (1, 33)
     >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'def'         (1, 0) (1, 3)
     NAME       'd01v_'       (1, 4) (1, 9)
     OP         '('           (1, 9) (1, 10)
@@ -321,6 +337,7 @@ Comparison
 
     >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
     ...             "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'if'          (1, 0) (1, 2)
     NUMBER     '1'           (1, 3) (1, 4)
     OP         '<'           (1, 5) (1, 6)
@@ -357,6 +374,7 @@ Comparison
 Shift
 
     >>> dump_tokens("x = 1 << 1 >> 5")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -368,6 +386,7 @@ Shift
 Additive
 
     >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -390,6 +409,7 @@ Additive
 Multiplicative
 
     >>> dump_tokens("x = 1//1*1/5*12%0x12")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -407,6 +427,7 @@ Multiplicative
 Unary
 
     >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '~'           (1, 0) (1, 1)
     NUMBER     '1'           (1, 1) (1, 2)
     OP         '^'           (1, 3) (1, 4)
@@ -419,6 +440,7 @@ Unary
     OP         '-'           (1, 16) (1, 17)
     NUMBER     '1'           (1, 17) (1, 18)
     >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '-'           (1, 0) (1, 1)
     NUMBER     '1'           (1, 1) (1, 2)
     OP         '*'           (1, 2) (1, 3)
@@ -442,6 +464,7 @@ Unary
 Selector
 
     >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'import'      (1, 0) (1, 6)
     NAME       'sys'         (1, 7) (1, 10)
     OP         ','           (1, 10) (1, 11)
@@ -463,6 +486,7 @@ Selector
 Methods
 
     >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '@'           (1, 0) (1, 1)
     NAME       'staticmethod (1, 1) (1, 13)
     NEWLINE    '\\n'          (1, 13) (1, 14)
@@ -485,42 +509,43 @@ Backslash means line continuation, except for comments
     True
     >>> roundtrip("# Comment \\\\nx = 0")
     True
-
-    >>>
-    >>> tempdir = os.path.dirname(f) or os.curdir
-    >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
-    >>> if not test_support.is_resource_enabled("compiler"):
-    ...     testfiles = random.sample(testfiles, 10)
-    ...
-    >>> for testfile in testfiles:
-    ...     if not roundtrip(open(testfile)): break
-    ... else: True
-    True
 """
 
-
 from test import test_support
-from tokenize import (tokenize, untokenize, generate_tokens, NUMBER, NAME, OP,
-                     STRING, ENDMARKER, tok_name)
-from io import StringIO
-import os
+from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
+                     STRING, ENDMARKER, tok_name, detect_encoding)
+from io import BytesIO
+from unittest import TestCase
+import os, sys, glob
 
 def dump_tokens(s):
     """Print out the tokens in s in a table format.
 
     The ENDMARKER is omitted.
     """
-    f = StringIO(s)
-    for type, token, start, end, line in generate_tokens(f.readline):
+    f = BytesIO(s.encode('utf-8'))
+    for type, token, start, end, line in tokenize(f.readline):
         if type == ENDMARKER:
             break
         type = tok_name[type]
         print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
 
-def roundtrip(s):
-    f = StringIO(s)
-    source = untokenize(generate_tokens(f.readline))
-    print(source, end="")
+def roundtrip(f):
+    """
+    Test roundtrip for `untokenize`. `f` is an open file or a string.
+    The source code in f is tokenized, converted back to source code via
+    tokenize.untokenize(), and tokenized again from the latter. The test
+    fails if the second tokenization doesn't match the first.
+    """
+    if isinstance(f, str):
+        f = BytesIO(f.encode('utf-8'))
+    token_list = list(tokenize(f.readline))
+    f.close()
+    tokens1 = [tok[:2] for tok in token_list]
+    new_bytes = untokenize(tokens1)
+    readline = (line for line in new_bytes.splitlines(1)).__next__
+    tokens2 = [tok[:2] for tok in tokenize(readline)]
+    return tokens1 == tokens2
 
 # This is an example from the docs, set up as a doctest.
 def decistmt(s):
@@ -545,9 +570,8 @@ def decistmt(s):
     >>> exec(decistmt(s))
     -3.217160342717258261933904529E-7
     """
-
     result = []
-    g = generate_tokens(StringIO(s).readline)   # tokenize the string
+    g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
     for toknum, tokval, _, _, _  in g:
         if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
             result.extend([
@@ -558,7 +582,238 @@ def decistmt(s):
             ])
         else:
             result.append((toknum, tokval))
-    return untokenize(result)
+    return untokenize(result).decode('utf-8')
+
+
+class TestTokenizerAdheresToPep0263(TestCase):
+    """
+    Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
+    """
+
+    def _testFile(self, filename):
+        path = os.path.join(os.path.dirname(__file__), filename)
+        return roundtrip(open(path, 'rb'))
+
+    def test_utf8_coding_cookie_and_no_utf8_bom(self):
+        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
+        self.assertTrue(self._testFile(f))
+
+    def test_latin1_coding_cookie_and_utf8_bom(self):
+        """
+        As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
+        allowed encoding for the comment is 'utf-8'.  The text file used in
+        this test starts with a BOM signature, but specifies latin1 as the
+        coding, so verify that a SyntaxError is raised, which matches the
+        behaviour of the interpreter when it encounters a similar condition.
+        """
+        f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
+        self.failUnlessRaises(SyntaxError, self._testFile, f)
+
+    def test_no_coding_cookie_and_utf8_bom(self):
+        f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
+        self.assertTrue(self._testFile(f))
+
+    def test_utf8_coding_cookie_and_utf8_bom(self):
+        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
+        self.assertTrue(self._testFile(f))
+
+
+class Test_Tokenize(TestCase):
+
+    def test__tokenize_decodes_with_specified_encoding(self):
+        literal = '"ЉЊЈЁЂ"'
+        line = literal.encode('utf-8')
+        first = False
+        def readline():
+            nonlocal first
+            if not first:
+                first = True
+                return line
+            else:
+                return b''
+
+        # skip the initial encoding token and the end token
+        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
+        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        self.assertEquals(tokens, expected_tokens,
+                          "bytes not decoded with encoding")
+
+    def test__tokenize_does_not_decode_with_encoding_none(self):
+        literal = '"ЉЊЈЁЂ"'
+        first = False
+        def readline():
+            nonlocal first
+            if not first:
+                first = True
+                return literal
+            else:
+                return b''
+
+        # skip the end token
+        tokens = list(_tokenize(readline, encoding=None))[:-1]
+        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        self.assertEquals(tokens, expected_tokens,
+                          "string not tokenized when encoding is None")
+
+
+class TestDetectEncoding(TestCase):
+
+    def get_readline(self, lines):
+        index = 0
+        def readline():
+            nonlocal index
+            if index == len(lines):
+                raise StopIteration
+            line = lines[index]
+            index += 1
+            return line
+        return readline
+
+    def test_no_bom_no_encoding_cookie(self):
+        lines = (
+            b'# something\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, list(lines[:2]))
+
+    def test_bom_no_cookie(self):
+        lines = (
+            b'\xef\xbb\xbf# something\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines,
+                          [b'# something\n', b'print(something)\n'])
+
+    def test_cookie_first_line_no_bom(self):
+        lines = (
+            b'# -*- coding: latin-1 -*-\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'latin-1')
+        self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
+
+    def test_matched_bom_and_cookie_first_line(self):
+        lines = (
+            b'\xef\xbb\xbf# coding=utf-8\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])
+
+    def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
+        lines = (
+            b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+    def test_cookie_second_line_no_bom(self):
+        lines = (
+            b'#! something\n',
+            b'# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'ascii')
+        expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
+        self.assertEquals(consumed_lines, expected)
+
+    def test_matched_bom_and_cookie_second_line(self):
+        lines = (
+            b'\xef\xbb\xbf#! something\n',
+            b'f# coding=utf-8\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines,
+                          [b'#! something\n', b'f# coding=utf-8\n'])
+
+    def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
+        lines = (
+            b'\xef\xbb\xbf#! something\n',
+            b'# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+    def test_short_files(self):
+        readline = self.get_readline((b'print(something)\n',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [b'print(something)\n'])
+
+        encoding, consumed_lines = detect_encoding(self.get_readline(()))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [])
+
+        readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [b'print(something)\n'])
+
+        readline = self.get_readline((b'\xef\xbb\xbf',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [])
+
+
+class TestTokenize(TestCase):
+
+    def test_tokenize(self):
+        import tokenize as tokenize_module
+        encoding = object()
+        encoding_used = None
+        def mock_detect_encoding(readline):
+            return encoding, ['first', 'second']
+
+        def mock__tokenize(readline, encoding):
+            nonlocal encoding_used
+            encoding_used = encoding
+            out = []
+            while True:
+                next_line = readline()
+                if next_line:
+                    out.append(next_line)
+                    continue
+                return out
+
+        counter = 0
+        def mock_readline():
+            nonlocal counter
+            counter += 1
+            if counter == 5:
+                return b''
+            return counter
+
+        orig_detect_encoding = tokenize_module.detect_encoding
+        orig__tokenize = tokenize_module._tokenize
+        tokenize_module.detect_encoding = mock_detect_encoding
+        tokenize_module._tokenize = mock__tokenize
+        try:
+            results = tokenize(mock_readline)
+            self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])
+        finally:
+            tokenize_module.detect_encoding = orig_detect_encoding
+            tokenize_module._tokenize = orig__tokenize
+
+        self.assertTrue(encoding_used, encoding)
 
 
 __test__ = {"doctests" : doctests, 'decistmt': decistmt}
@@ -566,6 +821,10 @@ __test__ = {"doctests" : doctests, 'decistmt': decistmt}
 def test_main():
     from test import test_tokenize
     test_support.run_doctest(test_tokenize, True)
+    test_support.run_unittest(TestTokenizerAdheresToPep0263)
+    test_support.run_unittest(Test_Tokenize)
+    test_support.run_unittest(TestDetectEncoding)
+    test_support.run_unittest(TestTokenize)
 
 if __name__ == "__main__":
     test_main()