From ed4ffd74042f5ac34a92514fdef8b61669e309ea Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 2 Oct 2023 17:11:24 +0200
Subject: [3.12] gh-108303: Move tokenize-related data to Lib/test/tokenizedata
 (GH-109265) (#109677)

* gh-108303: Move tokenize-related data to Lib/test/tokenizedata (GH-109265)

(cherry picked from commit 1110c5bc828218086f6397ec05a9312fb73ea30a)

* gh-108303: Add `Lib/test/tokenizedata` to `TESTSUBDIRS` (#109314)

(cherry picked from commit 42ab2cbd7b5e76e919b70883ae683e789dbd913d)

---------

Co-authored-by: Nikita Sobolev <mail@sobolevn.me>
---
 .gitattributes                                     |   2 +-
 .pre-commit-config.yaml                            |   2 +-
 Lib/test/bad_coding.py                             |   1 -
 Lib/test/bad_coding2.py                            |   2 -
 Lib/test/badsyntax_3131.py                         |   2 -
 Lib/test/coding20731.py                            |   4 -
 Lib/test/test_py_compile.py                        |  16 +-
 Lib/test/test_source_encoding.py                   |   5 +-
 Lib/test/test_tarfile.py                           |  29 ++--
 Lib/test/test_tokenize.py                          |   7 +-
 Lib/test/test_tools/test_reindent.py               |   2 +-
 Lib/test/test_unicode_identifiers.py               |   2 +-
 ...tests-latin1-coding-cookie-and-utf8-bom-sig.txt |  13 --
 ...ests-no-coding-cookie-and-utf8-bom-sig-only.txt |  11 --
 ...ests-utf8-coding-cookie-and-no-utf8-bom-sig.txt |  13 --
 ...e_tests-utf8-coding-cookie-and-utf8-bom-sig.txt |  12 --
 Lib/test/tokenize_tests.txt                        | 189 ---------------------
 Lib/test/tokenizedata/__init__.py                  |   0
 Lib/test/tokenizedata/bad_coding.py                |   1 +
 Lib/test/tokenizedata/bad_coding2.py               |   2 +
 Lib/test/tokenizedata/badsyntax_3131.py            |   2 +
 Lib/test/tokenizedata/coding20731.py               |   4 +
 ...tests-latin1-coding-cookie-and-utf8-bom-sig.txt |  13 ++
 ...ests-no-coding-cookie-and-utf8-bom-sig-only.txt |  11 ++
 ...ests-utf8-coding-cookie-and-no-utf8-bom-sig.txt |  13 ++
 ...e_tests-utf8-coding-cookie-and-utf8-bom-sig.txt |  12 ++
 Lib/test/tokenizedata/tokenize_tests.txt           | 189 +++++++++++++++++++++
 Makefile.pre.in                                    |   1 +
 28 files changed, 288 insertions(+), 272 deletions(-)
 delete mode 100644 Lib/test/bad_coding.py
 delete mode 100644 Lib/test/bad_coding2.py
 delete mode 100644 Lib/test/badsyntax_3131.py
 delete mode 100644 Lib/test/coding20731.py
 delete mode 100644 Lib/test/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt
 delete mode 100644 Lib/test/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt
 delete mode 100644 Lib/test/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt
 delete mode 100644 Lib/test/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt
 delete mode 100644 Lib/test/tokenize_tests.txt
 create mode 100644 Lib/test/tokenizedata/__init__.py
 create mode 100644 Lib/test/tokenizedata/bad_coding.py
 create mode 100644 Lib/test/tokenizedata/bad_coding2.py
 create mode 100644 Lib/test/tokenizedata/badsyntax_3131.py
 create mode 100644 Lib/test/tokenizedata/coding20731.py
 create mode 100644 Lib/test/tokenizedata/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt
 create mode 100644 Lib/test/tokenizedata/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt
 create mode 100644 Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt
 create mode 100644 Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt
 create mode 100644 Lib/test/tokenizedata/tokenize_tests.txt

diff --git a/.gitattributes b/.gitattributes
index 4ed9506..2bfd4bf 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -25,7 +25,7 @@ PC/classicAppCompat.* binary
 [attr]noeol -text
 
 Lib/test/cjkencodings/*                    noeol
-Lib/test/coding20731.py                    noeol
+Lib/test/tokenizedata/coding20731.py       noeol
 Lib/test/decimaltestdata/*.decTest         noeol
 Lib/test/test_email/data/*.txt             noeol
 Lib/test/test_importlib/resources/data01/*           noeol
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 19f6a03..4c1fd20 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@ repos:
       - id: check-yaml
       - id: end-of-file-fixer
         types: [python]
-        exclude: Lib/test/coding20731.py
+        exclude: Lib/test/tokenizedata/coding20731.py
       - id: trailing-whitespace
         types_or: [c, python, rst]
 
diff --git a/Lib/test/bad_coding.py b/Lib/test/bad_coding.py
deleted file mode 100644
index 971b0a8..0000000
--- a/Lib/test/bad_coding.py
+++ /dev/null
@@ -1 +0,0 @@
-# -*- coding: uft-8 -*-
diff --git a/Lib/test/bad_coding2.py b/Lib/test/bad_coding2.py
deleted file mode 100644
index bb2bb7e..0000000
--- a/Lib/test/bad_coding2.py
+++ /dev/null
@@ -1,2 +0,0 @@
-﻿#coding: utf8
-print('我')
diff --git a/Lib/test/badsyntax_3131.py b/Lib/test/badsyntax_3131.py
deleted file mode 100644
index 901d374..0000000
--- a/Lib/test/badsyntax_3131.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# -*- coding: utf-8 -*-
-€ = 2
diff --git a/Lib/test/coding20731.py b/Lib/test/coding20731.py
deleted file mode 100644
index b0e227a..0000000
--- a/Lib/test/coding20731.py
+++ /dev/null
@@ -1,4 +0,0 @@
-#coding:latin1
-
-
-
diff --git a/Lib/test/test_py_compile.py b/Lib/test/test_py_compile.py
index 5e0a44a..c4e6551 100644
--- a/Lib/test/test_py_compile.py
+++ b/Lib/test/test_py_compile.py
@@ -132,7 +132,9 @@ class PyCompileTestsBase:
             os.chmod(self.directory, mode.st_mode)
 
     def test_bad_coding(self):
-        bad_coding = os.path.join(os.path.dirname(__file__), 'bad_coding2.py')
+        bad_coding = os.path.join(os.path.dirname(__file__),
+                                  'tokenizedata',
+                                  'bad_coding2.py')
         with support.captured_stderr():
             self.assertIsNone(py_compile.compile(bad_coding, doraise=False))
         self.assertFalse(os.path.exists(
@@ -195,7 +197,9 @@ class PyCompileTestsBase:
         self.assertEqual(flags, 0b1)
 
     def test_quiet(self):
-        bad_coding = os.path.join(os.path.dirname(__file__), 'bad_coding2.py')
+        bad_coding = os.path.join(os.path.dirname(__file__),
+                                  'tokenizedata',
+                                  'bad_coding2.py')
         with support.captured_stderr() as stderr:
             self.assertIsNone(py_compile.compile(bad_coding, doraise=False, quiet=2))
             self.assertIsNone(py_compile.compile(bad_coding, doraise=True, quiet=2))
@@ -260,14 +264,18 @@ class PyCompileCLITestCase(unittest.TestCase):
         self.assertTrue(os.path.exists(self.cache_path))
 
     def test_bad_syntax(self):
-        bad_syntax = os.path.join(os.path.dirname(__file__), 'badsyntax_3131.py')
+        bad_syntax = os.path.join(os.path.dirname(__file__),
+                                  'tokenizedata',
+                                  'badsyntax_3131.py')
         rc, stdout, stderr = self.pycompilecmd_failure(bad_syntax)
         self.assertEqual(rc, 1)
         self.assertEqual(stdout, b'')
         self.assertIn(b'SyntaxError', stderr)
 
     def test_bad_syntax_with_quiet(self):
-        bad_syntax = os.path.join(os.path.dirname(__file__), 'badsyntax_3131.py')
+        bad_syntax = os.path.join(os.path.dirname(__file__),
+                                  'tokenizedata',
+                                  'badsyntax_3131.py')
         rc, stdout, stderr = self.pycompilecmd_failure('-q', bad_syntax)
         self.assertEqual(rc, 1)
         self.assertEqual(stdout, b'')
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index 72c2b47..2787137 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -68,6 +68,7 @@ class MiscSourceEncodingTest(unittest.TestCase):
     def test_20731(self):
         sub = subprocess.Popen([sys.executable,
                         os.path.join(os.path.dirname(__file__),
+                                     'tokenizedata',
                                      'coding20731.py')],
                         stderr=subprocess.PIPE)
         err = sub.communicate()[1]
@@ -100,10 +101,10 @@ class MiscSourceEncodingTest(unittest.TestCase):
         self.verify_bad_module(module_name)
 
     def verify_bad_module(self, module_name):
-        self.assertRaises(SyntaxError, __import__, 'test.' + module_name)
+        self.assertRaises(SyntaxError, __import__, 'test.tokenizedata.' + module_name)
 
         path = os.path.dirname(__file__)
-        filename = os.path.join(path, module_name + '.py')
+        filename = os.path.join(path, 'tokenizedata', module_name + '.py')
         with open(filename, "rb") as fp:
             bytes = fp.read()
         self.assertRaises(SyntaxError, compile, bytes, filename, 'exec')
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py
index 013c626..5d9714e 100644
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -2564,16 +2564,17 @@ class CommandLineTest(unittest.TestCase):
         return script_helper.assert_python_failure('-m', 'tarfile', *args)
 
     def make_simple_tarfile(self, tar_name):
-        files = [support.findfile('tokenize_tests.txt'),
+        files = [support.findfile('tokenize_tests.txt',
+                                  subdir='tokenizedata'),
                  support.findfile('tokenize_tests-no-coding-cookie-'
-                                  'and-utf8-bom-sig-only.txt')]
+                                  'and-utf8-bom-sig-only.txt',
+                                  subdir='tokenizedata')]
         self.addCleanup(os_helper.unlink, tar_name)
         with tarfile.open(tar_name, 'w') as tf:
             for tardata in files:
                 tf.add(tardata, arcname=os.path.basename(tardata))
 
     def make_evil_tarfile(self, tar_name):
-        files = [support.findfile('tokenize_tests.txt')]
         self.addCleanup(os_helper.unlink, tar_name)
         with tarfile.open(tar_name, 'w') as tf:
             benign = tarfile.TarInfo('benign')
@@ -2654,9 +2655,11 @@ class CommandLineTest(unittest.TestCase):
         self.assertEqual(rc, 1)
 
     def test_create_command(self):
-        files = [support.findfile('tokenize_tests.txt'),
+        files = [support.findfile('tokenize_tests.txt',
+                                  subdir='tokenizedata'),
                  support.findfile('tokenize_tests-no-coding-cookie-'
-                                  'and-utf8-bom-sig-only.txt')]
+                                  'and-utf8-bom-sig-only.txt',
+                                  subdir='tokenizedata')]
         for opt in '-c', '--create':
             try:
                 out = self.tarfilecmd(opt, tmpname, *files)
@@ -2667,9 +2670,11 @@ class CommandLineTest(unittest.TestCase):
                 os_helper.unlink(tmpname)
 
     def test_create_command_verbose(self):
-        files = [support.findfile('tokenize_tests.txt'),
+        files = [support.findfile('tokenize_tests.txt',
+                                  subdir='tokenizedata'),
                  support.findfile('tokenize_tests-no-coding-cookie-'
-                                  'and-utf8-bom-sig-only.txt')]
+                                  'and-utf8-bom-sig-only.txt',
+                                  subdir='tokenizedata')]
         for opt in '-v', '--verbose':
             try:
                 out = self.tarfilecmd(opt, '-c', tmpname, *files,
@@ -2681,7 +2686,7 @@ class CommandLineTest(unittest.TestCase):
                 os_helper.unlink(tmpname)
 
     def test_create_command_dotless_filename(self):
-        files = [support.findfile('tokenize_tests.txt')]
+        files = [support.findfile('tokenize_tests.txt', subdir='tokenizedata')]
         try:
             out = self.tarfilecmd('-c', dotlessname, *files)
             self.assertEqual(out, b'')
@@ -2692,7 +2697,7 @@ class CommandLineTest(unittest.TestCase):
 
     def test_create_command_dot_started_filename(self):
         tar_name = os.path.join(TEMPDIR, ".testtar")
-        files = [support.findfile('tokenize_tests.txt')]
+        files = [support.findfile('tokenize_tests.txt', subdir='tokenizedata')]
         try:
             out = self.tarfilecmd('-c', tar_name, *files)
             self.assertEqual(out, b'')
@@ -2702,9 +2707,11 @@ class CommandLineTest(unittest.TestCase):
             os_helper.unlink(tar_name)
 
     def test_create_command_compressed(self):
-        files = [support.findfile('tokenize_tests.txt'),
+        files = [support.findfile('tokenize_tests.txt',
+                                  subdir='tokenizedata'),
                  support.findfile('tokenize_tests-no-coding-cookie-'
-                                  'and-utf8-bom-sig-only.txt')]
+                                  'and-utf8-bom-sig-only.txt',
+                                  subdir='tokenizedata')]
         for filetype in (GzipTest, Bz2Test, LzmaTest):
             if not filetype.open:
                 continue
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index c320478..40680f0 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1198,7 +1198,7 @@ class TestTokenizerAdheresToPep0263(TestCase):
     """
 
     def _testFile(self, filename):
-        path = os.path.join(os.path.dirname(__file__), filename)
+        path = os.path.join(os.path.dirname(__file__), 'tokenizedata', filename)
         with open(path, 'rb') as f:
             TestRoundtrip.check_roundtrip(self, f)
 
@@ -1791,7 +1791,7 @@ class TestRoundtrip(TestCase):
 
         self.check_roundtrip("if x == 1 : \n"
                              "  print(x)\n")
-        fn = support.findfile("tokenize_tests.txt")
+        fn = support.findfile("tokenize_tests.txt", subdir="tokenizedata")
         with open(fn, 'rb') as f:
             self.check_roundtrip(f)
         self.check_roundtrip("if x == 1:\n"
@@ -1846,8 +1846,7 @@ class TestRoundtrip(TestCase):
         # pass the '-ucpu' option to process the full directory.
 
         import glob, random
-        fn = support.findfile("tokenize_tests.txt")
-        tempdir = os.path.dirname(fn) or os.curdir
+        tempdir = os.path.dirname(__file__) or os.curdir
         testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
 
         # Tokenize is broken on test_pep3131.py because regular expressions are
diff --git a/Lib/test/test_tools/test_reindent.py b/Lib/test/test_tools/test_reindent.py
index 3b0c793..64e31c2 100644
--- a/Lib/test/test_tools/test_reindent.py
+++ b/Lib/test/test_tools/test_reindent.py
@@ -25,7 +25,7 @@ class ReindentTests(unittest.TestCase):
         self.assertGreater(err, b'')
 
     def test_reindent_file_with_bad_encoding(self):
-        bad_coding_path = findfile('bad_coding.py')
+        bad_coding_path = findfile('bad_coding.py', subdir='tokenizedata')
         rc, out, err = assert_python_ok(self.script, '-r', bad_coding_path)
         self.assertEqual(out, b'')
         self.assertNotEqual(err, b'')
diff --git a/Lib/test/test_unicode_identifiers.py b/Lib/test/test_unicode_identifiers.py
index 5b9ced5..63c6c05 100644
--- a/Lib/test/test_unicode_identifiers.py
+++ b/Lib/test/test_unicode_identifiers.py
@@ -19,7 +19,7 @@ class PEP3131Test(unittest.TestCase):
 
     def test_invalid(self):
         try:
-            from test import badsyntax_3131
+            from test.tokenizedata import badsyntax_3131
         except SyntaxError as err:
             self.assertEqual(str(err),
               "invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)")
diff --git a/Lib/test/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt b/Lib/test/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt
deleted file mode 100644
index 1b5335b..0000000
--- a/Lib/test/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-﻿# -*- coding: latin1 -*-
-# IMPORTANT: this file has the utf-8 BOM signature '\xef\xbb\xbf'
-# at the start of it.  Make sure this is preserved if any changes
-# are made!  Also note that the coding cookie above conflicts with
-# the presence of a utf-8 BOM signature -- this is intended.
-
-# Arbitrary encoded utf-8 text (stolen from test_doctest2.py).
-x = 'ЉЊЈЁЂ'
-def y():
-    """
-    And again in a comment.  ЉЊЈЁЂ
-    """
-    pass
diff --git a/Lib/test/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt b/Lib/test/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt
deleted file mode 100644
index 23fd216..0000000
--- a/Lib/test/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-﻿# IMPORTANT: this file has the utf-8 BOM signature '\xef\xbb\xbf'
-# at the start of it.  Make sure this is preserved if any changes
-# are made!
-
-# Arbitrary encoded utf-8 text (stolen from test_doctest2.py).
-x = 'ЉЊЈЁЂ'
-def y():
-    """
-    And again in a comment.  ЉЊЈЁЂ
-    """
-    pass
diff --git a/Lib/test/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt b/Lib/test/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt
deleted file mode 100644
index 04561e4..0000000
--- a/Lib/test/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# -*- coding: utf-8 -*-
-# IMPORTANT: unlike the other test_tokenize-*.txt files, this file
-# does NOT have the utf-8 BOM signature '\xef\xbb\xbf' at the start
-# of it.  Make sure this is not added inadvertently by your editor
-# if any changes are made to this file!
-
-# Arbitrary encoded utf-8 text (stolen from test_doctest2.py).
-x = 'ЉЊЈЁЂ'
-def y():
-    """
-    And again in a comment.  ЉЊЈЁЂ
-    """
-    pass
diff --git a/Lib/test/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt b/Lib/test/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt
deleted file mode 100644
index 4b20ff6..0000000
--- a/Lib/test/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-﻿# -*- coding: utf-8 -*-
-# IMPORTANT: this file has the utf-8 BOM signature '\xef\xbb\xbf'
-# at the start of it.  Make sure this is preserved if any changes
-# are made!
-
-# Arbitrary encoded utf-8 text (stolen from test_doctest2.py).
-x = 'ЉЊЈЁЂ'
-def y():
-    """
-    And again in a comment.  ЉЊЈЁЂ
-    """
-    pass
diff --git a/Lib/test/tokenize_tests.txt b/Lib/test/tokenize_tests.txt
deleted file mode 100644
index c4f5a58..0000000
--- a/Lib/test/tokenize_tests.txt
+++ /dev/null
@@ -1,189 +0,0 @@
-# Tests for the 'tokenize' module.
-# Large bits stolen from test_grammar.py.
-
-# Comments
-"#"
-#'
-#"
-#\
-       #
-    # abc
-'''#
-#'''
-
-x = 1  #
-
-# Balancing continuation
-
-a = (3, 4,
-  5, 6)
-y = [3, 4,
-  5]
-z = {'a':5,
-  'b':6}
-x = (len(repr(y)) + 5*x - a[
-   3 ]
-   - x + len({
-   }
-    )
-  )
-
-# Backslash means line continuation:
-x = 1 \
-+ 1
-
-# Backslash does not means continuation in comments :\
-x = 0
-
-# Ordinary integers
-0xff != 255
-0o377 != 255
-2147483647   != 0o17777777777
--2147483647-1 != 0o20000000000
-0o37777777777 != -1
-0xffffffff != -1; 0o37777777777 != -1; -0o1234567 == 0O001234567; 0b10101 == 0B00010101
-
-# Long integers
-x = 0
-x = 0
-x = 0xffffffffffffffff
-x = 0xffffffffffffffff
-x = 0o77777777777777777
-x = 0B11101010111111111
-x = 123456789012345678901234567890
-x = 123456789012345678901234567890
-
-# Floating-point numbers
-x = 3.14
-x = 314.
-x = 0.314
-# XXX x = 000.314
-x = .314
-x = 3e14
-x = 3E14
-x = 3e-14
-x = 3e+14
-x = 3.e14
-x = .3e14
-x = 3.1e4
-
-# String literals
-x = ''; y = "";
-x = '\''; y = "'";
-x = '"'; y = "\"";
-x = "doesn't \"shrink\" does it"
-y = 'doesn\'t "shrink" does it'
-x = "does \"shrink\" doesn't it"
-y = 'does "shrink" doesn\'t it'
-x = """
-The "quick"
-brown fox
-jumps over
-the 'lazy' dog.
-"""
-y = '\nThe "quick"\nbrown fox\njumps over\nthe \'lazy\' dog.\n'
-y = '''
-The "quick"
-brown fox
-jumps over
-the 'lazy' dog.
-''';
-y = "\n\
-The \"quick\"\n\
-brown fox\n\
-jumps over\n\
-the 'lazy' dog.\n\
-";
-y = '\n\
-The \"quick\"\n\
-brown fox\n\
-jumps over\n\
-the \'lazy\' dog.\n\
-';
-x = r'\\' + R'\\'
-x = r'\'' + ''
-y = r'''
-foo bar \\
-baz''' + R'''
-foo'''
-y = r"""foo
-bar \\ baz
-""" + R'''spam
-'''
-x = b'abc' + B'ABC'
-y = b"abc" + B"ABC"
-x = br'abc' + Br'ABC' + bR'ABC' + BR'ABC'
-y = br"abc" + Br"ABC" + bR"ABC" + BR"ABC"
-x = rb'abc' + rB'ABC' + Rb'ABC' + RB'ABC'
-y = rb"abc" + rB"ABC" + Rb"ABC" + RB"ABC"
-x = br'\\' + BR'\\'
-x = rb'\\' + RB'\\'
-x = br'\'' + ''
-x = rb'\'' + ''
-y = br'''
-foo bar \\
-baz''' + BR'''
-foo'''
-y = Br"""foo
-bar \\ baz
-""" + bR'''spam
-'''
-y = rB"""foo
-bar \\ baz
-""" + Rb'''spam
-'''
-
-# Indentation
-if 1:
-    x = 2
-if 1:
-        x = 2
-if 1:
-    while 0:
-     if 0:
-           x = 2
-     x = 2
-if 0:
-  if 2:
-   while 0:
-        if 1:
-          x = 2
-
-# Operators
-
-def d22(a, b, c=1, d=2): pass
-def d01v(a=1, *restt, **restd): pass
-
-(x, y) != ({'a':1}, {'b':2})
-
-# comparison
-if 1 < 1 > 1 == 1 >= 1 <= 1 != 1 != 1 in 1 not in 1 is 1 is not 1: pass
-
-# binary
-x = 1 & 1
-x = 1 ^ 1
-x = 1 | 1
-
-# shift
-x = 1 << 1 >> 1
-
-# additive
-x = 1 - 1 + 1 - 1 + 1
-
-# multiplicative
-x = 1 / 1 * 1 % 1
-
-# unary
-x = ~1 ^ 1 & 1 | 1 & 1 ^ -1
-x = -1*1/1 + 1*1 - ---1*1
-
-# selector
-import sys, time
-x = sys.modules['time'].time()
-
-@staticmethod
-def foo(): pass
-
-@staticmethod
-def foo(x:1)->1: pass
-
diff --git a/Lib/test/tokenizedata/__init__.py b/Lib/test/tokenizedata/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Lib/test/tokenizedata/bad_coding.py b/Lib/test/tokenizedata/bad_coding.py
new file mode 100644
index 0000000..971b0a8
--- /dev/null
+++ b/Lib/test/tokenizedata/bad_coding.py
@@ -0,0 +1 @@
+# -*- coding: uft-8 -*-
diff --git a/Lib/test/tokenizedata/bad_coding2.py b/Lib/test/tokenizedata/bad_coding2.py
new file mode 100644
index 0000000..bb2bb7e
--- /dev/null
+++ b/Lib/test/tokenizedata/bad_coding2.py
@@ -0,0 +1,2 @@
+﻿#coding: utf8
+print('我')
diff --git a/Lib/test/tokenizedata/badsyntax_3131.py b/Lib/test/tokenizedata/badsyntax_3131.py
new file mode 100644
index 0000000..901d374
--- /dev/null
+++ b/Lib/test/tokenizedata/badsyntax_3131.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+€ = 2
diff --git a/Lib/test/tokenizedata/coding20731.py b/Lib/test/tokenizedata/coding20731.py
new file mode 100644
index 0000000..b0e227a
--- /dev/null
+++ b/Lib/test/tokenizedata/coding20731.py
@@ -0,0 +1,4 @@
+#coding:latin1
+
+
+
diff --git a/Lib/test/tokenizedata/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt b/Lib/test/tokenizedata/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt
new file mode 100644
index 0000000..1b5335b
--- /dev/null
+++ b/Lib/test/tokenizedata/tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt
@@ -0,0 +1,13 @@
+﻿# -*- coding: latin1 -*-
+# IMPORTANT: this file has the utf-8 BOM signature '\xef\xbb\xbf'
+# at the start of it.  Make sure this is preserved if any changes
+# are made!  Also note that the coding cookie above conflicts with
+# the presence of a utf-8 BOM signature -- this is intended.
+
+# Arbitrary encoded utf-8 text (stolen from test_doctest2.py).
+x = 'ЉЊЈЁЂ'
+def y():
+    """
+    And again in a comment.  ЉЊЈЁЂ
+    """
+    pass
diff --git a/Lib/test/tokenizedata/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt b/Lib/test/tokenizedata/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt
new file mode 100644
index 0000000..23fd216
--- /dev/null
+++ b/Lib/test/tokenizedata/tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt
@@ -0,0 +1,11 @@
+﻿# IMPORTANT: this file has the utf-8 BOM signature '\xef\xbb\xbf'
+# at the start of it.  Make sure this is preserved if any changes
+# are made!
+
+# Arbitrary encoded utf-8 text (stolen from test_doctest2.py).
+x = 'ЉЊЈЁЂ'
+def y():
+    """
+    And again in a comment.  ЉЊЈЁЂ
+    """
+    pass
diff --git a/Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt b/Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt
new file mode 100644
index 0000000..04561e4
--- /dev/null
+++ b/Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+# IMPORTANT: unlike the other test_tokenize-*.txt files, this file
+# does NOT have the utf-8 BOM signature '\xef\xbb\xbf' at the start
+# of it.  Make sure this is not added inadvertently by your editor
+# if any changes are made to this file!
+
+# Arbitrary encoded utf-8 text (stolen from test_doctest2.py).
+x = 'ЉЊЈЁЂ'
+def y():
+    """
+    And again in a comment.  ЉЊЈЁЂ
+    """
+    pass
diff --git a/Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt b/Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt
new file mode 100644
index 0000000..4b20ff6
--- /dev/null
+++ b/Lib/test/tokenizedata/tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt
@@ -0,0 +1,12 @@
+﻿# -*- coding: utf-8 -*-
+# IMPORTANT: this file has the utf-8 BOM signature '\xef\xbb\xbf'
+# at the start of it.  Make sure this is preserved if any changes
+# are made!
+
+# Arbitrary encoded utf-8 text (stolen from test_doctest2.py).
+x = 'ЉЊЈЁЂ'
+def y():
+    """
+    And again in a comment.  ЉЊЈЁЂ
+    """
+    pass
diff --git a/Lib/test/tokenizedata/tokenize_tests.txt b/Lib/test/tokenizedata/tokenize_tests.txt
new file mode 100644
index 0000000..c4f5a58
--- /dev/null
+++ b/Lib/test/tokenizedata/tokenize_tests.txt
@@ -0,0 +1,189 @@
+# Tests for the 'tokenize' module.
+# Large bits stolen from test_grammar.py.
+
+# Comments
+"#"
+#'
+#"
+#\
+       #
+    # abc
+'''#
+#'''
+
+x = 1  #
+
+# Balancing continuation
+
+a = (3, 4,
+  5, 6)
+y = [3, 4,
+  5]
+z = {'a':5,
+  'b':6}
+x = (len(repr(y)) + 5*x - a[
+   3 ]
+   - x + len({
+   }
+    )
+  )
+
+# Backslash means line continuation:
+x = 1 \
++ 1
+
+# Backslash does not means continuation in comments :\
+x = 0
+
+# Ordinary integers
+0xff != 255
+0o377 != 255
+2147483647   != 0o17777777777
+-2147483647-1 != 0o20000000000
+0o37777777777 != -1
+0xffffffff != -1; 0o37777777777 != -1; -0o1234567 == 0O001234567; 0b10101 == 0B00010101
+
+# Long integers
+x = 0
+x = 0
+x = 0xffffffffffffffff
+x = 0xffffffffffffffff
+x = 0o77777777777777777
+x = 0B11101010111111111
+x = 123456789012345678901234567890
+x = 123456789012345678901234567890
+
+# Floating-point numbers
+x = 3.14
+x = 314.
+x = 0.314
+# XXX x = 000.314
+x = .314
+x = 3e14
+x = 3E14
+x = 3e-14
+x = 3e+14
+x = 3.e14
+x = .3e14
+x = 3.1e4
+
+# String literals
+x = ''; y = "";
+x = '\''; y = "'";
+x = '"'; y = "\"";
+x = "doesn't \"shrink\" does it"
+y = 'doesn\'t "shrink" does it'
+x = "does \"shrink\" doesn't it"
+y = 'does "shrink" doesn\'t it'
+x = """
+The "quick"
+brown fox
+jumps over
+the 'lazy' dog.
+"""
+y = '\nThe "quick"\nbrown fox\njumps over\nthe \'lazy\' dog.\n'
+y = '''
+The "quick"
+brown fox
+jumps over
+the 'lazy' dog.
+''';
+y = "\n\
+The \"quick\"\n\
+brown fox\n\
+jumps over\n\
+the 'lazy' dog.\n\
+";
+y = '\n\
+The \"quick\"\n\
+brown fox\n\
+jumps over\n\
+the \'lazy\' dog.\n\
+';
+x = r'\\' + R'\\'
+x = r'\'' + ''
+y = r'''
+foo bar \\
+baz''' + R'''
+foo'''
+y = r"""foo
+bar \\ baz
+""" + R'''spam
+'''
+x = b'abc' + B'ABC'
+y = b"abc" + B"ABC"
+x = br'abc' + Br'ABC' + bR'ABC' + BR'ABC'
+y = br"abc" + Br"ABC" + bR"ABC" + BR"ABC"
+x = rb'abc' + rB'ABC' + Rb'ABC' + RB'ABC'
+y = rb"abc" + rB"ABC" + Rb"ABC" + RB"ABC"
+x = br'\\' + BR'\\'
+x = rb'\\' + RB'\\'
+x = br'\'' + ''
+x = rb'\'' + ''
+y = br'''
+foo bar \\
+baz''' + BR'''
+foo'''
+y = Br"""foo
+bar \\ baz
+""" + bR'''spam
+'''
+y = rB"""foo
+bar \\ baz
+""" + Rb'''spam
+'''
+
+# Indentation
+if 1:
+    x = 2
+if 1:
+        x = 2
+if 1:
+    while 0:
+     if 0:
+           x = 2
+     x = 2
+if 0:
+  if 2:
+   while 0:
+        if 1:
+          x = 2
+
+# Operators
+
+def d22(a, b, c=1, d=2): pass
+def d01v(a=1, *restt, **restd): pass
+
+(x, y) != ({'a':1}, {'b':2})
+
+# comparison
+if 1 < 1 > 1 == 1 >= 1 <= 1 != 1 != 1 in 1 not in 1 is 1 is not 1: pass
+
+# binary
+x = 1 & 1
+x = 1 ^ 1
+x = 1 | 1
+
+# shift
+x = 1 << 1 >> 1
+
+# additive
+x = 1 - 1 + 1 - 1 + 1
+
+# multiplicative
+x = 1 / 1 * 1 % 1
+
+# unary
+x = ~1 ^ 1 & 1 | 1 & 1 ^ -1
+x = -1*1/1 + 1*1 - ---1*1
+
+# selector
+import sys, time
+x = sys.modules['time'].time()
+
+@staticmethod
+def foo(): pass
+
+@staticmethod
+def foo(x:1)->1: pass
+
diff --git a/Makefile.pre.in b/Makefile.pre.in
index cf054c1..7418ddf 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -2226,6 +2226,7 @@ TESTSUBDIRS=	idlelib/idle_test \
 		test/test_zipfile/_path \
 		test/test_zoneinfo \
 		test/test_zoneinfo/data \
+		test/tokenizedata \
 		test/tracedmodules \
 		test/typinganndata \
 		test/xmltestdata \
-- 
cgit v0.12