1 files changed, 113 insertions, 4 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index b064967..be6c18d 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,9 +1,93 @@
-import os, glob, random
+"""Tests for the tokenize module.
+
+The tests were originally written in the old Python style, where the
+test output was compared to a golden file.  This docstring represents
+the first steps towards rewriting the entire test as a doctest.
+
+The tests can be really simple.  Given a small fragment of source
+code, print out a table with the tokens.  The ENDMARK is omitted for
+brevity.
+
+>>> dump_tokens("1 + 1")
+NUMBER      '1'           (1, 0) (1, 1)
+OP          '+'           (1, 2) (1, 3)
+NUMBER      '1'           (1, 4) (1, 5)
+
+A comment generates a token here, unlike in the parser module.  The
+comment token is followed by an NL or a NEWLINE token, depending on
+whether the line contains the completion of a statement.
+
+>>> dump_tokens("if False:\\n"
+...             "    # NL\\n"
+...             "    True = False # NEWLINE\\n")
+NAME        'if'          (1, 0) (1, 2)
+NAME        'False'       (1, 3) (1, 8)
+OP          ':'           (1, 8) (1, 9)
+NEWLINE     '\\n'          (1, 9) (1, 10)
+COMMENT     '# NL'        (2, 4) (2, 8)
+NL          '\\n'          (2, 8) (2, 9)
+INDENT      '    '        (3, 0) (3, 4)
+NAME        'True'        (3, 4) (3, 8)
+OP          '='           (3, 9) (3, 10)
+NAME        'False'       (3, 11) (3, 16)
+COMMENT     '# NEWLINE'   (3, 17) (3, 26)
+NEWLINE     '\\n'          (3, 26) (3, 27)
+DEDENT      ''            (4, 0) (4, 0)
+
+
+There will be a bunch more tests of specific source patterns.
+
+The tokenize module also defines an untokenize function that should
+regenerate the original program text from the tokens.
+
+There are some standard formatting practices that are easy to get right.
+
+>>> roundtrip("if x == 1:\\n"
+...           "    print x\\n")
+if x == 1:
+    print x
+
+Some people use different formatting conventions, which makes
+untokenize a little trickier.  Note that this test involves trailing
+whitespace after the colon.  Note that we use hex escapes to make the
+two trailing blanks apparent in the expected output.
+
+>>> roundtrip("if   x  ==  1  :  \\n"
+...           "  print x\\n")
+if   x  ==  1  :\x20\x20
+  print x
+
+Comments need to go in the right place.
+
+>>> roundtrip("if x == 1:\\n"
+...           "    # A comment by itself.\\n"
+...           "    print x  # Comment here, too.\\n"
+...           "    # Another comment.\\n"
+...           "after_if = True\\n")
+if x == 1:
+    # A comment by itself.
+    print x  # Comment here, too.
+    # Another comment.
+after_if = True
+
+>>> roundtrip("if (x  # The comments need to go in the right place\\n"
+...           "    == 1):\\n"
+...           "    print 'x == 1'\\n")
+if (x  # The comments need to go in the right place
+    == 1):
+    print 'x == 1'
+
+"""
+
+import os, glob, random, time, sys
 from cStringIO import StringIO
 from test.test_support import (verbose, findfile, is_resource_enabled,
                                TestFailed)
-from tokenize import (tokenize, generate_tokens, untokenize,
-                      NUMBER, NAME, OP, STRING)
+from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
+                      ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
+
+# How much time in seconds can pass before we print a 'Still working' message.
+_PRINT_WORKING_MSG_INTERVAL = 5 * 60
 
 # Test roundtrip for `untokenize`.  `f` is a file path.  The source code in f
 # is tokenized, converted back to source code via tokenize.untokenize(),
@@ -24,6 +108,23 @@ def test_roundtrip(f):
     if t1 != t2:
         raise TestFailed("untokenize() roundtrip failed for %r" % f)
 
+def dump_tokens(s):
+    """Print out the tokens in s in a table format.
+
+    The ENDMARKER is omitted.
+    """
+    f = StringIO(s)
+    for type, token, start, end, line in generate_tokens(f.readline):
+        if type == ENDMARKER:
+            break
+        type = tok_name[type]
+        print "%(type)-10.10s  %(token)-13.13r %(start)s %(end)s" % locals()
+
+def roundtrip(s):
+    f = StringIO(s)
+    source = untokenize(generate_tokens(f.readline))
+    print source,
+
 # This is an example from the docs, set up as a doctest.
 def decistmt(s):
     """Substitute Decimals for floats in a string of statements.
@@ -66,6 +167,8 @@ def test_main():
     if verbose:
         print 'starting...'
 
+    next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
+
     # This displays the tokenization of tokenize_tests.py to stdout, and
     # regrtest.py checks that this equals the expected output (in the
     # test/output/ directory).
@@ -85,6 +188,12 @@ def test_main():
         testfiles = random.sample(testfiles, 10)
 
     for f in testfiles:
+        # Print still working message since this test can be really slow
+        if next_time <= time.time():
+            next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
+            print >>sys.__stdout__, '  test_main still working, be patient...'
+            sys.__stdout__.flush()
+
         test_roundtrip(f)
 
     # Test detecton of IndentationError.
@@ -105,7 +214,7 @@ def foo():
     # Run the doctests in this module.
     from test import test_tokenize  # i.e., this module
     from test.test_support import run_doctest
-    run_doctest(test_tokenize)
+    run_doctest(test_tokenize, verbose)
 
     if verbose:
         print 'finished'