diff options
Diffstat (limited to 'Lib/tokenize.py')
| -rw-r--r-- | Lib/tokenize.py | 225 | 
1 files changed, 173 insertions, 52 deletions
| diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 29c9e29..cbf91ef 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -45,6 +45,51 @@ tok_name[NL] = 'NL'  ENCODING = N_TOKENS + 2  tok_name[ENCODING] = 'ENCODING'  N_TOKENS += 3 +EXACT_TOKEN_TYPES = { +    '(':   LPAR, +    ')':   RPAR, +    '[':   LSQB, +    ']':   RSQB, +    ':':   COLON, +    ',':   COMMA, +    ';':   SEMI, +    '+':   PLUS, +    '-':   MINUS, +    '*':   STAR, +    '/':   SLASH, +    '|':   VBAR, +    '&':   AMPER, +    '<':   LESS, +    '>':   GREATER, +    '=':   EQUAL, +    '.':   DOT, +    '%':   PERCENT, +    '{':   LBRACE, +    '}':   RBRACE, +    '==':  EQEQUAL, +    '!=':  NOTEQUAL, +    '<=':  LESSEQUAL, +    '>=':  GREATEREQUAL, +    '~':   TILDE, +    '^':   CIRCUMFLEX, +    '<<':  LEFTSHIFT, +    '>>':  RIGHTSHIFT, +    '**':  DOUBLESTAR, +    '+=':  PLUSEQUAL, +    '-=':  MINEQUAL, +    '*=':  STAREQUAL, +    '/=':  SLASHEQUAL, +    '%=':  PERCENTEQUAL, +    '&=':  AMPEREQUAL, +    '|=':  VBAREQUAL, +    '^=': CIRCUMFLEXEQUAL, +    '<<=': LEFTSHIFTEQUAL, +    '>>=': RIGHTSHIFTEQUAL, +    '**=': DOUBLESTAREQUAL, +    '//':  DOUBLESLASH, +    '//=': DOUBLESLASHEQUAL, +    '@':   AT +}  class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):      def __repr__(self): @@ -52,6 +97,13 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'          return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %                  self._replace(type=annotated_type)) +    @property +    def exact_type(self): +        if self.type == OP and self.string in EXACT_TOKEN_TYPES: +            return EXACT_TOKEN_TYPES[self.string] +        else: +            return self.type +  def group(*choices): return '(' + '|'.join(choices) + ')'  def any(*choices): return group(*choices) + '*'  def maybe(*choices): return group(*choices) + '?' @@ -75,6 +127,8 @@ Floatnumber = group(Pointfloat, Expfloat)  Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')  Number = group(Imagnumber, Floatnumber, Intnumber) +StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?' +  # Tail end of ' string.  Single = r"[^'\\]*(?:\\.[^'\\]*)*'"  # Tail end of " string. @@ -83,10 +137,10 @@ Double = r'[^"\\]*(?:\\.[^"\\]*)*"'  Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"  # Tail end of """ string.  Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""') +Triple = group(StringPrefix + "'''", StringPrefix + '"""')  # Single-line ' or " string. -String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", -               r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') +String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", +               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')  # Because of leftmost-then-longest match semantics, be sure to put the  # longest operators first (e.g., if = came before ==, == would get @@ -104,9 +158,9 @@ PlainToken = group(Number, Funny, String, Name)  Token = Ignore + PlainToken  # First (or only) line of ' or " string. -ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + +ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +                  group("'", r'\\\r?\n'), -                r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + +                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +                  group('"', r'\\\r?\n'))  PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)  PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) @@ -114,37 +168,49 @@ PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)  def _compile(expr):      return re.compile(expr, re.UNICODE) -tokenprog, pseudoprog, single3prog, double3prog = map( -    _compile, (Token, PseudoToken, Single3, Double3)) -endprogs = {"'": _compile(Single), '"': _compile(Double), -            "'''": single3prog, '"""': double3prog, -            "r'''": single3prog, 'r"""': double3prog, -            "b'''": single3prog, 'b"""': double3prog, -            "br'''": single3prog, 'br"""': double3prog, -            "R'''": single3prog, 'R"""': double3prog, -            "B'''": single3prog, 'B"""': double3prog, -            "bR'''": single3prog, 'bR"""': double3prog, -            "Br'''": single3prog, 'Br"""': double3prog, -            "BR'''": single3prog, 'BR"""': double3prog, -            'r': None, 'R': None, 'b': None, 'B': None} +endpats = {"'": Single, '"': Double, +           "'''": Single3, '"""': Double3, +           "r'''": Single3, 'r"""': Double3, +           "b'''": Single3, 'b"""': Double3, +           "R'''": Single3, 'R"""': Double3, +           "B'''": Single3, 'B"""': Double3, +           "br'''": Single3, 'br"""': Double3, +           "bR'''": Single3, 'bR"""': Double3, +           "Br'''": Single3, 'Br"""': Double3, +           "BR'''": Single3, 'BR"""': Double3, +           "rb'''": Single3, 'rb"""': Double3, +           "Rb'''": Single3, 'Rb"""': Double3, +           "rB'''": Single3, 'rB"""': Double3, +           "RB'''": Single3, 'RB"""': Double3, +           "u'''": Single3, 'u"""': Double3, +           "R'''": Single3, 'R"""': Double3, +           "U'''": Single3, 'U"""': Double3, +           'r': None, 'R': None, 'b': None, 'B': None, +           'u': None, 'U': None}  triple_quoted = {}  for t in ("'''", '"""',            "r'''", 'r"""', "R'''", 'R"""',            "b'''", 'b"""', "B'''", 'B"""',            "br'''", 'br"""', "Br'''", 'Br"""', -          "bR'''", 'bR"""', "BR'''", 'BR"""'): +          "bR'''", 'bR"""', "BR'''", 'BR"""', +          "rb'''", 'rb"""', "rB'''", 'rB"""', +          "Rb'''", 'Rb"""', "RB'''", 'RB"""', +          "u'''", 'u"""', "U'''", 'U"""', +          ):      triple_quoted[t] = t  single_quoted = {}  for t in ("'", '"',            "r'", 'r"', "R'", 'R"',            "b'", 'b"', "B'", 'B"',            "br'", 'br"', "Br'", 'Br"', -          "bR'", 'bR"', "BR'", 'BR"' ): +          "bR'", 'bR"', "BR'", 'BR"' , +          "rb'", 'rb"', "rB'", 'rB"', +          "Rb'", 'Rb"', "RB'", 'RB"' , +          "u'", 'u"', "U'", 'U"', +          ):      single_quoted[t] = t -del _compile -  tabsize = 8  class TokenError(Exception): pass @@ -281,6 +347,10 @@ def detect_encoding(readline):      If no encoding is specified, then the default of 'utf-8' will be returned.      """ +    try: +        filename = readline.__self__.name +    except AttributeError: +        filename = None      bom_found = False      encoding = None      default = 'utf-8' @@ -297,7 +367,10 @@ def detect_encoding(readline):              # per default encoding.              line_string = line.decode('utf-8')          except UnicodeDecodeError: -            raise SyntaxError("invalid or missing encoding declaration") +            msg = "invalid or missing encoding declaration" +            if filename is not None: +                msg = '{} for {!r}'.format(msg, filename) +            raise SyntaxError(msg)          matches = cookie_re.findall(line_string)          if not matches: @@ -307,12 +380,21 @@ def detect_encoding(readline):              codec = lookup(encoding)          except LookupError:              # This behaviour mimics the Python interpreter -            raise SyntaxError("unknown encoding: " + encoding) +            if filename is None: +                msg = "unknown encoding: " + encoding +            else: +                msg = "unknown encoding for {!r}: {}".format(filename, +                        encoding) +            raise SyntaxError(msg)          if bom_found:              if encoding != 'utf-8':                  # This behaviour mimics the Python interpreter -                raise SyntaxError('encoding problem: utf-8') +                if filename is None: +                    msg = 'encoding problem: utf-8' +                else: +                    msg = 'encoding problem for {!r}: utf-8'.format(filename) +                raise SyntaxError(msg)              encoding += '-sig'          return encoding @@ -469,7 +551,7 @@ def _tokenize(readline, encoding):              continued = 0          while pos < max: -            pseudomatch = pseudoprog.match(line, pos) +            pseudomatch = _compile(PseudoToken).match(line, pos)              if pseudomatch:                                # scan for tokens                  start, end = pseudomatch.span(1)                  spos, epos, pos = (lnum, start), (lnum, end), end @@ -487,7 +569,7 @@ def _tokenize(readline, encoding):                      assert not token.endswith("\n")                      yield TokenInfo(COMMENT, token, spos, epos, line)                  elif token in triple_quoted: -                    endprog = endprogs[token] +                    endprog = _compile(endpats[token])                      endmatch = endprog.match(line, pos)                      if endmatch:                           # all on one line                          pos = endmatch.end(0) @@ -503,8 +585,9 @@ def _tokenize(readline, encoding):                      token[:3] in single_quoted:                      if token[-1] == '\n':                  # continued string                          strstart = (lnum, start) -                        endprog = (endprogs[initial] or endprogs[token[1]] or -                                   endprogs[token[2]]) +                        endprog = _compile(endpats[initial] or +                                           endpats[token[1]] or +                                           endpats[token[2]])                          contstr, needcont = line[start:], 1                          contline = line                          break @@ -535,27 +618,65 @@ def _tokenize(readline, encoding):  def generate_tokens(readline):      return _tokenize(readline, None) +def main(): +    import argparse + +    # Helper error handling routines +    def perror(message): +        print(message, file=sys.stderr) + +    def error(message, filename=None, location=None): +        if location: +            args = (filename,) + location + (message,) +            perror("%s:%d:%d: error: %s" % args) +        elif filename: +            perror("%s: error: %s" % (filename, message)) +        else: +            perror("error: %s" % message) +        sys.exit(1) + +    # Parse the arguments and options +    parser = argparse.ArgumentParser(prog='python -m tokenize') +    parser.add_argument(dest='filename', nargs='?', +                        metavar='filename.py', +                        help='the file to tokenize; defaults to stdin') +    parser.add_argument('-e', '--exact', dest='exact', action='store_true', +                        help='display token names using the exact type') +    args = parser.parse_args() + +    try: +        # Tokenize the input +        if args.filename: +            filename = args.filename +            with builtins.open(filename, 'rb') as f: +                tokens = list(tokenize(f.readline)) +        else: +            filename = "<stdin>" +            tokens = _tokenize(sys.stdin.readline, None) + +        # Output the tokenization +        for token in tokens: +            token_type = token.type +            if args.exact: +                token_type = token.exact_type +            token_range = "%d,%d-%d,%d:" % (token.start + token.end) +            print("%-20s%-15s%-15r" % +                  (token_range, tok_name[token_type], token.string)) +    except IndentationError as err: +        line, column = err.args[1][1:3] +        error(err.args[0], filename, (line, column)) +    except TokenError as err: +        line, column = err.args[1] +        error(err.args[0], filename, (line, column)) +    except SyntaxError as err: +        error(err, filename) +    except IOError as err: +        error(err) +    except KeyboardInterrupt: +        print("interrupted\n") +    except Exception as err: +        perror("unexpected error: %s" % err) +        raise +  if __name__ == "__main__": -    # Quick sanity check -    s = b'''def parseline(self, line): -            """Parse the line into a command name and a string containing -            the arguments.  Returns a tuple containing (command, args, line). -            'command' and 'args' may be None if the line couldn't be parsed. -            """ -            line = line.strip() -            if not line: -                return None, None, line -            elif line[0] == '?': -                line = 'help ' + line[1:] -            elif line[0] == '!': -                if hasattr(self, 'do_shell'): -                    line = 'shell ' + line[1:] -                else: -                    return None, None, line -            i, n = 0, len(line) -            while i < n and line[i] in self.identchars: i = i+1 -            cmd, arg = line[:i], line[i:].strip() -            return cmd, arg, line -    ''' -    for tok in tokenize(iter(s.splitlines()).__next__): -        print(tok) +    main() | 
