Initial revision

author: Guido van Rossum <guido@python.org> 1992-01-01 19:34:47 (GMT)
committer: Guido van Rossum <guido@python.org> 1992-01-01 19:34:47 (GMT)
commit: 4d8e859e8f0a209a7e999ce9cc0988156c795949 (patch)
tree: f29c2a574c3be20ff07886f7d82e78388189fa66 /Lib
parent: 42d1f63c54b8f5e3da04155c3369be1b98a7f3d3 (diff)
download: cpython-4d8e859e8f0a209a7e999ce9cc0988156c795949.zip
cpython-4d8e859e8f0a209a7e999ce9cc0988156c795949.tar.gz
cpython-4d8e859e8f0a209a7e999ce9cc0988156c795949.tar.bz2
1 files changed, 60 insertions, 0 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
new file mode 100644
index 0000000..6b3d991
--- /dev/null
+++ b/Lib/tokenize.py
@@ -0,0 +1,60 @@
+# This module compiles a regular expression that recognizes Python tokens.
+# It is designed to match the working of the Python tokenizer exactly.
+# It takes care of everything except indentation;
+# note that un-escaped newlines are tokens, too.
+# tokenprog.regs[3] gives the location of the token without whitespace
+# It also defines various subexpressions, but doesn't compile them.
+# See the function test() below for an example of how to use.
+
+import regex
+
+# Note: to get a quoted backslash in a regexp, it must be quadrupled.
+
+Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
+
+Name = '[a-zA-Z_][a-zA-Z0-9_]*'
+
+Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
+Octnumber = '0[0-7]*[lL]?'
+Decnumber = '[1-9][0-9]*[lL]?'
+Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
+Exponent = '[eE][-+]?[0-9]+'
+Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
+Expfloat = '[0-9]+' + Exponent
+Floatnumber = Pointfloat + '\|' + Expfloat
+Number = Intnumber + '\|' + Floatnumber
+
+String = '\'\(\\\\.\|[^\\\n\']\)*\''
+
+Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
+Bracket = '[][(){}]'
+Special = '[:;.,`\n]'
+Funny = Operator + '\|' + Bracket + '\|' + Special
+
+PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
+
+Token = Ignore + '\(' + PlainToken + '\)'
+
+try:
+	save_syntax = regex.set_syntax(0) # Use default syntax
+	tokenprog = regex.compile(Token)
+finally:
+	dummy = regex.set_syntax(save_syntax) # Restore original syntax
+
+
+def test(file):
+	f = open(file, 'r')
+	while 1:
+		line = f.readline()
+		if not line: break
+		i, n = 0, len(line)
+		while i < n:
+			j = tokenprog.match(line, i)
+			if j < 0:
+				print 'No token at', `line[i:i+20]` + '...'
+				i = i+1
+			else:
+				i = i+j
+				a, b = tokenprog.regs[3]
+				if a < b:
+					print 'Token:', `line[a:b]`
author	Guido van Rossum <guido@python.org>	1992-01-01 19:34:47 (GMT)
committer	Guido van Rossum <guido@python.org>	1992-01-01 19:34:47 (GMT)
commit	4d8e859e8f0a209a7e999ce9cc0988156c795949 (patch)
tree	f29c2a574c3be20ff07886f7d82e78388189fa66 /Lib
parent	42d1f63c54b8f5e3da04155c3369be1b98a7f3d3 (diff)
download	cpython-4d8e859e8f0a209a7e999ce9cc0988156c795949.zip cpython-4d8e859e8f0a209a7e999ce9cc0988156c795949.tar.gz cpython-4d8e859e8f0a209a7e999ce9cc0988156c795949.tar.bz2