summaryrefslogtreecommitdiffstats
path: root/Lib/re.py
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1997-07-15 14:38:13 (GMT)
committerGuido van Rossum <guido@python.org>1997-07-15 14:38:13 (GMT)
commit04a1d74229058d204ce570e3727f438c31c1a176 (patch)
treed6f03b05ee10b228f6fe19d1df1304b0c0d33fb0 /Lib/re.py
parent70f107f63ddf8be78bc985ba2892e615396b25d6 (diff)
downloadcpython-04a1d74229058d204ce570e3727f438c31c1a176.zip
cpython-04a1d74229058d204ce570e3727f438c31c1a176.tar.gz
cpython-04a1d74229058d204ce570e3727f438c31c1a176.tar.bz2
Jeffrey's newest
Diffstat (limited to 'Lib/re.py')
-rw-r--r--Lib/re.py423
1 files changed, 271 insertions, 152 deletions
diff --git a/Lib/re.py b/Lib/re.py
index 2d24da5..b701bb6 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -153,9 +153,7 @@ class MatchObject:
g = self.re.groupindex[g]
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
- if g >= len(self.regs):
- result.append(None)
- elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
+ if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
result.append(None)
else:
result.append(self.string[self.regs[g][0]:self.regs[g][1]])
@@ -525,6 +523,186 @@ def build_fastmap(code, pos=0):
#
#
+[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
+[CHAR, MEMORY_REFERENCE, SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY,
+ BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(8)
+
+def expand_escape(pattern, index, context=NORMAL):
+ if index >= len(pattern):
+ raise error, 'escape ends too soon'
+
+ elif pattern[index] == 't':
+ return CHAR, chr(9), index + 1
+
+ elif pattern[index] == 'n':
+ return CHAR, chr(10), index + 1
+
+ elif pattern[index] == 'r':
+ return CHAR, chr(13), index + 1
+
+ elif pattern[index] == 'f':
+ return CHAR, chr(12), index + 1
+
+ elif pattern[index] == 'a':
+ return CHAR, chr(7), index + 1
+
+ elif pattern[index] == 'e':
+ return CHAR, chr(27), index + 1
+
+ elif pattern[index] == 'c':
+ if index + 1 >= len(pattern):
+ raise error, '\\c must be followed by another character'
+ elif pattern[index + 1] in 'abcdefghijklmnopqrstuvwxyz':
+ return CHAR, chr(ord(pattern[index + 1]) - ord('a') + 1), index + 2
+ else:
+ return CHAR, chr(ord(pattern[index + 1]) ^ 64), index + 2
+
+ elif pattern[index] == 'x':
+ # CAUTION: this is the Python rule, not the Perl rule!
+ end = index
+ while (end < len(pattern)) and (pattern[end] in string.hexdigits):
+ end = end + 1
+ if end == index:
+ raise error, "\\x must be followed by hex digit(s)"
+ # let Python evaluate it, so we don't incorrectly 2nd-guess
+ # what it's doing (and Python in turn passes it on to sscanf,
+ # so that *it* doesn't incorrectly 2nd-guess what C does!)
+ char = eval ('"' + pattern[index-2:end] + '"')
+ assert len(char) == 1
+ return CHAR, char, end
+
+ elif pattern[index] == 'b':
+ if context != NORMAL:
+ return CHAR, chr(8), index + 1
+ else:
+ return WORD_BOUNDARY, '', index + 1
+
+ elif pattern[index] == 'B':
+ if context != NORMAL:
+ return CHAR, 'B', index + 1
+ else:
+ return NOT_WORD_BOUNDARY, '', index + 1
+
+ elif pattern[index] == 'A':
+ if context != NORMAL:
+ return CHAR, 'A', index + 1
+ else:
+ return BEGINNING_OF_BUFFER, '', index + 1
+
+ elif pattern[index] == 'Z':
+ if context != NORMAL:
+ return 'Z', index + 1
+ else:
+ return END_OF_BUFFER, '', index + 1
+
+ elif pattern[index] in 'GluLUQE':
+ raise error, ('\\' + ch + ' is not allowed')
+
+ elif pattern[index] == 'w':
+ if context == NORMAL:
+ return SYNTAX, 'word', index + 1
+ elif context == CHARCLASS:
+ set = []
+ for char in syntax_table.keys():
+ if 'word' in syntax_table[char]:
+ set.append(char)
+ return SET, set, index + 1
+ else:
+ return CHAR, 'w', index + 1
+
+ elif pattern[index] == 'W':
+ if context == NORMAL:
+ return NOT_SYNTAX, 'word', index + 1
+ elif context == CHARCLASS:
+ set = []
+ for char in syntax_table.keys():
+ if 'word' not in syntax_table[char]:
+ set.append(char)
+ return SET, set, index + 1
+ else:
+ return CHAR, 'W', index + 1
+
+ elif pattern[index] == 's':
+ if context == NORMAL:
+ return SYNTAX, 'whitespace', index + 1
+ elif context == CHARCLASS:
+ set = []
+ for char in syntax_table.keys():
+ if 'whitespace' in syntax_table[char]:
+ set.append(char)
+ return SET, set, index + 1
+ else:
+ return CHAR, 's', index + 1
+
+ elif pattern[index] == 'S':
+ if context == NORMAL:
+ return NOT_SYNTAX, 'whitespace', index + 1
+ elif context == CHARCLASS:
+ set = []
+ for char in syntax_table.keys():
+ if 'whitespace' not in syntax_table[char]:
+ set.append(char)
+ return SET, set, index + 1
+ else:
+ return CHAR, 'S', index + 1
+
+ elif pattern[index] == 'd':
+ if context == NORMAL:
+ return SYNTAX, 'digit', index + 1
+ elif context == CHARCLASS:
+ set = []
+ for char in syntax_table.keys():
+ if 'digit' in syntax_table[char]:
+ set.append(char)
+ return SET, set, index + 1
+ else:
+ return CHAR, 'd', index + 1
+
+ elif pattern[index] == 'D':
+ if context == NORMAL:
+ return NOT_SYNTAX, 'digit', index + 1
+ elif context == CHARCLASS:
+ set = []
+ for char in syntax_table.keys():
+ if 'digit' not in syntax_table[char]:
+ set.append(char)
+ return SET, set, index + 1
+ else:
+ return CHAR, 'D', index + 1
+
+ elif pattern[index] in '0123456789':
+ end = index
+ while (end < len(pattern)) and (pattern[end] in string.digits):
+ end = end + 1
+ value = pattern[index:end]
+
+ if (len(value) == 3) or ((len(value) == 2) and (value[0] == '0')):
+ # octal character value
+ value = string.atoi(value, 8)
+ if value > 255:
+ raise error, 'octal char out of range'
+ return CHAR, chr(value), end
+
+ elif value == '0':
+ return CHAR, chr(0), end
+
+ elif len(value) > 3:
+ raise error, ('\\' + value + ' has too many digits')
+
+ else:
+ # \1-\99 - reference a register
+ if context == CHARCLASS:
+ raise error, ('cannot reference a register from '
+ 'inside a character class')
+ value = string.atoi(value)
+ if value == 0:
+ raise error, ('register 0 cannot be used '
+ 'during match')
+ return MEMORY_REFERENCE, value, end
+
+ else:
+ return CHAR, pattern[index], index + 1
+
def compile(pattern, flags=0):
stack = []
index = 0
@@ -536,118 +714,50 @@ def compile(pattern, flags=0):
char = pattern[index]
index = index + 1
if char == '\\':
- if index < len(pattern):
- next = pattern[index]
- index = index + 1
- if next == 't':
- stack.append([Exact(chr(9))])
-
- elif next == 'n':
- stack.append([Exact(chr(10))])
-
- elif next == 'r':
- stack.append([Exact(chr(13))])
-
- elif next == 'f':
- stack.append([Exact(chr(12))])
-
- elif next == 'a':
- stack.append([Exact(chr(7))])
-
- elif next == 'e':
- stack.append([Exact(chr(27))])
+ escape_type, value, index = expand_escape(pattern, index)
- elif next in '0123456789':
- value = next
- while (index < len(pattern)) and \
- (pattern[index] in string.digits):
- value = value + pattern[index]
- index = index + 1
- if (len(value) == 3) or \
- ((len(value) == 2) and (value[0] == '0')):
- value = string.atoi(value, 8)
- if value > 255:
- raise error, 'octal char out of range'
- stack.append([Exact(chr(value))])
- elif value == '0':
- stack.append([Exact(chr(0))])
- elif len(value) > 3:
- raise error, 'too many digits'
- else:
- value = string.atoi(value)
- if value >= register:
- raise error, ('cannot reference a register '
- 'not yet used')
- elif value == 0:
- raise error, ('register 0 cannot be used '
- 'during match')
- stack.append([MatchMemory(value)])
-
- elif next == 'x':
- value = ''
- while (index < len(pattern)) and \
- (pattern[index] in string.hexdigits):
- value = value + pattern[index]
- index = index + 1
- value = string.atoi(value, 16)
- if value > 255:
- raise error, 'hex char out of range'
- stack.append([Exact(chr(value))])
-
- elif next == 'c':
- if index >= len(pattern):
- raise error, '\\c at end of re'
- elif pattern[index] in 'abcdefghijklmnopqrstuvwxyz':
- stack.append(Exact(chr(ord(pattern[index]) -
- ord('a') + 1)))
- else:
- stack.append(Exact(chr(ord(pattern[index]) ^ 64)))
- index = index + 1
-
- elif next == 'A':
- stack.append([BegBuf()])
-
- elif next == 'Z':
- stack.append([EndBuf()])
-
- elif next == 'b':
- stack.append([WordBound()])
-
- elif next == 'B':
- stack.append([NotWordBound()])
-
- elif next == 'w':
- stack.append([SyntaxSpec('word')])
-
- elif next == 'W':
- stack.append([NotSyntaxSpec('word')])
-
- elif next == 's':
- stack.append([SyntaxSpec('whitespace')])
-
- elif next == 'S':
- stack.append([NotSyntaxSpec('whitespace')])
-
- elif next == 'd':
- stack.append([SyntaxSpec('digit')])
-
- elif next == 'D':
- stack.append([NotSyntaxSpec('digit')])
-
- elif next in 'GluLUQE':
- # some perl-isms that we don't support
- raise error, '\\' + next + ' not supported'
+ if escape_type == CHAR:
+ stack.append([Exact(value)])
- else:
- stack.append([Exact(pattern[index])])
-
+ elif escape_type == MEMORY_REFERENCE:
+ if value >= register:
+ raise error, ('cannot reference a register '
+ 'not yet used')
+ stack.append([MatchMemory(value)])
+
+ elif escape_type == BEGINNING_OF_BUFFER:
+ stack.append([BegBuf()])
+
+ elif escape_type == END_OF_BUFFER:
+ stack.append([EndBuf()])
+
+ elif escape_type == WORD_BOUNDARY:
+ stack.append([WordBound()])
+
+ elif escape_type == NOT_WORD_BOUNDARY:
+ stack.append([NotWordBound()])
+
+ elif escape_type == SYNTAX:
+ stack.append([SyntaxSpec(value)])
+
+ elif escape_type == NOT_SYNTAX:
+ stack.append([NotSyntaxSpec(value)])
+
+ elif escape_type == SET:
+ raise error, 'cannot use set escape type here'
+
else:
- raise error, 'backslash at the end of a string'
+ raise error, 'unknown escape type'
elif char == '|':
if len(stack) == 0:
- raise error, 'nothing to alternate'
+ raise error, 'alternate with nothing on the left'
+ if stack[-1][0].name == '(':
+ raise error, 'alternate with nothing on the left in the group'
+ if stack[-1][0].name == '|':
+ raise error, 'alternates with nothing inbetween them'
expr = []
+
while (len(stack) != 0) and \
(stack[-1][0].name != '(') and \
(stack[-1][0].name != '|'):
@@ -775,12 +885,13 @@ def compile(pattern, flags=0):
if len(stack) == 0:
raise error, 'too many close parens'
+
if len(expr) == 0:
raise error, 'nothing inside parens'
# check to see if alternation used correctly
if (expr[-1].name == '|'):
- raise error, 'alternation with nothing on the right'
+ raise error, 'alternate with nothing on the right'
# remove markers left by alternation
expr = filter(lambda x: x.name != '|', expr)
@@ -789,7 +900,7 @@ def compile(pattern, flags=0):
need_label = 0
for i in range(len(expr)):
if (expr[i].name == 'jump') and (expr[i].label == -1):
- expr[i] = JumpOpcode(label)
+ expr[i] = Jump(label)
need_label = 1
if need_label:
expr.append(Label(label))
@@ -1033,7 +1144,7 @@ def compile(pattern, flags=0):
stack.append([Exact(char)])
elif char in string.whitespace:
- if flags & VERBOSE:
+ if not (flags & VERBOSE):
stack.append([Exact(char)])
elif char == '[':
@@ -1042,28 +1153,44 @@ def compile(pattern, flags=0):
negate = 0
last = ''
set = []
+
if pattern[index] == '^':
negate = 1
index = index + 1
if index >= len(pattern):
raise error, 'incomplete set'
- if pattern[index] in ']-':
- set.append(pattern[index])
- last = pattern[index]
- index = index + 1
+
while (index < len(pattern)) and (pattern[index] != ']'):
next = pattern[index]
index = index + 1
if next == '-':
+ if last == '':
+ raise error, 'improper use of range in character set'
+
+ start = last
+
if (index >= len(pattern)) or (pattern[index] == ']'):
raise error, 'incomplete range in set'
- if last > pattern[index]:
+
+ if pattern[index] == '\\':
+ escape_type, value, index = expand_escape(pattern,
+ index + 1,
+ CHARCLASS)
+
+ if escape_type == CHAR:
+ end = value
+ else:
+ raise error, ('illegal escape in character '
+ 'class range')
+ else:
+ end = pattern[index]
+
+ if start > end:
raise error, 'range arguments out of order in set'
- for next in map(chr, \
- range(ord(last), \
- ord(pattern[index]) + 1)):
- if next not in set:
- set.append(next)
+ for char in map(chr, range(ord(start), ord(end) + 1)):
+ if char not in set:
+ set.append(char)
+
last = ''
index = index + 1
@@ -1071,42 +1198,30 @@ def compile(pattern, flags=0):
# expand syntax meta-characters and add to set
if index >= len(pattern):
raise error, 'incomplete set'
- elif (pattern[index] == ']'):
- raise error, 'backslash at the end of a set'
- elif pattern[index] == 'w':
- for next in syntax_table.keys():
- if 'word' in syntax_table[next]:
- set.append(next)
- elif pattern[index] == 'W':
- for next in syntax_table.keys():
- if 'word' not in syntax_table[next]:
- set.append(next)
- elif pattern[index] == 'd':
- for next in syntax_table.keys():
- if 'digit' in syntax_table[next]:
- set.append(next)
- elif pattern[index] == 'D':
- for next in syntax_table.keys():
- if 'digit' not in syntax_table[next]:
- set.append(next)
- elif pattern[index] == 's':
- for next in syntax_table.keys():
- if 'whitespace' in syntax_table[next]:
- set.append(next)
- elif pattern[index] == 'S':
- for next in syntax_table.keys():
- if 'whitespace' not in syntax_table[next]:
- set.append(next)
+
+ escape_type, value, index = expand_escape(pattern,
+ index,
+ CHARCLASS)
+
+ if escape_type == CHAR:
+ set.append(value)
+ last = value
+
+ elif escape_type == SET:
+ for char in value:
+ if char not in set:
+ set.append(char)
+ last = ''
+
else:
- raise error, 'unknown meta in set'
- last = ''
- index = index + 1
+ raise error, 'illegal escape type in character class'
else:
if next not in set:
set.append(next)
last = next
- if pattern[index] != ']':
+
+ if (index >= len(pattern)) or ( pattern[index] != ']'):
raise error, 'incomplete set'
index = index + 1
@@ -1116,8 +1231,12 @@ def compile(pattern, flags=0):
for char in map(chr, range(256)):
if char not in set:
notset.append(char)
+ if len(notset) == 0:
+ raise error, 'empty negated set'
stack.append([Set(notset)])
else:
+ if len(set) == 0:
+ raise error, 'empty set'
stack.append([Set(set)])
else:
@@ -1132,7 +1251,7 @@ def compile(pattern, flags=0):
if len(code) == 0:
raise error, 'no code generated'
if (code[-1].name == '|'):
- raise error, 'alternation with nothing on the right'
+ raise error, 'alternate with nothing on the right'
code = filter(lambda x: x.name != '|', code)
need_label = 0
for i in range(len(code)):