diff options
author | Guido van Rossum <guido@python.org> | 1992-09-20 21:41:09 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1992-09-20 21:41:09 (GMT) |
commit | 7a461e5aaf011243d9ac2658e4172e316b031eb9 (patch) | |
tree | a333855142b647a2ca74f5ee84e1d66c11754542 /Lib/regsub.py | |
parent | 7066dd75c5ee8385135541d03fb8edd8939ad740 (diff) | |
download | cpython-7a461e5aaf011243d9ac2658e4172e316b031eb9.zip cpython-7a461e5aaf011243d9ac2658e4172e316b031eb9.tar.gz cpython-7a461e5aaf011243d9ac2658e4172e316b031eb9.tar.bz2 |
New module regsub contains sub(), gsub() and split() as in nawk.
string.splitfields(s, '') now returns [s] as split() in nawk.
Added _exit to exported functions of os.
Diffstat (limited to 'Lib/regsub.py')
-rw-r--r-- | Lib/regsub.py | 147 |
1 files changed, 147 insertions, 0 deletions
diff --git a/Lib/regsub.py b/Lib/regsub.py new file mode 100644 index 0000000..7eb175b --- /dev/null +++ b/Lib/regsub.py @@ -0,0 +1,147 @@ +# Regular expression subroutines: +# sub(pat, repl, str): replace first occurrence of pattern in string +# gsub(pat, repl, str): replace all occurrences of pattern in string +# split(str, pat): split string using pattern as delimiter + + +import regex + + +# Replace first occurrence of pattern pat in string str by replacement +# repl. If the pattern isn't found, the string is returned unchanged. +# The replacement may contain references \digit to subpatterns and +# escaped backslashes. The pattern may be a string or an already +# compiled pattern. + +def sub(pat, repl, str): + prog = compile(pat) + if prog.search(str) >= 0: + regs = prog.regs + a, b = regs[0] + str = str[:a] + expand(repl, regs, str) + str[b:] + return str + + +# Replace all (non-overlapping) occurrences of pattern pat in string +# str by replacement repl. The same rules as for sub() apply. +# Empty matches for the pattern are replaced only when not adjacent to +# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'. + +def gsub(pat, repl, str): + prog = compile(pat) + new = '' + start = 0 + first = 1 + while prog.search(str, start) >= 0: + regs = prog.regs + a, b = regs[0] + if a == b == start and not first: + if start >= len(str) or prog.search(str, start+1) < 0: + break + regs = prog.regs + a, b = regs[0] + new = new + str[start:a] + expand(repl, regs, str) + start = b + first = 0 + new = new + str[start:] + return new + + +# Split string str in fields separated by delimiters matching pattern +# pat. Only non-empty matches for the pattern are considered, so e.g. +# split('abc', '') returns ['abc']. + +def split(str, pat): + prog = compile(pat) + res = [] + start = next = 0 + while prog.search(str, next) >= 0: + regs = prog.regs + a, b = regs[0] + if a == b: + next = next + 1 + if next >= len(str): + break + else: + res.append(str[start:a]) + start = next = b + res.append(str[start:]) + return res + + +# Internal subroutines: +# compile(pat): compile a pattern, caching already compiled patterns +# expand(repl, regs, str): expand \digit escapes in replacement string + + +# Manage a cache of compiled regular expressions. +# If the pattern is a string a compiled version of it is returned. +# If the pattern has been used before we return an already compiled +# version from the cache; otherwise we compile it now and save the +# compiled version in the cache. +# Instead of a string, a compiled regular expression can also be +# passed. +# WARNING: if the pattern syntax is changed, the cache should be +# flushed! + +cache = {} + +def compile(pat): + if type(pat) <> type(''): + return pat # Assume it is a compiled regex + if cache.has_key(pat): + prog = cache[pat] # Get it from the cache + else: + prog = cache[pat] = regex.compile(pat) + return prog + + +# Expand \digit in the replacement. +# Each occurrence of \digit is replaced by the substring of str +# indicated by regs[digit]. To include a literal \ in the +# replacement, double it; other \ escapes are left unchanged (i.e. +# the \ and the following character are both copied). + +def expand(repl, regs, str): + if '\\' not in repl: + return repl + new = '' + i = 0 + while i < len(repl): + c = repl[i]; i = i+1 + if c <> '\\' or i >= len(repl): + new = new + c + else: + c = repl[i]; i = i+1 + if '0' <= c <= '9': + a, b = regs[eval(c)] + new = new + str[a:b] + elif c == '\\': + new = new + c + else: + new = new + '\\' + c + return new + + +# Test program, reads sequences "pat repl str" from stdin. +# Optional argument specifies pattern used to split lines. + +def test(): + import sys + if sys.argv[1:]: + delpat = sys.argv[1] + else: + delpat = '[ \t\n]+' + while 1: + if sys.stdin.isatty(): sys.stderr.write('--> ') + line = sys.stdin.readline() + if not line: break + if line[-1] == '\n': line = line[:-1] + fields = split(line, delpat) + if len(fields) <> 3: + print 'Sorry, not three fields' + print 'split:', `fields` + continue + [pat, repl, str] = split(line, delpat) + print 'sub :', `sub(pat, repl, str)` + print 'gsub:', `gsub(pat, repl, str)` |