summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1997-10-23 22:43:50 (GMT)
committerGuido van Rossum <guido@python.org>1997-10-23 22:43:50 (GMT)
commitf81e5b9c780e47c8a312a69a7ff71169276a9720 (patch)
tree7ab590f66035617f50a77c7fe36a5d007f29b426
parent1fef18118339237de025ed15dc6df4c39315b55d (diff)
downloadcpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.zip
cpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.tar.gz
cpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.tar.bz2
New module -- converts regex regular expressions to re style.
There are two ways to use this -- as a filter (e.g. using C-U M-| on a regex string literal in an Emacs buffer) or from a Python program which imports this as a module. Read the doc string for more info, and also some caveats (some cases aren't handled right).
-rwxr-xr-xLib/reconvert.py186
1 files changed, 186 insertions, 0 deletions
diff --git a/Lib/reconvert.py b/Lib/reconvert.py
new file mode 100755
index 0000000..f0b61fc
--- /dev/null
+++ b/Lib/reconvert.py
@@ -0,0 +1,186 @@
+#! /usr/bin/env python1.5
+
+"""Convert old ("regex") regular expressions to new syntax ("re").
+
+When imported as a module, there are two functions, with their own
+strings:
+
+ convert(s, syntax=None) -- convert a regex regular expression to re syntax
+
+ quote(s) -- return a quoted string literal
+
+When used as a script, read a Python string literal (or any other
+expression evaluating to a string) from stdin, and write the
+translated expression to stdout as a string literal. Unless stdout is
+a tty, no trailing \n is written to stdout. This is done so that it
+can be used with Emacs C-U M-| (shell-command-on-region with argument
+which filters the region through the shell command).
+
+No attempt has been made at coding for performance.
+
+Translation table...
+
+ \( ( (unless RE_NO_BK_PARENS set)
+ \) ) (unless RE_NO_BK_PARENS set)
+ \| | (unless RE_NO_BK_VBAR set)
+ \< \b (not quite the same, but alla...)
+ \> \b (not quite the same, but alla...)
+ \` \A
+ \' \Z
+
+Not translated...
+
+ .
+ ^
+ $
+ *
+ + (unless RE_BK_PLUS_QM set, then to \+)
+ ? (unless RE_BK_PLUS_QM set, then to \?)
+ \
+ \b
+ \B
+ \w
+ \W
+ \1 ... \9
+
+Special cases...
+
+ Non-printable characters are always replaced by their 3-digit
+ escape code (except \t, \n, \r, which use mnemonic escapes)
+
+ Newline is turned into | when RE_NEWLINE_OR is set
+
+XXX To be done...
+
+ [...] (different treatment of backslashed items?)
+ [^...] (different treatment of backslashed items?)
+ ^ $ * + ? (in some error contexts these are probably treated differently)
+ \vDD \DD (in the regex docs but only works when RE_ANSI_HEX set)
+
+"""
+
+
+import regex
+from regex_syntax import * # RE_*
+
+# Default translation table
+mastertable = {
+ r'\<': r'\b',
+ r'\>': r'\b',
+ r'\`': r'\A',
+ r'\'': r'\Z',
+ r'\(': '(',
+ r'\)': ')',
+ r'\|': '|',
+ '(': r'\(',
+ ')': r'\)',
+ '|': r'\|',
+ '\t': r'\t',
+ '\n': r'\n',
+ '\r': r'\r',
+}
+
+
+def convert(s, syntax=None):
+ """Convert a regex regular expression to re syntax.
+
+ The first argument is the regular expression, as a string object,
+ just like it would be passed to regex.compile(). (I.e., pass the
+ actual string object -- string quotes must already have been
+ removed and the standard escape processing has already been done,
+ e.g. by eval().)
+
+ The optional second argument is the regex syntax variant to be
+ used. This is an integer mask as passed to regex.set_syntax();
+ the flag bits are defined in regex_syntax. When not specified, or
+ when None is given, the current regex syntax mask (as retrieved by
+ regex.get_syntax()) is used -- which is 0 by default.
+
+ The return value is a regular expression, as a string object that
+ could be passed to re.compile(). (I.e., no string quotes have
+ been added -- use quote() below, or repr().)
+
+ The conversion is not always guaranteed to be correct. More
+ syntactical analysis should be performed to detect borderline
+ cases and decide what to do with them. For example, 'x*?' is not
+ translated correctly.
+
+ """
+ table = mastertable.copy()
+ if syntax is None:
+ syntax = regex.get_syntax()
+ if syntax & RE_NO_BK_PARENS:
+ del table[r'\('], table[r'\)']
+ del table['('], table[')']
+ if syntax & RE_NO_BK_VBAR:
+ del table[r'\|']
+ del table['|']
+ if syntax & RE_BK_PLUS_QM:
+ table['+'] = r'\+'
+ table['?'] = r'\?'
+ table[r'\+'] = '+'
+ table[r'\?'] = '?'
+ if syntax & RE_NEWLINE_OR:
+ table['\n'] = '|'
+ res = ""
+
+ i = 0
+ end = len(s)
+ while i < end:
+ c = s[i]
+ i = i+1
+ if c == '\\':
+ c = s[i]
+ i = i+1
+ key = '\\' + c
+ key = table.get(key, key)
+ res = res + key
+ else:
+ c = table.get(c, c)
+ res = res + c
+ return res
+
+
+def quote(s, quote=None):
+ """Convert a string object to a quoted string literal.
+
+ This is similar to repr() but will return a "raw" string (r'...'
+ or r"...") when the string contains backslashes, instead of
+ doubling all backslashes. The resulting string does *not* always
+ evaluate to the same string as the original; however it will do
+ just the right thing when passed into re.compile().
+
+ The optional second argument forces the string quote; it must be
+ a single character which is a valid Python string quote.
+
+ """
+ if quote is None:
+ q = "'"
+ altq = "'"
+ if q in s and altq not in s:
+ q = altq
+ else:
+ assert quote in ('"', "'")
+ q = quote
+ res = q
+ for c in s:
+ if c == q: c = '\\' + c
+ elif c < ' ' or c > '~': c = "\\%03o" % ord(c)
+ res = res + c
+ res = res + q
+ if '\\' in res:
+ res = 'r' + res
+ return res
+
+
+def main():
+ """Main program -- called when run as a script."""
+ import sys
+ s = eval(sys.stdin.read())
+ sys.stdout.write(quote(convert(s)))
+ if sys.stdout.isatty():
+ sys.stdout.write("\n")
+
+
+if __name__ == '__main__':
+ main()