New module -- converts regex regular expressions to re style.

There are two ways to use this -- as a filter (e.g. using C-U M-| on a regex string literal in an Emacs buffer) or from a Python program which imports this as a module. Read the doc string for more info, and also some caveats (some cases aren't handled right).
author: Guido van Rossum <guido@python.org> 1997-10-23 22:43:50 (GMT)
committer: Guido van Rossum <guido@python.org> 1997-10-23 22:43:50 (GMT)
commit: f81e5b9c780e47c8a312a69a7ff71169276a9720 (patch)
tree: 7ab590f66035617f50a77c7fe36a5d007f29b426 /Lib/reconvert.py
parent: 1fef18118339237de025ed15dc6df4c39315b55d (diff)
download: cpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.zip
cpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.tar.gz
cpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.tar.bz2
1 files changed, 186 insertions, 0 deletions
diff --git a/Lib/reconvert.py b/Lib/reconvert.py
new file mode 100755
index 0000000..f0b61fc
--- /dev/null
+++ b/Lib/reconvert.py
@@ -0,0 +1,186 @@
+#! /usr/bin/env python1.5
+
+"""Convert old ("regex") regular expressions to new syntax ("re").
+
+When imported as a module, there are two functions, with their own
+strings:
+
+  convert(s, syntax=None) -- convert a regex regular expression to re syntax
+
+  quote(s) -- return a quoted string literal
+
+When used as a script, read a Python string literal (or any other
+expression evaluating to a string) from stdin, and write the
+translated expression to stdout as a string literal.  Unless stdout is
+a tty, no trailing \n is written to stdout.  This is done so that it
+can be used with Emacs C-U M-| (shell-command-on-region with argument
+which filters the region through the shell command).
+
+No attempt has been made at coding for performance.
+
+Translation table...
+
+    \(    (     (unless RE_NO_BK_PARENS set)
+    \)    )     (unless RE_NO_BK_PARENS set)
+    \|    |     (unless RE_NO_BK_VBAR set)
+    \<    \b    (not quite the same, but alla...)
+    \>    \b    (not quite the same, but alla...)
+    \`    \A
+    \'    \Z
+
+Not translated...
+
+    .
+    ^
+    $
+    *
+    +           (unless RE_BK_PLUS_QM set, then to \+)
+    ?           (unless RE_BK_PLUS_QM set, then to \?)
+    \
+    \b
+    \B
+    \w
+    \W
+    \1 ... \9
+
+Special cases...
+
+    Non-printable characters are always replaced by their 3-digit
+    escape code (except \t, \n, \r, which use mnemonic escapes)
+
+    Newline is turned into | when RE_NEWLINE_OR is set
+
+XXX To be done...
+
+    [...]     (different treatment of backslashed items?)
+    [^...]    (different treatment of backslashed items?)
+    ^ $ * + ? (in some error contexts these are probably treated differently)
+    \vDD  \DD (in the regex docs but only works when RE_ANSI_HEX set)
+
+"""
+
+
+import regex
+from regex_syntax import * # RE_* 
+
+# Default translation table
+mastertable = {
+    r'\<': r'\b',
+    r'\>': r'\b',
+    r'\`': r'\A',
+    r'\'': r'\Z',
+    r'\(': '(',
+    r'\)': ')',
+    r'\|': '|',
+    '(': r'\(',
+    ')': r'\)',
+    '|': r'\|',
+    '\t': r'\t',
+    '\n': r'\n',
+    '\r': r'\r',
+}
+
+
+def convert(s, syntax=None):
+    """Convert a regex regular expression to re syntax.
+
+    The first argument is the regular expression, as a string object,
+    just like it would be passed to regex.compile().  (I.e., pass the
+    actual string object -- string quotes must already have been
+    removed and the standard escape processing has already been done,
+    e.g. by eval().)
+
+    The optional second argument is the regex syntax variant to be
+    used.  This is an integer mask as passed to regex.set_syntax();
+    the flag bits are defined in regex_syntax.  When not specified, or
+    when None is given, the current regex syntax mask (as retrieved by
+    regex.get_syntax()) is used -- which is 0 by default.
+
+    The return value is a regular expression, as a string object that
+    could be passed to re.compile().  (I.e., no string quotes have
+    been added -- use quote() below, or repr().)
+
+    The conversion is not always guaranteed to be correct.  More
+    syntactical analysis should be performed to detect borderline
+    cases and decide what to do with them.  For example, 'x*?' is not
+    translated correctly.
+
+    """
+    table = mastertable.copy()
+    if syntax is None:
+	syntax = regex.get_syntax()
+    if syntax & RE_NO_BK_PARENS:
+	del table[r'\('], table[r'\)']
+	del table['('], table[')']
+    if syntax & RE_NO_BK_VBAR:
+	del table[r'\|']
+	del table['|']
+    if syntax & RE_BK_PLUS_QM:
+	table['+'] = r'\+'
+	table['?'] = r'\?'
+	table[r'\+'] = '+'
+	table[r'\?'] = '?'
+    if syntax & RE_NEWLINE_OR:
+	table['\n'] = '|'
+    res = ""
+
+    i = 0
+    end = len(s)
+    while i < end:
+	c = s[i]
+	i = i+1
+	if c == '\\':
+	    c = s[i]
+	    i = i+1
+	    key = '\\' + c
+	    key = table.get(key, key)
+	    res = res + key
+	else:
+	    c = table.get(c, c)
+	    res = res + c
+    return res
+
+
+def quote(s, quote=None):
+    """Convert a string object to a quoted string literal.
+
+    This is similar to repr() but will return a "raw" string (r'...'
+    or r"...") when the string contains backslashes, instead of
+    doubling all backslashes.  The resulting string does *not* always
+    evaluate to the same string as the original; however it will do
+    just the right thing when passed into re.compile().
+
+    The optional second argument forces the string quote; it must be
+    a single character which is a valid Python string quote.
+
+    """
+    if quote is None:
+	q = "'"
+	altq = "'"
+	if q in s and altq not in s:
+	    q = altq
+    else:
+	assert quote in ('"', "'")
+	q = quote
+    res = q
+    for c in s:
+	if c == q: c = '\\' + c
+	elif c < ' ' or c > '~': c = "\\%03o" % ord(c)
+	res = res + c
+    res = res + q
+    if '\\' in res:
+	res = 'r' + res
+    return res
+
+
+def main():
+    """Main program -- called when run as a script."""
+    import sys
+    s = eval(sys.stdin.read())
+    sys.stdout.write(quote(convert(s)))
+    if sys.stdout.isatty():
+	sys.stdout.write("\n")
+
+
+if __name__ == '__main__':
+    main()
author	Guido van Rossum <guido@python.org>	1997-10-23 22:43:50 (GMT)
committer	Guido van Rossum <guido@python.org>	1997-10-23 22:43:50 (GMT)
commit	f81e5b9c780e47c8a312a69a7ff71169276a9720 (patch)
tree	7ab590f66035617f50a77c7fe36a5d007f29b426 /Lib/reconvert.py
parent	1fef18118339237de025ed15dc6df4c39315b55d (diff)
download	cpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.zip cpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.tar.gz cpython-f81e5b9c780e47c8a312a69a7ff71169276a9720.tar.bz2