diff options
-rwxr-xr-x | Lib/reconvert.py | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/Lib/reconvert.py b/Lib/reconvert.py new file mode 100755 index 0000000..f0b61fc --- /dev/null +++ b/Lib/reconvert.py @@ -0,0 +1,186 @@ +#! /usr/bin/env python1.5 + +"""Convert old ("regex") regular expressions to new syntax ("re"). + +When imported as a module, there are two functions, with their own +strings: + + convert(s, syntax=None) -- convert a regex regular expression to re syntax + + quote(s) -- return a quoted string literal + +When used as a script, read a Python string literal (or any other +expression evaluating to a string) from stdin, and write the +translated expression to stdout as a string literal. Unless stdout is +a tty, no trailing \n is written to stdout. This is done so that it +can be used with Emacs C-U M-| (shell-command-on-region with argument +which filters the region through the shell command). + +No attempt has been made at coding for performance. + +Translation table... + + \( ( (unless RE_NO_BK_PARENS set) + \) ) (unless RE_NO_BK_PARENS set) + \| | (unless RE_NO_BK_VBAR set) + \< \b (not quite the same, but alla...) + \> \b (not quite the same, but alla...) + \` \A + \' \Z + +Not translated... + + . + ^ + $ + * + + (unless RE_BK_PLUS_QM set, then to \+) + ? (unless RE_BK_PLUS_QM set, then to \?) + \ + \b + \B + \w + \W + \1 ... \9 + +Special cases... + + Non-printable characters are always replaced by their 3-digit + escape code (except \t, \n, \r, which use mnemonic escapes) + + Newline is turned into | when RE_NEWLINE_OR is set + +XXX To be done... + + [...] (different treatment of backslashed items?) + [^...] (different treatment of backslashed items?) + ^ $ * + ? (in some error contexts these are probably treated differently) + \vDD \DD (in the regex docs but only works when RE_ANSI_HEX set) + +""" + + +import regex +from regex_syntax import * # RE_* + +# Default translation table +mastertable = { + r'\<': r'\b', + r'\>': r'\b', + r'\`': r'\A', + r'\'': r'\Z', + r'\(': '(', + r'\)': ')', + r'\|': '|', + '(': r'\(', + ')': r'\)', + '|': r'\|', + '\t': r'\t', + '\n': r'\n', + '\r': r'\r', +} + + +def convert(s, syntax=None): + """Convert a regex regular expression to re syntax. + + The first argument is the regular expression, as a string object, + just like it would be passed to regex.compile(). (I.e., pass the + actual string object -- string quotes must already have been + removed and the standard escape processing has already been done, + e.g. by eval().) + + The optional second argument is the regex syntax variant to be + used. This is an integer mask as passed to regex.set_syntax(); + the flag bits are defined in regex_syntax. When not specified, or + when None is given, the current regex syntax mask (as retrieved by + regex.get_syntax()) is used -- which is 0 by default. + + The return value is a regular expression, as a string object that + could be passed to re.compile(). (I.e., no string quotes have + been added -- use quote() below, or repr().) + + The conversion is not always guaranteed to be correct. More + syntactical analysis should be performed to detect borderline + cases and decide what to do with them. For example, 'x*?' is not + translated correctly. + + """ + table = mastertable.copy() + if syntax is None: + syntax = regex.get_syntax() + if syntax & RE_NO_BK_PARENS: + del table[r'\('], table[r'\)'] + del table['('], table[')'] + if syntax & RE_NO_BK_VBAR: + del table[r'\|'] + del table['|'] + if syntax & RE_BK_PLUS_QM: + table['+'] = r'\+' + table['?'] = r'\?' + table[r'\+'] = '+' + table[r'\?'] = '?' + if syntax & RE_NEWLINE_OR: + table['\n'] = '|' + res = "" + + i = 0 + end = len(s) + while i < end: + c = s[i] + i = i+1 + if c == '\\': + c = s[i] + i = i+1 + key = '\\' + c + key = table.get(key, key) + res = res + key + else: + c = table.get(c, c) + res = res + c + return res + + +def quote(s, quote=None): + """Convert a string object to a quoted string literal. + + This is similar to repr() but will return a "raw" string (r'...' + or r"...") when the string contains backslashes, instead of + doubling all backslashes. The resulting string does *not* always + evaluate to the same string as the original; however it will do + just the right thing when passed into re.compile(). + + The optional second argument forces the string quote; it must be + a single character which is a valid Python string quote. + + """ + if quote is None: + q = "'" + altq = "'" + if q in s and altq not in s: + q = altq + else: + assert quote in ('"', "'") + q = quote + res = q + for c in s: + if c == q: c = '\\' + c + elif c < ' ' or c > '~': c = "\\%03o" % ord(c) + res = res + c + res = res + q + if '\\' in res: + res = 'r' + res + return res + + +def main(): + """Main program -- called when run as a script.""" + import sys + s = eval(sys.stdin.read()) + sys.stdout.write(quote(convert(s))) + if sys.stdout.isatty(): + sys.stdout.write("\n") + + +if __name__ == '__main__': + main() |