summaryrefslogtreecommitdiffstats
path: root/Lib/lib2to3/btm_utils.py
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2010-10-14 23:03:32 (GMT)
committerBenjamin Peterson <benjamin@python.org>2010-10-14 23:03:32 (GMT)
commitb0871cac113961499b170f3470428b25fe2a432d (patch)
treeececab0680cb705507850c7ef83b285dfa3bc436 /Lib/lib2to3/btm_utils.py
parentea5d827b729e425751428153318ecc348cc0be50 (diff)
downloadcpython-b0871cac113961499b170f3470428b25fe2a432d.zip
cpython-b0871cac113961499b170f3470428b25fe2a432d.tar.gz
cpython-b0871cac113961499b170f3470428b25fe2a432d.tar.bz2
Merged revisions 85510 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/branches/py3k ................ r85510 | benjamin.peterson | 2010-10-14 18:00:04 -0500 (Thu, 14 Oct 2010) | 61 lines Merged revisions 83852-83853,83857,84042,84216,84274-84276,84375,85388,85478,85506-85508 via svnmerge from svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3 ........ r83852 | benjamin.peterson | 2010-08-08 15:45:44 -0500 (Sun, 08 Aug 2010) | 1 line wrap with parens ........ r83853 | benjamin.peterson | 2010-08-08 15:46:31 -0500 (Sun, 08 Aug 2010) | 1 line use parens ........ r83857 | benjamin.peterson | 2010-08-08 15:59:49 -0500 (Sun, 08 Aug 2010) | 1 line things which use touch_import should be pre order ........ r84042 | george.boutsioukis | 2010-08-14 16:10:19 -0500 (Sat, 14 Aug 2010) | 2 lines This revision incorporates into the 2to3 tool the new, faster, tree matching algorithm developed during a GSOC project. The algorithm resides in the two added modules, btm_matcher and btm_utils. New code has been added to drive the new matching process in refactor.py and a few minor changes were made in other modules. A BM_compatible flag(False by default) has been added in fixer_base and it is set to True in most of the current fixers. ........ r84216 | benjamin.peterson | 2010-08-19 16:44:05 -0500 (Thu, 19 Aug 2010) | 1 line allow star_expr in testlist_gexp ........ r84274 | benjamin.peterson | 2010-08-22 18:40:46 -0500 (Sun, 22 Aug 2010) | 1 line wrap long line ........ r84275 | benjamin.peterson | 2010-08-22 18:42:22 -0500 (Sun, 22 Aug 2010) | 1 line cleanup ........ r84276 | benjamin.peterson | 2010-08-22 18:51:01 -0500 (Sun, 22 Aug 2010) | 1 line when there's a None value and a traceback, don't call type with it #9661 ........ r84375 | george.boutsioukis | 2010-08-31 08:38:53 -0500 (Tue, 31 Aug 2010) | 3 lines Idiomatic code changes & stylistic issues fixed in the BottomMatcher module. Thanks to Benjamin Peterson for taking the time to review the code. ........ r85388 | benjamin.peterson | 2010-10-12 17:27:44 -0500 (Tue, 12 Oct 2010) | 1 line fix urllib fixer with multiple as imports on a line #10069 ........ r85478 | benjamin.peterson | 2010-10-14 08:09:56 -0500 (Thu, 14 Oct 2010) | 1 line stop abusing docstrings ........ r85506 | benjamin.peterson | 2010-10-14 17:45:19 -0500 (Thu, 14 Oct 2010) | 1 line kill sibling import ........ r85507 | benjamin.peterson | 2010-10-14 17:54:15 -0500 (Thu, 14 Oct 2010) | 1 line remove trailing whitespace ........ r85508 | benjamin.peterson | 2010-10-14 17:55:28 -0500 (Thu, 14 Oct 2010) | 1 line typo ........ ................
Diffstat (limited to 'Lib/lib2to3/btm_utils.py')
-rw-r--r--Lib/lib2to3/btm_utils.py283
1 files changed, 283 insertions, 0 deletions
diff --git a/Lib/lib2to3/btm_utils.py b/Lib/lib2to3/btm_utils.py
new file mode 100644
index 0000000..2276dc9
--- /dev/null
+++ b/Lib/lib2to3/btm_utils.py
@@ -0,0 +1,283 @@
+"Utility functions used by the btm_matcher module"
+
+from . import pytree
+from .pgen2 import grammar, token
+from .pygram import pattern_symbols, python_symbols
+
+syms = pattern_symbols
+pysyms = python_symbols
+tokens = grammar.opmap
+token_labels = token
+
+TYPE_ANY = -1
+TYPE_ALTERNATIVES = -2
+TYPE_GROUP = -3
+
+class MinNode(object):
+ """This class serves as an intermediate representation of the
+ pattern tree during the conversion to sets of leaf-to-root
+ subpatterns"""
+
+ def __init__(self, type=None, name=None):
+ self.type = type
+ self.name = name
+ self.children = []
+ self.leaf = False
+ self.parent = None
+ self.alternatives = []
+ self.group = []
+
+ def __repr__(self):
+ return str(self.type) + ' ' + str(self.name)
+
+ def leaf_to_root(self):
+ """Internal method. Returns a characteristic path of the
+ pattern tree. This method must be run for all leaves until the
+ linear subpatterns are merged into a single"""
+ node = self
+ subp = []
+ while node:
+ if node.type == TYPE_ALTERNATIVES:
+ node.alternatives.append(subp)
+ if len(node.alternatives) == len(node.children):
+ #last alternative
+ subp = [tuple(node.alternatives)]
+ node.alternatives = []
+ node = node.parent
+ continue
+ else:
+ node = node.parent
+ subp = None
+ break
+
+ if node.type == TYPE_GROUP:
+ node.group.append(subp)
+ #probably should check the number of leaves
+ if len(node.group) == len(node.children):
+ subp = get_characteristic_subpattern(node.group)
+ node.group = []
+ node = node.parent
+ continue
+ else:
+ node = node.parent
+ subp = None
+ break
+
+ if node.type == token_labels.NAME and node.name:
+ #in case of type=name, use the name instead
+ subp.append(node.name)
+ else:
+ subp.append(node.type)
+
+ node = node.parent
+ return subp
+
+ def get_linear_subpattern(self):
+ """Drives the leaf_to_root method. The reason that
+ leaf_to_root must be run multiple times is because we need to
+ reject 'group' matches; for example the alternative form
+ (a | b c) creates a group [b c] that needs to be matched. Since
+ matching multiple linear patterns overcomes the automaton's
+ capabilities, leaf_to_root merges each group into a single
+ choice based on 'characteristic'ity,
+
+ i.e. (a|b c) -> (a|b) if b more characteristic than c
+
+ Returns: The most 'characteristic'(as defined by
+ get_characteristic_subpattern) path for the compiled pattern
+ tree.
+ """
+
+ for l in self.leaves():
+ subp = l.leaf_to_root()
+ if subp:
+ return subp
+
+ def leaves(self):
+ "Generator that returns the leaves of the tree"
+ for child in self.children:
+ for x in child.leaves():
+ yield x
+ if not self.children:
+ yield self
+
+def reduce_tree(node, parent=None):
+ """
+ Internal function. Reduces a compiled pattern tree to an
+ intermediate representation suitable for feeding the
+ automaton. This also trims off any optional pattern elements(like
+ [a], a*).
+ """
+
+ new_node = None
+ #switch on the node type
+ if node.type == syms.Matcher:
+ #skip
+ node = node.children[0]
+
+ if node.type == syms.Alternatives :
+ #2 cases
+ if len(node.children) <= 2:
+ #just a single 'Alternative', skip this node
+ new_node = reduce_tree(node.children[0], parent)
+ else:
+ #real alternatives
+ new_node = MinNode(type=TYPE_ALTERNATIVES)
+ #skip odd children('|' tokens)
+ for child in node.children:
+ if node.children.index(child)%2:
+ continue
+ reduced = reduce_tree(child, new_node)
+ if reduced is not None:
+ new_node.children.append(reduced)
+ elif node.type == syms.Alternative:
+ if len(node.children) > 1:
+
+ new_node = MinNode(type=TYPE_GROUP)
+ for child in node.children:
+ reduced = reduce_tree(child, new_node)
+ if reduced:
+ new_node.children.append(reduced)
+ if not new_node.children:
+ # delete the group if all of the children were reduced to None
+ new_node = None
+
+ else:
+ new_node = reduce_tree(node.children[0], parent)
+
+ elif node.type == syms.Unit:
+ if (isinstance(node.children[0], pytree.Leaf) and
+ node.children[0].value == '('):
+ #skip parentheses
+ return reduce_tree(node.children[1], parent)
+ if ((isinstance(node.children[0], pytree.Leaf) and
+ node.children[0].value == '[')
+ or
+ (len(node.children)>1 and
+ hasattr(node.children[1], "value") and
+ node.children[1].value == '[')):
+ #skip whole unit if its optional
+ return None
+
+ leaf = True
+ details_node = None
+ alternatives_node = None
+ has_repeater = False
+ repeater_node = None
+ has_variable_name = False
+
+ for child in node.children:
+ if child.type == syms.Details:
+ leaf = False
+ details_node = child
+ elif child.type == syms.Repeater:
+ has_repeater = True
+ repeater_node = child
+ elif child.type == syms.Alternatives:
+ alternatives_node = child
+ if hasattr(child, 'value') and child.value == '=': # variable name
+ has_variable_name = True
+
+ #skip variable name
+ if has_variable_name:
+ #skip variable name, '='
+ name_leaf = node.children[2]
+ if hasattr(name_leaf, 'value') and name_leaf.value == '(':
+ # skip parenthesis
+ name_leaf = node.children[3]
+ else:
+ name_leaf = node.children[0]
+
+ #set node type
+ if name_leaf.type == token_labels.NAME:
+ #(python) non-name or wildcard
+ if name_leaf.value == 'any':
+ new_node = MinNode(type=TYPE_ANY)
+ else:
+ if hasattr(token_labels, name_leaf.value):
+ new_node = MinNode(type=getattr(token_labels, name_leaf.value))
+ else:
+ new_node = MinNode(type=getattr(pysyms, name_leaf.value))
+
+ elif name_leaf.type == token_labels.STRING:
+ #(python) name or character; remove the apostrophes from
+ #the string value
+ name = name_leaf.value.strip("'")
+ if name in tokens:
+ new_node = MinNode(type=tokens[name])
+ else:
+ new_node = MinNode(type=token_labels.NAME, name=name)
+ elif name_leaf.type == syms.Alternatives:
+ new_node = reduce_tree(alternatives_node, parent)
+
+ #handle repeaters
+ if has_repeater:
+ if repeater_node.children[0].value == '*':
+ #reduce to None
+ new_node = None
+ elif repeater_node.children[0].value == '+':
+ #reduce to a single occurence i.e. do nothing
+ pass
+ else:
+ #TODO: handle {min, max} repeaters
+ raise NotImplementedError
+ pass
+
+ #add children
+ if details_node and new_node is not None:
+ for child in details_node.children[1:-1]:
+ #skip '<', '>' markers
+ reduced = reduce_tree(child, new_node)
+ if reduced is not None:
+ new_node.children.append(reduced)
+ if new_node:
+ new_node.parent = parent
+ return new_node
+
+
+def get_characteristic_subpattern(subpatterns):
+ """Picks the most characteristic from a list of linear patterns
+ Current order used is:
+ names > common_names > common_chars
+ """
+ if not isinstance(subpatterns, list):
+ return subpatterns
+ if len(subpatterns)==1:
+ return subpatterns[0]
+
+ # first pick out the ones containing variable names
+ subpatterns_with_names = []
+ subpatterns_with_common_names = []
+ common_names = ['in', 'for', 'if' , 'not', 'None']
+ subpatterns_with_common_chars = []
+ common_chars = "[]().,:"
+ for subpattern in subpatterns:
+ if any(rec_test(subpattern, lambda x: type(x) is str)):
+ if any(rec_test(subpattern,
+ lambda x: isinstance(x, str) and x in common_chars)):
+ subpatterns_with_common_chars.append(subpattern)
+ elif any(rec_test(subpattern,
+ lambda x: isinstance(x, str) and x in common_names)):
+ subpatterns_with_common_names.append(subpattern)
+
+ else:
+ subpatterns_with_names.append(subpattern)
+
+ if subpatterns_with_names:
+ subpatterns = subpatterns_with_names
+ elif subpatterns_with_common_names:
+ subpatterns = subpatterns_with_common_names
+ elif subpatterns_with_common_chars:
+ subpatterns = subpatterns_with_common_chars
+ # of the remaining subpatterns pick out the longest one
+ return max(subpatterns, key=len)
+
+def rec_test(sequence, test_func):
+ """Tests test_func on all items of sequence and items of included
+ sub-iterables"""
+ for x in sequence:
+ if isinstance(x, (list, tuple)):
+ for y in rec_test(x, test_func):
+ yield y
+ else:
+ yield test_func(x)