diff options
Diffstat (limited to 'Lib/idlelib/HyperParser.py')
-rw-r--r-- | Lib/idlelib/HyperParser.py | 249 |
1 files changed, 158 insertions, 91 deletions
diff --git a/Lib/idlelib/HyperParser.py b/Lib/idlelib/HyperParser.py index 4af4b08..77cb057 100644 --- a/Lib/idlelib/HyperParser.py +++ b/Lib/idlelib/HyperParser.py @@ -1,23 +1,31 @@ -""" -HyperParser -=========== -This module defines the HyperParser class, which provides advanced parsing -abilities for the ParenMatch and other extensions. -The HyperParser uses PyParser. PyParser is intended mostly to give information -on the proper indentation of code. HyperParser gives some information on the -structure of code, used by extensions to help the user. +"""Provide advanced parsing abilities for ParenMatch and other extensions. + +HyperParser uses PyParser. PyParser mostly gives information on the +proper indentation of code. HyperParser gives additional information on +the structure of code. """ import string -import keyword +from keyword import iskeyword from idlelib import PyParse -class HyperParser: +# all ASCII chars that may be in an identifier +_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_") +# all ASCII chars that may be the first char of an identifier +_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_") + +# lookup table for whether 7-bit ASCII chars are valid in a Python identifier +_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)] +# lookup table for whether 7-bit ASCII chars are valid as the first +# char in a Python identifier +_IS_ASCII_ID_FIRST_CHAR = \ + [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)] + + +class HyperParser: def __init__(self, editwin, index): - """Initialize the HyperParser to analyze the surroundings of the given - index. - """ + "To initialize, analyze the surroundings of the given index." self.editwin = editwin self.text = text = editwin.text @@ -33,9 +41,10 @@ class HyperParser: startat = max(lno - context, 1) startatindex = repr(startat) + ".0" stopatindex = "%d.end" % lno - # We add the newline because PyParse requires a newline at end. - # We add a space so that index won't be at end of line, so that - # its status will be the same as the char before it, if should. + # We add the newline because PyParse requires a newline + # at end. We add a space so that index won't be at end + # of line, so that its status will be the same as the + # char before it, if should. parser.set_str(text.get(startatindex, stopatindex)+' \n') bod = parser.find_good_parse_start( editwin._build_char_in_string_func(startatindex)) @@ -49,122 +58,175 @@ class HyperParser: else: startatindex = "1.0" stopatindex = "%d.end" % lno - # We add the newline because PyParse requires a newline at end. - # We add a space so that index won't be at end of line, so that - # its status will be the same as the char before it, if should. + # We add the newline because PyParse requires it. We add a + # space so that index won't be at end of line, so that its + # status will be the same as the char before it, if should. parser.set_str(text.get(startatindex, stopatindex)+' \n') parser.set_lo(0) - # We want what the parser has, except for the last newline and space. + # We want what the parser has, minus the last newline and space. self.rawtext = parser.str[:-2] - # As far as I can see, parser.str preserves the statement we are in, - # so that stopatindex can be used to synchronize the string with the - # text box indices. + # Parser.str apparently preserves the statement we are in, so + # that stopatindex can be used to synchronize the string with + # the text box indices. self.stopatindex = stopatindex self.bracketing = parser.get_last_stmt_bracketing() - # find which pairs of bracketing are openers. These always correspond - # to a character of rawtext. - self.isopener = [i>0 and self.bracketing[i][1] > self.bracketing[i-1][1] + # find which pairs of bracketing are openers. These always + # correspond to a character of rawtext. + self.isopener = [i>0 and self.bracketing[i][1] > + self.bracketing[i-1][1] for i in range(len(self.bracketing))] self.set_index(index) def set_index(self, index): - """Set the index to which the functions relate. Note that it must be - in the same statement. + """Set the index to which the functions relate. + + The index must be in the same statement. """ - indexinrawtext = \ - len(self.rawtext) - len(self.text.get(index, self.stopatindex)) + indexinrawtext = (len(self.rawtext) - + len(self.text.get(index, self.stopatindex))) if indexinrawtext < 0: - raise ValueError("The index given is before the analyzed statement") + raise ValueError("Index %s precedes the analyzed statement" + % index) self.indexinrawtext = indexinrawtext # find the rightmost bracket to which index belongs self.indexbracket = 0 - while self.indexbracket < len(self.bracketing)-1 and \ - self.bracketing[self.indexbracket+1][0] < self.indexinrawtext: + while (self.indexbracket < len(self.bracketing)-1 and + self.bracketing[self.indexbracket+1][0] < self.indexinrawtext): self.indexbracket += 1 - if self.indexbracket < len(self.bracketing)-1 and \ - self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and \ - not self.isopener[self.indexbracket+1]: + if (self.indexbracket < len(self.bracketing)-1 and + self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and + not self.isopener[self.indexbracket+1]): self.indexbracket += 1 def is_in_string(self): - """Is the index given to the HyperParser is in a string?""" + """Is the index given to the HyperParser in a string?""" # The bracket to which we belong should be an opener. # If it's an opener, it has to have a character. - return self.isopener[self.indexbracket] and \ - self.rawtext[self.bracketing[self.indexbracket][0]] in ('"', "'") + return (self.isopener[self.indexbracket] and + self.rawtext[self.bracketing[self.indexbracket][0]] + in ('"', "'")) def is_in_code(self): - """Is the index given to the HyperParser is in a normal code?""" - return not self.isopener[self.indexbracket] or \ - self.rawtext[self.bracketing[self.indexbracket][0]] not in \ - ('#', '"', "'") + """Is the index given to the HyperParser in normal code?""" + return (not self.isopener[self.indexbracket] or + self.rawtext[self.bracketing[self.indexbracket][0]] + not in ('#', '"', "'")) def get_surrounding_brackets(self, openers='([{', mustclose=False): - """If the index given to the HyperParser is surrounded by a bracket - defined in openers (or at least has one before it), return the - indices of the opening bracket and the closing bracket (or the - end of line, whichever comes first). - If it is not surrounded by brackets, or the end of line comes before - the closing bracket and mustclose is True, returns None. + """Return bracket indexes or None. + + If the index given to the HyperParser is surrounded by a + bracket defined in openers (or at least has one before it), + return the indices of the opening bracket and the closing + bracket (or the end of line, whichever comes first). + + If it is not surrounded by brackets, or the end of line comes + before the closing bracket and mustclose is True, returns None. """ + bracketinglevel = self.bracketing[self.indexbracket][1] before = self.indexbracket - while not self.isopener[before] or \ - self.rawtext[self.bracketing[before][0]] not in openers or \ - self.bracketing[before][1] > bracketinglevel: + while (not self.isopener[before] or + self.rawtext[self.bracketing[before][0]] not in openers or + self.bracketing[before][1] > bracketinglevel): before -= 1 if before < 0: return None bracketinglevel = min(bracketinglevel, self.bracketing[before][1]) after = self.indexbracket + 1 - while after < len(self.bracketing) and \ - self.bracketing[after][1] >= bracketinglevel: + while (after < len(self.bracketing) and + self.bracketing[after][1] >= bracketinglevel): after += 1 beforeindex = self.text.index("%s-%dc" % (self.stopatindex, len(self.rawtext)-self.bracketing[before][0])) - if after >= len(self.bracketing) or \ - self.bracketing[after][0] > len(self.rawtext): + if (after >= len(self.bracketing) or + self.bracketing[after][0] > len(self.rawtext)): if mustclose: return None afterindex = self.stopatindex else: - # We are after a real char, so it is a ')' and we give the index - # before it. - afterindex = self.text.index("%s-%dc" % - (self.stopatindex, + # We are after a real char, so it is a ')' and we give the + # index before it. + afterindex = self.text.index( + "%s-%dc" % (self.stopatindex, len(self.rawtext)-(self.bracketing[after][0]-1))) return beforeindex, afterindex - # This string includes all chars that may be in a white space - _whitespace_chars = " \t\n\\" - # This string includes all chars that may be in an identifier - _id_chars = string.ascii_letters + string.digits + "_" - # This string includes all chars that may be the first char of an identifier - _id_first_chars = string.ascii_letters + "_" - - # Given a string and pos, return the number of chars in the identifier - # which ends at pos, or 0 if there is no such one. Saved words are not - # identifiers. - def _eat_identifier(self, str, limit, pos): + # the set of built-in identifiers which are also keywords, + # i.e. keyword.iskeyword() returns True for them + _ID_KEYWORDS = frozenset({"True", "False", "None"}) + + @classmethod + def _eat_identifier(cls, str, limit, pos): + """Given a string and pos, return the number of chars in the + identifier which ends at pos, or 0 if there is no such one. + + This ignores non-identifier eywords are not identifiers. + """ + is_ascii_id_char = _IS_ASCII_ID_CHAR + + # Start at the end (pos) and work backwards. i = pos - while i > limit and str[i-1] in self._id_chars: + + # Go backwards as long as the characters are valid ASCII + # identifier characters. This is an optimization, since it + # is faster in the common case where most of the characters + # are ASCII. + while i > limit and ( + ord(str[i - 1]) < 128 and + is_ascii_id_char[ord(str[i - 1])] + ): i -= 1 - if i < pos and (str[i] not in self._id_first_chars or \ - keyword.iskeyword(str[i:pos])): - i = pos + + # If the above loop ended due to reaching a non-ASCII + # character, continue going backwards using the most generic + # test for whether a string contains only valid identifier + # characters. + if i > limit and ord(str[i - 1]) >= 128: + while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier(): + i -= 4 + if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier(): + i -= 2 + if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier(): + i -= 1 + + # The identifier candidate starts here. If it isn't a valid + # identifier, don't eat anything. At this point that is only + # possible if the first character isn't a valid first + # character for an identifier. + if not str[i:pos].isidentifier(): + return 0 + elif i < pos: + # All characters in str[i:pos] are valid ASCII identifier + # characters, so it is enough to check that the first is + # valid as the first character of an identifier. + if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]: + return 0 + + # All keywords are valid identifiers, but should not be + # considered identifiers here, except for True, False and None. + if i < pos and ( + iskeyword(str[i:pos]) and + str[i:pos] not in cls._ID_KEYWORDS + ): + return 0 + return pos - i + # This string includes all chars that may be in a white space + _whitespace_chars = " \t\n\\" + def get_expression(self): - """Return a string with the Python expression which ends at the given - index, which is empty if there is no real one. + """Return a string with the Python expression which ends at the + given index, which is empty if there is no real one. """ if not self.is_in_code(): - raise ValueError("get_expression should only be called if index "\ - "is inside a code.") + raise ValueError("get_expression should only be called" + "if index is inside a code.") rawtext = self.rawtext bracketing = self.bracketing @@ -177,20 +239,20 @@ class HyperParser: postdot_phase = True while 1: - # Eat whitespaces, comments, and if postdot_phase is False - one dot + # Eat whitespaces, comments, and if postdot_phase is False - a dot while 1: if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars: # Eat a whitespace pos -= 1 - elif not postdot_phase and \ - pos > brck_limit and rawtext[pos-1] == '.': + elif (not postdot_phase and + pos > brck_limit and rawtext[pos-1] == '.'): # Eat a dot pos -= 1 postdot_phase = True - # The next line will fail if we are *inside* a comment, but we - # shouldn't be. - elif pos == brck_limit and brck_index > 0 and \ - rawtext[bracketing[brck_index-1][0]] == '#': + # The next line will fail if we are *inside* a comment, + # but we shouldn't be. + elif (pos == brck_limit and brck_index > 0 and + rawtext[bracketing[brck_index-1][0]] == '#'): # Eat a comment brck_index -= 2 brck_limit = bracketing[brck_index][0] @@ -200,8 +262,8 @@ class HyperParser: break if not postdot_phase: - # We didn't find a dot, so the expression end at the last - # identifier pos. + # We didn't find a dot, so the expression end at the + # last identifier pos. break ret = self._eat_identifier(rawtext, brck_limit, pos) @@ -209,13 +271,13 @@ class HyperParser: # There is an identifier to eat pos = pos - ret last_identifier_pos = pos - # Now, in order to continue the search, we must find a dot. + # Now, to continue the search, we must find a dot. postdot_phase = False # (the loop continues now) elif pos == brck_limit: - # We are at a bracketing limit. If it is a closing bracket, - # eat the bracket, otherwise, stop the search. + # We are at a bracketing limit. If it is a closing + # bracket, eat the bracket, otherwise, stop the search. level = bracketing[brck_index][1] while brck_index > 0 and bracketing[brck_index-1][1] > level: brck_index -= 1 @@ -244,3 +306,8 @@ class HyperParser: break return rawtext[last_identifier_pos:self.indexinrawtext] + + +if __name__ == '__main__': + import unittest + unittest.main('idlelib.idle_test.test_hyperparser', verbosity=2) |