diff options
author | Tal Einat <taleinat@gmail.com> | 2014-07-16 13:33:36 (GMT) |
---|---|---|
committer | Tal Einat <taleinat@gmail.com> | 2014-07-16 13:33:36 (GMT) |
commit | 9b7f9e6c5a4f0d31333b7f10fac0c6c92b8e53bc (patch) | |
tree | 41d6a12dd85ebf2fc67310d78635737efdfb3aa5 /Lib/idlelib/HyperParser.py | |
parent | cfa8950aaba03b1b3f6dc03e700f69edb88b1d67 (diff) | |
download | cpython-9b7f9e6c5a4f0d31333b7f10fac0c6c92b8e53bc.zip cpython-9b7f9e6c5a4f0d31333b7f10fac0c6c92b8e53bc.tar.gz cpython-9b7f9e6c5a4f0d31333b7f10fac0c6c92b8e53bc.tar.bz2 |
Issue #21765: Add support for non-ascii identifiers to HyperParser
Diffstat (limited to 'Lib/idlelib/HyperParser.py')
-rw-r--r-- | Lib/idlelib/HyperParser.py | 93 |
1 files changed, 75 insertions, 18 deletions
diff --git a/Lib/idlelib/HyperParser.py b/Lib/idlelib/HyperParser.py index d376568..77cb057 100644 --- a/Lib/idlelib/HyperParser.py +++ b/Lib/idlelib/HyperParser.py @@ -6,11 +6,24 @@ the structure of code. """ import string -import keyword +from keyword import iskeyword from idlelib import PyParse -class HyperParser: +# all ASCII chars that may be in an identifier +_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_") +# all ASCII chars that may be the first char of an identifier +_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_") + +# lookup table for whether 7-bit ASCII chars are valid in a Python identifier +_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)] +# lookup table for whether 7-bit ASCII chars are valid as the first +# char in a Python identifier +_IS_ASCII_ID_FIRST_CHAR = \ + [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)] + + +class HyperParser: def __init__(self, editwin, index): "To initialize, analyze the surroundings of the given index." @@ -143,26 +156,70 @@ class HyperParser: return beforeindex, afterindex - # Ascii chars that may be in a white space - _whitespace_chars = " \t\n\\" - # Ascii chars that may be in an identifier - _id_chars = string.ascii_letters + string.digits + "_" - # Ascii chars that may be the first char of an identifier - _id_first_chars = string.ascii_letters + "_" - - # Given a string and pos, return the number of chars in the - # identifier which ends at pos, or 0 if there is no such one. Saved - # words are not identifiers. - def _eat_identifier(self, str, limit, pos): + # the set of built-in identifiers which are also keywords, + # i.e. keyword.iskeyword() returns True for them + _ID_KEYWORDS = frozenset({"True", "False", "None"}) + + @classmethod + def _eat_identifier(cls, str, limit, pos): + """Given a string and pos, return the number of chars in the + identifier which ends at pos, or 0 if there is no such one. + + This ignores non-identifier eywords are not identifiers. + """ + is_ascii_id_char = _IS_ASCII_ID_CHAR + + # Start at the end (pos) and work backwards. i = pos - while i > limit and str[i-1] in self._id_chars: + + # Go backwards as long as the characters are valid ASCII + # identifier characters. This is an optimization, since it + # is faster in the common case where most of the characters + # are ASCII. + while i > limit and ( + ord(str[i - 1]) < 128 and + is_ascii_id_char[ord(str[i - 1])] + ): i -= 1 - if (i < pos and (str[i] not in self._id_first_chars or - (keyword.iskeyword(str[i:pos]) and - str[i:pos] not in {'None', 'False', 'True'}))): - i = pos + + # If the above loop ended due to reaching a non-ASCII + # character, continue going backwards using the most generic + # test for whether a string contains only valid identifier + # characters. + if i > limit and ord(str[i - 1]) >= 128: + while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier(): + i -= 4 + if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier(): + i -= 2 + if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier(): + i -= 1 + + # The identifier candidate starts here. If it isn't a valid + # identifier, don't eat anything. At this point that is only + # possible if the first character isn't a valid first + # character for an identifier. + if not str[i:pos].isidentifier(): + return 0 + elif i < pos: + # All characters in str[i:pos] are valid ASCII identifier + # characters, so it is enough to check that the first is + # valid as the first character of an identifier. + if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]: + return 0 + + # All keywords are valid identifiers, but should not be + # considered identifiers here, except for True, False and None. + if i < pos and ( + iskeyword(str[i:pos]) and + str[i:pos] not in cls._ID_KEYWORDS + ): + return 0 + return pos - i + # This string includes all chars that may be in a white space + _whitespace_chars = " \t\n\\" + def get_expression(self): """Return a string with the Python expression which ends at the given index, which is empty if there is no real one. |