From 9b7f9e6c5a4f0d31333b7f10fac0c6c92b8e53bc Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Wed, 16 Jul 2014 16:33:36 +0300 Subject: Issue #21765: Add support for non-ascii identifiers to HyperParser --- Lib/idlelib/HyperParser.py | 93 +++++++++++++++++++++++++------ Lib/idlelib/PyParse.py | 80 +++++++++++++++++--------- Lib/idlelib/idle_test/test_hyperparser.py | 82 +++++++++++++++++++++++++++ Misc/NEWS | 2 + 4 files changed, 213 insertions(+), 44 deletions(-) diff --git a/Lib/idlelib/HyperParser.py b/Lib/idlelib/HyperParser.py index d376568..77cb057 100644 --- a/Lib/idlelib/HyperParser.py +++ b/Lib/idlelib/HyperParser.py @@ -6,11 +6,24 @@ the structure of code. """ import string -import keyword +from keyword import iskeyword from idlelib import PyParse -class HyperParser: +# all ASCII chars that may be in an identifier +_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_") +# all ASCII chars that may be the first char of an identifier +_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_") + +# lookup table for whether 7-bit ASCII chars are valid in a Python identifier +_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)] +# lookup table for whether 7-bit ASCII chars are valid as the first +# char in a Python identifier +_IS_ASCII_ID_FIRST_CHAR = \ + [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)] + + +class HyperParser: def __init__(self, editwin, index): "To initialize, analyze the surroundings of the given index." @@ -143,26 +156,70 @@ class HyperParser: return beforeindex, afterindex - # Ascii chars that may be in a white space - _whitespace_chars = " \t\n\\" - # Ascii chars that may be in an identifier - _id_chars = string.ascii_letters + string.digits + "_" - # Ascii chars that may be the first char of an identifier - _id_first_chars = string.ascii_letters + "_" - - # Given a string and pos, return the number of chars in the - # identifier which ends at pos, or 0 if there is no such one. Saved - # words are not identifiers. - def _eat_identifier(self, str, limit, pos): + # the set of built-in identifiers which are also keywords, + # i.e. keyword.iskeyword() returns True for them + _ID_KEYWORDS = frozenset({"True", "False", "None"}) + + @classmethod + def _eat_identifier(cls, str, limit, pos): + """Given a string and pos, return the number of chars in the + identifier which ends at pos, or 0 if there is no such one. + + This ignores non-identifier eywords are not identifiers. + """ + is_ascii_id_char = _IS_ASCII_ID_CHAR + + # Start at the end (pos) and work backwards. i = pos - while i > limit and str[i-1] in self._id_chars: + + # Go backwards as long as the characters are valid ASCII + # identifier characters. This is an optimization, since it + # is faster in the common case where most of the characters + # are ASCII. + while i > limit and ( + ord(str[i - 1]) < 128 and + is_ascii_id_char[ord(str[i - 1])] + ): i -= 1 - if (i < pos and (str[i] not in self._id_first_chars or - (keyword.iskeyword(str[i:pos]) and - str[i:pos] not in {'None', 'False', 'True'}))): - i = pos + + # If the above loop ended due to reaching a non-ASCII + # character, continue going backwards using the most generic + # test for whether a string contains only valid identifier + # characters. + if i > limit and ord(str[i - 1]) >= 128: + while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier(): + i -= 4 + if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier(): + i -= 2 + if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier(): + i -= 1 + + # The identifier candidate starts here. If it isn't a valid + # identifier, don't eat anything. At this point that is only + # possible if the first character isn't a valid first + # character for an identifier. + if not str[i:pos].isidentifier(): + return 0 + elif i < pos: + # All characters in str[i:pos] are valid ASCII identifier + # characters, so it is enough to check that the first is + # valid as the first character of an identifier. + if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]: + return 0 + + # All keywords are valid identifiers, but should not be + # considered identifiers here, except for True, False and None. + if i < pos and ( + iskeyword(str[i:pos]) and + str[i:pos] not in cls._ID_KEYWORDS + ): + return 0 + return pos - i + # This string includes all chars that may be in a white space + _whitespace_chars = " \t\n\\" + def get_expression(self): """Return a string with the Python expression which ends at the given index, which is empty if there is no real one. diff --git a/Lib/idlelib/PyParse.py b/Lib/idlelib/PyParse.py index 61a0003..3e501ca 100644 --- a/Lib/idlelib/PyParse.py +++ b/Lib/idlelib/PyParse.py @@ -1,5 +1,7 @@ import re import sys +from collections import Mapping +from functools import partial # Reason last stmt is continued (or C_NONE if it's not). (C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE, @@ -91,19 +93,48 @@ _chew_ordinaryre = re.compile(r""" [^[\](){}#'"\\]+ """, re.VERBOSE).match -# Build translation table to map uninteresting chars to "x", open -# brackets to "(", and close brackets to ")". -_tran = {} -for i in range(256): - _tran[i] = 'x' -for ch in "({[": - _tran[ord(ch)] = '(' -for ch in ")}]": - _tran[ord(ch)] = ')' -for ch in "\"'\\\n#": - _tran[ord(ch)] = ch -del i, ch +class StringTranslatePseudoMapping(Mapping): + r"""Utility class to be used with str.translate() + + This Mapping class wraps a given dict. When a value for a key is + requested via __getitem__() or get(), the key is looked up in the + given dict. If found there, the value from the dict is returned. + Otherwise, the default value given upon initialization is returned. + + This allows using str.translate() to make some replacements, and to + replace all characters for which no replacement was specified with + a given character instead of leaving them as-is. + + For example, to replace everything except whitespace with 'x': + + >>> whitespace_chars = ' \t\n\r' + >>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars} + >>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x')) + >>> text = "a + b\tc\nd" + >>> text.translate(mapping) + 'x x x\tx\nx' + """ + def __init__(self, non_defaults, default_value): + self._non_defaults = non_defaults + self._default_value = default_value + + def _get(key, _get=non_defaults.get, _default=default_value): + return _get(key, _default) + self._get = _get + + def __getitem__(self, item): + return self._get(item) + + def __len__(self): + return len(self._non_defaults) + + def __iter__(self): + return iter(self._non_defaults) + + def get(self, key, default=None): + return self._get(key) + class Parser: @@ -113,19 +144,6 @@ class Parser: def set_str(self, s): assert len(s) == 0 or s[-1] == '\n' - if isinstance(s, str): - # The parse functions have no idea what to do with Unicode, so - # replace all Unicode characters with "x". This is "safe" - # so long as the only characters germane to parsing the structure - # of Python are 7-bit ASCII. It's *necessary* because Unicode - # strings don't have a .translate() method that supports - # deletechars. - uniphooey = s - s = [] - push = s.append - for raw in map(ord, uniphooey): - push(raw < 127 and chr(raw) or "x") - s = "".join(s) self.str = s self.study_level = 0 @@ -197,6 +215,16 @@ class Parser: if lo > 0: self.str = self.str[lo:] + # Build a translation table to map uninteresting chars to 'x', open + # brackets to '(', close brackets to ')' while preserving quotes, + # backslashes, newlines and hashes. This is to be passed to + # str.translate() in _study1(). + _tran = {} + _tran.update((ord(c), ord('(')) for c in "({[") + _tran.update((ord(c), ord(')')) for c in ")}]") + _tran.update((ord(c), ord(c)) for c in "\"'\\\n#") + _tran = StringTranslatePseudoMapping(_tran, default_value=ord('x')) + # As quickly as humanly possible , find the line numbers (0- # based) of the non-continuation lines. # Creates self.{goodlines, continuation}. @@ -211,7 +239,7 @@ class Parser: # uninteresting characters. This can cut the number of chars # by a factor of 10-40, and so greatly speed the following loop. str = self.str - str = str.translate(_tran) + str = str.translate(self._tran) str = str.replace('xxxxxxxx', 'x') str = str.replace('xxxx', 'x') str = str.replace('xx', 'x') diff --git a/Lib/idlelib/idle_test/test_hyperparser.py b/Lib/idlelib/idle_test/test_hyperparser.py index 68e97fc..edfc783 100644 --- a/Lib/idlelib/idle_test/test_hyperparser.py +++ b/Lib/idlelib/idle_test/test_hyperparser.py @@ -30,6 +30,7 @@ class HyperParserTest(unittest.TestCase): "z = ((r'asdf')+('a')))\n" '[x for x in\n' 'for = False\n' + 'cliché = "this is a string with unicode, what a cliché"' ) @classmethod @@ -93,6 +94,8 @@ class HyperParserTest(unittest.TestCase): self.assertTrue(p.is_in_string()) p = get('4.6') self.assertTrue(p.is_in_string()) + p = get('12.54') + self.assertTrue(p.is_in_string()) def test_is_in_code(self): get = self.get_parser @@ -180,12 +183,91 @@ class HyperParserTest(unittest.TestCase): p = get('10.0') self.assertEqual(p.get_expression(), '') + p = get('10.6') + self.assertEqual(p.get_expression(), '') + + p = get('10.11') + self.assertEqual(p.get_expression(), '') + p = get('11.3') self.assertEqual(p.get_expression(), '') p = get('11.11') self.assertEqual(p.get_expression(), 'False') + p = get('12.6') + self.assertEqual(p.get_expression(), 'cliché') + + def test_eat_identifier(self): + def is_valid_id(candidate): + result = HyperParser._eat_identifier(candidate, 0, len(candidate)) + if result == len(candidate): + return True + elif result == 0: + return False + else: + err_msg = "Unexpected result: {} (expected 0 or {}".format( + result, len(candidate) + ) + raise Exception(err_msg) + + # invalid first character which is valid elsewhere in an identifier + self.assertFalse(is_valid_id('2notid')) + + # ASCII-only valid identifiers + self.assertTrue(is_valid_id('valid_id')) + self.assertTrue(is_valid_id('_valid_id')) + self.assertTrue(is_valid_id('valid_id_')) + self.assertTrue(is_valid_id('_2valid_id')) + + # keywords which should be "eaten" + self.assertTrue(is_valid_id('True')) + self.assertTrue(is_valid_id('False')) + self.assertTrue(is_valid_id('None')) + + # keywords which should not be "eaten" + self.assertFalse(is_valid_id('for')) + self.assertFalse(is_valid_id('import')) + self.assertFalse(is_valid_id('return')) + + # valid unicode identifiers + self.assertTrue(is_valid_id('cliche')) + self.assertTrue(is_valid_id('cliché')) + self.assertTrue(is_valid_id('a٢')) + + # invalid unicode identifiers + self.assertFalse(is_valid_id('2a')) + self.assertFalse(is_valid_id('٢a')) + self.assertFalse(is_valid_id('a²')) + + # valid identifier after "punctuation" + self.assertEqual(HyperParser._eat_identifier('+ var', 0, 5), len('var')) + self.assertEqual(HyperParser._eat_identifier('+var', 0, 4), len('var')) + self.assertEqual(HyperParser._eat_identifier('.var', 0, 4), len('var')) + + # invalid identifiers + self.assertFalse(is_valid_id('+')) + self.assertFalse(is_valid_id(' ')) + self.assertFalse(is_valid_id(':')) + self.assertFalse(is_valid_id('?')) + self.assertFalse(is_valid_id('^')) + self.assertFalse(is_valid_id('\\')) + self.assertFalse(is_valid_id('"')) + self.assertFalse(is_valid_id('"a string"')) + + def test_eat_identifier_various_lengths(self): + eat_id = HyperParser._eat_identifier + + for length in range(1, 21): + self.assertEqual(eat_id('a' * length, 0, length), length) + self.assertEqual(eat_id('é' * length, 0, length), length) + self.assertEqual(eat_id('a' + '2' * (length - 1), 0, length), length) + self.assertEqual(eat_id('é' + '2' * (length - 1), 0, length), length) + self.assertEqual(eat_id('é' + 'a' * (length - 1), 0, length), length) + self.assertEqual(eat_id('é' * (length - 1) + 'a', 0, length), length) + self.assertEqual(eat_id('+' * length, 0, length), 0) + self.assertEqual(eat_id('2' + 'a' * (length - 1), 0, length), 0) + self.assertEqual(eat_id('2' + 'é' * (length - 1), 0, length), 0) if __name__ == '__main__': unittest.main(verbosity=2) diff --git a/Misc/NEWS b/Misc/NEWS index 9425dc9..1280134 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -171,6 +171,8 @@ Build IDLE ---- +- Issue #21765: Add support for non-ascii identifiers to HyperParser. + - Issue #21940: Add unittest for WidgetRedirector. Initial patch by Saimadhav Heblikar. -- cgit v0.12