diff options
-rw-r--r-- | Doc/library/re.rst | 9 | ||||
-rw-r--r-- | Doc/whatsnew/3.8.rst | 2 | ||||
-rw-r--r-- | Lib/sre_parse.py | 37 | ||||
-rw-r--r-- | Lib/test/test_re.py | 36 | ||||
-rw-r--r-- | Misc/ACKS | 1 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2018-02-08-18-59-11.bpo-30688.zBh4TH.rst | 2 |
6 files changed, 77 insertions, 10 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 83ebe7d..475a8d2 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -468,13 +468,13 @@ Most of the standard escapes supported by Python string literals are also accepted by the regular expression parser:: \a \b \f \n - \r \t \u \U - \v \x \\ + \N \r \t \u + \U \v \x \\ (Note that ``\b`` is used to represent word boundaries, and means "backspace" only inside character classes.) -``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode +``'\u'``, ``'\U'``, and ``'\N'`` escape sequences are only recognized in Unicode patterns. In bytes patterns they are errors. Octal escapes are included in a limited form. If the first digit is a 0, or if @@ -488,6 +488,9 @@ three digits in length. .. versionchanged:: 3.6 Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors. +.. versionchanged:: 3.8 + The ``'\N{name}'`` escape sequence has been added. As in string literals, + it expands to the named Unicode character (e.g. ``'\N{EM DASH}'``). .. seealso:: diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index 60f54a0..4181981 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -75,6 +75,8 @@ New Features Other Language Changes ====================== +* Added support of ``\N{name}`` escapes in :mod:`regular expressions <re>`. + (Contributed by Jonathan Eunice and Serhiy Storchaka in :issue:`30688`.) New Modules diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index a53735b..db01e84 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -13,6 +13,7 @@ # XXX: show string offset and offending character for all errors from sre_constants import * +import unicodedata SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -264,19 +265,19 @@ class Tokenizer: result += c self.__next() return result - def getuntil(self, terminator): + def getuntil(self, terminator, name): result = '' while True: c = self.next self.__next() if c is None: if not result: - raise self.error("missing group name") + raise self.error("missing " + name) raise self.error("missing %s, unterminated name" % terminator, len(result)) if c == terminator: if not result: - raise self.error("missing group name", 1) + raise self.error("missing " + name, 1) break result += c return result @@ -322,6 +323,17 @@ def _class_escape(source, escape): c = int(escape[2:], 16) chr(c) # raise ValueError for invalid code return LITERAL, c + elif c == "N" and source.istext: + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) + return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) escape += source.getwhile(2, OCTDIGITS) @@ -370,6 +382,17 @@ def _escape(source, escape, state): c = int(escape[2:], 16) chr(c) # raise ValueError for invalid code return LITERAL, c + elif c == "N" and source.istext: + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) + return LITERAL, c elif c == "0": # octal escape escape += source.getwhile(2, OCTDIGITS) @@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False): # python extensions if sourcematch("<"): # named group: skip forward to end of name - name = source.getuntil(">") + name = source.getuntil(">", "group name") if not name.isidentifier(): msg = "bad character in group name %r" % name raise source.error(msg, len(name) + 1) elif sourcematch("="): # named backreference - name = source.getuntil(")") + name = source.getuntil(")", "group name") if not name.isidentifier(): msg = "bad character in group name %r" % name raise source.error(msg, len(name) + 1) @@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group - condname = source.getuntil(")") + condname = source.getuntil(")", "group name") if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -977,7 +1000,7 @@ def parse_template(source, pattern): name = "" if not s.match("<"): raise s.error("missing <") - name = s.getuntil(">") + name = s.getuntil(">", "group name") if name.isidentifier(): try: index = groupindex[name] diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 9fed4be..ab1d985 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -694,6 +694,42 @@ class ReTests(unittest.TestCase): with self.subTest(c): self.assertRaises(re.error, re.compile, '[\\%c]' % c) + def test_named_unicode_escapes(self): + # test individual Unicode named escapes + self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<')) + self.assertTrue(re.match(r'\N{less-than sign}', '<')) + self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>')) + self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d')) + self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH ' + r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}', + '\ufbf9')) + self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]', + '=')) + self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]', + ';')) + + # test errors in \N{name} handling - only valid names should pass + self.checkPatternError(r'\N', 'missing {', 2) + self.checkPatternError(r'[\N]', 'missing {', 3) + self.checkPatternError(r'\N{', 'missing character name', 3) + self.checkPatternError(r'[\N{', 'missing character name', 4) + self.checkPatternError(r'\N{}', 'missing character name', 3) + self.checkPatternError(r'[\N{}]', 'missing character name', 4) + self.checkPatternError(r'\NSNAKE}', 'missing {', 2) + self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3) + self.checkPatternError(r'\N{SNAKE', + 'missing }, unterminated name', 3) + self.checkPatternError(r'[\N{SNAKE]', + 'missing }, unterminated name', 4) + self.checkPatternError(r'[\N{SNAKE]}', + "undefined character name 'SNAKE]'", 1) + self.checkPatternError(r'\N{SPAM}', + "undefined character name 'SPAM'", 0) + self.checkPatternError(r'[\N{SPAM}]', + "undefined character name 'SPAM'", 1) + self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0) + self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1) + def test_string_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), @@ -441,6 +441,7 @@ Andy Eskilsson André Espaze Stefan Esser Nicolas Estibals +Jonathan Eunice Carey Evans Stephen D Evans Tim Everett diff --git a/Misc/NEWS.d/next/Library/2018-02-08-18-59-11.bpo-30688.zBh4TH.rst b/Misc/NEWS.d/next/Library/2018-02-08-18-59-11.bpo-30688.zBh4TH.rst new file mode 100644 index 0000000..7d31680 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-02-08-18-59-11.bpo-30688.zBh4TH.rst @@ -0,0 +1,2 @@ +Added support of ``\N{name}`` escapes in regular expressions. Based on +patch by Jonathan Eunice. |