bpo-30688: Support \N{name} escapes in re patterns. (GH-5588)

Co-authored-by: Jonathan Eunice <jonathan.eunice@gmail.com>
author: Serhiy Storchaka <storchaka@gmail.com> 2018-02-09 22:08:17 (GMT)
committer: GitHub <noreply@github.com> 2018-02-09 22:08:17 (GMT)
commit: a445feb72902e4a3c5ae712f0c289309e1580d52 (patch)
tree: 5a4bbd53ad0fa579f9672370d469f6da000647ff /Lib/sre_parse.py
parent: 2411292ba8155327125d8a1da8a4c9fa003d5909 (diff)
download: cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.zip
cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.tar.gz
cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.tar.bz2
1 files changed, 30 insertions, 7 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index a53735b..db01e84 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -13,6 +13,7 @@
 # XXX: show string offset and offending character for all errors
 
 from sre_constants import *
+import unicodedata
 
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS = "*+?{"
@@ -264,19 +265,19 @@ class Tokenizer:
             result += c
             self.__next()
         return result
-    def getuntil(self, terminator):
+    def getuntil(self, terminator, name):
         result = ''
         while True:
             c = self.next
             self.__next()
             if c is None:
                 if not result:
-                    raise self.error("missing group name")
+                    raise self.error("missing " + name)
                 raise self.error("missing %s, unterminated name" % terminator,
                                  len(result))
             if c == terminator:
                 if not result:
-                    raise self.error("missing group name", 1)
+                    raise self.error("missing " + name, 1)
                 break
             result += c
         return result
@@ -322,6 +323,17 @@ def _class_escape(source, escape):
             c = int(escape[2:], 16)
             chr(c) # raise ValueError for invalid code
             return LITERAL, c
+        elif c == "N" and source.istext:
+            # named unicode escape e.g. \N{EM DASH}
+            if not source.match('{'):
+                raise source.error("missing {")
+            charname = source.getuntil('}', 'character name')
+            try:
+                c = ord(unicodedata.lookup(charname))
+            except KeyError:
+                raise source.error("undefined character name %r" % charname,
+                                   len(charname) + len(r'\N{}'))
+            return LITERAL, c
         elif c in OCTDIGITS:
             # octal escape (up to three digits)
             escape += source.getwhile(2, OCTDIGITS)
@@ -370,6 +382,17 @@ def _escape(source, escape, state):
             c = int(escape[2:], 16)
             chr(c) # raise ValueError for invalid code
             return LITERAL, c
+        elif c == "N" and source.istext:
+            # named unicode escape e.g. \N{EM DASH}
+            if not source.match('{'):
+                raise source.error("missing {")
+            charname = source.getuntil('}', 'character name')
+            try:
+                c = ord(unicodedata.lookup(charname))
+            except KeyError:
+                raise source.error("undefined character name %r" % charname,
+                                   len(charname) + len(r'\N{}'))
+            return LITERAL, c
         elif c == "0":
             # octal escape
             escape += source.getwhile(2, OCTDIGITS)
@@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False):
                     # python extensions
                     if sourcematch("<"):
                         # named group: skip forward to end of name
-                        name = source.getuntil(">")
+                        name = source.getuntil(">", "group name")
                         if not name.isidentifier():
                             msg = "bad character in group name %r" % name
                             raise source.error(msg, len(name) + 1)
                     elif sourcematch("="):
                         # named backreference
-                        name = source.getuntil(")")
+                        name = source.getuntil(")", "group name")
                         if not name.isidentifier():
                             msg = "bad character in group name %r" % name
                             raise source.error(msg, len(name) + 1)
@@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False):
 
                 elif char == "(":
                     # conditional backreference group
-                    condname = source.getuntil(")")
+                    condname = source.getuntil(")", "group name")
                     if condname.isidentifier():
                         condgroup = state.groupdict.get(condname)
                         if condgroup is None:
@@ -977,7 +1000,7 @@ def parse_template(source, pattern):
                 name = ""
                 if not s.match("<"):
                     raise s.error("missing <")
-                name = s.getuntil(">")
+                name = s.getuntil(">", "group name")
                 if name.isidentifier():
                     try:
                         index = groupindex[name]
author	Serhiy Storchaka <storchaka@gmail.com>	2018-02-09 22:08:17 (GMT)
committer	GitHub <noreply@github.com>	2018-02-09 22:08:17 (GMT)
commit	a445feb72902e4a3c5ae712f0c289309e1580d52 (patch)
tree	5a4bbd53ad0fa579f9672370d469f6da000647ff /Lib/sre_parse.py
parent	2411292ba8155327125d8a1da8a4c9fa003d5909 (diff)
download	cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.zip cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.tar.gz cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.tar.bz2