summaryrefslogtreecommitdiffstats
path: root/Lib/sre_parse.py
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2018-02-09 22:08:17 (GMT)
committerGitHub <noreply@github.com>2018-02-09 22:08:17 (GMT)
commita445feb72902e4a3c5ae712f0c289309e1580d52 (patch)
tree5a4bbd53ad0fa579f9672370d469f6da000647ff /Lib/sre_parse.py
parent2411292ba8155327125d8a1da8a4c9fa003d5909 (diff)
downloadcpython-a445feb72902e4a3c5ae712f0c289309e1580d52.zip
cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.tar.gz
cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.tar.bz2
bpo-30688: Support \N{name} escapes in re patterns. (GH-5588)
Co-authored-by: Jonathan Eunice <jonathan.eunice@gmail.com>
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r--Lib/sre_parse.py37
1 files changed, 30 insertions, 7 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index a53735b..db01e84 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -13,6 +13,7 @@
# XXX: show string offset and offending character for all errors
from sre_constants import *
+import unicodedata
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
@@ -264,19 +265,19 @@ class Tokenizer:
result += c
self.__next()
return result
- def getuntil(self, terminator):
+ def getuntil(self, terminator, name):
result = ''
while True:
c = self.next
self.__next()
if c is None:
if not result:
- raise self.error("missing group name")
+ raise self.error("missing " + name)
raise self.error("missing %s, unterminated name" % terminator,
len(result))
if c == terminator:
if not result:
- raise self.error("missing group name", 1)
+ raise self.error("missing " + name, 1)
break
result += c
return result
@@ -322,6 +323,17 @@ def _class_escape(source, escape):
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
+ elif c == "N" and source.istext:
+ # named unicode escape e.g. \N{EM DASH}
+ if not source.match('{'):
+ raise source.error("missing {")
+ charname = source.getuntil('}', 'character name')
+ try:
+ c = ord(unicodedata.lookup(charname))
+ except KeyError:
+ raise source.error("undefined character name %r" % charname,
+ len(charname) + len(r'\N{}'))
+ return LITERAL, c
elif c in OCTDIGITS:
# octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS)
@@ -370,6 +382,17 @@ def _escape(source, escape, state):
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
+ elif c == "N" and source.istext:
+ # named unicode escape e.g. \N{EM DASH}
+ if not source.match('{'):
+ raise source.error("missing {")
+ charname = source.getuntil('}', 'character name')
+ try:
+ c = ord(unicodedata.lookup(charname))
+ except KeyError:
+ raise source.error("undefined character name %r" % charname,
+ len(charname) + len(r'\N{}'))
+ return LITERAL, c
elif c == "0":
# octal escape
escape += source.getwhile(2, OCTDIGITS)
@@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False):
# python extensions
if sourcematch("<"):
# named group: skip forward to end of name
- name = source.getuntil(">")
+ name = source.getuntil(">", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
elif sourcematch("="):
# named backreference
- name = source.getuntil(")")
+ name = source.getuntil(")", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
@@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False):
elif char == "(":
# conditional backreference group
- condname = source.getuntil(")")
+ condname = source.getuntil(")", "group name")
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
@@ -977,7 +1000,7 @@ def parse_template(source, pattern):
name = ""
if not s.match("<"):
raise s.error("missing <")
- name = s.getuntil(">")
+ name = s.getuntil(">", "group name")
if name.isidentifier():
try:
index = groupindex[name]