diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2018-02-09 22:08:17 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-02-09 22:08:17 (GMT) |
commit | a445feb72902e4a3c5ae712f0c289309e1580d52 (patch) | |
tree | 5a4bbd53ad0fa579f9672370d469f6da000647ff /Lib/sre_parse.py | |
parent | 2411292ba8155327125d8a1da8a4c9fa003d5909 (diff) | |
download | cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.zip cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.tar.gz cpython-a445feb72902e4a3c5ae712f0c289309e1580d52.tar.bz2 |
bpo-30688: Support \N{name} escapes in re patterns. (GH-5588)
Co-authored-by: Jonathan Eunice <jonathan.eunice@gmail.com>
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r-- | Lib/sre_parse.py | 37 |
1 files changed, 30 insertions, 7 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index a53735b..db01e84 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -13,6 +13,7 @@ # XXX: show string offset and offending character for all errors from sre_constants import * +import unicodedata SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -264,19 +265,19 @@ class Tokenizer: result += c self.__next() return result - def getuntil(self, terminator): + def getuntil(self, terminator, name): result = '' while True: c = self.next self.__next() if c is None: if not result: - raise self.error("missing group name") + raise self.error("missing " + name) raise self.error("missing %s, unterminated name" % terminator, len(result)) if c == terminator: if not result: - raise self.error("missing group name", 1) + raise self.error("missing " + name, 1) break result += c return result @@ -322,6 +323,17 @@ def _class_escape(source, escape): c = int(escape[2:], 16) chr(c) # raise ValueError for invalid code return LITERAL, c + elif c == "N" and source.istext: + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) + return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) escape += source.getwhile(2, OCTDIGITS) @@ -370,6 +382,17 @@ def _escape(source, escape, state): c = int(escape[2:], 16) chr(c) # raise ValueError for invalid code return LITERAL, c + elif c == "N" and source.istext: + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) + return LITERAL, c elif c == "0": # octal escape escape += source.getwhile(2, OCTDIGITS) @@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False): # python extensions if sourcematch("<"): # named group: skip forward to end of name - name = source.getuntil(">") + name = source.getuntil(">", "group name") if not name.isidentifier(): msg = "bad character in group name %r" % name raise source.error(msg, len(name) + 1) elif sourcematch("="): # named backreference - name = source.getuntil(")") + name = source.getuntil(")", "group name") if not name.isidentifier(): msg = "bad character in group name %r" % name raise source.error(msg, len(name) + 1) @@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group - condname = source.getuntil(")") + condname = source.getuntil(")", "group name") if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -977,7 +1000,7 @@ def parse_template(source, pattern): name = "" if not s.match("<"): raise s.error("missing <") - name = s.getuntil(">") + name = s.getuntil(">", "group name") if name.isidentifier(): try: index = groupindex[name] |