From fe67bd91685f89fbf95ee9727ce03d20dea3e9b8 Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Tue, 28 Jul 2009 20:35:03 +0000 Subject: Issue #6561: '\d' regular expression should not match characters of category [No]; only those of category [Nd]. (Backport of r74237 from py3k.) --- Doc/library/re.rst | 3 ++- Lib/test/test_re.py | 21 +++++++++++++++++++++ Misc/NEWS | 4 ++++ Modules/_sre.c | 2 +- 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 2d5e195..df63f9b 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -332,7 +332,8 @@ the second character. For example, ``\$`` matches the character ``'$'``. ``\d`` When the :const:`UNICODE` flag is not specified, matches any decimal digit; this is equivalent to the set ``[0-9]``. With :const:`UNICODE`, it will match - whatever is classified as a digit in the Unicode character properties database. + whatever is classified as a decimal digit in the Unicode character properties + database. ``\D`` When the :const:`UNICODE` flag is not specified, matches any non-digit diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 4f543d9..c4cc820 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -636,6 +636,27 @@ class ReTests(unittest.TestCase): self.assertEqual(iter.next().span(), (4, 4)) self.assertRaises(StopIteration, iter.next) + def test_bug_6561(self): + # '\d' should match characters in Unicode category 'Nd' + # (Number, Decimal Digit), but not those in 'Nl' (Number, + # Letter) or 'No' (Number, Other). + decimal_digits = [ + u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd' + u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' + u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' + ] + for x in decimal_digits: + self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x) + + not_decimal_digits = [ + u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' + u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' + u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No' + u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' + ] + for x in not_decimal_digits: + self.assertIsNone(re.match('^\d$', x, re.UNICODE)) + def test_empty_array(self): # SF buf 1647541 import array diff --git a/Misc/NEWS b/Misc/NEWS index 25bbabb..6731fb9 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -1205,6 +1205,10 @@ C-API Extension Modules ----------------- +- Issue #6561: '\d' in a regex now matches only characters with + Unicode category 'Nd' (Number, Decimal Digit). Previously it also + matched characters with category 'No'. + - Issue #1523: Remove deprecated overflow wrapping for struct.pack with an integer format code ('bBhHiIlLqQ'). Packing an out-of-range integer now consistently raises struct.error. diff --git a/Modules/_sre.c b/Modules/_sre.c index 1aea53b..0d9ee24 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -172,7 +172,7 @@ static unsigned int sre_lower_locale(unsigned int ch) #if defined(HAVE_UNICODE) -#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) +#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch)) #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch)) -- cgit v0.12