From 1b08b30743ec823cc01a4efbe44b721986e5cb51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Sat, 25 Apr 2009 14:13:56 +0000 Subject: Merged revisions 71894 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r71894 | walter.doerwald | 2009-04-25 16:03:16 +0200 (Sa, 25 Apr 2009) | 4 lines Issue #5828 (Invalid behavior of unicode.lower): Fixed bogus logic in makeunicodedata.py and regenerated the Unicode database (This fixes u'\u1d79'.lower() == '\x00'). ........ --- Lib/test/test_unicodedata.py | 15 +++++++++++++- Misc/NEWS | 4 ++++ Objects/unicodetype_db.h | 4 ++-- Tools/unicode/makeunicodedata.py | 43 ++++++++++++++++++++-------------------- 4 files changed, 41 insertions(+), 25 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 2dfa807..4f691b5 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -20,7 +20,7 @@ encoding = 'utf-8' class UnicodeMethodsTest(unittest.TestCase): # update this, if the database changes - expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e' + expectedchecksum = 'b7db9b5f1d804976fa921d2009cbef6f025620c1' def test_method_checksum(self): h = hashlib.sha1() @@ -258,6 +258,19 @@ class UnicodeMiscTest(UnicodeDatabaseTest): # the upper-case mapping: as delta, or as absolute value self.assert_("a".upper()=='A') self.assert_("\u1d79".upper()=='\ua77d') + self.assert_(".".upper()=='.') + + def test_bug_5828(self): + self.assertEqual("\u1d79".lower(), "\u1d79") + # Only U+0000 should have U+0000 as its upper/lower/titlecase variant + self.assertEqual( + [ + c for c in range(sys.maxunicode+1) + if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title() + ], + [0] + ) + def test_main(): test.support.run_unittest( diff --git a/Misc/NEWS b/Misc/NEWS index 9f9ad37..9750f4b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -104,6 +104,10 @@ Library - Issue #2703: SimpleXMLRPCDispatcher.__init__: Provide default values for new arguments introduced in 2.5. +- Issue #5828 (Invalid behavior of unicode.lower): Fixed bogus logic in + makeunicodedata.py and regenerated the Unicode database (This fixes + u'\u1d79'.lower() == '\x00'). + Extension Modules ----------------- diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h index 640697c..f46f1fb 100644 --- a/Objects/unicodetype_db.h +++ b/Objects/unicodetype_db.h @@ -127,7 +127,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {0, 0, 0, 0, 8, 1540}, {0, 0, 0, 0, 9, 1540}, {0, 0, 0, 0, 0, 1792}, - {42877, 0, 42877, 0, 0, 3849}, + {42877, 7545, 42877, 0, 0, 3849}, {3814, 0, 3814, 0, 0, 1801}, {65477, 0, 65477, 0, 0, 1801}, {0, 57921, 0, 0, 0, 1921}, @@ -174,7 +174,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {0, 54787, 0, 0, 0, 1921}, {0, 54753, 0, 0, 0, 1921}, {58272, 0, 58272, 0, 0, 1801}, - {0, 7545, 0, 0, 0, 3969}, + {42877, 7545, 42877, 0, 0, 3969}, {0, 40, 0, 0, 0, 1921}, {65496, 0, 65496, 0, 0, 1801}, }; diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 10fd991..930a0df 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -383,33 +383,32 @@ def makeunicodetype(unicode, trace): flags |= XID_CONTINUE_MASK # use delta predictor for upper/lower/title if it fits if record[12]: - upper = int(record[12], 16) - char - if -32768 <= upper <= 32767 and delta: - upper = upper & 0xffff - else: - upper += char - delta = False + upper = int(record[12], 16) else: - upper = 0 + upper = char if record[13]: - lower = int(record[13], 16) - char - if -32768 <= lower <= 32767 and delta: - lower = lower & 0xffff - else: - lower += char - delta = False + lower = int(record[13], 16) else: - lower = 0 + lower = char if record[14]: - title = int(record[14], 16) - char - if -32768 <= lower <= 32767 and delta: - title = title & 0xffff - else: - title += char - delta = False + title = int(record[14], 16) + else: + # UCD.html says that a missing title char means that + # it defaults to the uppercase character, not to the + # character itself. Apparently, in the current UCD (5.x) + # this feature is never used + title = upper + upper_d = upper - char + lower_d = lower - char + title_d = title - char + if -32768 <= upper_d <= 32767 and \ + -32768 <= lower_d <= 32767 and \ + -32768 <= title_d <= 32767: + # use deltas + upper = upper_d & 0xffff + lower = lower_d & 0xffff + title = title_d & 0xffff else: - title = 0 - if not delta: flags |= NODELTA_MASK # decimal digit, integer digit decimal = 0 -- cgit v0.12