diff options
author | Walter Dörwald <walter@livinglogic.de> | 2009-04-25 14:03:16 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2009-04-25 14:03:16 (GMT) |
commit | 5d98ec76bb548ef65b6943c6eefd48eaabdbed90 (patch) | |
tree | 297888aa774109cfd45e15d0d12110491ef657c2 | |
parent | 140d9d673efdb33178fb4f93ece31bbc1d91ada0 (diff) | |
download | cpython-5d98ec76bb548ef65b6943c6eefd48eaabdbed90.zip cpython-5d98ec76bb548ef65b6943c6eefd48eaabdbed90.tar.gz cpython-5d98ec76bb548ef65b6943c6eefd48eaabdbed90.tar.bz2 |
Issue #5828 (Invalid behavior of unicode.lower): Fixed bogus logic in
makeunicodedata.py and regenerated the Unicode database (This fixes
u'\u1d79'.lower() == '\x00').
-rw-r--r-- | Lib/test/test_unicodedata.py | 15 | ||||
-rw-r--r-- | Misc/NEWS | 4 | ||||
-rw-r--r-- | Objects/unicodetype_db.h | 4 | ||||
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 43 |
4 files changed, 41 insertions, 25 deletions
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 84999e5..18f37f8 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -20,7 +20,7 @@ encoding = 'utf-8' class UnicodeMethodsTest(unittest.TestCase): # update this, if the database changes - expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e' + expectedchecksum = 'b7db9b5f1d804976fa921d2009cbef6f025620c1' def test_method_checksum(self): h = hashlib.sha1() @@ -257,6 +257,19 @@ class UnicodeMiscTest(UnicodeDatabaseTest): # the upper-case mapping: as delta, or as absolute value self.assert_(u"a".upper()==u'A') self.assert_(u"\u1d79".upper()==u'\ua77d') + self.assert_(u".".upper()==u".") + + def test_bug_5828(self): + self.assertEqual(u"\u1d79".lower(), u"\u1d79") + # Only U+0000 should have U+0000 as its upper/lower/titlecase variant + self.assertEqual( + [ + c for c in range(sys.maxunicode+1) + if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title() + ], + [0] + ) + def test_main(): test.test_support.run_unittest( @@ -773,6 +773,10 @@ Library - Issue #2703: SimpleXMLRPCDispatcher.__init__: Provide default values for new arguments introduced in 2.5. +- Issue #5828 (Invalid behavior of unicode.lower): Fixed bogus logic in + makeunicodedata.py and regenerated the Unicode database (This fixes + u'\u1d79'.lower() == '\x00'). + Tools/Demos ----------- diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h index 4168f83..2e9c01b 100644 --- a/Objects/unicodetype_db.h +++ b/Objects/unicodetype_db.h @@ -118,7 +118,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {0, 0, 0, 0, 7, 4}, {0, 0, 0, 0, 8, 4}, {0, 0, 0, 0, 9, 4}, - {42877, 0, 42877, 0, 0, 265}, + {42877, 7545, 42877, 0, 0, 265}, {3814, 0, 3814, 0, 0, 9}, {65477, 0, 65477, 0, 0, 9}, {0, 57921, 0, 0, 0, 129}, @@ -159,7 +159,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = { {0, 54787, 0, 0, 0, 129}, {0, 54753, 0, 0, 0, 129}, {58272, 0, 58272, 0, 0, 9}, - {0, 7545, 0, 0, 0, 385}, + {42877, 7545, 42877, 0, 0, 385}, {0, 40, 0, 0, 0, 129}, {65496, 0, 65496, 0, 0, 9}, }; diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 8ede83c..3cd5a1f 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -371,33 +371,32 @@ def makeunicodetype(unicode, trace): flags |= UPPER_MASK # use delta predictor for upper/lower/title if it fits if record[12]: - upper = int(record[12], 16) - char - if -32768 <= upper <= 32767 and delta: - upper = upper & 0xffff - else: - upper += char - delta = False + upper = int(record[12], 16) else: - upper = 0 + upper = char if record[13]: - lower = int(record[13], 16) - char - if -32768 <= lower <= 32767 and delta: - lower = lower & 0xffff - else: - lower += char - delta = False + lower = int(record[13], 16) else: - lower = 0 + lower = char if record[14]: - title = int(record[14], 16) - char - if -32768 <= lower <= 32767 and delta: - title = title & 0xffff - else: - title += char - delta = False + title = int(record[14], 16) + else: + # UCD.html says that a missing title char means that + # it defaults to the uppercase character, not to the + # character itself. Apparently, in the current UCD (5.x) + # this feature is never used + title = upper + upper_d = upper - char + lower_d = lower - char + title_d = title - char + if -32768 <= upper_d <= 32767 and \ + -32768 <= lower_d <= 32767 and \ + -32768 <= title_d <= 32767: + # use deltas + upper = upper_d & 0xffff + lower = lower_d & 0xffff + title = title_d & 0xffff else: - title = 0 - if not delta: flags |= NODELTA_MASK # decimal digit, integer digit decimal = 0 |