diff options
author | Wonsup Yoon <pusnow@me.com> | 2018-06-15 12:03:14 (GMT) |
---|---|---|
committer | Xiang Zhang <angwerzx@126.com> | 2018-06-15 12:03:14 (GMT) |
commit | d134809cd3764c6a634eab7bb8995e3e2eff14d5 (patch) | |
tree | 6bcc3ec615c093c71b96ce1ce52594bacdc75466 | |
parent | ceeef10cdbc08561f9954e13bbed1cb2299a8c72 (diff) | |
download | cpython-d134809cd3764c6a634eab7bb8995e3e2eff14d5.zip cpython-d134809cd3764c6a634eab7bb8995e3e2eff14d5.tar.gz cpython-d134809cd3764c6a634eab7bb8995e3e2eff14d5.tar.bz2 |
bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (GH-1958)
Hangul composition check boundaries are wrong for the second character
([0x1161, 0x1176) instead of [0x1161, 0x1176]) and third character ((0x11A7, 0x11C3)
instead of [0x11A7, 0x11C3]).
-rw-r--r-- | Lib/test/test_unicodedata.py | 13 | ||||
-rw-r--r-- | Misc/ACKS | 1 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst | 1 | ||||
-rw-r--r-- | Modules/unicodedata.c | 10 |
4 files changed, 22 insertions, 3 deletions
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 99dd0de..170778f 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -208,6 +208,19 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): b = 'C\u0338' * 20 + '\xC7' self.assertEqual(self.db.normalize('NFC', a), b) + def test_issue29456(self): + # Fix #29456 + u1176_str_a = '\u1100\u1176\u11a8' + u1176_str_b = '\u1100\u1176\u11a8' + u11a7_str_a = '\u1100\u1175\u11a7' + u11a7_str_b = '\uae30\u11a7' + u11c3_str_a = '\u1100\u1175\u11c3' + u11c3_str_b = '\uae30\u11c3' + self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b) + self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) + self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) + + def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, b'a') @@ -1800,6 +1800,7 @@ Jason Yeo EungJun Yi Bob Yodlowski Danny Yoo +Wonsup Yoon Rory Yorke George Yoshida Kazuhiro Yoshida diff --git a/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst new file mode 100644 index 0000000..9b30bf6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst @@ -0,0 +1 @@ +Fix bugs in hangul normalization: u1176, u11a7 and u11c3 diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 7a9a964..e8788f5 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) if (LBase <= code && code < (LBase+LCount) && i + 1 < len && VBase <= PyUnicode_READ(kind, data, i+1) && - PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) { + PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) { + /* check L character is a modern leading consonant (0x1100 ~ 0x1112) + and V character is a modern vowel (0x1161 ~ 0x1175). */ int LIndex, VIndex; LIndex = code - LBase; VIndex = PyUnicode_READ(kind, data, i+1) - VBase; code = SBase + (LIndex*VCount+VIndex)*TCount; i+=2; if (i < len && - TBase <= PyUnicode_READ(kind, data, i) && - PyUnicode_READ(kind, data, i) <= (TBase+TCount)) { + TBase < PyUnicode_READ(kind, data, i) && + PyUnicode_READ(kind, data, i) < (TBase+TCount)) { + /* check T character is a modern trailing consonant + (0x11A8 ~ 0x11C2). */ code += PyUnicode_READ(kind, data, i)-TBase; i++; } |