bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (GH-1958) (GH-7704)

Hangul composition check boundaries are wrong for the second character ([0x1161, 0x1176) instead of [0x1161, 0x1176]) and third character ((0x11A7, 0x11C3) instead of [0x11A7, 0x11C3]).. (cherry picked from commit d134809cd3764c6a634eab7bb8995e3e2eff14d5) Co-authored-by: Wonsup Yoon <pusnow@me.com>
author: Xiang Zhang <angwerzx@126.com> 2018-06-15 13:26:55 (GMT)
committer: GitHub <noreply@github.com> 2018-06-15 13:26:55 (GMT)
commit: 1889c4cbd62e200fa4cde3d6219e0aadf9bd8149 (patch)
tree: 884cddef4342e2ee2ed415b122e65eaea83f5789
parent: fc8ea20c6f8571de96791bc5f7f2d693406024c7 (diff)
download: cpython-1889c4cbd62e200fa4cde3d6219e0aadf9bd8149.zip
cpython-1889c4cbd62e200fa4cde3d6219e0aadf9bd8149.tar.gz
cpython-1889c4cbd62e200fa4cde3d6219e0aadf9bd8149.tar.bz2
4 files changed, 21 insertions, 2 deletions
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index c30ecf4..11f2cda 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -204,6 +204,19 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
         b = u'C\u0338' * 20  + u'\xC7'
         self.assertEqual(self.db.normalize('NFC', a), b)
 
+    def test_issue29456(self):
+        # Fix #29456
+        u1176_str_a = u'\u1100\u1176\u11a8'
+        u1176_str_b = u'\u1100\u1176\u11a8'
+        u11a7_str_a = u'\u1100\u1175\u11a7'
+        u11a7_str_b = u'\uae30\u11a7'
+        u11c3_str_a = u'\u1100\u1175\u11c3'
+        u11c3_str_b = u'\uae30\u11c3'
+        self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
+        self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
+        self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
+
+
     def test_east_asian_width(self):
         eaw = self.db.east_asian_width
         self.assertRaises(TypeError, eaw, 'a')
diff --git a/Misc/ACKS b/Misc/ACKS
index 295b933..7ec29fa 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1578,6 +1578,7 @@ Jason Yeo
 EungJun Yi
 Bob Yodlowski
 Danny Yoo
+Wonsup Yoon
 Rory Yorke
 George Yoshida
 Kazuhiro Yoshida
diff --git a/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst
new file mode 100644
index 0000000..9b30bf6
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst
@@ -0,0 +1 @@
+Fix bugs in hangul normalization: u1176, u11a7 and u11c3
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 6b01fc76..df6ffe3 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -664,14 +664,18 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
          pairs, since we always have decomposed data. */
       if (LBase <= *i && *i < (LBase+LCount) &&
           i + 1 < end &&
-          VBase <= i[1] && i[1] <= (VBase+VCount)) {
+          VBase <= i[1] && i[1] < (VBase+VCount)) {
+          /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
+             and V character is a modern vowel (0x1161 ~ 0x1175). */
           int LIndex, VIndex;
           LIndex = i[0] - LBase;
           VIndex = i[1] - VBase;
           code = SBase + (LIndex*VCount+VIndex)*TCount;
           i+=2;
           if (i < end &&
-              TBase <= *i && *i <= (TBase+TCount)) {
+              TBase < *i && *i < (TBase+TCount)) {
+              /* check T character is a modern trailing consonant
+                 (0x11A8 ~ 0x11C2). */
               code += *i-TBase;
               i++;
           }
author	Xiang Zhang <angwerzx@126.com>	2018-06-15 13:26:55 (GMT)
committer	GitHub <noreply@github.com>	2018-06-15 13:26:55 (GMT)
commit	1889c4cbd62e200fa4cde3d6219e0aadf9bd8149 (patch)
tree	884cddef4342e2ee2ed415b122e65eaea83f5789
parent	fc8ea20c6f8571de96791bc5f7f2d693406024c7 (diff)
download	cpython-1889c4cbd62e200fa4cde3d6219e0aadf9bd8149.zip cpython-1889c4cbd62e200fa4cde3d6219e0aadf9bd8149.tar.gz cpython-1889c4cbd62e200fa4cde3d6219e0aadf9bd8149.tar.bz2