From 36fe3c0a849a7f1c1fb72c198bdd31bcf15c06c2 Mon Sep 17 00:00:00 2001 From: Hye-Shik Chang Date: Sat, 4 Aug 2007 04:15:04 +0000 Subject: Backport from trunk r56727: Fix gb18030 codec's bug that doesn't map two-byte characters on GB18030 extension in encoding. (bug reported by Bjorn Stabell) --- Lib/test/test_codecmaps_cn.py | 8 ++++++++ Lib/test/test_multibytecodec_support.py | 16 +++++++++++++++- Misc/NEWS | 3 +++ Modules/cjkcodecs/_codecs_cn.c | 1 + 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_codecmaps_cn.py b/Lib/test/test_codecmaps_cn.py index 8cbee76..0c34fdc 100644 --- a/Lib/test/test_codecmaps_cn.py +++ b/Lib/test/test_codecmaps_cn.py @@ -19,10 +19,18 @@ class TestGBKMap(test_multibytecodec_support.TestBase_Mapping, mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/' \ 'MICSFT/WINDOWS/CP936.TXT' +class TestGB18030Map(test_multibytecodec_support.TestBase_Mapping, + unittest.TestCase): + encoding = 'gb18030' + mapfileurl = 'http://source.icu-project.org/repos/icu/data/' \ + 'trunk/charset/data/xml/gb-18030-2000.xml' + + def test_main(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(TestGB2312Map)) suite.addTest(unittest.makeSuite(TestGBKMap)) + suite.addTest(unittest.makeSuite(TestGB18030Map)) test_support.run_suite(suite) if __name__ == "__main__": diff --git a/Lib/test/test_multibytecodec_support.py b/Lib/test/test_multibytecodec_support.py index bec32de..197f777 100644 --- a/Lib/test/test_multibytecodec_support.py +++ b/Lib/test/test_multibytecodec_support.py @@ -5,7 +5,7 @@ # import sys, codecs, os.path -import unittest +import unittest, re from test import test_support from StringIO import StringIO @@ -272,6 +272,12 @@ class TestBase_Mapping(unittest.TestCase): return test_support.open_urlresource(self.mapfileurl) def test_mapping_file(self): + if self.mapfileurl.endswith('.xml'): + self._test_mapping_file_ucm() + else: + self._test_mapping_file_plain() + + def _test_mapping_file_plain(self): unichrs = lambda s: u''.join(map(unichr, map(eval, s.split('+')))) urt_wa = {} @@ -303,6 +309,14 @@ class TestBase_Mapping(unittest.TestCase): self._testpoint(csetch, unich) + def _test_mapping_file_ucm(self): + ucmdata = self.open_mapping_file().read() + uc = re.findall('', ucmdata) + for uni, coded in uc: + unich = unichr(int(uni, 16)) + codech = ''.join(chr(int(c, 16)) for c in coded.split()) + self._testpoint(codech, unich) + def test_mapping_supplemental(self): for mapping in self.supmaps: self._testpoint(*mapping) diff --git a/Misc/NEWS b/Misc/NEWS index 0b01eaf..3d5221c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -26,6 +26,9 @@ Core and builtins Library ------- +- GB18030 codec now can encode additional two-byte characters that + are missing in GBK. + - Bug #1704793: Raise KeyError if unicodedata.lookup cannot represent the result in a single character. diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c index c811a67..4542ce6 100644 --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -197,6 +197,7 @@ ENCODER(gb18030) REQUIRE_OUTBUF(2) GBK_ENCODE(c, code) + else TRYMAP_ENC(gb18030ext, code, c); else { const struct _gb18030_to_unibmp_ranges *utrrange; -- cgit v0.12