summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_codecmaps_cn.py7
-rw-r--r--Lib/test/test_multibytecodec_support.py16
-rw-r--r--Misc/NEWS3
-rw-r--r--Modules/cjkcodecs/_codecs_cn.c1
4 files changed, 26 insertions, 1 deletions
diff --git a/Lib/test/test_codecmaps_cn.py b/Lib/test/test_codecmaps_cn.py
index 75541ac..344fc56 100644
--- a/Lib/test/test_codecmaps_cn.py
+++ b/Lib/test/test_codecmaps_cn.py
@@ -19,6 +19,13 @@ class TestGBKMap(test_multibytecodec_support.TestBase_Mapping,
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/' \
'MICSFT/WINDOWS/CP936.TXT'
+class TestGB18030Map(test_multibytecodec_support.TestBase_Mapping,
+ unittest.TestCase):
+ encoding = 'gb18030'
+ mapfileurl = 'http://source.icu-project.org/repos/icu/data/' \
+ 'trunk/charset/data/xml/gb-18030-2000.xml'
+
+
def test_main():
test_support.run_unittest(__name__)
diff --git a/Lib/test/test_multibytecodec_support.py b/Lib/test/test_multibytecodec_support.py
index bec32de..197f777 100644
--- a/Lib/test/test_multibytecodec_support.py
+++ b/Lib/test/test_multibytecodec_support.py
@@ -5,7 +5,7 @@
#
import sys, codecs, os.path
-import unittest
+import unittest, re
from test import test_support
from StringIO import StringIO
@@ -272,6 +272,12 @@ class TestBase_Mapping(unittest.TestCase):
return test_support.open_urlresource(self.mapfileurl)
def test_mapping_file(self):
+ if self.mapfileurl.endswith('.xml'):
+ self._test_mapping_file_ucm()
+ else:
+ self._test_mapping_file_plain()
+
+ def _test_mapping_file_plain(self):
unichrs = lambda s: u''.join(map(unichr, map(eval, s.split('+'))))
urt_wa = {}
@@ -303,6 +309,14 @@ class TestBase_Mapping(unittest.TestCase):
self._testpoint(csetch, unich)
+ def _test_mapping_file_ucm(self):
+ ucmdata = self.open_mapping_file().read()
+ uc = re.findall('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>', ucmdata)
+ for uni, coded in uc:
+ unich = unichr(int(uni, 16))
+ codech = ''.join(chr(int(c, 16)) for c in coded.split())
+ self._testpoint(codech, unich)
+
def test_mapping_supplemental(self):
for mapping in self.supmaps:
self._testpoint(*mapping)
diff --git a/Misc/NEWS b/Misc/NEWS
index 123e911..30bb985 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -240,6 +240,9 @@ Core and builtins
Library
-------
+- GB18030 codec now can encode additional two-byte characters that
+ are missing in GBK.
+
- Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
represent the result in a single character.
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c
index c811a67..4542ce6 100644
--- a/Modules/cjkcodecs/_codecs_cn.c
+++ b/Modules/cjkcodecs/_codecs_cn.c
@@ -197,6 +197,7 @@ ENCODER(gb18030)
REQUIRE_OUTBUF(2)
GBK_ENCODE(c, code)
+ else TRYMAP_ENC(gb18030ext, code, c);
else {
const struct _gb18030_to_unibmp_ranges *utrrange;