summaryrefslogtreecommitdiffstats
path: root/Tools/unicode/genmap_schinese.py
diff options
context:
space:
mode:
authorDong-hee Na <donghee.na92@gmail.com>2020-04-29 17:34:24 (GMT)
committerGitHub <noreply@github.com>2020-04-29 17:34:24 (GMT)
commit113feb3ec2b08948a381175d33b6ff308d24fceb (patch)
treebab128ad30da815e0de1eb7cc72a2213a6d92700 /Tools/unicode/genmap_schinese.py
parent2d8757758d0d75882fef0fe0e3c74c4756b3e81e (diff)
downloadcpython-113feb3ec2b08948a381175d33b6ff308d24fceb.zip
cpython-113feb3ec2b08948a381175d33b6ff308d24fceb.tar.gz
cpython-113feb3ec2b08948a381175d33b6ff308d24fceb.tar.bz2
bpo-40328: Add tool for generating cjk mapping headers (GH-19602)
Diffstat (limited to 'Tools/unicode/genmap_schinese.py')
-rw-r--r--Tools/unicode/genmap_schinese.py149
1 files changed, 149 insertions, 0 deletions
diff --git a/Tools/unicode/genmap_schinese.py b/Tools/unicode/genmap_schinese.py
new file mode 100644
index 0000000..647c033
--- /dev/null
+++ b/Tools/unicode/genmap_schinese.py
@@ -0,0 +1,149 @@
+#
+# genmap_schinese.py: Simplified Chinese Codecs Map Generator
+#
+# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
+# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
+#
+import os
+import re
+
+from genmap_support import *
+
+
+GB2312_C1 = (0x21, 0x7e)
+GB2312_C2 = (0x21, 0x7e)
+GBKL1_C1 = (0x81, 0xa8)
+GBKL1_C2 = (0x40, 0xfe)
+GBKL2_C1 = (0xa9, 0xfe)
+GBKL2_C2 = (0x40, 0xa0)
+GB18030EXTP1_C1 = (0xa1, 0xa9)
+GB18030EXTP1_C2 = (0x40, 0xfe)
+GB18030EXTP2_C1 = (0xaa, 0xaf)
+GB18030EXTP2_C2 = (0xa1, 0xfe)
+GB18030EXTP3_C1 = (0xd7, 0xd7)
+GB18030EXTP3_C2 = (0xfa, 0xfe)
+GB18030EXTP4_C1 = (0xf8, 0xfd)
+GB18030EXTP4_C2 = (0xa1, 0xfe)
+GB18030EXTP5_C1 = (0xfe, 0xfe)
+GB18030EXTP5_C2 = (0x50, 0xfe)
+
+MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT'
+MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'
+MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml'
+
+re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>')
+
+
+def parse_gb18030map(fo):
+ m, gbuni = {}, {}
+ for i in range(65536):
+ if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area
+ gbuni[i] = None
+ for uni, native in re_gb18030ass.findall(fo.read()):
+ uni = eval('0x'+uni)
+ native = [eval('0x'+u) for u in native.split()]
+ if len(native) <= 2:
+ del gbuni[uni]
+ if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes
+ m.setdefault(native[0], {})
+ m[native[0]][native[1]] = uni
+ gbuni = [k for k in gbuni.keys()]
+ gbuni.sort()
+ return m, gbuni
+
+def main():
+ print("Loading Mapping File...")
+ gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312)
+ cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936)
+ gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030)
+
+ gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map)
+ gbkdecmap = loadmap(cp936map)
+ gb2312decmap = loadmap(gb2312map)
+ difmap = {}
+ for c1, m in gbkdecmap.items():
+ for c2, code in m.items():
+ del gb18030decmap[c1][c2]
+ if not gb18030decmap[c1]:
+ del gb18030decmap[c1]
+ for c1, m in gb2312decmap.items():
+ for c2, code in m.items():
+ gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80
+ if gbkdecmap[gbkc1][gbkc2] == code:
+ del gbkdecmap[gbkc1][gbkc2]
+ if not gbkdecmap[gbkc1]:
+ del gbkdecmap[gbkc1]
+
+ gb2312_gbkencmap, gb18030encmap = {}, {}
+ for c1, m in gbkdecmap.items():
+ for c2, code in m.items():
+ gb2312_gbkencmap.setdefault(code >> 8, {})
+ gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set
+ for c1, m in gb2312decmap.items():
+ for c2, code in m.items():
+ gb2312_gbkencmap.setdefault(code >> 8, {})
+ gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset
+ for c1, m in gb18030decmap.items():
+ for c2, code in m.items():
+ gb18030encmap.setdefault(code >> 8, {})
+ gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2
+
+ with open('mappings_cn.h', 'w') as fp:
+ print_autogen(fp, os.path.basename(__file__))
+
+ print("Generating GB2312 decode map...")
+ writer = DecodeMapWriter(fp, "gb2312", gb2312decmap)
+ writer.update_decode_map(GB2312_C1, GB2312_C2)
+ writer.generate()
+
+ print("Generating GBK decode map...")
+ writer = DecodeMapWriter(fp, "gbkext", gbkdecmap)
+ writer.update_decode_map(GBKL1_C1, GBKL1_C2)
+ writer.update_decode_map(GBKL2_C1, GBKL2_C2)
+ writer.generate()
+
+ print("Generating GB2312 && GBK encode map...")
+ writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap)
+ writer.generate()
+
+ print("Generating GB18030 extension decode map...")
+ writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap)
+ for i in range(1, 6):
+ writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i))
+
+ writer.generate()
+
+ print("Generating GB18030 extension encode map...")
+ writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap)
+ writer.generate()
+
+ print("Generating GB18030 Unicode BMP Mapping Ranges...")
+ ranges = [[-1, -1, -1]]
+ gblinnum = 0
+ fp.write("""
+static const struct _gb18030_to_unibmp_ranges {
+ Py_UCS4 first, last;
+ DBCHAR base;
+} gb18030_to_unibmp_ranges[] = {
+""")
+
+ for uni in gb18030unilinear:
+ if uni == ranges[-1][1] + 1:
+ ranges[-1][1] = uni
+ else:
+ ranges.append([uni, uni, gblinnum])
+ gblinnum += 1
+
+ filler = BufferedFiller()
+ for first, last, base in ranges[1:]:
+ filler.write('{', str(first), ',', str(last), ',', str(base), '},')
+
+ filler.write('{', '0,', '0,', str(
+ ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};')
+ filler.printout(fp)
+
+ print("Done!")
+
+
+if __name__ == '__main__':
+ main()