src/caseconvert.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

# python3 script to generate caseconvert.h.
# It uses difference in lower() and upper() on a character to make a mapping
# that maps a given unicode point to either a lower or upper case UTF-8 character.
# this also include multi-byte characters.

import codecs

toupper = {}
tolower = {}

def writeMapping(file,mapping):
    for k,v in sorted(mapping.items()):
        file.write(u"    case %s /* %s */: BSEQ(%s) /* %s */;\n" %
               (hex(ord(k[0])), k, ",".join(f"0x{b:02x}" for b in v.encode('utf-8')), v))

# create mappings of characters whose upper and lower case differ
for codeValue in range(0,0x1FFFF):
        s = chr(codeValue)
        sl = s.lower()
        su = s.upper()
        if ord(s[0])!=ord(sl[0]):
            tolower[s]=sl
        if ord(s[0])!=ord(su[0]):
            toupper[s]=su

file = codecs.open("caseconvert.h", "w", "utf-8")
file.write(r'''/** This file is generated by python3 caseconvert.py. DO NOT EDIT! */

#ifndef CASECONVERT_H
#define CASECONVERT_H

#include <cstdint>
#include <string>

#define BSEQ(...) { static unsigned char s[] = { __VA_ARGS__, 0x00 }; \
                    return reinterpret_cast<const char *>(s); }

inline const char *convertUnicodeToUpper(uint32_t code)
{
  switch(code)
  {
''');
writeMapping(file,toupper);
file.write(r'''    default: return nullptr;
  }
}

inline const char *convertUnicodeToLower(uint32_t code)
{
  switch(code)
  {
''');
writeMapping(file,tolower);
file.write(r'''    default: return nullptr;
  }
}

#endif
''');