From 4910e7500060284d815092d8058bbc3e30b925c8 Mon Sep 17 00:00:00 2001 From: Dimitri van Heesch Date: Tue, 27 Apr 2021 20:27:56 +0200 Subject: Optimize UTF8 lower/upper case conversion for ASCII --- src/utf8.cpp | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/src/utf8.cpp b/src/utf8.cpp index a00f615..e7108f4 100644 --- a/src/utf8.cpp +++ b/src/utf8.cpp @@ -74,6 +74,11 @@ static inline uint32_t convertUTF8CharToUnicode(const char *s,size_t bytesLeft,i return 0; } unsigned char uc = static_cast(*s); + if (uc<128) // ASCII case + { + len=1; + return uc; + } switch (bytesLeft) { default: @@ -134,39 +139,59 @@ uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos) return convertUTF8CharToUnicode(charS.c_str(),charS.length(),len); } +static inline char asciiToLower(uint32_t code) +{ + return code>='A' && code<='Z' ? (char)(code+'a'-'A') : (char)code; +} + +static inline char asciiToUpper(uint32_t code) +{ + return code>='a' && code<='z' ? (char)(code+'A'-'a') : (char)code; +} + static inline std::string caseConvert(const std::string &input, + char (*asciiConversionFunc)(uint32_t code), const char *(*conversionFunc)(uint32_t code)) { uint32_t code; - TextStream result; + std::string result; + result.reserve(input.length()); // assume all ASCII characters int len; size_t bytesLeft = input.length(); const char *p = input.c_str(); while ((code=convertUTF8CharToUnicode(p,bytesLeft,len))) { - const char *conv = conversionFunc(code); - if (conv==nullptr) // no difference between lower and upper case + if (code<128) // ASCII case { - result.write(p,len); + char c = asciiConversionFunc(code); + result+=c; } - else // replace the input character with the conversion result + else // generic case { - result << conv; + const char *conv = conversionFunc(code); + if (conv==nullptr) // no difference between lower and upper case + { + result.append(p,len); + } + else // replace the input character with the conversion result + { + result.append(conv); + } } p+=len; bytesLeft-=len; } - return result.str(); + return result; } std::string convertUTF8ToLower(const std::string &input) { - return caseConvert(input,convertUnicodeToLower); + return caseConvert(input,asciiToLower,convertUnicodeToLower); } std::string convertUTF8ToUpper(const std::string &input) { - return caseConvert(input,convertUnicodeToUpper); + return caseConvert(input,asciiToUpper,convertUnicodeToUpper); } const char *writeUTF8Char(TextStream &t,const char *s) -- cgit v0.12