/****************************************************************************** * * Copyright (C) 1997-2021 by Dimitri van Heesch. * * Permission to use, copy, modify, and distribute this software and its * documentation under the terms of the GNU General Public License is hereby * granted. No representations are made about the suitability of this software * for any purpose. It is provided "as is" without express or implied warranty. * See the GNU General Public License for more details. * * Documents produced by Doxygen are derivative works derived from the * input used in their production; they are not affected by this license. * */ #include #include "utf8.h" #include "caseconvert.h" int getUTF8CharNumBytes(char c) { int num=1; unsigned char uc = static_cast(c); if (uc>=0x80u) // multibyte character { if ((uc&0xE0u)==0xC0u) { num=2; // 110x.xxxx: 2 byte character } if ((uc&0xF0u)==0xE0u) { num=3; // 1110.xxxx: 3 byte character } if ((uc&0xF8u)==0xF0u) { num=4; // 1111.0xxx: 4 byte character } if ((uc&0xFCu)==0xF8u) { num=5; // 1111.10xx: 5 byte character } if ((uc&0xFEu)==0xFCu) { num=6; // 1111.110x: 6 byte character } } return num; } //! Decodes a given input of utf8 data to a unicode code point //! given the number of bytes it's made of static inline uint32_t decode_utf8( const char* data , int numBytes ) noexcept { uint32_t cp = (unsigned char)*data; if (numBytes>1) { cp &= 0x7F >> numBytes; // Mask out the header bits for (int i=1 ; i(*s); switch (bytesLeft) { default: if ((uc&0xFEu)==0xFCu)// 1111110X six bytes { len=6; return decode_utf8(s,len); } // fall through case 5: if ((uc&0xFCu)==0xF8u) // 111110XX five bytes { len=5; return decode_utf8(s,len); } // fall through case 4: if ((uc&0xF8u)==0xF0u) // 11110XXX four bytes { len=4; return decode_utf8(s,len); } // fall through case 3: if ((uc&0xF0u)==0xE0u) // 1110XXXX three bytes { len=3; return decode_utf8(s,len); } // fall through case 2: if ((uc&0xE0u)==0xC0u) // 110XXXXX two bytes { len=2; return decode_utf8(s,len); } // fall through case 1: { len=1; return uc; } } len=0; return 0; } std::string getUTF8CharAt(const std::string &input,size_t pos) { if (input.length()<=pos) return std::string(); int numBytes=getUTF8CharNumBytes(input[pos]); if (input.length()(input[0])==0xC2 && static_cast(input[1])==0xA0) ? 2 : 0; }