diff options
Diffstat (limited to 'src/utf8.h')
-rw-r--r-- | src/utf8.h | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..c4c8aad --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,71 @@ +/****************************************************************************** + * + * Copyright (C) 1997-2021 by Dimitri van Heesch. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation under the terms of the GNU General Public License is hereby + * granted. No representations are made about the suitability of this software + * for any purpose. It is provided "as is" without express or implied warranty. + * See the GNU General Public License for more details. + * + * Documents produced by Doxygen are derivative works derived from the + * input used in their production; they are not affected by this license. + * + */ + +#ifndef UTF8_H +#define UTF8_H + +#include <cstdint> +#include <string> +#include <iostream> + +/** @file + * @brief Various UTF8 related helper functions. + * + * See https://en.wikipedia.org/wiki/UTF-8 for details on UTF8 encoding. + */ + + +/** Converts the input string into a lower case version, also taking into account + * non-ASCII characters that has a lower case variant. + */ +std::string convertUTF8ToLower(const std::string &input); + +/** Converts the input string into a upper case version, also taking into account + * non-ASCII characters that has a upper case variant. + */ +std::string convertUTF8ToUpper(const std::string &input); + +/** Returns the UTF8 character found at byte position pos in the input string. + * The resulting string can be a multi byte sequence. + */ +std::string getUTF8CharAt(const std::string &input,size_t pos); + +/** Returns the 32bit Unicode value matching character at byte position pos in + * the UTF8 encoded input. + */ +uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos); + +/** Returns the number of bytes making up a single UTF8 character given the first byte + * in the sequence. + */ +int getUTF8CharNumBytes(char firstByte); + +/** Writes the UTF8 character pointed to by s to stream t and returns a pointer + * to the next character. + */ +const char *writeUTF8Char(std::ostream &t,const char *s); + +/** Returns true iff the last character in input is a multibyte character. */ +bool lastUTF8CharIsMultibyte(const std::string &input); + +/** Returns true iff the input string at byte position pos holds an upper case character. */ +bool isUTF8CharUpperCase(const std::string &input,size_t pos); + +/** Check if the first character pointed at by input is a non-breakable whitespace character. + * Returns the byte size of the character if there is match or 0 if not. + */ +int isUTF8NonBreakableSpace(const char *input); + +#endif |