diff options
author | Dimitri van Heesch <doxygen@gmail.com> | 2021-03-22 19:02:06 (GMT) |
---|---|---|
committer | Dimitri van Heesch <doxygen@gmail.com> | 2021-03-22 19:02:06 (GMT) |
commit | a4ecbee86766b35d25d41d1a178806e1688485df (patch) | |
tree | 0cb08f45ced6dd4ed97188972c5a718c94219d46 /src/utf8.h | |
parent | fa1897b1889f7bf74de68f1ac99cf3be343a7551 (diff) | |
download | Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.zip Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.tar.gz Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.tar.bz2 |
issue #8375: Lowercase search does not find non-ASCII uppercase pages and vice versa
Diffstat (limited to 'src/utf8.h')
-rw-r--r-- | src/utf8.h | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..c4c8aad --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,71 @@ +/****************************************************************************** + * + * Copyright (C) 1997-2021 by Dimitri van Heesch. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation under the terms of the GNU General Public License is hereby + * granted. No representations are made about the suitability of this software + * for any purpose. It is provided "as is" without express or implied warranty. + * See the GNU General Public License for more details. + * + * Documents produced by Doxygen are derivative works derived from the + * input used in their production; they are not affected by this license. + * + */ + +#ifndef UTF8_H +#define UTF8_H + +#include <cstdint> +#include <string> +#include <iostream> + +/** @file + * @brief Various UTF8 related helper functions. + * + * See https://en.wikipedia.org/wiki/UTF-8 for details on UTF8 encoding. + */ + + +/** Converts the input string into a lower case version, also taking into account + * non-ASCII characters that has a lower case variant. + */ +std::string convertUTF8ToLower(const std::string &input); + +/** Converts the input string into a upper case version, also taking into account + * non-ASCII characters that has a upper case variant. + */ +std::string convertUTF8ToUpper(const std::string &input); + +/** Returns the UTF8 character found at byte position pos in the input string. + * The resulting string can be a multi byte sequence. + */ +std::string getUTF8CharAt(const std::string &input,size_t pos); + +/** Returns the 32bit Unicode value matching character at byte position pos in + * the UTF8 encoded input. + */ +uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos); + +/** Returns the number of bytes making up a single UTF8 character given the first byte + * in the sequence. + */ +int getUTF8CharNumBytes(char firstByte); + +/** Writes the UTF8 character pointed to by s to stream t and returns a pointer + * to the next character. + */ +const char *writeUTF8Char(std::ostream &t,const char *s); + +/** Returns true iff the last character in input is a multibyte character. */ +bool lastUTF8CharIsMultibyte(const std::string &input); + +/** Returns true iff the input string at byte position pos holds an upper case character. */ +bool isUTF8CharUpperCase(const std::string &input,size_t pos); + +/** Check if the first character pointed at by input is a non-breakable whitespace character. + * Returns the byte size of the character if there is match or 0 if not. + */ +int isUTF8NonBreakableSpace(const char *input); + +#endif |