summaryrefslogtreecommitdiffstats
path: root/src/utf8.h
diff options
context:
space:
mode:
authorDimitri van Heesch <doxygen@gmail.com>2021-03-22 19:02:06 (GMT)
committerDimitri van Heesch <doxygen@gmail.com>2021-03-22 19:02:06 (GMT)
commita4ecbee86766b35d25d41d1a178806e1688485df (patch)
tree0cb08f45ced6dd4ed97188972c5a718c94219d46 /src/utf8.h
parentfa1897b1889f7bf74de68f1ac99cf3be343a7551 (diff)
downloadDoxygen-a4ecbee86766b35d25d41d1a178806e1688485df.zip
Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.tar.gz
Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.tar.bz2
issue #8375: Lowercase search does not find non-ASCII uppercase pages and vice versa
Diffstat (limited to 'src/utf8.h')
-rw-r--r--src/utf8.h71
1 files changed, 71 insertions, 0 deletions
diff --git a/src/utf8.h b/src/utf8.h
new file mode 100644
index 0000000..c4c8aad
--- /dev/null
+++ b/src/utf8.h
@@ -0,0 +1,71 @@
+/******************************************************************************
+ *
+ * Copyright (C) 1997-2021 by Dimitri van Heesch.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation under the terms of the GNU General Public License is hereby
+ * granted. No representations are made about the suitability of this software
+ * for any purpose. It is provided "as is" without express or implied warranty.
+ * See the GNU General Public License for more details.
+ *
+ * Documents produced by Doxygen are derivative works derived from the
+ * input used in their production; they are not affected by this license.
+ *
+ */
+
+#ifndef UTF8_H
+#define UTF8_H
+
+#include <cstdint>
+#include <string>
+#include <iostream>
+
+/** @file
+ * @brief Various UTF8 related helper functions.
+ *
+ * See https://en.wikipedia.org/wiki/UTF-8 for details on UTF8 encoding.
+ */
+
+
+/** Converts the input string into a lower case version, also taking into account
+ * non-ASCII characters that has a lower case variant.
+ */
+std::string convertUTF8ToLower(const std::string &input);
+
+/** Converts the input string into a upper case version, also taking into account
+ * non-ASCII characters that has a upper case variant.
+ */
+std::string convertUTF8ToUpper(const std::string &input);
+
+/** Returns the UTF8 character found at byte position pos in the input string.
+ * The resulting string can be a multi byte sequence.
+ */
+std::string getUTF8CharAt(const std::string &input,size_t pos);
+
+/** Returns the 32bit Unicode value matching character at byte position pos in
+ * the UTF8 encoded input.
+ */
+uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos);
+
+/** Returns the number of bytes making up a single UTF8 character given the first byte
+ * in the sequence.
+ */
+int getUTF8CharNumBytes(char firstByte);
+
+/** Writes the UTF8 character pointed to by s to stream t and returns a pointer
+ * to the next character.
+ */
+const char *writeUTF8Char(std::ostream &t,const char *s);
+
+/** Returns true iff the last character in input is a multibyte character. */
+bool lastUTF8CharIsMultibyte(const std::string &input);
+
+/** Returns true iff the input string at byte position pos holds an upper case character. */
+bool isUTF8CharUpperCase(const std::string &input,size_t pos);
+
+/** Check if the first character pointed at by input is a non-breakable whitespace character.
+ * Returns the byte size of the character if there is match or 0 if not.
+ */
+int isUTF8NonBreakableSpace(const char *input);
+
+#endif