issue #8375: Lowercase search does not find non-ASCII uppercase pages and vice versa

author: Dimitri van Heesch <doxygen@gmail.com> 2021-03-22 19:02:06 (GMT)
committer: Dimitri van Heesch <doxygen@gmail.com> 2021-03-22 19:02:06 (GMT)
commit: a4ecbee86766b35d25d41d1a178806e1688485df (patch)
tree: 0cb08f45ced6dd4ed97188972c5a718c94219d46 /src/utf8.h
parent: fa1897b1889f7bf74de68f1ac99cf3be343a7551 (diff)
download: Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.zip
Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.tar.gz
Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.tar.bz2
1 files changed, 71 insertions, 0 deletions
diff --git a/src/utf8.h b/src/utf8.h
new file mode 100644
index 0000000..c4c8aad
--- /dev/null
+++ b/src/utf8.h
@@ -0,0 +1,71 @@
+/******************************************************************************
+ *
+ * Copyright (C) 1997-2021 by Dimitri van Heesch.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation under the terms of the GNU General Public License is hereby
+ * granted. No representations are made about the suitability of this software
+ * for any purpose. It is provided "as is" without express or implied warranty.
+ * See the GNU General Public License for more details.
+ *
+ * Documents produced by Doxygen are derivative works derived from the
+ * input used in their production; they are not affected by this license.
+ *
+ */
+
+#ifndef UTF8_H
+#define UTF8_H
+
+#include <cstdint>
+#include <string>
+#include <iostream>
+
+/** @file
+ *  @brief Various UTF8 related helper functions.
+ *
+ *  See https://en.wikipedia.org/wiki/UTF-8 for details on UTF8 encoding.
+ */
+
+
+/** Converts the input string into a lower case version, also taking into account
+ *  non-ASCII characters that has a lower case variant.
+ */
+std::string convertUTF8ToLower(const std::string &input);
+
+/** Converts the input string into a upper case version, also taking into account
+ *  non-ASCII characters that has a upper case variant.
+ */
+std::string convertUTF8ToUpper(const std::string &input);
+
+/** Returns the UTF8 character found at byte position pos in the input string.
+ *  The resulting string can be a multi byte sequence.
+ */
+std::string getUTF8CharAt(const std::string &input,size_t pos);
+
+/** Returns the 32bit Unicode value matching character at byte position pos in
+ *  the UTF8 encoded input.
+ */
+uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos);
+
+/** Returns the number of bytes making up a single UTF8 character given the first byte
+ *  in the sequence.
+ */
+int getUTF8CharNumBytes(char firstByte);
+
+/** Writes the UTF8 character pointed to by s to stream t and returns a pointer
+ *  to the next character.
+ */
+const char *writeUTF8Char(std::ostream &t,const char *s);
+
+/** Returns true iff the last character in input is a multibyte character. */
+bool lastUTF8CharIsMultibyte(const std::string &input);
+
+/** Returns true iff the input string at byte position pos holds an upper case character. */
+bool isUTF8CharUpperCase(const std::string &input,size_t pos);
+
+/** Check if the first character pointed at by input is a non-breakable whitespace character.
+ *  Returns the byte size of the character if there is match or 0 if not.
+ */
+int isUTF8NonBreakableSpace(const char *input);
+
+#endif
author	Dimitri van Heesch <doxygen@gmail.com>	2021-03-22 19:02:06 (GMT)
committer	Dimitri van Heesch <doxygen@gmail.com>	2021-03-22 19:02:06 (GMT)
commit	a4ecbee86766b35d25d41d1a178806e1688485df (patch)
tree	0cb08f45ced6dd4ed97188972c5a718c94219d46 /src/utf8.h
parent	fa1897b1889f7bf74de68f1ac99cf3be343a7551 (diff)
download	Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.zip Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.tar.gz Doxygen-a4ecbee86766b35d25d41d1a178806e1688485df.tar.bz2