Bug 705910 - Indexing and searching cannot treat non ASCII identifiers

author: Dimitri van Heesch <dimitri@stack.nl> 2013-09-14 15:11:20 (GMT)
committer: Dimitri van Heesch <dimitri@stack.nl> 2013-09-15 18:12:34 (GMT)
commit: bca6baee6685b489c36abec5a3b550921294e228 (patch)
tree: a9f2f6d2622e8aba8beed7477e87d7c9c7a468e1 /src/util.cpp
parent: f6bc941e73bd562b15705f7bc3c958267f75f842 (diff)
download: Doxygen-bca6baee6685b489c36abec5a3b550921294e228.zip
Doxygen-bca6baee6685b489c36abec5a3b550921294e228.tar.gz
Doxygen-bca6baee6685b489c36abec5a3b550921294e228.tar.bz2
1 files changed, 69 insertions, 0 deletions
diff --git a/src/util.cpp b/src/util.cpp
index 5abe4ed..60a0fe1 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -7919,3 +7919,72 @@ void addDocCrossReference(MemberDef *src,MemberDef *dst)
   }
 }
 
+//--------------------------------------------------------------------------------------
+
+/*! @brief Get one unicode character as an unsigned integer from utf-8 string
+ *
+ * @param s utf-8 encoded string
+ * @param idx byte position of given string \a s.
+ * @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT
+ * @see getNextUtf8OrToLower()
+ * @see getNextUtf8OrToUpper()
+ */
+uint getUtf8Code( const QCString& s, int idx )
+{
+  const int length = s.length();
+  if (idx >= length) { return 0; }
+  const uint c0 = (uchar)s.at(idx);
+  if ( c0 < 0xC2 || c0 >= 0xF8 ) // 1 byte character
+  {
+    return c0;
+  }
+  if (idx+1 >= length) { return 0; }
+  const uint c1 = ((uchar)s.at(idx+1)) & 0x3f;
+  if ( c0 < 0xE0 ) // 2 byte character
+  {
+    return ((c0 & 0x1f) << 6) | c1;
+  }
+  if (idx+2 >= length) { return 0; }
+  const uint c2 = ((uchar)s.at(idx+2)) & 0x3f;
+  if ( c0 < 0xF0 ) // 3 byte character
+  {
+    return ((c0 & 0x0f) << 12) | (c1 << 6) | c2;
+  }
+  if (idx+3 >= length) { return 0; }
+  // 4 byte character
+  const uint c3 = ((uchar)s.at(idx+3)) & 0x3f;
+  return ((c0 & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
+}
+
+
+/*! @brief Returns one unicode character as an unsigned integer 
+ *  from utf-8 string, making the character lower case if it was upper case.
+ *
+ * @param s utf-8 encoded string
+ * @param idx byte position of given string \a s.
+ * @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT, excludes 'A'-'Z'
+ * @see getNextUtf8Code()
+*/
+uint getUtf8CodeToLower( const QCString& s, int idx )
+{
+  const uint v = getUtf8Code( s, idx );
+  return v < 0x7f ? tolower( v ) : v;
+}
+
+
+/*! @brief Returns one unicode character as ian unsigned interger 
+ *  from utf-8 string, making the character upper case if it was lower case.
+ *
+ * @param s utf-8 encoded string
+ * @param idx byte position of given string \a s.
+ * @return the unicode codepoint, 0 - MAX_UNICODE_CODEPOINT, excludes 'A'-'Z'
+ * @see getNextUtf8Code()
+ */
+uint getUtf8CodeToUpper( const QCString& s, int idx )
+{
+  const uint v = getUtf8Code( s, idx );
+  return v < 0x7f ? toupper( v ) : v;
+}
+
+//--------------------------------------------------------------------------------------
+
author	Dimitri van Heesch <dimitri@stack.nl>	2013-09-14 15:11:20 (GMT)
committer	Dimitri van Heesch <dimitri@stack.nl>	2013-09-15 18:12:34 (GMT)
commit	bca6baee6685b489c36abec5a3b550921294e228 (patch)
tree	a9f2f6d2622e8aba8beed7477e87d7c9c7a468e1 /src/util.cpp
parent	f6bc941e73bd562b15705f7bc3c958267f75f842 (diff)
download	Doxygen-bca6baee6685b489c36abec5a3b550921294e228.zip Doxygen-bca6baee6685b489c36abec5a3b550921294e228.tar.gz Doxygen-bca6baee6685b489c36abec5a3b550921294e228.tar.bz2