Incorrect handling of UTF-8 characters in latex reference statement

In case in there is a literal utf-8 character like the the Unicode En Dash (U+2013) a pages is accessed with `\T1\endash` like: ``` https://en.wikipedia.org/wiki/Damerau\T1\textendash_Levenshtein_distance ``` and this page cannot be found, so the UTF-8 character has to be handled properly here. This is found when solving issue #8241
author: albert-github <albert.tests@gmail.com> 2020-12-11 17:45:19 (GMT)
committer: albert-github <albert.tests@gmail.com> 2020-12-11 17:45:19 (GMT)
commit: 2421a8bb57aad7bbd43dd992d385a59006004f42 (patch)
tree: 84524a85dda9a01db44efcce95ccd2ee58cb69e2
parent: 979ea243de83a693e0d9da545ee4cbe7db9521ee (diff)
download: Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.zip
Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.tar.gz
Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.tar.bz2
1 files changed, 57 insertions, 1 deletions
diff --git a/src/util.cpp b/src/util.cpp
index 74e5226..c00f4aa 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -5309,7 +5309,63 @@ QCString latexFilterURL(const char *s)
       case '%':  t << "\\%"; break;
       case '\\':  t << "\\\\"; break;
       default:
-        t << c;
+        if (c<0)
+        {
+          char ids[5];
+          const unsigned char uc = (unsigned char)c;
+          bool doEscape = TRUE;
+          if (uc <= 0xf7)
+          {
+            const signed char* pt = (signed char *)p;
+            ids[ 0 ] = c;
+            int l = 0;
+            if ((uc&0xE0)==0xC0)
+            {
+              l=2; // 11xx.xxxx: >=2 byte character
+            }
+            if ((uc&0xF0)==0xE0)
+            {
+              l=3; // 111x.xxxx: >=3 byte character
+            }
+            if ((uc&0xF8)==0xF0)
+            {
+              l=4; // 1111.xxxx: >=4 byte character
+            }
+            doEscape = l==0;
+            for (int m=1; m<l && !doEscape; ++m)
+            {
+              unsigned char ct = (unsigned char)*pt;
+              if (ct==0 || (ct&0xC0)!=0x80) // invalid unicode character
+              {
+                doEscape=TRUE;
+              }
+              else
+              {
+                ids[ m ] = *pt++;
+              }
+            }
+            if ( !doEscape ) // got a valid unicode character
+            {
+              static char map[] = "0123456789ABCDEF";
+              for (int m = 0; m < l; ++m)
+              {
+                unsigned char id = (unsigned char)ids[m];
+                t << "\\%" << map[id>>4] << map[id&0xF];
+              }
+              p += l - 1;
+            }
+          }
+          if (doEscape) // not a valid unicode char or escaping needed
+          {
+            static char map[] = "0123456789ABCDEF";
+            unsigned char id = (unsigned char)c;
+            t << c;
+          }
+        }
+        else
+        {
+          t << c;
+        }
         break;
     }
   }
author	albert-github <albert.tests@gmail.com>	2020-12-11 17:45:19 (GMT)
committer	albert-github <albert.tests@gmail.com>	2020-12-11 17:45:19 (GMT)
commit	2421a8bb57aad7bbd43dd992d385a59006004f42 (patch)
tree	84524a85dda9a01db44efcce95ccd2ee58cb69e2
parent	979ea243de83a693e0d9da545ee4cbe7db9521ee (diff)
download	Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.zip Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.tar.gz Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.tar.bz2