summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoralbert-github <albert.tests@gmail.com>2020-12-11 17:45:19 (GMT)
committeralbert-github <albert.tests@gmail.com>2020-12-11 17:45:19 (GMT)
commit2421a8bb57aad7bbd43dd992d385a59006004f42 (patch)
tree84524a85dda9a01db44efcce95ccd2ee58cb69e2
parent979ea243de83a693e0d9da545ee4cbe7db9521ee (diff)
downloadDoxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.zip
Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.tar.gz
Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.tar.bz2
Incorrect handling of UTF-8 characters in latex reference statement
In case in there is a literal utf-8 character like the the Unicode En Dash (U+2013) a pages is accessed with `\T1\endash` like: ``` https://en.wikipedia.org/wiki/Damerau\T1\textendash_Levenshtein_distance ``` and this page cannot be found, so the UTF-8 character has to be handled properly here. This is found when solving issue #8241
-rw-r--r--src/util.cpp58
1 files changed, 57 insertions, 1 deletions
diff --git a/src/util.cpp b/src/util.cpp
index 74e5226..c00f4aa 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -5309,7 +5309,63 @@ QCString latexFilterURL(const char *s)
case '%': t << "\\%"; break;
case '\\': t << "\\\\"; break;
default:
- t << c;
+ if (c<0)
+ {
+ char ids[5];
+ const unsigned char uc = (unsigned char)c;
+ bool doEscape = TRUE;
+ if (uc <= 0xf7)
+ {
+ const signed char* pt = (signed char *)p;
+ ids[ 0 ] = c;
+ int l = 0;
+ if ((uc&0xE0)==0xC0)
+ {
+ l=2; // 11xx.xxxx: >=2 byte character
+ }
+ if ((uc&0xF0)==0xE0)
+ {
+ l=3; // 111x.xxxx: >=3 byte character
+ }
+ if ((uc&0xF8)==0xF0)
+ {
+ l=4; // 1111.xxxx: >=4 byte character
+ }
+ doEscape = l==0;
+ for (int m=1; m<l && !doEscape; ++m)
+ {
+ unsigned char ct = (unsigned char)*pt;
+ if (ct==0 || (ct&0xC0)!=0x80) // invalid unicode character
+ {
+ doEscape=TRUE;
+ }
+ else
+ {
+ ids[ m ] = *pt++;
+ }
+ }
+ if ( !doEscape ) // got a valid unicode character
+ {
+ static char map[] = "0123456789ABCDEF";
+ for (int m = 0; m < l; ++m)
+ {
+ unsigned char id = (unsigned char)ids[m];
+ t << "\\%" << map[id>>4] << map[id&0xF];
+ }
+ p += l - 1;
+ }
+ }
+ if (doEscape) // not a valid unicode char or escaping needed
+ {
+ static char map[] = "0123456789ABCDEF";
+ unsigned char id = (unsigned char)c;
+ t << c;
+ }
+ }
+ else
+ {
+ t << c;
+ }
break;
}
}