diff options
author | albert-github <albert.tests@gmail.com> | 2020-12-11 17:45:19 (GMT) |
---|---|---|
committer | albert-github <albert.tests@gmail.com> | 2020-12-11 17:45:19 (GMT) |
commit | 2421a8bb57aad7bbd43dd992d385a59006004f42 (patch) | |
tree | 84524a85dda9a01db44efcce95ccd2ee58cb69e2 /src | |
parent | 979ea243de83a693e0d9da545ee4cbe7db9521ee (diff) | |
download | Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.zip Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.tar.gz Doxygen-2421a8bb57aad7bbd43dd992d385a59006004f42.tar.bz2 |
Incorrect handling of UTF-8 characters in latex reference statement
In case in there is a literal utf-8 character like the the Unicode En Dash (U+2013) a pages is accessed with `\T1\endash` like:
```
https://en.wikipedia.org/wiki/Damerau\T1\textendash_Levenshtein_distance
```
and this page cannot be found, so the UTF-8 character has to be handled properly here.
This is found when solving issue #8241
Diffstat (limited to 'src')
-rw-r--r-- | src/util.cpp | 58 |
1 files changed, 57 insertions, 1 deletions
diff --git a/src/util.cpp b/src/util.cpp index 74e5226..c00f4aa 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -5309,7 +5309,63 @@ QCString latexFilterURL(const char *s) case '%': t << "\\%"; break; case '\\': t << "\\\\"; break; default: - t << c; + if (c<0) + { + char ids[5]; + const unsigned char uc = (unsigned char)c; + bool doEscape = TRUE; + if (uc <= 0xf7) + { + const signed char* pt = (signed char *)p; + ids[ 0 ] = c; + int l = 0; + if ((uc&0xE0)==0xC0) + { + l=2; // 11xx.xxxx: >=2 byte character + } + if ((uc&0xF0)==0xE0) + { + l=3; // 111x.xxxx: >=3 byte character + } + if ((uc&0xF8)==0xF0) + { + l=4; // 1111.xxxx: >=4 byte character + } + doEscape = l==0; + for (int m=1; m<l && !doEscape; ++m) + { + unsigned char ct = (unsigned char)*pt; + if (ct==0 || (ct&0xC0)!=0x80) // invalid unicode character + { + doEscape=TRUE; + } + else + { + ids[ m ] = *pt++; + } + } + if ( !doEscape ) // got a valid unicode character + { + static char map[] = "0123456789ABCDEF"; + for (int m = 0; m < l; ++m) + { + unsigned char id = (unsigned char)ids[m]; + t << "\\%" << map[id>>4] << map[id&0xF]; + } + p += l - 1; + } + } + if (doEscape) // not a valid unicode char or escaping needed + { + static char map[] = "0123456789ABCDEF"; + unsigned char id = (unsigned char)c; + t << c; + } + } + else + { + t << c; + } break; } } |