diff options
-rw-r--r-- | Source/cm_utf8.c | 10 | ||||
-rw-r--r-- | Tests/CMakeLib/testUTF8.cxx | 26 |
2 files changed, 29 insertions, 7 deletions
diff --git a/Source/cm_utf8.c b/Source/cm_utf8.c index 52af4a6..d41d097 100644 --- a/Source/cm_utf8.c +++ b/Source/cm_utf8.c @@ -71,6 +71,16 @@ const char* cm_utf8_decode_character(const char* first, const char* last, return 0; } + /* UTF-16 surrogate halves. */ + if (0xD800 <= uc && uc <= 0xDFFF) { + return 0; + } + + /* Invalid codepoints. */ + if (0x10FFFF < uc) { + return 0; + } + *pc = uc; return first; } diff --git a/Tests/CMakeLib/testUTF8.cxx b/Tests/CMakeLib/testUTF8.cxx index c99c46d..7f52c82 100644 --- a/Tests/CMakeLib/testUTF8.cxx +++ b/Tests/CMakeLib/testUTF8.cxx @@ -21,17 +21,29 @@ struct test_utf8_entry }; static test_utf8_entry const good_entry[] = { - { 1, "\x20\x00\x00\x00", 0x0020 }, /* Space. */ - { 2, "\xC2\xA9\x00\x00", 0x00A9 }, /* Copyright. */ - { 3, "\xE2\x80\x98\x00", 0x2018 }, /* Open-single-quote. */ - { 3, "\xE2\x80\x99\x00", 0x2019 }, /* Close-single-quote. */ - { 4, "\xF0\xA3\x8E\xB4", 0x233B4 }, /* Example from RFC 3629. */ + { 1, "\x20\x00\x00\x00", 0x0020 }, /* Space. */ + { 2, "\xC2\xA9\x00\x00", 0x00A9 }, /* Copyright. */ + { 3, "\xE2\x80\x98\x00", 0x2018 }, /* Open-single-quote. */ + { 3, "\xE2\x80\x99\x00", 0x2019 }, /* Close-single-quote. */ + { 4, "\xF0\xA3\x8E\xB4", 0x233B4 }, /* Example from RFC 3629. */ + { 3, "\xED\x80\x80\x00", 0xD000 }, /* Valid 0xED prefixed codepoint. */ + { 4, "\xF4\x8F\xBF\xBF", 0x10FFFF }, /* Highest valid RFC codepoint. */ { 0, { 0, 0, 0, 0, 0 }, 0 } }; static test_utf8_char const bad_chars[] = { - "\x80\x00\x00\x00", "\xC0\x00\x00\x00", "\xE0\x00\x00\x00", - "\xE0\x80\x80\x00", "\xF0\x80\x80\x80", { 0, 0, 0, 0, 0 } + "\x80\x00\x00\x00", /* Leading continuation byte. */ + "\xC0\x80\x00\x00", /* Overlong encoding. */ + "\xC1\x80\x00\x00", /* Overlong encoding. */ + "\xC2\x00\x00\x00", /* Missing continuation byte. */ + "\xE0\x00\x00\x00", /* Missing continuation bytes. */ + "\xE0\x80\x80\x00", /* Overlong encoding. */ + "\xF0\x80\x80\x80", /* Overlong encoding. */ + "\xED\xA0\x80\x00", /* UTF-16 surrogate half. */ + "\xED\xBF\xBF\x00", /* UTF-16 surrogate half. */ + "\xF4\x90\x80\x80", /* Lowest out-of-range codepoint. */ + "\xF5\x80\x80\x80", /* Prefix forces out-of-range codepoints. */ + { 0, 0, 0, 0, 0 } }; static void report_good(bool passed, test_utf8_char const c) |