From fb5de060bc99635a5b18b3389cc15e9937b19a0e Mon Sep 17 00:00:00 2001 From: Ben Boeckel Date: Thu, 14 Mar 2019 13:26:21 -0400 Subject: cm_utf8: reject codepoints above 0x10FFFF These are invalid because the Unicode standard says so (because UTF-16 as specified today cannot encode them). --- Source/cm_utf8.c | 5 +++++ Tests/CMakeLib/testUTF8.cxx | 6 ++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Source/cm_utf8.c b/Source/cm_utf8.c index 2459c02..d41d097 100644 --- a/Source/cm_utf8.c +++ b/Source/cm_utf8.c @@ -76,6 +76,11 @@ const char* cm_utf8_decode_character(const char* first, const char* last, return 0; } + /* Invalid codepoints. */ + if (0x10FFFF < uc) { + return 0; + } + *pc = uc; return first; } diff --git a/Tests/CMakeLib/testUTF8.cxx b/Tests/CMakeLib/testUTF8.cxx index f1da6df..7f52c82 100644 --- a/Tests/CMakeLib/testUTF8.cxx +++ b/Tests/CMakeLib/testUTF8.cxx @@ -28,10 +28,6 @@ static test_utf8_entry const good_entry[] = { { 4, "\xF0\xA3\x8E\xB4", 0x233B4 }, /* Example from RFC 3629. */ { 3, "\xED\x80\x80\x00", 0xD000 }, /* Valid 0xED prefixed codepoint. */ { 4, "\xF4\x8F\xBF\xBF", 0x10FFFF }, /* Highest valid RFC codepoint. */ - /* These are invalid according to the RFC, but accepted here. */ - { 4, "\xF4\x90\x80\x80", 0x110000 }, /* Lowest out-of-range codepoint. */ - { 4, "\xF5\x80\x80\x80", - 0x140000 }, /* Prefix forces out-of-range codepoints. */ { 0, { 0, 0, 0, 0, 0 }, 0 } }; @@ -45,6 +41,8 @@ static test_utf8_char const bad_chars[] = { "\xF0\x80\x80\x80", /* Overlong encoding. */ "\xED\xA0\x80\x00", /* UTF-16 surrogate half. */ "\xED\xBF\xBF\x00", /* UTF-16 surrogate half. */ + "\xF4\x90\x80\x80", /* Lowest out-of-range codepoint. */ + "\xF5\x80\x80\x80", /* Prefix forces out-of-range codepoints. */ { 0, 0, 0, 0, 0 } }; -- cgit v0.12