diff options
Diffstat (limited to 'Utilities/cmxmlrpc/xmlrpc_utf8.c')
-rw-r--r-- | Utilities/cmxmlrpc/xmlrpc_utf8.c | 376 |
1 files changed, 376 insertions, 0 deletions
diff --git a/Utilities/cmxmlrpc/xmlrpc_utf8.c b/Utilities/cmxmlrpc/xmlrpc_utf8.c new file mode 100644 index 0000000..4e31362 --- /dev/null +++ b/Utilities/cmxmlrpc/xmlrpc_utf8.c @@ -0,0 +1,376 @@ +/* Copyright (C) 2001 by Eric Kidd. All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** 3. The name of the author may not be used to endorse or promote products +** derived from this software without specific prior written permission. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +** ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +** SUCH DAMAGE. */ + + +/*========================================================================= +** XML-RPC UTF-8 Utilities +**========================================================================= +** Routines for validating, encoding and decoding UTF-8 data. We try to +** be very, very strict about invalid UTF-8 data. +** +** All of the code in this file assumes that your machine represents +** wchar_t as a 16-bit (or wider) character containing UCS-2 data. If this +** assumption is incorrect, you may need to replace this file. +** +** For lots of information on Unicode and UTF-8 decoding, see: +** http://www.cl.cam.ac.uk/~mgk25/unicode.html +*/ + +#include "xmlrpc_config.h" + +#include "xmlrpc.h" + +#ifdef HAVE_UNICODE_WCHAR + +/*========================================================================= +** Tables and Constants +**========================================================================= +** We use a variety of tables and constants to help decode and validate +** UTF-8 data. +*/ + +/* The number of bytes in a UTF-8 sequence starting with the character used +** as the array index. A zero entry indicates an illegal initial byte. +** This table was generated using a Perl script and information from the +** UTF-8 standard. +** +** Fredrik Lundh's UTF-8 decoder Python 2.0 uses a similar table. But +** since Python 2.0 has the icky CNRI license, I regenerated this +** table from scratch and wrote my own decoder. */ +static unsigned char utf8_seq_length[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 +}; + +/* The minimum legal character value for a UTF-8 sequence of the given +** length. We have to check this to avoid accepting "overlong" UTF-8 +** sequences, which use more bytes than necessary to encode a given +** character. Such sequences are commonly used by evil people to bypass +** filters and security checks. This table is based on the UTF-8-test.txt +** file by Markus Kuhn <mkuhn@acm.org>. */ +static wchar_t utf8_min_char_for_length[4] = { + 0, /* Length 0: Not used (meaningless) */ + 0x0000, /* Length 1: Not used (special-cased) */ + 0x0080, /* Length 2 */ + 0x0800 /* Length 3 */ + +#if 0 + /* These are only useful on systems where wchar_t is 32-bits wide + ** and supports full UCS-4. */ + 0x00010000, /* Length 4 */ + 0x00200000, /* Length 5 */ + 0x04000000 /* Length 6 */ +#endif +}; + +/* This is the maximum legal 16-byte (UCS-2) character. Again, this +** information is based on UTF-8-test.txt. */ +#define UCS2_MAX_LEGAL_CHARACTER (0xFFFD) + +/* First and last UTF-16 surrogate characters. These are *not* legal UCS-2 +** characters--they're used to code for UCS-4 characters when using +** UTF-16. They should never appear in decoded UTF-8 data! Again, these +** could hypothetically be used to bypass security measures on some machines. +** Based on UTF-8-test.txt. */ +#define UTF16_FIRST_SURROGATE (0xD800) +#define UTF16_LAST_SURROGATE (0xDFFF) + +/* Is the character 'c' a UTF-8 continuation character? */ +#define IS_CONTINUATION(c) (((c) & 0xC0) == 0x80) + +/* Maximum number of bytes needed to encode a supported character. */ +#define MAX_ENCODED_BYTES (3) + + +/*========================================================================= +** decode_utf8 +**========================================================================= +** Internal routine which decodes (or validates) a UTF-8 string. +** To validate, set io_buff and out_buff_len to NULL. To decode, allocate +** a sufficiently large buffer, pass it as io_buff, and pass a pointer as +** as out_buff_len. The data will be written to the buffer, and the +** length to out_buff_len. +** +** We assume that wchar_t holds a single UCS-2 character in native-endian +** byte ordering. +*/ + +static void +decode_utf8(xmlrpc_env * const env, + const char * const utf8_data, + size_t const utf8_len, + wchar_t * const io_buff, + size_t * const out_buff_len) { + + size_t i, length, out_pos; + char init, con1, con2; + wchar_t wc; + + XMLRPC_ASSERT_ENV_OK(env); + XMLRPC_ASSERT_PTR_OK(utf8_data); + XMLRPC_ASSERT((!io_buff && !out_buff_len) || + (io_buff && out_buff_len)); + + /* Suppress GCC warning about possibly undefined variable. */ + wc = 0; + + i = 0; + out_pos = 0; + while (i < utf8_len) { + init = utf8_data[i]; + if ((init & 0x80) == 0x00) { + /* Convert ASCII character to wide character. */ + wc = init; + i++; + } else { + /* Look up the length of this UTF-8 sequence. */ + length = utf8_seq_length[(unsigned char) init]; + + /* Check to make sure we have enough bytes to convert. */ + if (i + length > utf8_len) + XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR, + "Truncated UTF-8 sequence"); + + /* Decode a multibyte UTF-8 sequence. */ + switch (length) { + case 0: + XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR, + "Invalid UTF-8 initial byte"); + + case 2: + /* 110xxxxx 10xxxxxx */ + con1 = utf8_data[i+1]; + if (!IS_CONTINUATION(con1)) + XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR, + "UTF-8 sequence too short"); + wc = ((((wchar_t) (init & 0x1F)) << 6) | + (((wchar_t) (con1 & 0x3F)))); + break; + + case 3: + /* 1110xxxx 10xxxxxx 10xxxxxx */ + con1 = utf8_data[i+1]; + con2 = utf8_data[i+2]; + if (!IS_CONTINUATION(con1) || !IS_CONTINUATION(con2)) + XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR, + "UTF-8 sequence too short"); + wc = ((((wchar_t) (init & 0x0F)) << 12) | + (((wchar_t) (con1 & 0x3F)) << 6) | + (((wchar_t) (con2 & 0x3F)))); + break; + + case 4: + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + case 5: + /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + case 6: + /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR, + "UCS-4 characters not supported"); + + default: + XMLRPC_ASSERT("Error in UTF-8 decoder tables"); + } + + /* Advance to the end of the sequence. */ + i += length; + + /* Check for illegal UCS-2 characters. */ + if (wc > UCS2_MAX_LEGAL_CHARACTER) + XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR, + "UCS-2 characters > U+FFFD are illegal"); + + /* Check for UTF-16 surrogates. */ + if (UTF16_FIRST_SURROGATE <= wc && wc <= UTF16_LAST_SURROGATE) + XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR, + "UTF-16 surrogates may not appear in UTF-8 data"); + + /* Check for overlong sequences. */ + if (wc < utf8_min_char_for_length[length]) + XMLRPC_FAIL(env, XMLRPC_INVALID_UTF8_ERROR, + "Overlong UTF-8 sequence not allowed"); + } + + /* If we have a buffer, write our character to it. */ + if (io_buff) { + io_buff[out_pos++] = wc; + } + } + + /* Record the number of characters we found. */ + if (out_buff_len) + *out_buff_len = out_pos; + + cleanup: + if (env->fault_occurred) { + if (out_buff_len) + *out_buff_len = 0; + } +} + + + +/*========================================================================= +** xmlrpc_validate_utf8 +**========================================================================= +** Make sure that a UTF-8 string is valid. +*/ + +void +xmlrpc_validate_utf8 (xmlrpc_env * const env, + const char * const utf8_data, + size_t const utf8_len) { + + decode_utf8(env, utf8_data, utf8_len, NULL, NULL); +} + + +/*========================================================================= +** xmlrpc_utf8_to_wcs +**========================================================================= +** Decode UTF-8 string to a "wide character string". This function +** returns an xmlrpc_mem_block with an element type of wchar_t. Don't +** try to intepret the block in a bytewise fashion--it won't work in +** any useful or portable fashion. +*/ + +xmlrpc_mem_block *xmlrpc_utf8_to_wcs (xmlrpc_env *env, + char *utf8_data, + size_t utf8_len) +{ + xmlrpc_mem_block *output; + size_t wcs_length; + + /* Error-handling preconditions. */ + output = NULL; + + /* Allocate a memory block large enough to hold any possible output. + ** We assume that each byte of the input may decode to a whcar_t. */ + output = XMLRPC_TYPED_MEM_BLOCK_NEW(wchar_t, env, utf8_len); + XMLRPC_FAIL_IF_FAULT(env); + + /* Decode the UTF-8 data. */ + decode_utf8(env, utf8_data, utf8_len, + XMLRPC_TYPED_MEM_BLOCK_CONTENTS(wchar_t, output), + &wcs_length); + XMLRPC_FAIL_IF_FAULT(env); + + /* Make sure we didn't overrun our buffer. */ + XMLRPC_ASSERT(wcs_length <= utf8_len); + + /* Correct the length of the memory block. */ + XMLRPC_TYPED_MEM_BLOCK_RESIZE(wchar_t, env, output, wcs_length); + XMLRPC_FAIL_IF_FAULT(env); + + cleanup: + if (env->fault_occurred) { + if (output) + xmlrpc_mem_block_free(output); + return NULL; + } + return output; +} + + +/*========================================================================= +** xmlrpc_utf8_to_wcs +**========================================================================= +** Encode a "wide character string" as UTF-8. +*/ + +xmlrpc_mem_block *xmlrpc_wcs_to_utf8 (xmlrpc_env *env, + wchar_t *wcs_data, + size_t wcs_len) +{ + size_t estimate, bytes_used, i; + xmlrpc_mem_block *output; + unsigned char *buffer; + wchar_t wc; + + XMLRPC_ASSERT_ENV_OK(env); + XMLRPC_ASSERT_PTR_OK(wcs_data); + + /* Error-handling preconditions. */ + output = NULL; + + /* Allocate a memory block large enough to hold any possible output. + ** We assume that every wchar might encode to the maximum length. */ + estimate = wcs_len * MAX_ENCODED_BYTES; + output = XMLRPC_TYPED_MEM_BLOCK_NEW(char, env, estimate); + XMLRPC_FAIL_IF_FAULT(env); + + /* Output our characters. */ + buffer = (unsigned char*) XMLRPC_TYPED_MEM_BLOCK_CONTENTS(char, output); + bytes_used = 0; + for (i = 0; i < wcs_len; i++) { + wc = wcs_data[i]; + if (wc <= 0x007F) { + buffer[bytes_used++] = wc & 0x7F; + } else if (wc <= 0x07FF) { + /* 110xxxxx 10xxxxxx */ + buffer[bytes_used++] = 0xC0 | (wc >> 6); + buffer[bytes_used++] = 0x80 | (wc & 0x3F); + } else if (wc <= 0xFFFF) { + /* 1110xxxx 10xxxxxx 10xxxxxx */ + buffer[bytes_used++] = 0xE0 | (wc >> 12); + buffer[bytes_used++] = 0x80 | ((wc >> 6) & 0x3F); + buffer[bytes_used++] = 0x80 | (wc & 0x3F); + } else { + XMLRPC_FAIL(env, XMLRPC_INTERNAL_ERROR, + "Don't know how to encode UCS-4 characters yet"); + } + } + + /* Make sure we didn't overrun our buffer. */ + XMLRPC_ASSERT(bytes_used <= estimate); + + /* Correct the length of the memory block. */ + XMLRPC_TYPED_MEM_BLOCK_RESIZE(char, env, output, bytes_used); + XMLRPC_FAIL_IF_FAULT(env); + + cleanup: + if (env->fault_occurred) { + if (output) + xmlrpc_mem_block_free(output); + return NULL; + } + return output; +} + +#endif /* HAVE_UNICODE_WCHAR */ |