diff options
Diffstat (limited to 'libxml2/testchar.c')
-rw-r--r-- | libxml2/testchar.c | 617 |
1 files changed, 617 insertions, 0 deletions
diff --git a/libxml2/testchar.c b/libxml2/testchar.c new file mode 100644 index 0000000..0d08792 --- /dev/null +++ b/libxml2/testchar.c @@ -0,0 +1,617 @@ +/** + * Test the UTF-8 decoding routines + * + * author: Daniel Veillard + * copy: see Copyright for the status of this software. + */ + +#include <stdio.h> +#include <string.h> +#include <libxml/parser.h> +#include <libxml/parserInternals.h> + +#include "buf.h" + +int lastError; + +static void errorHandler(void *unused, xmlErrorPtr err) { + if ((unused == NULL) && (err != NULL) && (lastError == 0)) { + lastError = err->code; + } +} + +char document1[100] = "<doc>XXXX</doc>"; +char document2[100] = "<doc foo='XXXX'/>"; + +static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document, + int len, char *data, int forbid1, int forbid2) { + int i; + xmlDocPtr res; + + for (i = 0;i <= 0xFF;i++) { + lastError = 0; + xmlCtxtReset(ctxt); + + data[0] = i; + + res = xmlReadMemory(document, len, "test", NULL, 0); + + if ((i == forbid1) || (i == forbid2)) { + if ((lastError == 0) || (res != NULL)) + fprintf(stderr, + "Failed to detect invalid char for Byte 0x%02X: %c\n", + i, i); + } + + else if ((i == '<') || (i == '&')) { + if ((lastError == 0) || (res != NULL)) + fprintf(stderr, + "Failed to detect illegal char %c for Byte 0x%02X\n", i, i); + } + else if (((i < 0x20) || (i >= 0x80)) && + (i != 0x9) && (i != 0xA) && (i != 0xD)) { + if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL)) + fprintf(stderr, + "Failed to detect invalid char for Byte 0x%02X\n", i); + } + else if (res == NULL) { + fprintf(stderr, + "Failed to parse valid char for Byte 0x%02X : %c\n", i, i); + } + if (res != NULL) + xmlFreeDoc(res); + } +} + +static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document, + int len, char *data) { + int i, j; + xmlDocPtr res; + + for (i = 0x80;i <= 0xFF;i++) { + for (j = 0;j <= 0xFF;j++) { + lastError = 0; + xmlCtxtReset(ctxt); + + data[0] = i; + data[1] = j; + + res = xmlReadMemory(document, len, "test", NULL, 0); + + /* if first bit of first char is set, then second bit must too */ + if ((i & 0x80) && ((i & 0x40) == 0)) { + if ((lastError == 0) || (res != NULL)) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", + i, j); + } + + /* + * if first bit of first char is set, then second char first + * bits must be 10 + */ + else if ((i & 0x80) && ((j & 0xC0) != 0x80)) { + if ((lastError == 0) || (res != NULL)) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", + i, j); + } + + /* + * if using a 2 byte encoding then the value must be greater + * than 0x80, i.e. one of bits 5 to 1 of i must be set + */ + else if ((i & 0x80) && ((i & 0x1E) == 0)) { + if ((lastError == 0) || (res != NULL)) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", + i, j); + } + + /* + * if third bit of first char is set, then the sequence would need + * at least 3 bytes, but we give only 2 ! + */ + else if ((i & 0xE0) == 0xE0) { + if ((lastError == 0) || (res != NULL)) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n", + i, j); + } + + /* + * We should see no error in remaning cases + */ + else if ((lastError != 0) || (res == NULL)) { + fprintf(stderr, + "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j); + } + if (res != NULL) + xmlFreeDoc(res); + } + } +} + +/** + * testDocumentRanges: + * + * Test the correct UTF8 character parsing in context of XML documents + * Those are in-context injection tests checking the parser behaviour on + * edge case values at different point in content, beginning and end of + * CDATA in text or in attribute values. + */ + +static void testDocumentRanges(void) { + xmlParserCtxtPtr ctxt; + char *data; + + /* + * Set up a parsing context using the first document as + * the current input source. + */ + ctxt = xmlNewParserCtxt(); + if (ctxt == NULL) { + fprintf(stderr, "Failed to allocate parser context\n"); + return; + } + + printf("testing 1 byte char in document: 1"); + fflush(stdout); + data = &document1[5]; + data[0] = ' '; + data[1] = ' '; + data[2] = ' '; + data[3] = ' '; + /* test 1 byte injection at beginning of area */ + testDocumentRangeByte1(ctxt, &document1[0], strlen(document1), + data, -1, -1); + printf(" 2"); + fflush(stdout); + data[0] = ' '; + data[1] = ' '; + data[2] = ' '; + data[3] = ' '; + /* test 1 byte injection at end of area */ + testDocumentRangeByte1(ctxt, &document1[0], strlen(document1), + data + 3, -1, -1); + + printf(" 3"); + fflush(stdout); + data = &document2[10]; + data[0] = ' '; + data[1] = ' '; + data[2] = ' '; + data[3] = ' '; + /* test 1 byte injection at beginning of area */ + testDocumentRangeByte1(ctxt, &document2[0], strlen(document2), + data, '\'', -1); + printf(" 4"); + fflush(stdout); + data[0] = ' '; + data[1] = ' '; + data[2] = ' '; + data[3] = ' '; + /* test 1 byte injection at end of area */ + testDocumentRangeByte1(ctxt, &document2[0], strlen(document2), + data + 3, '\'', -1); + printf(" done\n"); + + printf("testing 2 byte char in document: 1"); + fflush(stdout); + data = &document1[5]; + data[0] = ' '; + data[1] = ' '; + data[2] = ' '; + data[3] = ' '; + /* test 2 byte injection at beginning of area */ + testDocumentRangeByte2(ctxt, &document1[0], strlen(document1), + data); + printf(" 2"); + fflush(stdout); + data[0] = ' '; + data[1] = ' '; + data[2] = ' '; + data[3] = ' '; + /* test 2 byte injection at end of area */ + testDocumentRangeByte2(ctxt, &document1[0], strlen(document1), + data + 2); + + printf(" 3"); + fflush(stdout); + data = &document2[10]; + data[0] = ' '; + data[1] = ' '; + data[2] = ' '; + data[3] = ' '; + /* test 2 byte injection at beginning of area */ + testDocumentRangeByte2(ctxt, &document2[0], strlen(document2), + data); + printf(" 4"); + fflush(stdout); + data[0] = ' '; + data[1] = ' '; + data[2] = ' '; + data[3] = ' '; + /* test 2 byte injection at end of area */ + testDocumentRangeByte2(ctxt, &document2[0], strlen(document2), + data + 2); + printf(" done\n"); + + xmlFreeParserCtxt(ctxt); +} + +static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) { + int i = 0; + int len, c; + + data[1] = 0; + data[2] = 0; + data[3] = 0; + for (i = 0;i <= 0xFF;i++) { + data[0] = i; + ctxt->charset = XML_CHAR_ENCODING_UTF8; + + lastError = 0; + c = xmlCurrentChar(ctxt, &len); + if ((i == 0) || (i >= 0x80)) { + /* we must see an error there */ + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Byte 0x%02X\n", i); + } else if (i == 0xD) { + if ((c != 0xA) || (len != 1)) + fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i); + } else if ((c != i) || (len != 1)) { + fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i); + } + } +} + +static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) { + int i, j; + int len, c; + + data[2] = 0; + data[3] = 0; + for (i = 0x80;i <= 0xFF;i++) { + for (j = 0;j <= 0xFF;j++) { + data[0] = i; + data[1] = j; + ctxt->charset = XML_CHAR_ENCODING_UTF8; + + lastError = 0; + c = xmlCurrentChar(ctxt, &len); + + /* if first bit of first char is set, then second bit must too */ + if ((i & 0x80) && ((i & 0x40) == 0)) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n", + i, j); + } + + /* + * if first bit of first char is set, then second char first + * bits must be 10 + */ + else if ((i & 0x80) && ((j & 0xC0) != 0x80)) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n", + i, j, c); + } + + /* + * if using a 2 byte encoding then the value must be greater + * than 0x80, i.e. one of bits 5 to 1 of i must be set + */ + else if ((i & 0x80) && ((i & 0x1E) == 0)) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n", + i, j, c); + } + + /* + * if third bit of first char is set, then the sequence would need + * at least 3 bytes, but we give only 2 ! + */ + else if ((i & 0xE0) == 0xE0) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n", + i, j); + } + + /* + * We should see no error in remaning cases + */ + else if ((lastError != 0) || (len != 2)) { + fprintf(stderr, + "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j); + } + + /* + * Finally check the value is right + */ + else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) { + fprintf(stderr, + "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n", + i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c); + } + } + } +} + +static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) { + int i, j, k, K; + int len, c; + unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF}; + int value; + + data[3] = 0; + for (i = 0xE0;i <= 0xFF;i++) { + for (j = 0;j <= 0xFF;j++) { + for (k = 0;k < 6;k++) { + data[0] = i; + data[1] = j; + K = lows[k]; + data[2] = (char) K; + value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12); + ctxt->charset = XML_CHAR_ENCODING_UTF8; + + lastError = 0; + c = xmlCurrentChar(ctxt, &len); + + /* + * if fourth bit of first char is set, then the sequence would need + * at least 4 bytes, but we give only 3 ! + */ + if ((i & 0xF0) == 0xF0) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", + i, j, K, data[3]); + } + + /* + * The second and the third bytes must start with 10 + */ + else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n", + i, j, K); + } + + /* + * if using a 3 byte encoding then the value must be greater + * than 0x800, i.e. one of bits 4 to 0 of i must be set or + * the 6th byte of data[1] must be set + */ + else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n", + i, j, K); + } + + /* + * There are values in that range that are not allowed in XML-1.0 + */ + else if (((value > 0xD7FF) && (value <0xE000)) || + ((value > 0xFFFD) && (value <0x10000))) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n", + value, i, j, K); + } + + /* + * We should see no error in remaining cases + */ + else if ((lastError != 0) || (len != 3)) { + fprintf(stderr, + "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n", + i, j, K); + } + + /* + * Finally check the value is right + */ + else if (c != value) { + fprintf(stderr, + "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n", + i, j, data[2], value, c); + } + } + } + } +} + +static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) { + int i, j, k, K, l, L; + int len, c; + unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF}; + int value; + + data[4] = 0; + for (i = 0xF0;i <= 0xFF;i++) { + for (j = 0;j <= 0xFF;j++) { + for (k = 0;k < 6;k++) { + for (l = 0;l < 6;l++) { + data[0] = i; + data[1] = j; + K = lows[k]; + data[2] = (char) K; + L = lows[l]; + data[3] = (char) L; + value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) + + ((i & 0x7) << 18); + ctxt->charset = XML_CHAR_ENCODING_UTF8; + + lastError = 0; + c = xmlCurrentChar(ctxt, &len); + + /* + * if fifth bit of first char is set, then the sequence would need + * at least 5 bytes, but we give only 4 ! + */ + if ((i & 0xF8) == 0xF8) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", + i, j, K, data[3]); + } + + /* + * The second, third and fourth bytes must start with 10 + */ + else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) || + ((L & 0xC0) != 0x80)) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", + i, j, K, L); + } + + /* + * if using a 3 byte encoding then the value must be greater + * than 0x10000, i.e. one of bits 3 to 0 of i must be set or + * the 6 or 5th byte of j must be set + */ + else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, + "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", + i, j, K, L); + } + + /* + * There are values in that range that are not allowed in XML-1.0 + */ + else if (((value > 0xD7FF) && (value <0xE000)) || + ((value > 0xFFFD) && (value <0x10000)) || + (value > 0x10FFFF)) { + if (lastError != XML_ERR_INVALID_CHAR) + fprintf(stderr, +"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n", + value, i, j, K, L); + } + + /* + * We should see no error in remaining cases + */ + else if ((lastError != 0) || (len != 4)) { + fprintf(stderr, + "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n", + i, j, K); + } + + /* + * Finally check the value is right + */ + else if (c != value) { + fprintf(stderr, + "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n", + i, j, data[2], value, c); + } + } + } + } + } +} + +/** + * testCharRanges: + * + * Test the correct UTF8 character parsing in isolation i.e. + * not when parsing a full document, this is less expensive and we can + * cover the full range of UTF-8 chars accepted by XML-1.0 + */ + +static void testCharRanges(void) { + char data[5]; + xmlParserCtxtPtr ctxt; + xmlParserInputBufferPtr buf; + xmlParserInputPtr input; + + memset(data, 0, 5); + + /* + * Set up a parsing context using the above data buffer as + * the current input source. + */ + ctxt = xmlNewParserCtxt(); + if (ctxt == NULL) { + fprintf(stderr, "Failed to allocate parser context\n"); + return; + } + buf = xmlParserInputBufferCreateStatic(data, sizeof(data), + XML_CHAR_ENCODING_NONE); + if (buf == NULL) { + fprintf(stderr, "Failed to allocate input buffer\n"); + goto error; + } + input = xmlNewInputStream(ctxt); + if (input == NULL) { + xmlFreeParserInputBuffer(buf); + goto error; + } + input->filename = NULL; + input->buf = buf; + input->cur = + input->base = xmlBufContent(input->buf->buffer); + input->end = input->base + 4; + inputPush(ctxt, input); + + printf("testing char range: 1"); + fflush(stdout); + testCharRangeByte1(ctxt, data); + printf(" 2"); + fflush(stdout); + testCharRangeByte2(ctxt, data); + printf(" 3"); + fflush(stdout); + testCharRangeByte3(ctxt, data); + printf(" 4"); + fflush(stdout); + testCharRangeByte4(ctxt, data); + printf(" done\n"); + fflush(stdout); + +error: + xmlFreeParserCtxt(ctxt); +} + +int main(void) { + + /* + * this initialize the library and check potential ABI mismatches + * between the version it was compiled for and the actual shared + * library used. + */ + LIBXML_TEST_VERSION + + /* + * Catch errors separately + */ + + xmlSetStructuredErrorFunc(NULL, errorHandler); + + /* + * Run the tests + */ + testCharRanges(); + testDocumentRanges(); + + /* + * Cleanup function for the XML library. + */ + xmlCleanupParser(); + /* + * this is to debug memory for regression tests + */ + xmlMemoryDump(); + return(0); +} |