From 23ec4b57e1359f9c539b8defc317542173ae087e Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 15 Jun 2017 00:54:36 +0200 Subject: bpo-29591: Upgrade Modules/expat to libexpat 2.2 (#2164) * bpo-29591: Upgrade Modules/expat to libexpat 2.2 * bpo-29591: Restore Python changes on expat * bpo-29591: Remove expat config of unsupported platforms Remove the configuration (Modules/expat/*config.h) of unsupported platforms: * Amiga * MacOS Classic on PPC32 * Open Watcom * bpo-29591: Remove useless XML_HAS_SET_HASH_SALT The XML_HAS_SET_HASH_SALT define of Modules/expat/expat.h became useless since our local expat copy was upgrade to expat 2.1 (it's now expat 2.2.0). --- Misc/NEWS | 4 + Modules/expat/COPYING | 5 +- Modules/expat/amigaconfig.h | 32 ------ Modules/expat/expat.h | 17 ++- Modules/expat/expat_external.h | 14 +++ Modules/expat/internal.h | 22 ++++ Modules/expat/macconfig.h | 53 ---------- Modules/expat/watcomconfig.h | 47 --------- Modules/expat/xmlparse.c | 113 ++++++++++++++------ Modules/expat/xmlrole.c | 224 +++++++++++++++++++-------------------- Modules/expat/xmltok.c | 230 +++++++++++++++++++++++++++++------------ Modules/expat/xmltok.h | 10 +- Modules/expat/xmltok_impl.c | 226 ++++++++++++++++++++-------------------- Modules/pyexpat.c | 6 +- 14 files changed, 527 insertions(+), 476 deletions(-) delete mode 100644 Modules/expat/amigaconfig.h delete mode 100644 Modules/expat/macconfig.h delete mode 100644 Modules/expat/watcomconfig.h diff --git a/Misc/NEWS b/Misc/NEWS index 7452b25..985d6c0 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -362,6 +362,10 @@ Extension Modules Library ------- +- [Security] bpo-29591: Update expat copy from 2.1.1 to 2.2.0 to get fixes + of CVE-2016-0718 and CVE-2016-4472. See + https://sourceforge.net/p/expat/bugs/537/ for more information. + - bpo-24744: pkgutil.walk_packages function now raises ValueError if *path* is a string. Patch by Sanyam Khurana. diff --git a/Modules/expat/COPYING b/Modules/expat/COPYING index dcb4506..092c83b 100644 --- a/Modules/expat/COPYING +++ b/Modules/expat/COPYING @@ -1,6 +1,5 @@ -Copyright (c) 1998, 1999, 2000 Thai Open Source Software Center Ltd - and Clark Cooper -Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Expat maintainers. +Copyright (c) 1998-2000 Thai Open Source Software Center Ltd and Clark Cooper +Copyright (c) 2001-2016 Expat maintainers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/Modules/expat/amigaconfig.h b/Modules/expat/amigaconfig.h deleted file mode 100644 index 86c6115..0000000 --- a/Modules/expat/amigaconfig.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef AMIGACONFIG_H -#define AMIGACONFIG_H - -/* 1234 = LIL_ENDIAN, 4321 = BIGENDIAN */ -#define BYTEORDER 4321 - -/* Define to 1 if you have the `bcopy' function. */ -#define HAVE_BCOPY 1 - -/* Define to 1 if you have the header file. */ -#undef HAVE_CHECK_H - -/* Define to 1 if you have the `memmove' function. */ -#define HAVE_MEMMOVE 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_UNISTD_H 1 - -/* whether byteorder is bigendian */ -#define WORDS_BIGENDIAN - -/* Define to specify how much context to retain around the current parse - point. */ -#define XML_CONTEXT_BYTES 1024 - -/* Define to make parameter entity parsing functionality available. */ -#define XML_DTD - -/* Define to make XML Namespaces functionality available. */ -#define XML_NS - -#endif /* AMIGACONFIG_H */ diff --git a/Modules/expat/expat.h b/Modules/expat/expat.h index e8eefdd..086e24b 100644 --- a/Modules/expat/expat.h +++ b/Modules/expat/expat.h @@ -342,7 +342,7 @@ XML_SetEntityDeclHandler(XML_Parser parser, XML_EntityDeclHandler handler); /* OBSOLETE -- OBSOLETE -- OBSOLETE - This handler has been superceded by the EntityDeclHandler above. + This handler has been superseded by the EntityDeclHandler above. It is provided here for backward compatibility. This is called for a declaration of an unparsed (NDATA) entity. @@ -915,8 +915,6 @@ XMLPARSEAPI(int) XML_SetHashSalt(XML_Parser parser, unsigned long hash_salt); -#define XML_HAS_SET_HASH_SALT /* Python Only: Defined for pyexpat.c. */ - /* If XML_Parse or XML_ParseBuffer have returned XML_STATUS_ERROR, then XML_GetErrorCode returns information about the error. */ @@ -975,9 +973,12 @@ XML_FreeContentModel(XML_Parser parser, XML_Content *model); /* Exposing the memory handling functions used in Expat */ XMLPARSEAPI(void *) +XML_ATTR_MALLOC +XML_ATTR_ALLOC_SIZE(2) XML_MemMalloc(XML_Parser parser, size_t size); XMLPARSEAPI(void *) +XML_ATTR_ALLOC_SIZE(3) XML_MemRealloc(XML_Parser parser, void *ptr, size_t size); XMLPARSEAPI(void) @@ -1033,14 +1034,12 @@ XMLPARSEAPI(const XML_Feature *) XML_GetFeatureList(void); -/* Expat follows the GNU/Linux convention of odd number minor version for - beta/development releases and even number minor version for stable - releases. Micro is bumped with each release, and set to 0 with each - change to major or minor version. +/* Expat follows the semantic versioning convention. + See http://semver.org. */ #define XML_MAJOR_VERSION 2 -#define XML_MINOR_VERSION 1 -#define XML_MICRO_VERSION 1 +#define XML_MINOR_VERSION 2 +#define XML_MICRO_VERSION 0 #ifdef __cplusplus } diff --git a/Modules/expat/expat_external.h b/Modules/expat/expat_external.h index f337e1c..4860c57 100644 --- a/Modules/expat/expat_external.h +++ b/Modules/expat/expat_external.h @@ -69,12 +69,26 @@ #endif #endif /* not defined XML_STATIC */ +#if !defined(XMLIMPORT) && defined(__GNUC__) && (__GNUC__ >= 4) +#define XMLIMPORT __attribute__ ((visibility ("default"))) +#endif /* If we didn't define it above, define it away: */ #ifndef XMLIMPORT #define XMLIMPORT #endif +#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)) +#define XML_ATTR_MALLOC __attribute__((__malloc__)) +#else +#define XML_ATTR_MALLOC +#endif + +#if defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) +#define XML_ATTR_ALLOC_SIZE(x) __attribute__((__alloc_size__(x))) +#else +#define XML_ATTR_ALLOC_SIZE(x) +#endif #define XMLPARSEAPI(type) XMLIMPORT type XMLCALL diff --git a/Modules/expat/internal.h b/Modules/expat/internal.h index dd54548..94cb98e 100644 --- a/Modules/expat/internal.h +++ b/Modules/expat/internal.h @@ -71,3 +71,25 @@ #define inline #endif #endif + +#ifndef UNUSED_P +# ifdef __GNUC__ +# define UNUSED_P(p) UNUSED_ ## p __attribute__((__unused__)) +# else +# define UNUSED_P(p) UNUSED_ ## p +# endif +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + + +void +align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef); + + +#ifdef __cplusplus +} +#endif diff --git a/Modules/expat/macconfig.h b/Modules/expat/macconfig.h deleted file mode 100644 index 2725caa..0000000 --- a/Modules/expat/macconfig.h +++ /dev/null @@ -1,53 +0,0 @@ -/*================================================================ -** Copyright 2000, Clark Cooper -** All rights reserved. -** -** This is free software. You are permitted to copy, distribute, or modify -** it under the terms of the MIT/X license (contained in the COPYING file -** with this distribution.) -** -*/ - -#ifndef MACCONFIG_H -#define MACCONFIG_H - - -/* 1234 = LIL_ENDIAN, 4321 = BIGENDIAN */ -#define BYTEORDER 4321 - -/* Define to 1 if you have the `bcopy' function. */ -#undef HAVE_BCOPY - -/* Define to 1 if you have the `memmove' function. */ -#define HAVE_MEMMOVE - -/* Define to 1 if you have a working `mmap' system call. */ -#undef HAVE_MMAP - -/* Define to 1 if you have the header file. */ -#undef HAVE_UNISTD_H - -/* whether byteorder is bigendian */ -#define WORDS_BIGENDIAN - -/* Define to specify how much context to retain around the current parse - point. */ -#undef XML_CONTEXT_BYTES - -/* Define to make parameter entity parsing functionality available. */ -#define XML_DTD - -/* Define to make XML Namespaces functionality available. */ -#define XML_NS - -/* Define to empty if `const' does not conform to ANSI C. */ -#undef const - -/* Define to `long' if does not define. */ -#define off_t long - -/* Define to `unsigned' if does not define. */ -#undef size_t - - -#endif /* ifndef MACCONFIG_H */ diff --git a/Modules/expat/watcomconfig.h b/Modules/expat/watcomconfig.h deleted file mode 100644 index 2f05e3f..0000000 --- a/Modules/expat/watcomconfig.h +++ /dev/null @@ -1,47 +0,0 @@ -/* expat_config.h for use with Open Watcom 1.5 and above. */ - -#ifndef WATCOMCONFIG_H -#define WATCOMCONFIG_H - -#ifdef __NT__ -#define WIN32_LEAN_AND_MEAN -#include -#undef WIN32_LEAN_AND_MEAN -#endif - -/* 1234 = LIL_ENDIAN, 4321 = BIGENDIAN */ -#define BYTEORDER 1234 - -/* Define to 1 if you have the `memmove' function. */ -#define HAVE_MEMMOVE 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_UNISTD_H 1 - -/* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "expat-bugs@mail.libexpat.org" - -/* Define to the full name of this package. */ -#define PACKAGE_NAME "expat" - -/* Define to the full name and version of this package. */ -#define PACKAGE_STRING "expat 2.0.0" - -/* Define to the one symbol short name of this package. */ -#undef PACKAGE_TARNAME - -/* Define to the version of this package. */ -#define PACKAGE_VERSION "2.0.0" - -/* Define to specify how much context to retain around the current parse - point. */ -#define XML_CONTEXT_BYTES 1024 - -/* Define to make parameter entity parsing functionality available. */ -#define XML_DTD 1 - -/* Define to make XML Namespaces functionality available. */ -#define XML_NS 1 - -#endif - diff --git a/Modules/expat/xmlparse.c b/Modules/expat/xmlparse.c index 4128387..b308e67 100644 --- a/Modules/expat/xmlparse.c +++ b/Modules/expat/xmlparse.c @@ -2,9 +2,22 @@ See the file COPYING for copying permission. */ +#include +#include /* memset(), memcpy() */ +#include +#include /* UINT_MAX */ + +#ifdef WIN32 +#define getpid GetCurrentProcessId +#else +#include /* gettimeofday() */ +#include /* getpid() */ +#include /* getpid() */ +#endif + #define XML_BUILDING_EXPAT 1 -#ifdef COMPILED_FROM_DSP +#ifdef WIN32 #include "winconfig.h" #elif defined(MACOS_CLASSIC) #include "macconfig.h" @@ -14,13 +27,7 @@ #include "watcomconfig.h" #elif defined(HAVE_EXPAT_CONFIG_H) #include -#endif /* ndef COMPILED_FROM_DSP */ - -#include -#include /* memset(), memcpy() */ -#include -#include /* UINT_MAX */ -#include /* time() */ +#endif /* ndef WIN32 */ #include "ascii.h" #include "expat.h" @@ -432,7 +439,7 @@ static ELEMENT_TYPE * getElementType(XML_Parser parser, const ENCODING *enc, const char *ptr, const char *end); -static unsigned long generate_hash_secret_salt(void); +static unsigned long generate_hash_secret_salt(XML_Parser parser); static XML_Bool startParsing(XML_Parser parser); static XML_Parser @@ -691,11 +698,38 @@ static const XML_Char implicitContext[] = { }; static unsigned long -generate_hash_secret_salt(void) +gather_time_entropy(void) { - unsigned int seed = time(NULL) % UINT_MAX; - srand(seed); - return rand(); +#ifdef WIN32 + FILETIME ft; + GetSystemTimeAsFileTime(&ft); /* never fails */ + return ft.dwHighDateTime ^ ft.dwLowDateTime; +#else + struct timeval tv; + int gettimeofday_res; + + gettimeofday_res = gettimeofday(&tv, NULL); + assert (gettimeofday_res == 0); + + /* Microseconds time is <20 bits entropy */ + return tv.tv_usec; +#endif +} + +static unsigned long +generate_hash_secret_salt(XML_Parser parser) +{ + /* Process ID is 0 bits entropy if attacker has local access + * XML_Parser address is few bits of entropy if attacker has local access */ + const unsigned long entropy = + gather_time_entropy() ^ getpid() ^ (unsigned long)parser; + + /* Factors are 2^31-1 and 2^61-1 (Mersenne primes M31 and M61) */ + if (sizeof(unsigned long) == 4) { + return entropy * 2147483647; + } else { + return entropy * (unsigned long)2305843009213693951; + } } static XML_Bool /* only valid for root parser */ @@ -703,7 +737,7 @@ startParsing(XML_Parser parser) { /* hash functions must be initialized before setContext() is called */ if (hash_secret_salt == 0) - hash_secret_salt = generate_hash_secret_salt(); + hash_secret_salt = generate_hash_secret_salt(parser); if (ns) { /* implicit context only set for root parser, since child parsers (i.e. external entity parsers) will inherit it @@ -1695,15 +1729,15 @@ XML_GetBuffer(XML_Parser parser, int len) if (len > bufferLim - bufferEnd) { #ifdef XML_CONTEXT_BYTES int keep; -#endif - int neededSize = len + (int)(bufferEnd - bufferPtr); +#endif /* defined XML_CONTEXT_BYTES */ + /* Do not invoke signed arithmetic overflow: */ + int neededSize = (int) ((unsigned)len + (unsigned)(bufferEnd - bufferPtr)); if (neededSize < 0) { errorCode = XML_ERROR_NO_MEMORY; return NULL; } #ifdef XML_CONTEXT_BYTES keep = (int)(bufferPtr - buffer); - if (keep > XML_CONTEXT_BYTES) keep = XML_CONTEXT_BYTES; neededSize += keep; @@ -1728,7 +1762,8 @@ XML_GetBuffer(XML_Parser parser, int len) if (bufferSize == 0) bufferSize = INIT_BUFFER_SIZE; do { - bufferSize *= 2; + /* Do not invoke signed arithmetic overflow: */ + bufferSize = (int) (2U * (unsigned) bufferSize); } while (bufferSize < neededSize && bufferSize > 0); if (bufferSize <= 0) { errorCode = XML_ERROR_NO_MEMORY; @@ -1855,7 +1890,7 @@ XML_Index XMLCALL XML_GetCurrentByteIndex(XML_Parser parser) { if (eventPtr) - return parseEndByteIndex - (parseEndPtr - eventPtr); + return (XML_Index)(parseEndByteIndex - (parseEndPtr - eventPtr)); return -1; } @@ -2429,11 +2464,11 @@ doContent(XML_Parser parser, for (;;) { int bufSize; int convLen; - XmlConvert(enc, + const enum XML_Convert_Result convert_res = XmlConvert(enc, &fromPtr, rawNameEnd, (ICHAR **)&toPtr, (ICHAR *)tag->bufEnd - 1); convLen = (int)(toPtr - (XML_Char *)tag->buf); - if (fromPtr == rawNameEnd) { + if ((convert_res == XML_CONVERT_COMPLETED) || (convert_res == XML_CONVERT_INPUT_INCOMPLETE)) { tag->name.strLen = convLen; break; } @@ -2654,11 +2689,11 @@ doContent(XML_Parser parser, if (MUST_CONVERT(enc, s)) { for (;;) { ICHAR *dataPtr = (ICHAR *)dataBuf; - XmlConvert(enc, &s, next, &dataPtr, (ICHAR *)dataBufEnd); + const enum XML_Convert_Result convert_res = XmlConvert(enc, &s, next, &dataPtr, (ICHAR *)dataBufEnd); *eventEndPP = s; charDataHandler(handlerArg, dataBuf, (int)(dataPtr - (ICHAR *)dataBuf)); - if (s == next) + if ((convert_res == XML_CONVERT_COMPLETED) || (convert_res == XML_CONVERT_INPUT_INCOMPLETE)) break; *eventPP = s; } @@ -3264,11 +3299,11 @@ doCdataSection(XML_Parser parser, if (MUST_CONVERT(enc, s)) { for (;;) { ICHAR *dataPtr = (ICHAR *)dataBuf; - XmlConvert(enc, &s, next, &dataPtr, (ICHAR *)dataBufEnd); + const enum XML_Convert_Result convert_res = XmlConvert(enc, &s, next, &dataPtr, (ICHAR *)dataBufEnd); *eventEndPP = next; charDataHandler(handlerArg, dataBuf, (int)(dataPtr - (ICHAR *)dataBuf)); - if (s == next) + if ((convert_res == XML_CONVERT_COMPLETED) || (convert_res == XML_CONVERT_INPUT_INCOMPLETE)) break; *eventPP = s; } @@ -4927,9 +4962,9 @@ internalEntityProcessor(XML_Parser parser, static enum XML_Error PTRCALL errorProcessor(XML_Parser parser, - const char *s, - const char *end, - const char **nextPtr) + const char *UNUSED_P(s), + const char *UNUSED_P(end), + const char **UNUSED_P(nextPtr)) { return errorCode; } @@ -5345,6 +5380,7 @@ reportDefault(XML_Parser parser, const ENCODING *enc, const char *s, const char *end) { if (MUST_CONVERT(enc, s)) { + enum XML_Convert_Result convert_res; const char **eventPP; const char **eventEndPP; if (enc == encoding) { @@ -5357,11 +5393,11 @@ reportDefault(XML_Parser parser, const ENCODING *enc, } do { ICHAR *dataPtr = (ICHAR *)dataBuf; - XmlConvert(enc, &s, end, &dataPtr, (ICHAR *)dataBufEnd); + convert_res = XmlConvert(enc, &s, end, &dataPtr, (ICHAR *)dataBufEnd); *eventEndPP = s; defaultHandler(handlerArg, dataBuf, (int)(dataPtr - (ICHAR *)dataBuf)); *eventPP = s; - } while (s != end); + } while ((convert_res != XML_CONVERT_COMPLETED) && (convert_res != XML_CONVERT_INPUT_INCOMPLETE)); } else defaultHandler(handlerArg, (XML_Char *)s, (int)((XML_Char *)end - (XML_Char *)s)); @@ -6166,8 +6202,8 @@ poolAppend(STRING_POOL *pool, const ENCODING *enc, if (!pool->ptr && !poolGrow(pool)) return NULL; for (;;) { - XmlConvert(enc, &ptr, end, (ICHAR **)&(pool->ptr), (ICHAR *)pool->end); - if (ptr == end) + const enum XML_Convert_Result convert_res = XmlConvert(enc, &ptr, end, (ICHAR **)&(pool->ptr), (ICHAR *)pool->end); + if ((convert_res == XML_CONVERT_COMPLETED) || (convert_res == XML_CONVERT_INPUT_INCOMPLETE)) break; if (!poolGrow(pool)) return NULL; @@ -6251,8 +6287,13 @@ poolGrow(STRING_POOL *pool) } } if (pool->blocks && pool->start == pool->blocks->s) { - int blockSize = (int)(pool->end - pool->start)*2; - BLOCK *temp = (BLOCK *) + BLOCK *temp; + int blockSize = (int)((unsigned)(pool->end - pool->start)*2U); + + if (blockSize < 0) + return XML_FALSE; + + temp = (BLOCK *) pool->mem->realloc_fcn(pool->blocks, (offsetof(BLOCK, s) + blockSize * sizeof(XML_Char))); @@ -6267,6 +6308,10 @@ poolGrow(STRING_POOL *pool) else { BLOCK *tem; int blockSize = (int)(pool->end - pool->start); + + if (blockSize < 0) + return XML_FALSE; + if (blockSize < INIT_BLOCK_SIZE) blockSize = INIT_BLOCK_SIZE; else diff --git a/Modules/expat/xmlrole.c b/Modules/expat/xmlrole.c index 44772e2..fcd0dc6 100644 --- a/Modules/expat/xmlrole.c +++ b/Modules/expat/xmlrole.c @@ -4,7 +4,7 @@ #include -#ifdef COMPILED_FROM_DSP +#ifdef WIN32 #include "winconfig.h" #elif defined(MACOS_CLASSIC) #include "macconfig.h" @@ -16,7 +16,7 @@ #ifdef HAVE_EXPAT_CONFIG_H #include #endif -#endif /* ndef COMPILED_FROM_DSP */ +#endif /* ndef WIN32 */ #include "expat_external.h" #include "internal.h" @@ -195,9 +195,9 @@ prolog1(PROLOG_STATE *state, static int PTRCALL prolog2(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -216,9 +216,9 @@ prolog2(PROLOG_STATE *state, static int PTRCALL doctype0(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -264,9 +264,9 @@ doctype1(PROLOG_STATE *state, static int PTRCALL doctype2(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -281,9 +281,9 @@ doctype2(PROLOG_STATE *state, static int PTRCALL doctype3(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -298,9 +298,9 @@ doctype3(PROLOG_STATE *state, static int PTRCALL doctype4(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -318,9 +318,9 @@ doctype4(PROLOG_STATE *state, static int PTRCALL doctype5(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -437,9 +437,9 @@ externalSubset1(PROLOG_STATE *state, static int PTRCALL entity0(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -457,9 +457,9 @@ entity0(PROLOG_STATE *state, static int PTRCALL entity1(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -502,9 +502,9 @@ entity2(PROLOG_STATE *state, static int PTRCALL entity3(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -519,9 +519,9 @@ entity3(PROLOG_STATE *state, static int PTRCALL entity4(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -559,9 +559,9 @@ entity5(PROLOG_STATE *state, static int PTRCALL entity6(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -605,9 +605,9 @@ entity7(PROLOG_STATE *state, static int PTRCALL entity8(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -622,9 +622,9 @@ entity8(PROLOG_STATE *state, static int PTRCALL entity9(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -639,9 +639,9 @@ entity9(PROLOG_STATE *state, static int PTRCALL entity10(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -656,9 +656,9 @@ entity10(PROLOG_STATE *state, static int PTRCALL notation0(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -697,9 +697,9 @@ notation1(PROLOG_STATE *state, static int PTRCALL notation2(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -714,9 +714,9 @@ notation2(PROLOG_STATE *state, static int PTRCALL notation3(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -732,9 +732,9 @@ notation3(PROLOG_STATE *state, static int PTRCALL notation4(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -753,9 +753,9 @@ notation4(PROLOG_STATE *state, static int PTRCALL attlist0(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -771,9 +771,9 @@ attlist0(PROLOG_STATE *state, static int PTRCALL attlist1(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -833,9 +833,9 @@ attlist2(PROLOG_STATE *state, static int PTRCALL attlist3(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -852,9 +852,9 @@ attlist3(PROLOG_STATE *state, static int PTRCALL attlist4(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -872,9 +872,9 @@ attlist4(PROLOG_STATE *state, static int PTRCALL attlist5(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -889,9 +889,9 @@ attlist5(PROLOG_STATE *state, static int PTRCALL attlist6(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -906,9 +906,9 @@ attlist6(PROLOG_STATE *state, static int PTRCALL attlist7(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -967,9 +967,9 @@ attlist8(PROLOG_STATE *state, static int PTRCALL attlist9(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -984,9 +984,9 @@ attlist9(PROLOG_STATE *state, static int PTRCALL element0(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1072,9 +1072,9 @@ element2(PROLOG_STATE *state, static int PTRCALL element3(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1097,9 +1097,9 @@ element3(PROLOG_STATE *state, static int PTRCALL element4(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1115,9 +1115,9 @@ element4(PROLOG_STATE *state, static int PTRCALL element5(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1136,9 +1136,9 @@ element5(PROLOG_STATE *state, static int PTRCALL element6(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1166,9 +1166,9 @@ element6(PROLOG_STATE *state, static int PTRCALL element7(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1240,9 +1240,9 @@ condSect0(PROLOG_STATE *state, static int PTRCALL condSect1(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1258,9 +1258,9 @@ condSect1(PROLOG_STATE *state, static int PTRCALL condSect2(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1277,9 +1277,9 @@ condSect2(PROLOG_STATE *state, static int PTRCALL declClose(PROLOG_STATE *state, int tok, - const char *ptr, - const char *end, - const ENCODING *enc) + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { switch (tok) { case XML_TOK_PROLOG_S: @@ -1292,11 +1292,11 @@ declClose(PROLOG_STATE *state, } static int PTRCALL -error(PROLOG_STATE *state, - int tok, - const char *ptr, - const char *end, - const ENCODING *enc) +error(PROLOG_STATE *UNUSED_P(state), + int UNUSED_P(tok), + const char *UNUSED_P(ptr), + const char *UNUSED_P(end), + const ENCODING *UNUSED_P(enc)) { return XML_ROLE_NONE; } diff --git a/Modules/expat/xmltok.c b/Modules/expat/xmltok.c index bf09dfc..a29d9e2 100644 --- a/Modules/expat/xmltok.c +++ b/Modules/expat/xmltok.c @@ -4,7 +4,7 @@ #include -#ifdef COMPILED_FROM_DSP +#ifdef WIN32 #include "winconfig.h" #elif defined(MACOS_CLASSIC) #include "macconfig.h" @@ -16,7 +16,7 @@ #ifdef HAVE_EXPAT_CONFIG_H #include #endif -#endif /* ndef COMPILED_FROM_DSP */ +#endif /* ndef WIN32 */ #include "expat_external.h" #include "internal.h" @@ -46,7 +46,7 @@ #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) #define UCS2_GET_NAMING(pages, hi, lo) \ - (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) + (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F))) /* A 2 byte UTF-8 representation splits the characters 11 bits between the bottom 5 and 6 bits of the bytes. We need 8 bits to index into @@ -56,7 +56,7 @@ (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ + ((((byte)[0]) & 3) << 1) \ + ((((byte)[1]) >> 5) & 1)] \ - & (1 << (((byte)[1]) & 0x1F))) + & (1u << (((byte)[1]) & 0x1F))) /* A 3 byte UTF-8 representation splits the characters 16 bits between the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index @@ -69,7 +69,7 @@ << 3) \ + ((((byte)[1]) & 3) << 1) \ + ((((byte)[2]) >> 5) & 1)] \ - & (1 << (((byte)[2]) & 0x1F))) + & (1u << (((byte)[2]) & 0x1F))) #define UTF8_GET_NAMING(pages, p, n) \ ((n) == 2 \ @@ -122,19 +122,19 @@ ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) static int PTRFASTCALL -isNever(const ENCODING *enc, const char *p) +isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p)) { return 0; } static int PTRFASTCALL -utf8_isName2(const ENCODING *enc, const char *p) +utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); } static int PTRFASTCALL -utf8_isName3(const ENCODING *enc, const char *p) +utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); } @@ -142,13 +142,13 @@ utf8_isName3(const ENCODING *enc, const char *p) #define utf8_isName4 isNever static int PTRFASTCALL -utf8_isNmstrt2(const ENCODING *enc, const char *p) +utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); } static int PTRFASTCALL -utf8_isNmstrt3(const ENCODING *enc, const char *p) +utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); } @@ -156,19 +156,19 @@ utf8_isNmstrt3(const ENCODING *enc, const char *p) #define utf8_isNmstrt4 isNever static int PTRFASTCALL -utf8_isInvalid2(const ENCODING *enc, const char *p) +utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_INVALID2((const unsigned char *)p); } static int PTRFASTCALL -utf8_isInvalid3(const ENCODING *enc, const char *p) +utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_INVALID3((const unsigned char *)p); } static int PTRFASTCALL -utf8_isInvalid4(const ENCODING *enc, const char *p) +utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_INVALID4((const unsigned char *)p); } @@ -222,6 +222,17 @@ struct normal_encoding { E ## isInvalid3, \ E ## isInvalid4 +#define NULL_VTABLE \ + /* isName2 */ NULL, \ + /* isName3 */ NULL, \ + /* isName4 */ NULL, \ + /* isNmstrt2 */ NULL, \ + /* isNmstrt3 */ NULL, \ + /* isNmstrt4 */ NULL, \ + /* isInvalid2 */ NULL, \ + /* isInvalid3 */ NULL, \ + /* isInvalid4 */ NULL + static int FASTCALL checkCharRefNumber(int); #include "xmltok_impl.h" @@ -318,39 +329,89 @@ enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ UTF8_cval4 = 0xf0 }; -static void PTRCALL -utf8_toUtf8(const ENCODING *enc, +void +align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef) +{ + const char * fromLim = *fromLimRef; + size_t walked = 0; + for (; fromLim > from; fromLim--, walked++) { + const unsigned char prev = (unsigned char)fromLim[-1]; + if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ + if (walked + 1 >= 4) { + fromLim += 4 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ + if (walked + 1 >= 3) { + fromLim += 3 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ + if (walked + 1 >= 2) { + fromLim += 2 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ + break; + } + } + *fromLimRef = fromLim; +} + +static enum XML_Convert_Result PTRCALL +utf8_toUtf8(const ENCODING *UNUSED_P(enc), const char **fromP, const char *fromLim, char **toP, const char *toLim) { + enum XML_Convert_Result res = XML_CONVERT_COMPLETED; char *to; const char *from; if (fromLim - *fromP > toLim - *toP) { /* Avoid copying partial characters. */ - for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) - if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) - break; + res = XML_CONVERT_OUTPUT_EXHAUSTED; + fromLim = *fromP + (toLim - *toP); + align_limit_to_full_utf8_characters(*fromP, &fromLim); } - for (to = *toP, from = *fromP; from != fromLim; from++, to++) + for (to = *toP, from = *fromP; (from < fromLim) && (to < toLim); from++, to++) *to = *from; *fromP = from; *toP = to; + + if ((to == toLim) && (from < fromLim)) + return XML_CONVERT_OUTPUT_EXHAUSTED; + else + return res; } -static void PTRCALL +static enum XML_Convert_Result PTRCALL utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { + enum XML_Convert_Result res = XML_CONVERT_COMPLETED; unsigned short *to = *toP; const char *from = *fromP; - while (from != fromLim && to != toLim) { + while (from < fromLim && to < toLim) { switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { case BT_LEAD2: + if (fromLim - from < 2) { + res = XML_CONVERT_INPUT_INCOMPLETE; + break; + } *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); from += 2; break; case BT_LEAD3: + if (fromLim - from < 3) { + res = XML_CONVERT_INPUT_INCOMPLETE; + break; + } *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); from += 3; @@ -358,8 +419,14 @@ utf8_toUtf16(const ENCODING *enc, case BT_LEAD4: { unsigned long n; - if (to + 1 == toLim) + if (toLim - to < 2) { + res = XML_CONVERT_OUTPUT_EXHAUSTED; goto after; + } + if (fromLim - from < 4) { + res = XML_CONVERT_INPUT_INCOMPLETE; + goto after; + } n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); n -= 0x10000; @@ -377,6 +444,7 @@ utf8_toUtf16(const ENCODING *enc, after: *fromP = from; *toP = to; + return res; } #ifdef XML_NS @@ -425,38 +493,43 @@ static const struct normal_encoding internal_utf8_encoding = { STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) }; -static void PTRCALL -latin1_toUtf8(const ENCODING *enc, +static enum XML_Convert_Result PTRCALL +latin1_toUtf8(const ENCODING *UNUSED_P(enc), const char **fromP, const char *fromLim, char **toP, const char *toLim) { for (;;) { unsigned char c; if (*fromP == fromLim) - break; + return XML_CONVERT_COMPLETED; c = (unsigned char)**fromP; if (c & 0x80) { if (toLim - *toP < 2) - break; + return XML_CONVERT_OUTPUT_EXHAUSTED; *(*toP)++ = (char)((c >> 6) | UTF8_cval2); *(*toP)++ = (char)((c & 0x3f) | 0x80); (*fromP)++; } else { if (*toP == toLim) - break; + return XML_CONVERT_OUTPUT_EXHAUSTED; *(*toP)++ = *(*fromP)++; } } } -static void PTRCALL -latin1_toUtf16(const ENCODING *enc, +static enum XML_Convert_Result PTRCALL +latin1_toUtf16(const ENCODING *UNUSED_P(enc), const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { - while (*fromP != fromLim && *toP != toLim) + while (*fromP < fromLim && *toP < toLim) *(*toP)++ = (unsigned char)*(*fromP)++; + + if ((*toP == toLim) && (*fromP < fromLim)) + return XML_CONVERT_OUTPUT_EXHAUSTED; + else + return XML_CONVERT_COMPLETED; } #ifdef XML_NS @@ -467,7 +540,7 @@ static const struct normal_encoding latin1_encoding_ns = { #include "asciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(sb_) + STANDARD_VTABLE(sb_) NULL_VTABLE }; #endif @@ -480,16 +553,21 @@ static const struct normal_encoding latin1_encoding = { #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(sb_) + STANDARD_VTABLE(sb_) NULL_VTABLE }; -static void PTRCALL -ascii_toUtf8(const ENCODING *enc, +static enum XML_Convert_Result PTRCALL +ascii_toUtf8(const ENCODING *UNUSED_P(enc), const char **fromP, const char *fromLim, char **toP, const char *toLim) { - while (*fromP != fromLim && *toP != toLim) + while (*fromP < fromLim && *toP < toLim) *(*toP)++ = *(*fromP)++; + + if ((*toP == toLim) && (*fromP < fromLim)) + return XML_CONVERT_OUTPUT_EXHAUSTED; + else + return XML_CONVERT_COMPLETED; } #ifdef XML_NS @@ -500,7 +578,7 @@ static const struct normal_encoding ascii_encoding_ns = { #include "asciitab.h" /* BT_NONXML == 0 */ }, - STANDARD_VTABLE(sb_) + STANDARD_VTABLE(sb_) NULL_VTABLE }; #endif @@ -513,7 +591,7 @@ static const struct normal_encoding ascii_encoding = { #undef BT_COLON /* BT_NONXML == 0 */ }, - STANDARD_VTABLE(sb_) + STANDARD_VTABLE(sb_) NULL_VTABLE }; static int PTRFASTCALL @@ -536,13 +614,14 @@ unicode_byte_type(char hi, char lo) } #define DEFINE_UTF16_TO_UTF8(E) \ -static void PTRCALL \ -E ## toUtf8(const ENCODING *enc, \ +static enum XML_Convert_Result PTRCALL \ +E ## toUtf8(const ENCODING *UNUSED_P(enc), \ const char **fromP, const char *fromLim, \ char **toP, const char *toLim) \ { \ - const char *from; \ - for (from = *fromP; from != fromLim; from += 2) { \ + const char *from = *fromP; \ + fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ + for (; from < fromLim; from += 2) { \ int plane; \ unsigned char lo2; \ unsigned char lo = GET_LO(from); \ @@ -552,7 +631,7 @@ E ## toUtf8(const ENCODING *enc, \ if (lo < 0x80) { \ if (*toP == toLim) { \ *fromP = from; \ - return; \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ } \ *(*toP)++ = lo; \ break; \ @@ -562,7 +641,7 @@ E ## toUtf8(const ENCODING *enc, \ case 0x4: case 0x5: case 0x6: case 0x7: \ if (toLim - *toP < 2) { \ *fromP = from; \ - return; \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ } \ *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ @@ -570,7 +649,7 @@ E ## toUtf8(const ENCODING *enc, \ default: \ if (toLim - *toP < 3) { \ *fromP = from; \ - return; \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ } \ /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ @@ -580,7 +659,11 @@ E ## toUtf8(const ENCODING *enc, \ case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ if (toLim - *toP < 4) { \ *fromP = from; \ - return; \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ + } \ + if (fromLim - from < 4) { \ + *fromP = from; \ + return XML_CONVERT_INPUT_INCOMPLETE; \ } \ plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ @@ -596,20 +679,32 @@ E ## toUtf8(const ENCODING *enc, \ } \ } \ *fromP = from; \ + if (from < fromLim) \ + return XML_CONVERT_INPUT_INCOMPLETE; \ + else \ + return XML_CONVERT_COMPLETED; \ } #define DEFINE_UTF16_TO_UTF16(E) \ -static void PTRCALL \ -E ## toUtf16(const ENCODING *enc, \ +static enum XML_Convert_Result PTRCALL \ +E ## toUtf16(const ENCODING *UNUSED_P(enc), \ const char **fromP, const char *fromLim, \ unsigned short **toP, const unsigned short *toLim) \ { \ + enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ + fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ /* Avoid copying first half only of surrogate */ \ if (fromLim - *fromP > ((toLim - *toP) << 1) \ - && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ + && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ fromLim -= 2; \ - for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ + res = XML_CONVERT_INPUT_INCOMPLETE; \ + } \ + for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ + if ((*toP == toLim) && (*fromP < fromLim)) \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ + else \ + return res; \ } #define SET2(ptr, ch) \ @@ -726,7 +821,7 @@ static const struct normal_encoding little2_encoding_ns = { #include "asciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(little2_) + STANDARD_VTABLE(little2_) NULL_VTABLE }; #endif @@ -745,7 +840,7 @@ static const struct normal_encoding little2_encoding = { #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(little2_) + STANDARD_VTABLE(little2_) NULL_VTABLE }; #if BYTEORDER != 4321 @@ -758,7 +853,7 @@ static const struct normal_encoding internal_little2_encoding_ns = { #include "iasciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(little2_) + STANDARD_VTABLE(little2_) NULL_VTABLE }; #endif @@ -771,7 +866,7 @@ static const struct normal_encoding internal_little2_encoding = { #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(little2_) + STANDARD_VTABLE(little2_) NULL_VTABLE }; #endif @@ -867,7 +962,7 @@ static const struct normal_encoding big2_encoding_ns = { #include "asciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(big2_) + STANDARD_VTABLE(big2_) NULL_VTABLE }; #endif @@ -886,7 +981,7 @@ static const struct normal_encoding big2_encoding = { #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(big2_) + STANDARD_VTABLE(big2_) NULL_VTABLE }; #if BYTEORDER != 1234 @@ -899,7 +994,7 @@ static const struct normal_encoding internal_big2_encoding_ns = { #include "iasciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(big2_) + STANDARD_VTABLE(big2_) NULL_VTABLE }; #endif @@ -912,7 +1007,7 @@ static const struct normal_encoding internal_big2_encoding = { #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(big2_) + STANDARD_VTABLE(big2_) NULL_VTABLE }; #endif @@ -938,7 +1033,7 @@ streqci(const char *s1, const char *s2) } static void PTRCALL -initUpdatePosition(const ENCODING *enc, const char *ptr, +initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr, const char *end, POSITION *pos) { normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); @@ -1288,7 +1383,7 @@ unknown_isInvalid(const ENCODING *enc, const char *p) return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; } -static void PTRCALL +static enum XML_Convert_Result PTRCALL unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) @@ -1299,21 +1394,21 @@ unknown_toUtf8(const ENCODING *enc, const char *utf8; int n; if (*fromP == fromLim) - break; + return XML_CONVERT_COMPLETED; utf8 = uenc->utf8[(unsigned char)**fromP]; n = *utf8++; if (n == 0) { int c = uenc->convert(uenc->userData, *fromP); n = XmlUtf8Encode(c, buf); if (n > toLim - *toP) - break; + return XML_CONVERT_OUTPUT_EXHAUSTED; utf8 = buf; *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] - (BT_LEAD2 - 2)); } else { if (n > toLim - *toP) - break; + return XML_CONVERT_OUTPUT_EXHAUSTED; (*fromP)++; } do { @@ -1322,13 +1417,13 @@ unknown_toUtf8(const ENCODING *enc, } } -static void PTRCALL +static enum XML_Convert_Result PTRCALL unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); - while (*fromP != fromLim && *toP != toLim) { + while (*fromP < fromLim && *toP < toLim) { unsigned short c = uenc->utf16[(unsigned char)**fromP]; if (c == 0) { c = (unsigned short) @@ -1340,6 +1435,11 @@ unknown_toUtf16(const ENCODING *enc, (*fromP)++; *(*toP)++ = c; } + + if ((*toP == toLim) && (*fromP < fromLim)) + return XML_CONVERT_OUTPUT_EXHAUSTED; + else + return XML_CONVERT_COMPLETED; } ENCODING * @@ -1503,7 +1603,7 @@ initScan(const ENCODING * const *encodingTable, { const ENCODING **encPtr; - if (ptr == end) + if (ptr >= end) return XML_TOK_NONE; encPtr = enc->encPtr; if (ptr + 1 == end) { diff --git a/Modules/expat/xmltok.h b/Modules/expat/xmltok.h index ca867aa..752007e 100644 --- a/Modules/expat/xmltok.h +++ b/Modules/expat/xmltok.h @@ -130,6 +130,12 @@ typedef int (PTRCALL *SCANNER)(const ENCODING *, const char *, const char **); +enum XML_Convert_Result { + XML_CONVERT_COMPLETED = 0, + XML_CONVERT_INPUT_INCOMPLETE = 1, + XML_CONVERT_OUTPUT_EXHAUSTED = 2 /* and therefore potentially input remaining as well */ +}; + struct encoding { SCANNER scanners[XML_N_STATES]; SCANNER literalScanners[XML_N_LITERAL_TYPES]; @@ -158,12 +164,12 @@ struct encoding { const char *ptr, const char *end, const char **badPtr); - void (PTRCALL *utf8Convert)(const ENCODING *enc, + enum XML_Convert_Result (PTRCALL *utf8Convert)(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim); - void (PTRCALL *utf16Convert)(const ENCODING *enc, + enum XML_Convert_Result (PTRCALL *utf16Convert)(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, diff --git a/Modules/expat/xmltok_impl.c b/Modules/expat/xmltok_impl.c index 9c2895b..5f779c0 100644 --- a/Modules/expat/xmltok_impl.c +++ b/Modules/expat/xmltok_impl.c @@ -87,27 +87,45 @@ #define PREFIX(ident) ident #endif + +#define HAS_CHARS(enc, ptr, end, count) \ + (end - ptr >= count * MINBPC(enc)) + +#define HAS_CHAR(enc, ptr, end) \ + HAS_CHARS(enc, ptr, end, 1) + +#define REQUIRE_CHARS(enc, ptr, end, count) \ + { \ + if (! HAS_CHARS(enc, ptr, end, count)) { \ + return XML_TOK_PARTIAL; \ + } \ + } + +#define REQUIRE_CHAR(enc, ptr, end) \ + REQUIRE_CHARS(enc, ptr, end, 1) + + /* ptr points to character following " */ switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: @@ -175,7 +191,7 @@ PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, } static int PTRCALL -PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, +PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr, const char *end, int *tokPtr) { int upper = 0; @@ -225,15 +241,14 @@ PREFIX(scanPi)(const ENCODING *enc, const char *ptr, { int tok; const char *target = ptr; - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) default: *nextTokPtr = ptr; return XML_TOK_INVALID; } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) case BT_S: case BT_CR: case BT_LF: @@ -242,13 +257,12 @@ PREFIX(scanPi)(const ENCODING *enc, const char *ptr, return XML_TOK_INVALID; } ptr += MINBPC(enc); - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { INVALID_CASES(ptr, nextTokPtr) case BT_QUEST: ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { *nextTokPtr = ptr + MINBPC(enc); return tok; @@ -266,8 +280,7 @@ PREFIX(scanPi)(const ENCODING *enc, const char *ptr, return XML_TOK_INVALID; } ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { *nextTokPtr = ptr + MINBPC(enc); return tok; @@ -282,15 +295,14 @@ PREFIX(scanPi)(const ENCODING *enc, const char *ptr, } static int PTRCALL -PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, +PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr, const char *end, const char **nextTokPtr) { static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB }; int i; /* CDATA[ */ - if (end - ptr < 6 * MINBPC(enc)) - return XML_TOK_PARTIAL; + REQUIRE_CHARS(enc, ptr, end, 6); for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { *nextTokPtr = ptr; @@ -305,7 +317,7 @@ static int PTRCALL PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr == end) + if (ptr >= end) return XML_TOK_NONE; if (MINBPC(enc) > 1) { size_t n = end - ptr; @@ -319,13 +331,11 @@ PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, switch (BYTE_TYPE(enc, ptr)) { case BT_RSQB: ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) break; ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { ptr -= MINBPC(enc); break; @@ -334,8 +344,7 @@ PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, return XML_TOK_CDATA_SECT_CLOSE; case BT_CR: ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); if (BYTE_TYPE(enc, ptr) == BT_LF) ptr += MINBPC(enc); *nextTokPtr = ptr; @@ -348,7 +357,7 @@ PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, ptr += MINBPC(enc); break; } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { #define LEAD_CASE(n) \ case BT_LEAD ## n: \ @@ -383,19 +392,18 @@ static int PTRCALL PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) default: *nextTokPtr = ptr; return XML_TOK_INVALID; } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) case BT_S: case BT_CR: case BT_LF: - for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { + for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { switch (BYTE_TYPE(enc, ptr)) { case BT_S: case BT_CR: case BT_LF: break; @@ -432,7 +440,7 @@ static int PTRCALL PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr != end) { + if (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { case BT_DIGIT: case BT_HEX: @@ -441,7 +449,7 @@ PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, *nextTokPtr = ptr; return XML_TOK_INVALID; } - for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { + for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { switch (BYTE_TYPE(enc, ptr)) { case BT_DIGIT: case BT_HEX: @@ -464,7 +472,7 @@ static int PTRCALL PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr != end) { + if (HAS_CHAR(enc, ptr, end)) { if (CHAR_MATCHES(enc, ptr, ASCII_x)) return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); switch (BYTE_TYPE(enc, ptr)) { @@ -474,7 +482,7 @@ PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, *nextTokPtr = ptr; return XML_TOK_INVALID; } - for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { + for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { switch (BYTE_TYPE(enc, ptr)) { case BT_DIGIT: break; @@ -496,8 +504,7 @@ static int PTRCALL PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) case BT_NUM: @@ -506,7 +513,7 @@ PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, *nextTokPtr = ptr; return XML_TOK_INVALID; } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) case BT_SEMI: @@ -529,7 +536,7 @@ PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, #ifdef XML_NS int hadColon = 0; #endif - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) #ifdef XML_NS @@ -540,8 +547,7 @@ PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, } hadColon = 1; ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) default: @@ -555,8 +561,7 @@ PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, int t; ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); t = BYTE_TYPE(enc, ptr); if (t == BT_EQUALS) break; @@ -579,8 +584,7 @@ PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, #endif for (;;) { ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); open = BYTE_TYPE(enc, ptr); if (open == BT_QUOT || open == BT_APOS) break; @@ -598,8 +602,7 @@ PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, /* in attribute value */ for (;;) { int t; - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); t = BYTE_TYPE(enc, ptr); if (t == open) break; @@ -624,8 +627,7 @@ PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, } } ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { case BT_S: case BT_CR: @@ -642,8 +644,7 @@ PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, /* ptr points to closing quote */ for (;;) { ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) case BT_S: case BT_CR: case BT_LF: @@ -655,8 +656,7 @@ PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, case BT_SOL: sol: ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { *nextTokPtr = ptr; return XML_TOK_INVALID; @@ -688,13 +688,12 @@ PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, #ifdef XML_NS int hadColon; #endif - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) case BT_EXCL: - if ((ptr += MINBPC(enc)) == end) - return XML_TOK_PARTIAL; + ptr += MINBPC(enc); + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { case BT_MINUS: return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); @@ -716,7 +715,7 @@ PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, hadColon = 0; #endif /* we have a start-tag */ - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) #ifdef XML_NS @@ -727,8 +726,7 @@ PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, } hadColon = 1; ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) default: @@ -740,7 +738,7 @@ PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, case BT_S: case BT_CR: case BT_LF: { ptr += MINBPC(enc); - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) case BT_GT: @@ -765,8 +763,7 @@ PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, case BT_SOL: sol: ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { *nextTokPtr = ptr; return XML_TOK_INVALID; @@ -785,7 +782,7 @@ static int PTRCALL PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr == end) + if (ptr >= end) return XML_TOK_NONE; if (MINBPC(enc) > 1) { size_t n = end - ptr; @@ -803,7 +800,7 @@ PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); case BT_CR: ptr += MINBPC(enc); - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) return XML_TOK_TRAILING_CR; if (BYTE_TYPE(enc, ptr) == BT_LF) ptr += MINBPC(enc); @@ -814,12 +811,12 @@ PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_DATA_NEWLINE; case BT_RSQB: ptr += MINBPC(enc); - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) return XML_TOK_TRAILING_RSQB; if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) break; ptr += MINBPC(enc); - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) return XML_TOK_TRAILING_RSQB; if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { ptr -= MINBPC(enc); @@ -832,7 +829,7 @@ PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, ptr += MINBPC(enc); break; } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { #define LEAD_CASE(n) \ case BT_LEAD ## n: \ @@ -845,12 +842,12 @@ PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) #undef LEAD_CASE case BT_RSQB: - if (ptr + MINBPC(enc) != end) { + if (HAS_CHARS(enc, ptr, end, 2)) { if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { ptr += MINBPC(enc); break; } - if (ptr + 2*MINBPC(enc) != end) { + if (HAS_CHARS(enc, ptr, end, 3)) { if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) { ptr += MINBPC(enc); break; @@ -884,8 +881,7 @@ static int PTRCALL PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: @@ -895,7 +891,7 @@ PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, *nextTokPtr = ptr; return XML_TOK_INVALID; } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) case BT_SEMI: @@ -913,15 +909,14 @@ static int PTRCALL PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) default: *nextTokPtr = ptr; return XML_TOK_INVALID; } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) case BT_CR: case BT_LF: case BT_S: @@ -941,7 +936,7 @@ PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { int t = BYTE_TYPE(enc, ptr); switch (t) { INVALID_CASES(ptr, nextTokPtr) @@ -950,7 +945,7 @@ PREFIX(scanLit)(int open, const ENCODING *enc, ptr += MINBPC(enc); if (t != open) break; - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) return -XML_TOK_LITERAL; *nextTokPtr = ptr; switch (BYTE_TYPE(enc, ptr)) { @@ -973,7 +968,7 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { int tok; - if (ptr == end) + if (ptr >= end) return XML_TOK_NONE; if (MINBPC(enc) > 1) { size_t n = end - ptr; @@ -992,8 +987,7 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, case BT_LT: { ptr += MINBPC(enc); - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); switch (BYTE_TYPE(enc, ptr)) { case BT_EXCL: return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); @@ -1021,7 +1015,7 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, case BT_S: case BT_LF: for (;;) { ptr += MINBPC(enc); - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) break; switch (BYTE_TYPE(enc, ptr)) { case BT_S: case BT_LF: @@ -1048,11 +1042,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_OPEN_BRACKET; case BT_RSQB: ptr += MINBPC(enc); - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) return -XML_TOK_CLOSE_BRACKET; if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { - if (ptr + MINBPC(enc) == end) - return XML_TOK_PARTIAL; + REQUIRE_CHARS(enc, ptr, end, 2); if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { *nextTokPtr = ptr + 2*MINBPC(enc); return XML_TOK_COND_SECT_CLOSE; @@ -1065,7 +1058,7 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_OPEN_PAREN; case BT_RPAR: ptr += MINBPC(enc); - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) return -XML_TOK_CLOSE_PAREN; switch (BYTE_TYPE(enc, ptr)) { case BT_AST: @@ -1141,7 +1134,7 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, *nextTokPtr = ptr; return XML_TOK_INVALID; } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) case BT_GT: case BT_RPAR: case BT_COMMA: @@ -1154,8 +1147,7 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, ptr += MINBPC(enc); switch (tok) { case XML_TOK_NAME: - if (ptr == end) - return XML_TOK_PARTIAL; + REQUIRE_CHAR(enc, ptr, end); tok = XML_TOK_PREFIXED_NAME; switch (BYTE_TYPE(enc, ptr)) { CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) @@ -1204,10 +1196,12 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { const char *start; - if (ptr == end) + if (ptr >= end) return XML_TOK_NONE; + else if (! HAS_CHAR(enc, ptr, end)) + return XML_TOK_PARTIAL; start = ptr; - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { #define LEAD_CASE(n) \ case BT_LEAD ## n: ptr += n; break; @@ -1232,7 +1226,7 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, case BT_CR: if (ptr == start) { ptr += MINBPC(enc); - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) return XML_TOK_TRAILING_CR; if (BYTE_TYPE(enc, ptr) == BT_LF) ptr += MINBPC(enc); @@ -1262,10 +1256,12 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end, const char **nextTokPtr) { const char *start; - if (ptr == end) + if (ptr >= end) return XML_TOK_NONE; + else if (! HAS_CHAR(enc, ptr, end)) + return XML_TOK_PARTIAL; start = ptr; - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { #define LEAD_CASE(n) \ case BT_LEAD ## n: ptr += n; break; @@ -1294,7 +1290,7 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, case BT_CR: if (ptr == start) { ptr += MINBPC(enc); - if (ptr == end) + if (! HAS_CHAR(enc, ptr, end)) return XML_TOK_TRAILING_CR; if (BYTE_TYPE(enc, ptr) == BT_LF) ptr += MINBPC(enc); @@ -1326,15 +1322,15 @@ PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, end = ptr + n; } } - while (ptr != end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { INVALID_CASES(ptr, nextTokPtr) case BT_LT: - if ((ptr += MINBPC(enc)) == end) - return XML_TOK_PARTIAL; + ptr += MINBPC(enc); + REQUIRE_CHAR(enc, ptr, end); if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { - if ((ptr += MINBPC(enc)) == end) - return XML_TOK_PARTIAL; + ptr += MINBPC(enc); + REQUIRE_CHAR(enc, ptr, end); if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { ++level; ptr += MINBPC(enc); @@ -1342,11 +1338,11 @@ PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, } break; case BT_RSQB: - if ((ptr += MINBPC(enc)) == end) - return XML_TOK_PARTIAL; + ptr += MINBPC(enc); + REQUIRE_CHAR(enc, ptr, end); if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { - if ((ptr += MINBPC(enc)) == end) - return XML_TOK_PARTIAL; + ptr += MINBPC(enc); + REQUIRE_CHAR(enc, ptr, end); if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { ptr += MINBPC(enc); if (level == 0) { @@ -1373,7 +1369,7 @@ PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, { ptr += MINBPC(enc); end -= MINBPC(enc); - for (; ptr != end; ptr += MINBPC(enc)) { + for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { switch (BYTE_TYPE(enc, ptr)) { case BT_DIGIT: case BT_HEX: @@ -1521,7 +1517,7 @@ PREFIX(getAtts)(const ENCODING *enc, const char *ptr, } static int PTRFASTCALL -PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) +PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr) { int result = 0; /* skip &# */ @@ -1565,7 +1561,7 @@ PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) } static int PTRCALL -PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, +PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr, const char *end) { switch ((end - ptr)/MINBPC(enc)) { @@ -1683,11 +1679,11 @@ PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) } static int PTRCALL -PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, +PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1, const char *end1, const char *ptr2) { for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { - if (ptr1 == end1) + if (end1 - ptr1 < MINBPC(enc)) return 0; if (!CHAR_MATCHES(enc, ptr1, *ptr2)) return 0; @@ -1744,7 +1740,7 @@ PREFIX(updatePosition)(const ENCODING *enc, const char *end, POSITION *pos) { - while (ptr < end) { + while (HAS_CHAR(enc, ptr, end)) { switch (BYTE_TYPE(enc, ptr)) { #define LEAD_CASE(n) \ case BT_LEAD ## n: \ @@ -1760,7 +1756,7 @@ PREFIX(updatePosition)(const ENCODING *enc, case BT_CR: pos->lineNumber++; ptr += MINBPC(enc); - if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF) + if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF) ptr += MINBPC(enc); pos->columnNumber = (XML_Size)-1; break; diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index ec7a60b..d9cfa3e 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1188,10 +1188,8 @@ newxmlparseobject(const char *encoding, const char *namespace_separator, PyObjec Py_DECREF(self); return NULL; } -#if XML_COMBINED_VERSION >= 20100 || defined(XML_HAS_SET_HASH_SALT) - /* This feature was added upstream in libexpat 2.1.0. Our expat copy - * has a backport of this feature where we also define XML_HAS_SET_HASH_SALT - * to indicate that we can still use it. */ +#if XML_COMBINED_VERSION >= 20100 + /* This feature was added upstream in libexpat 2.1.0. */ XML_SetHashSalt(self->itself, (unsigned long)_Py_HashSecret.expat.hashsalt); #endif -- cgit v0.12