diff options
Diffstat (limited to 'Objects/unicodeobject.c')
| -rw-r--r-- | Objects/unicodeobject.c | 1144 |
1 files changed, 472 insertions, 672 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1df38e8..4fb1aed 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -115,9 +115,9 @@ static char unicode_default_encoding[100]; /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, -/* case 0x0009: * HORIZONTAL TABULATION */ +/* case 0x0009: * CHARACTER TABULATION */ /* case 0x000A: * LINE FEED */ -/* case 0x000B: * VERTICAL TABULATION */ +/* case 0x000B: * LINE TABULATION */ /* case 0x000C: * FORM FEED */ /* case 0x000D: * CARRIAGE RETURN */ 0, 1, 1, 1, 1, 1, 0, 0, @@ -147,8 +147,10 @@ const unsigned char _Py_ascii_whitespace[] = { static unsigned char ascii_linebreak[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0x000A, * LINE FEED */ +/* 0x000B, * LINE TABULATION */ +/* 0x000C, * FORM FEED */ /* 0x000D, * CARRIAGE RETURN */ - 0, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x001C, * FILE SEPARATOR */ /* 0x001D, * GROUP SEPARATOR */ @@ -190,11 +192,22 @@ PyUnicode_GetMax(void) /* the linebreak mask is set up by Unicode_Init below */ +#if LONG_BIT >= 128 +#define BLOOM_WIDTH 128 +#elif LONG_BIT >= 64 +#define BLOOM_WIDTH 64 +#elif LONG_BIT >= 32 +#define BLOOM_WIDTH 32 +#else +#error "LONG_BIT is smaller than 32" +#endif + #define BLOOM_MASK unsigned long static BLOOM_MASK bloom_linebreak; -#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) +#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) +#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) #define BLOOM_LINEBREAK(ch) \ ((ch) < 128U ? ascii_linebreak[(ch)] : \ @@ -204,12 +217,12 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) { /* calculate simple bloom-style bitmask for a given unicode string */ - long mask; + BLOOM_MASK mask; Py_ssize_t i; mask = 0; for (i = 0; i < len; i++) - mask |= (1 << (ptr[i] & 0x1F)); + BLOOM_ADD(mask, ptr[i]); return mask; } @@ -280,7 +293,7 @@ int unicode_resize(register PyUnicodeObject *unicode, } /* We allocate one more byte to make sure the string is - Ux0000 terminated -- XXX is this needed ? + Ux0000 terminated; some code relies on that. XXX This allocator could further be enhanced by assuring that the free list never reduces its size below 1. @@ -527,6 +540,60 @@ PyObject *PyUnicode_FromString(const char *u) #ifdef HAVE_WCHAR_H +#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) +# define CONVERT_WCHAR_TO_SURROGATES +#endif + +#ifdef CONVERT_WCHAR_TO_SURROGATES + +/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need + to convert from UTF32 to UTF16. */ + +PyObject *PyUnicode_FromWideChar(register const wchar_t *w, + Py_ssize_t size) +{ + PyUnicodeObject *unicode; + register Py_ssize_t i; + Py_ssize_t alloc; + const wchar_t *orig_w; + + if (w == NULL) { + PyErr_BadInternalCall(); + return NULL; + } + + alloc = size; + orig_w = w; + for (i = size; i > 0; i--) { + if (*w > 0xFFFF) + alloc++; + w++; + } + w = orig_w; + unicode = _PyUnicode_New(alloc); + if (!unicode) + return NULL; + + /* Copy the wchar_t data into the new object */ + { + register Py_UNICODE *u; + u = PyUnicode_AS_UNICODE(unicode); + for (i = size; i > 0; i--) { + if (*w > 0xFFFF) { + wchar_t ordinal = *w++; + ordinal -= 0x10000; + *u++ = 0xD800 | (ordinal >> 10); + *u++ = 0xDC00 | (ordinal & 0x3FF); + } + else + *u++ = *w++; + } + } + return (PyObject *)unicode; +} + +#else + PyObject *PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) { @@ -557,6 +624,10 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w, return (PyObject *)unicode; } +#endif /* CONVERT_WCHAR_TO_SURROGATES */ + +#undef CONVERT_WCHAR_TO_SURROGATES + static void makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) { @@ -681,7 +752,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) case 's': { /* UTF-8 */ - unsigned char *s = va_arg(count, unsigned char*); + const char *s = va_arg(count, const char*); PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); if (!str) goto fail; @@ -1408,69 +1479,81 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler /* --- UTF-7 Codec -------------------------------------------------------- */ -/* see RFC2152 for details */ +/* See RFC2152 for details. We encode conservatively and decode liberally. */ -static -char utf7_special[128] = { - /* indicate whether a UTF-7 character is special i.e. cannot be directly - encoded: - 0 - not special - 1 - special - 2 - whitespace (optional) - 3 - RFC2152 Set O (optional) */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, - 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, - 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, +/* Three simple macros defining base-64. */ -}; +/* Is c a base-64 character? */ + +#define IS_BASE64(c) \ + (isalnum(c) || (c) == '+' || (c) == '/') -/* Note: The comparison (c) <= 0 is a trick to work-around gcc - warnings about the comparison always being false; since - utf7_special[0] is 1, we can safely make that one comparison - true */ +/* given that c is a base-64 character, what is its base-64 value? */ -#define SPECIAL(c, encodeO, encodeWS) \ - ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ - (encodeWS && (utf7_special[(c)] == 2)) || \ - (encodeO && (utf7_special[(c)] == 3))) +#define FROM_BASE64(c) \ + (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ + ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ + ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ + (c) == '+' ? 62 : 63) -#define B64(n) \ +/* What is the base-64 character of the bottom 6 bits of n? */ + +#define TO_BASE64(n) \ ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) -#define B64CHAR(c) \ - (isalnum(c) || (c) == '+' || (c) == '/') -#define UB64(c) \ - ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ - (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) - -#define ENCODE(out, ch, bits) \ - while (bits >= 6) { \ - *out++ = B64(ch >> (bits-6)); \ - bits -= 6; \ - } - -#define DECODE(out, ch, bits, surrogate) \ - while (bits >= 16) { \ - Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ - bits -= 16; \ - if (surrogate) { \ - /* We have already generated an error for the high surrogate \ - so let's not bother seeing if the low surrogate is correct or not */ \ - surrogate = 0; \ - } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ - /* This is a surrogate pair. Unfortunately we can't represent \ - it in a 16-bit character */ \ - surrogate = 1; \ - errmsg = "code pairs are not supported"; \ - goto utf7Error; \ - } else { \ - *out++ = outCh; \ - } \ - } + +/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be + * decoded as itself. We are permissive on decoding; the only ASCII + * byte not decoding to itself is the + which begins a base64 + * string. */ + +#define DECODE_DIRECT(c) \ + ((c) <= 127 && (c) != '+') + +/* The UTF-7 encoder treats ASCII characters differently according to + * whether they are Set D, Set O, Whitespace, or special (i.e. none of + * the above). See RFC2152. This array identifies these different + * sets: + * 0 : "Set D" + * alphanumeric and '(),-./:? + * 1 : "Set O" + * !"#$%&*;<=>@[]^_`{|} + * 2 : "whitespace" + * ht nl cr sp + * 3 : special (must be base64 encoded) + * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) + */ + +static +char utf7_category[128] = { +/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, +/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +/* sp ! " # $ % & ' ( ) * + , - . / */ + 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, +/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, +/* @ A B C D E F G H I J K L M N O */ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/* P Q R S T U V W X Y Z [ \ ] ^ _ */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, +/* ` a b c d e f g h i j k l m n o */ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +/* p q r s t u v w x y z { | } ~ del */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, +}; + +/* ENCODE_DIRECT: this character should be encoded as itself. The + * answer depends on whether we are encoding set O as itself, and also + * on whether we are encoding whitespace as itself. RFC2152 makes it + * clear that the answers to these questions vary between + * applications, so this code needs to be flexible. */ + +#define ENCODE_DIRECT(c, directO, directWS) \ + ((c) < 128 && (c) > 0 && \ + ((utf7_category[(c)] == 0) || \ + (directWS && (utf7_category[(c)] == 2)) || \ + (directO && (utf7_category[(c)] == 1)))) PyObject *PyUnicode_DecodeUTF7(const char *s, Py_ssize_t size, @@ -1479,6 +1562,13 @@ PyObject *PyUnicode_DecodeUTF7(const char *s, return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); } +/* The decoder. The only state we preserve is our read position, + * i.e. how many characters we have consumed. So if we end in the + * middle of a shift sequence we have to back off the read position + * and the output to the beginning of the sequence, otherwise we lose + * all the shift state (seen bits, number of bits seen, high + * surrogate). */ + PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, Py_ssize_t size, const char *errors, @@ -1493,9 +1583,10 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, Py_UNICODE *p; const char *errmsg = ""; int inShift = 0; - unsigned int bitsleft = 0; - unsigned long charsleft = 0; - int surrogate = 0; + Py_UNICODE *shiftOutStart; + unsigned int base64bits = 0; + unsigned long base64buffer = 0; + Py_UNICODE surrogate = 0; PyObject *errorHandler = NULL; PyObject *exc = NULL; @@ -1509,79 +1600,103 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, } p = unicode->str; + shiftOutStart = p; e = s + size; while (s < e) { - Py_UNICODE ch; - restart: - ch = (unsigned char) *s; + Py_UNICODE ch = (unsigned char) *s; - if (inShift) { - if ((ch == '-') || !B64CHAR(ch)) { - inShift = 0; + if (inShift) { /* in a base-64 section */ + if (IS_BASE64(ch)) { /* consume a base-64 character */ + base64buffer = (base64buffer << 6) | FROM_BASE64(ch); + base64bits += 6; s++; - - /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); - if (bitsleft >= 6) { - /* The shift sequence has a partial character in it. If - bitsleft < 6 then we could just classify it as padding - but that is not the case here */ - - errmsg = "partial character in shift sequence"; - goto utf7Error; + if (base64bits >= 16) { + /* we have enough bits for a UTF-16 value */ + Py_UNICODE outCh = (Py_UNICODE) + (base64buffer >> (base64bits-16)); + base64bits -= 16; + base64buffer &= (1 << base64bits) - 1; /* clear high bits */ + if (surrogate) { + /* expecting a second surrogate */ + if (outCh >= 0xDC00 && outCh <= 0xDFFF) { +#ifdef Py_UNICODE_WIDE + *p++ = (((surrogate & 0x3FF)<<10) + | (outCh & 0x3FF)) + 0x10000; +#else + *p++ = surrogate; + *p++ = outCh; +#endif + surrogate = 0; + continue; + } + else { + *p++ = surrogate; + surrogate = 0; + } + } + if (outCh >= 0xD800 && outCh <= 0xDBFF) { + /* first surrogate */ + surrogate = outCh; + } + else { + *p++ = outCh; + } } - /* According to RFC2152 the remaining bits should be zero. We - choose to signal an error/insert a replacement character - here so indicate the potential of a misencoded character. */ - - /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ - if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { - errmsg = "non-zero padding bits in shift sequence"; - goto utf7Error; + } + else { /* now leaving a base-64 section */ + inShift = 0; + s++; + if (surrogate) { + *p++ = surrogate; + surrogate = 0; } - - if (ch == '-') { - if ((s < e) && (*(s) == '-')) { - *p++ = '-'; - inShift = 1; + if (base64bits > 0) { /* left-over bits */ + if (base64bits >= 6) { + /* We've seen at least one base-64 character */ + errmsg = "partial character in shift sequence"; + goto utf7Error; } - } else if (SPECIAL(ch,0,0)) { - errmsg = "unexpected special character"; - goto utf7Error; - } else { + else { + /* Some bits remain; they should be zero */ + if (base64buffer != 0) { + errmsg = "non-zero padding bits in shift sequence"; + goto utf7Error; + } + } + } + if (ch != '-') { + /* '-' is absorbed; other terminating + characters are preserved */ *p++ = ch; } - } else { - charsleft = (charsleft << 6) | UB64(ch); - bitsleft += 6; - s++; - /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); } } else if ( ch == '+' ) { startinpos = s-starts; - s++; - if (s < e && *s == '-') { + s++; /* consume '+' */ + if (s < e && *s == '-') { /* '+-' encodes '+' */ s++; *p++ = '+'; - } else - { + } + else { /* begin base64-encoded section */ inShift = 1; - bitsleft = 0; + shiftOutStart = p; + base64bits = 0; } } - else if (SPECIAL(ch,0,0)) { - startinpos = s-starts; - errmsg = "unexpected special character"; + else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ + *p++ = ch; s++; - goto utf7Error; } else { - *p++ = ch; + startinpos = s-starts; s++; + errmsg = "unexpected special character"; + goto utf7Error; } continue; - utf7Error: +utf7Error: outpos = p-PyUnicode_AS_UNICODE(unicode); endinpos = s-starts; if (unicode_decode_call_errorhandler( @@ -1592,23 +1707,33 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, goto onError; } - if (inShift && !consumed) { - outpos = p-PyUnicode_AS_UNICODE(unicode); - endinpos = size; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "utf7", "unterminated shift sequence", - starts, size, &startinpos, &endinpos, &exc, &s, - &unicode, &outpos, &p)) - goto onError; - if (s < e) - goto restart; + /* end of string */ + + if (inShift && !consumed) { /* in shift sequence, no more to follow */ + /* if we're in an inconsistent state, that's an error */ + if (surrogate || + (base64bits >= 6) || + (base64bits > 0 && base64buffer != 0)) { + outpos = p-PyUnicode_AS_UNICODE(unicode); + endinpos = size; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf7", "unterminated shift sequence", + starts, size, &startinpos, &endinpos, &exc, &s, + &unicode, &outpos, &p)) + goto onError; + } } + + /* return state */ if (consumed) { - if(inShift) + if (inShift) { + p = shiftOutStart; /* back off output */ *consumed = startinpos; - else + } + else { *consumed = s-starts; + } } if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) @@ -1628,27 +1753,27 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, Py_ssize_t size, - int encodeSetO, - int encodeWhiteSpace, + int base64SetO, + int base64WhiteSpace, const char *errors) { PyObject *v; /* It might be possible to tighten this worst case */ - Py_ssize_t cbAllocated = 5 * size; + Py_ssize_t allocated = 8 * size; int inShift = 0; Py_ssize_t i = 0; - unsigned int bitsleft = 0; - unsigned long charsleft = 0; + unsigned int base64bits = 0; + unsigned long base64buffer = 0; char * out; char * start; - if (cbAllocated / 5 != size) + if (allocated / 8 != size) return PyErr_NoMemory(); if (size == 0) return PyString_FromStringAndSize(NULL, 0); - v = PyString_FromStringAndSize(NULL, cbAllocated); + v = PyString_FromStringAndSize(NULL, allocated); if (v == NULL) return NULL; @@ -1656,78 +1781,77 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, for (;i < size; ++i) { Py_UNICODE ch = s[i]; - if (!inShift) { - if (ch == '+') { - *out++ = '+'; - *out++ = '-'; - } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { - charsleft = ch; - bitsleft = 16; - *out++ = '+'; - /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); - inShift = bitsleft > 0; - } else { - *out++ = (char) ch; - } - } else { - if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { - *out++ = B64(charsleft << (6-bitsleft)); - charsleft = 0; - bitsleft = 0; + if (inShift) { + if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { + /* shifting out */ + if (base64bits) { /* output remaining bits */ + *out++ = TO_BASE64(base64buffer << (6-base64bits)); + base64buffer = 0; + base64bits = 0; + } + inShift = 0; /* Characters not in the BASE64 set implicitly unshift the sequence so no '-' is required, except if the character is itself a '-' */ - if (B64CHAR(ch) || ch == '-') { + if (IS_BASE64(ch) || ch == '-') { *out++ = '-'; } - inShift = 0; *out++ = (char) ch; - } else { - bitsleft += 16; - charsleft = (charsleft << 16) | ch; - /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); - - /* If the next character is special then we don't need to terminate - the shift sequence. If the next character is not a BASE64 character - or '-' then the shift sequence will be terminated implicitly and we - don't have to insert a '-'. */ - - if (bitsleft == 0) { - if (i + 1 < size) { - Py_UNICODE ch2 = s[i+1]; - - if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { - - } else if (B64CHAR(ch2) || ch2 == '-') { - *out++ = '-'; - inShift = 0; - } else { - inShift = 0; - } - - } - else { + } + else { + goto encode_char; + } + } + else { /* not in a shift sequence */ + if (ch == '+') { + *out++ = '+'; *out++ = '-'; - inShift = 0; - } - } } + else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { + *out++ = (char) ch; + } + else { + *out++ = '+'; + inShift = 1; + goto encode_char; + } + } + continue; +encode_char: +#ifdef Py_UNICODE_WIDE + if (ch >= 0x10000) { + /* code first surrogate */ + base64bits += 16; + base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); + while (base64bits >= 6) { + *out++ = TO_BASE64(base64buffer >> (base64bits-6)); + base64bits -= 6; + } + /* prepare second surrogate */ + ch = 0xDC00 | ((ch-0x10000) & 0x3FF); + } +#endif + base64bits += 16; + base64buffer = (base64buffer << 16) | ch; + while (base64bits >= 6) { + *out++ = TO_BASE64(base64buffer >> (base64bits-6)); + base64bits -= 6; } } - if (bitsleft) { - *out++= B64(charsleft << (6-bitsleft) ); + if (base64bits) + *out++= TO_BASE64(base64buffer << (6-base64bits) ); + if (inShift) *out++ = '-'; - } - _PyString_Resize(&v, out - start); + if (_PyString_Resize(&v, out - start)) + return NULL; return v; } -#undef SPECIAL -#undef B64 -#undef B64CHAR -#undef UB64 -#undef ENCODE -#undef DECODE +#undef IS_BASE64 +#undef FROM_BASE64 +#undef TO_BASE64 +#undef DECODE_DIRECT +#undef ENCODE_DIRECT /* --- UTF-8 Codec -------------------------------------------------------- */ @@ -2033,7 +2157,8 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, /* Cut back to size actually needed. */ nneeded = p - PyString_AS_STRING(v); assert(nneeded <= nallocated); - _PyString_Resize(&v, nneeded); + if (_PyString_Resize(&v, nneeded)) + return NULL; } return v; @@ -2077,10 +2202,11 @@ PyUnicode_DecodeUTF32Stateful(const char *s, Py_UNICODE *p; #ifndef Py_UNICODE_WIDE int pairs = 0; + const unsigned char *qq; #else const int pairs = 0; #endif - const unsigned char *q, *e, *qq; + const unsigned char *q, *e; int bo = 0; /* assume native ordering by default */ const char *errmsg = ""; /* Offsets from q for retrieving bytes in the right order. */ @@ -2757,16 +2883,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, message = "malformed \\N character escape"; if (ucnhash_CAPI == NULL) { /* load the unicode data module */ - PyObject *m, *api; - m = PyImport_ImportModuleNoBlock("unicodedata"); - if (m == NULL) - goto ucnhashError; - api = PyObject_GetAttrString(m, "ucnhash_CAPI"); - Py_DECREF(m); - if (api == NULL) - goto ucnhashError; - ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); - Py_DECREF(api); + ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); if (ucnhash_CAPI == NULL) goto ucnhashError; } @@ -3004,7 +3121,8 @@ PyObject *unicodeescape_string(const Py_UNICODE *s, *p++ = PyString_AS_STRING(repr)[1]; *p = '\0'; - _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); + if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) + return NULL; return repr; } @@ -3225,7 +3343,8 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, *p++ = (char) ch; } *p = '\0'; - _PyString_Resize(&repr, p - q); + if (_PyString_Resize(&repr, p - q)) + return NULL; return repr; } @@ -3564,9 +3683,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, collstart-startp, collend-startp, &newpos); if (repunicode == NULL) goto onError; - /* need more space? (at least enough for what we - have+the replacement+the rest of the string, so - we won't have to check space for encodable characters) */ + /* need more space? (at least enough for what we have+the + replacement+the rest of the string, so we won't have to + check space for encodable characters) */ respos = str-PyString_AS_STRING(res); repsize = PyUnicode_GET_SIZE(repunicode); requiredsize = respos+repsize+(endp-collend); @@ -4217,7 +4336,7 @@ PyUnicode_BuildEncodingMap(PyObject* string) if (!result) return NULL; for (i = 0; i < 256; i++) { - key = value = NULL; + value = NULL; key = PyInt_FromLong(decode[i]); value = PyInt_FromLong(i); if (!key || !value) @@ -5041,11 +5160,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, } /* All other characters are considered unencodable */ collstart = p; - collend = p+1; - while (collend < end) { + for (collend = p+1; collend < end; collend++) { if ((0 < *collend && *collend < 256) || - !Py_UNICODE_ISSPACE(*collend) || - Py_UNICODE_TODECIMAL(*collend)) + Py_UNICODE_ISSPACE(*collend) || + 0 <= Py_UNICODE_TODECIMAL(*collend)) break; } /* cache callback name lookup @@ -5124,27 +5242,27 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, /* --- Helpers ------------------------------------------------------------ */ #include "stringlib/unicodedefs.h" - -#define FROM_UNICODE - #include "stringlib/fastsearch.h" #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/partition.h" +#include "stringlib/split.h" /* helper macro to fixup start/end slice values */ -#define FIX_START_END(obj) \ - if (start < 0) \ - start += (obj)->length; \ - if (start < 0) \ - start = 0; \ - if (end > (obj)->length) \ - end = (obj)->length; \ - if (end < 0) \ - end += (obj)->length; \ - if (end < 0) \ - end = 0; +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } Py_ssize_t PyUnicode_Count(PyObject *str, PyObject *substr, @@ -5164,10 +5282,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str, return -1; } - FIX_START_END(str_obj); - + ADJUST_INDICES(start, end, str_obj->length); result = stringlib_count( - str_obj->str + start, end - start, sub_obj->str, sub_obj->length + str_obj->str + start, end - start, sub_obj->str, sub_obj->length, + PY_SSIZE_T_MAX ); Py_DECREF(sub_obj); @@ -5222,8 +5340,7 @@ int tailmatch(PyUnicodeObject *self, if (substring->length == 0) return 1; - FIX_START_END(self); - + ADJUST_INDICES(start, end, self->length); end -= substring->length; if (end < start) return 0; @@ -5363,13 +5480,13 @@ int fixcapitalize(PyUnicodeObject *self) if (len == 0) return 0; - if (Py_UNICODE_ISLOWER(*s)) { + if (!Py_UNICODE_ISUPPER(*s)) { *s = Py_UNICODE_TOUPPER(*s); status = 1; } s++; while (--len > 0) { - if (Py_UNICODE_ISUPPER(*s)) { + if (!Py_UNICODE_ISLOWER(*s)) { *s = Py_UNICODE_TOLOWER(*s); status = 1; } @@ -5600,305 +5717,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self, return u; } -#define SPLIT_APPEND(data, left, right) \ - str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ - if (!str) \ - goto onError; \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); - -static -PyObject *split_whitespace(PyUnicodeObject *self, - PyObject *list, - Py_ssize_t maxcount) +PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) { - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = 0; i < len; ) { - /* find a token */ - while (i < len && Py_UNICODE_ISSPACE(buf[i])) - i++; - j = i; - while (i < len && !Py_UNICODE_ISSPACE(buf[i])) - i++; - if (j < i) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, j, i); - while (i < len && Py_UNICODE_ISSPACE(buf[i])) - i++; - j = i; - } - } - if (j < len) { - SPLIT_APPEND(buf, j, len); - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -PyObject *PyUnicode_Splitlines(PyObject *string, - int keepends) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len; PyObject *list; - PyObject *str; - Py_UNICODE *data; string = PyUnicode_FromObject(string); if (string == NULL) return NULL; - data = PyUnicode_AS_UNICODE(string); - len = PyUnicode_GET_SIZE(string); - - list = PyList_New(0); - if (!list) - goto onError; - for (i = j = 0; i < len; ) { - Py_ssize_t eol; + list = stringlib_splitlines( + (PyObject*) string, PyUnicode_AS_UNICODE(string), + PyUnicode_GET_SIZE(string), keepends); - /* Find a line and append it */ - while (i < len && !BLOOM_LINEBREAK(data[i])) - i++; - - /* Skip the line break reading CRLF as one line break */ - eol = i; - if (i < len) { - if (data[i] == '\r' && i + 1 < len && - data[i+1] == '\n') - i += 2; - else - i++; - if (keepends) - eol = i; - } - SPLIT_APPEND(data, j, eol); - j = i; - } - if (j < len) { - SPLIT_APPEND(data, j, len); - } - - Py_DECREF(string); - return list; - - onError: - Py_XDECREF(list); Py_DECREF(string); - return NULL; -} - -static -PyObject *split_char(PyUnicodeObject *self, - PyObject *list, - Py_UNICODE ch, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = 0; i < len; ) { - if (buf[i] == ch) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, j, i); - i = j = i + 1; - } else - i++; - } - if (j <= len) { - SPLIT_APPEND(buf, j, len); - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *split_substring(PyUnicodeObject *self, - PyObject *list, - PyUnicodeObject *substring, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - Py_ssize_t sublen = substring->length; - PyObject *str; - - for (i = j = 0; i <= len - sublen; ) { - if (Py_UNICODE_MATCH(self, i, substring)) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(self->str, j, i); - i = j = i + sublen; - } else - i++; - } - if (j <= len) { - SPLIT_APPEND(self->str, j, len); - } return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_whitespace(PyUnicodeObject *self, - PyObject *list, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = len - 1; i >= 0; ) { - /* find a token */ - while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) - i--; - j = i; - while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) - i--; - if (j > i) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, i + 1, j + 1); - while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) - i--; - j = i; - } - } - if (j >= 0) { - SPLIT_APPEND(buf, 0, j + 1); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_char(PyUnicodeObject *self, - PyObject *list, - Py_UNICODE ch, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = len - 1; i >= 0; ) { - if (buf[i] == ch) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, i + 1, j + 1); - j = i = i - 1; - } else - i--; - } - if (j >= -1) { - SPLIT_APPEND(buf, 0, j + 1); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_substring(PyUnicodeObject *self, - PyObject *list, - PyUnicodeObject *substring, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - Py_ssize_t sublen = substring->length; - PyObject *str; - - for (i = len - sublen, j = len; i >= 0; ) { - if (Py_UNICODE_MATCH(self, i, substring)) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(self->str, i + sublen, j); - j = i; - i -= sublen; - } else - i--; - } - if (j >= 0) { - SPLIT_APPEND(self->str, 0, j); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; } -#undef SPLIT_APPEND - static PyObject *split(PyUnicodeObject *self, PyUnicodeObject *substring, Py_ssize_t maxcount) { - PyObject *list; - if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - list = PyList_New(0); - if (!list) - return NULL; - if (substring == NULL) - return split_whitespace(self,list,maxcount); - - else if (substring->length == 1) - return split_char(self,list,substring->str[0],maxcount); + return stringlib_split_whitespace( + (PyObject*) self, self->str, self->length, maxcount + ); - else if (substring->length == 0) { - Py_DECREF(list); - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else - return split_substring(self,list,substring,maxcount); + return stringlib_split( + (PyObject*) self, self->str, self->length, + substring->str, substring->length, + maxcount + ); } static @@ -5906,28 +5758,19 @@ PyObject *rsplit(PyUnicodeObject *self, PyUnicodeObject *substring, Py_ssize_t maxcount) { - PyObject *list; - if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - list = PyList_New(0); - if (!list) - return NULL; - if (substring == NULL) - return rsplit_whitespace(self,list,maxcount); - - else if (substring->length == 1) - return rsplit_char(self,list,substring->str[0],maxcount); + return stringlib_rsplit_whitespace( + (PyObject*) self, self->str, self->length, maxcount + ); - else if (substring->length == 0) { - Py_DECREF(list); - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else - return rsplit_substring(self,list,substring,maxcount); + return stringlib_rsplit( + (PyObject*) self, self->str, self->length, + substring->str, substring->length, + maxcount + ); } static @@ -5940,10 +5783,14 @@ PyObject *replace(PyUnicodeObject *self, if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; + else if (maxcount == 0 || self->length == 0) + goto nothing; if (str1->length == str2->length) { - /* same length */ Py_ssize_t i; + /* same length */ + if (str1->length == 0) + goto nothing; if (str1->length == 1) { /* replace characters */ Py_UNICODE u1, u2; @@ -5962,8 +5809,8 @@ PyObject *replace(PyUnicodeObject *self, u->str[i] = u2; } } else { - i = fastsearch( - self->str, self->length, str1->str, str1->length, FAST_SEARCH + i = stringlib_find( + self->str, self->length, str1->str, str1->length, 0 ); if (i < 0) goto nothing; @@ -5971,25 +5818,30 @@ PyObject *replace(PyUnicodeObject *self, if (!u) return NULL; Py_UNICODE_COPY(u->str, self->str, self->length); - while (i <= self->length - str1->length) - if (Py_UNICODE_MATCH(self, i, str1)) { - if (--maxcount < 0) - break; - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; - } else - i++; + + /* change everything in-place, starting with this one */ + Py_UNICODE_COPY(u->str+i, str2->str, str2->length); + i += str1->length; + + while ( --maxcount > 0) { + i = stringlib_find(self->str+i, self->length-i, + str1->str, str1->length, + i); + if (i == -1) + break; + Py_UNICODE_COPY(u->str+i, str2->str, str2->length); + i += str1->length; + } } } else { - Py_ssize_t n, i, j, e; + Py_ssize_t n, i, j; Py_ssize_t product, new_size, delta; Py_UNICODE *p; /* replace strings */ - n = stringlib_count(self->str, self->length, str1->str, str1->length); - if (n > maxcount) - n = maxcount; + n = stringlib_count(self->str, self->length, str1->str, str1->length, + maxcount); if (n == 0) goto nothing; /* new_size = self->length + n * (str2->length - str1->length)); */ @@ -6015,19 +5867,15 @@ PyObject *replace(PyUnicodeObject *self, return NULL; i = 0; p = u->str; - e = self->length - str1->length; if (str1->length > 0) { while (n-- > 0) { /* look for next match */ - j = i; - while (j <= e) { - if (Py_UNICODE_MATCH(self, j, str1)) - break; - j++; - } - if (j > i) { - if (j > e) - break; + j = stringlib_find(self->str+i, self->length-i, + str1->str, str1->length, + i); + if (j == -1) + break; + else if (j > i) { /* copy unchanged part [i:j] */ Py_UNICODE_COPY(p, self->str+i, j-i); p += j - i; @@ -6083,7 +5931,7 @@ PyDoc_STRVAR(capitalize__doc__, "S.capitalize() -> unicode\n\ \n\ Return a capitalized version of S, i.e. make the first character\n\ -have upper case."); +have upper case and the rest lower case."); static PyObject* unicode_capitalize(PyUnicodeObject *self) @@ -6381,8 +6229,6 @@ int PyUnicode_Contains(PyObject *container, /* Coerce the two arguments */ sub = PyUnicode_FromObject(element); if (!sub) { - PyErr_SetString(PyExc_TypeError, - "'in <string>' requires string as left operand"); return -1; } @@ -6457,20 +6303,15 @@ unicode_count(PyUnicodeObject *self, PyObject *args) Py_ssize_t end = PY_SSIZE_T_MAX; PyObject *result; - if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, - _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) - return NULL; - - substring = (PyUnicodeObject *)PyUnicode_FromObject( - (PyObject *)substring); - if (substring == NULL) + if (!stringlib_parse_args_finds_unicode("count", args, &substring, + &start, &end)) return NULL; - FIX_START_END(self); - + ADJUST_INDICES(start, end, self->length); result = PyInt_FromSsize_t( stringlib_count(self->str + start, end - start, - substring->str, substring->length) + substring->str, substring->length, + PY_SSIZE_T_MAX) ); Py_DECREF(substring); @@ -6489,13 +6330,15 @@ a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ codecs.register_error that can handle UnicodeEncodeErrors."); static PyObject * -unicode_encode(PyUnicodeObject *self, PyObject *args) +unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) { + static char *kwlist[] = {"encoding", "errors", 0}; char *encoding = NULL; char *errors = NULL; PyObject *v; - if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", + kwlist, &encoding, &errors)) return NULL; v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); if (v == NULL) @@ -6521,17 +6364,19 @@ Decodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ -as well as any other name registerd with codecs.register_error that is\n\ +as well as any other name registered with codecs.register_error that is\n\ able to handle UnicodeDecodeErrors."); static PyObject * -unicode_decode(PyUnicodeObject *self, PyObject *args) +unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) { + static char *kwlist[] = {"encoding", "errors", 0}; char *encoding = NULL; char *errors = NULL; PyObject *v; - if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", + kwlist, &encoding, &errors)) return NULL; v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); if (v == NULL) @@ -6641,7 +6486,7 @@ PyDoc_STRVAR(find__doc__, "S.find(sub [,start [,end]]) -> int\n\ \n\ Return the lowest index in S where substring sub is found,\n\ -such that sub is contained within s[start:end]. Optional\n\ +such that sub is contained within S[start:end]. Optional\n\ arguments start and end are interpreted as in slice notation.\n\ \n\ Return -1 on failure."); @@ -6649,12 +6494,13 @@ Return -1 on failure."); static PyObject * unicode_find(PyUnicodeObject *self, PyObject *args) { - PyObject *substring; + PyUnicodeObject *substring; Py_ssize_t start; Py_ssize_t end; Py_ssize_t result; - if (!_ParseTupleFinds(args, &substring, &start, &end)) + if (!stringlib_parse_args_finds_unicode("find", args, &substring, + &start, &end)) return NULL; result = stringlib_find_slice( @@ -6726,11 +6572,12 @@ static PyObject * unicode_index(PyUnicodeObject *self, PyObject *args) { Py_ssize_t result; - PyObject *substring; + PyUnicodeObject *substring; Py_ssize_t start; Py_ssize_t end; - if (!_ParseTupleFinds(args, &substring, &start, &end)) + if (!stringlib_parse_args_finds_unicode("index", args, &substring, + &start, &end)) return NULL; result = stringlib_find_slice( @@ -7385,7 +7232,7 @@ PyDoc_STRVAR(rfind__doc__, "S.rfind(sub [,start [,end]]) -> int\n\ \n\ Return the highest index in S where substring sub is found,\n\ -such that sub is contained within s[start:end]. Optional\n\ +such that sub is contained within S[start:end]. Optional\n\ arguments start and end are interpreted as in slice notation.\n\ \n\ Return -1 on failure."); @@ -7393,12 +7240,13 @@ Return -1 on failure."); static PyObject * unicode_rfind(PyUnicodeObject *self, PyObject *args) { - PyObject *substring; + PyUnicodeObject *substring; Py_ssize_t start; Py_ssize_t end; Py_ssize_t result; - if (!_ParseTupleFinds(args, &substring, &start, &end)) + if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, + &start, &end)) return NULL; result = stringlib_rfind_slice( @@ -7420,12 +7268,13 @@ Like S.rfind() but raise ValueError when the substring is not found."); static PyObject * unicode_rindex(PyUnicodeObject *self, PyObject *args) { - PyObject *substring; + PyUnicodeObject *substring; Py_ssize_t start; Py_ssize_t end; Py_ssize_t result; - if (!_ParseTupleFinds(args, &substring, &start, &end)) + if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, + &start, &end)) return NULL; result = stringlib_rfind_slice( @@ -7804,8 +7653,7 @@ unicode_startswith(PyUnicodeObject *self, Py_ssize_t end = PY_SSIZE_T_MAX; int result; - if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, - _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) + if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) return NULL; if (PyTuple_Check(subobj)) { Py_ssize_t i; @@ -7824,8 +7672,12 @@ unicode_startswith(PyUnicodeObject *self, Py_RETURN_FALSE; } substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); - if (substring == NULL) + if (substring == NULL) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) + PyErr_Format(PyExc_TypeError, "startswith first arg must be str, " + "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); return NULL; + } result = tailmatch(self, substring, start, end, -1); Py_DECREF(substring); return PyBool_FromLong(result); @@ -7850,8 +7702,7 @@ unicode_endswith(PyUnicodeObject *self, Py_ssize_t end = PY_SSIZE_T_MAX; int result; - if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, - _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) + if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) return NULL; if (PyTuple_Check(subobj)) { Py_ssize_t i; @@ -7869,9 +7720,12 @@ unicode_endswith(PyUnicodeObject *self, Py_RETURN_FALSE; } substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); - if (substring == NULL) + if (substring == NULL) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) + PyErr_Format(PyExc_TypeError, "endswith first arg must be str, " + "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); return NULL; - + } result = tailmatch(self, substring, start, end, +1); Py_DECREF(substring); return PyBool_FromLong(result); @@ -7884,7 +7738,8 @@ unicode_endswith(PyUnicodeObject *self, PyDoc_STRVAR(format__doc__, "S.format(*args, **kwargs) -> unicode\n\ \n\ -"); +Return a formatted version of S, using substitutions from args and kwargs.\n\ +The substitutions are identified by braces ('{' and '}')."); static PyObject * unicode__format__(PyObject *self, PyObject *args) @@ -7918,7 +7773,7 @@ unicode__format__(PyObject *self, PyObject *args) PyDoc_STRVAR(p_format__doc__, "S.__format__(format_spec) -> unicode\n\ \n\ -"); +Return a formatted version of S as described by format_spec."); static PyObject * unicode__sizeof__(PyUnicodeObject *v) @@ -7944,7 +7799,7 @@ static PyMethodDef unicode_methods[] = { /* Order is according to common usage: often used methods should appear first, since lookup is done sequentially. */ - {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, + {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, @@ -7960,7 +7815,7 @@ static PyMethodDef unicode_methods[] = { {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, - {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__}, + {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__}, /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, @@ -8175,16 +8030,6 @@ strtounicode(Py_UNICODE *buffer, const char *charbuffer) } static int -doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) -{ - Py_ssize_t result; - - PyOS_ascii_formatd((char *)buffer, len, format, x); - result = strtounicode(buffer, (char *)buffer); - return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); -} - -static int longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) { Py_ssize_t result; @@ -8198,64 +8043,29 @@ longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) shared with stringobject.c, converting from 8-bit to Unicode after the formatting is done. */ -static int -formatfloat(Py_UNICODE *buf, - size_t buflen, - int flags, - int prec, - int type, - PyObject *v) -{ - /* fmt = '%#.' + `prec` + `type` - worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ - char fmt[20]; +/* Returns a new reference to a PyUnicode object, or NULL on failure. */ + +static PyObject * +formatfloat(PyObject *v, int flags, int prec, int type) +{ + char *p; + PyObject *result; double x; x = PyFloat_AsDouble(v); if (x == -1.0 && PyErr_Occurred()) - return -1; + return NULL; + if (prec < 0) prec = 6; -#if SIZEOF_INT > 4 - /* make sure that the decimal representation of precision really does - need at most 10 digits: platforms with sizeof(int) == 8 exist! */ - if (prec > 0x7fffffff) { - PyErr_SetString(PyExc_OverflowError, - "outrageously large precision " - "for formatted float"); - return -1; - } -#endif - - if (type == 'f' && fabs(x) >= 1e50) - type = 'g'; - /* Worst case length calc to ensure no buffer overrun: - 'g' formats: - fmt = %#.<prec>g - buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp - for any double rep.) - len = 1 + prec + 1 + 2 + 5 = 9 + prec - - 'f' formats: - buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) - len = 1 + 50 + 1 + prec = 52 + prec - - If prec=0 the effective precision is 1 (the leading digit is - always given), therefore increase the length by one. - - */ - if (((type == 'g' || type == 'G') && - buflen <= (size_t)10 + (size_t)prec) || - (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { - PyErr_SetString(PyExc_OverflowError, - "formatted float is too long (precision too large?)"); - return -1; - } - PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", - (flags&F_ALT) ? "#" : "", - prec, type); - return doubletounicode(buf, buflen, fmt, x); + p = PyOS_double_to_string(x, type, prec, + (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); + if (p == NULL) + return NULL; + result = PyUnicode_FromStringAndSize(p, strlen(p)); + PyMem_Free(p); + return result; } static PyObject* @@ -8425,7 +8235,7 @@ formatchar(Py_UNICODE *buf, /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) - FORMATBUFLEN is the length of the buffer in which the floats, ints, & + FORMATBUFLEN is the length of the buffer in which the ints & chars are formatted. XXX This is a magic number. Each formatting routine does bounds checking to ensure no overflow, but a better solution may be to malloc a buffer of appropriate size for each @@ -8496,7 +8306,7 @@ PyObject *PyUnicode_Format(PyObject *format, Py_UNICODE *pbuf; Py_UNICODE sign; Py_ssize_t len; - Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ + Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */ fmt++; if (*fmt == '(') { @@ -8757,13 +8567,11 @@ PyObject *PyUnicode_Format(PyObject *format, case 'F': case 'g': case 'G': - if (c == 'F') - c = 'f'; - pbuf = formatbuf; - len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), - flags, prec, c, v); - if (len < 0) + temp = formatfloat(v, flags, prec, c); + if (temp == NULL) goto onError; + pbuf = PyUnicode_AS_UNICODE(temp); + len = PyUnicode_GET_SIZE(temp); sign = 1; if (flags & F_ZERO) fill = '0'; @@ -9084,11 +8892,3 @@ _PyUnicode_Fini(void) #ifdef __cplusplus } #endif - - -/* - Local variables: - c-basic-offset: 4 - indent-tabs-mode: nil - End: -*/ |
