summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c1144
1 files changed, 472 insertions, 672 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1df38e8..4fb1aed 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -115,9 +115,9 @@ static char unicode_default_encoding[100];
/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = {
0, 0, 0, 0, 0, 0, 0, 0,
-/* case 0x0009: * HORIZONTAL TABULATION */
+/* case 0x0009: * CHARACTER TABULATION */
/* case 0x000A: * LINE FEED */
-/* case 0x000B: * VERTICAL TABULATION */
+/* case 0x000B: * LINE TABULATION */
/* case 0x000C: * FORM FEED */
/* case 0x000D: * CARRIAGE RETURN */
0, 1, 1, 1, 1, 1, 0, 0,
@@ -147,8 +147,10 @@ const unsigned char _Py_ascii_whitespace[] = {
static unsigned char ascii_linebreak[] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x000A, * LINE FEED */
+/* 0x000B, * LINE TABULATION */
+/* 0x000C, * FORM FEED */
/* 0x000D, * CARRIAGE RETURN */
- 0, 0, 1, 0, 0, 1, 0, 0,
+ 0, 0, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x001C, * FILE SEPARATOR */
/* 0x001D, * GROUP SEPARATOR */
@@ -190,11 +192,22 @@ PyUnicode_GetMax(void)
/* the linebreak mask is set up by Unicode_Init below */
+#if LONG_BIT >= 128
+#define BLOOM_WIDTH 128
+#elif LONG_BIT >= 64
+#define BLOOM_WIDTH 64
+#elif LONG_BIT >= 32
+#define BLOOM_WIDTH 32
+#else
+#error "LONG_BIT is smaller than 32"
+#endif
+
#define BLOOM_MASK unsigned long
static BLOOM_MASK bloom_linebreak;
-#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
+#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
+#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
#define BLOOM_LINEBREAK(ch) \
((ch) < 128U ? ascii_linebreak[(ch)] : \
@@ -204,12 +217,12 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
{
/* calculate simple bloom-style bitmask for a given unicode string */
- long mask;
+ BLOOM_MASK mask;
Py_ssize_t i;
mask = 0;
for (i = 0; i < len; i++)
- mask |= (1 << (ptr[i] & 0x1F));
+ BLOOM_ADD(mask, ptr[i]);
return mask;
}
@@ -280,7 +293,7 @@ int unicode_resize(register PyUnicodeObject *unicode,
}
/* We allocate one more byte to make sure the string is
- Ux0000 terminated -- XXX is this needed ?
+ Ux0000 terminated; some code relies on that.
XXX This allocator could further be enhanced by assuring that the
free list never reduces its size below 1.
@@ -527,6 +540,60 @@ PyObject *PyUnicode_FromString(const char *u)
#ifdef HAVE_WCHAR_H
+#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
+# define CONVERT_WCHAR_TO_SURROGATES
+#endif
+
+#ifdef CONVERT_WCHAR_TO_SURROGATES
+
+/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
+ to convert from UTF32 to UTF16. */
+
+PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
+ Py_ssize_t size)
+{
+ PyUnicodeObject *unicode;
+ register Py_ssize_t i;
+ Py_ssize_t alloc;
+ const wchar_t *orig_w;
+
+ if (w == NULL) {
+ PyErr_BadInternalCall();
+ return NULL;
+ }
+
+ alloc = size;
+ orig_w = w;
+ for (i = size; i > 0; i--) {
+ if (*w > 0xFFFF)
+ alloc++;
+ w++;
+ }
+ w = orig_w;
+ unicode = _PyUnicode_New(alloc);
+ if (!unicode)
+ return NULL;
+
+ /* Copy the wchar_t data into the new object */
+ {
+ register Py_UNICODE *u;
+ u = PyUnicode_AS_UNICODE(unicode);
+ for (i = size; i > 0; i--) {
+ if (*w > 0xFFFF) {
+ wchar_t ordinal = *w++;
+ ordinal -= 0x10000;
+ *u++ = 0xD800 | (ordinal >> 10);
+ *u++ = 0xDC00 | (ordinal & 0x3FF);
+ }
+ else
+ *u++ = *w++;
+ }
+ }
+ return (PyObject *)unicode;
+}
+
+#else
+
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Py_ssize_t size)
{
@@ -557,6 +624,10 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
return (PyObject *)unicode;
}
+#endif /* CONVERT_WCHAR_TO_SURROGATES */
+
+#undef CONVERT_WCHAR_TO_SURROGATES
+
static void
makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
{
@@ -681,7 +752,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
case 's':
{
/* UTF-8 */
- unsigned char *s = va_arg(count, unsigned char*);
+ const char *s = va_arg(count, const char*);
PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
if (!str)
goto fail;
@@ -1408,69 +1479,81 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
/* --- UTF-7 Codec -------------------------------------------------------- */
-/* see RFC2152 for details */
+/* See RFC2152 for details. We encode conservatively and decode liberally. */
-static
-char utf7_special[128] = {
- /* indicate whether a UTF-7 character is special i.e. cannot be directly
- encoded:
- 0 - not special
- 1 - special
- 2 - whitespace (optional)
- 3 - RFC2152 Set O (optional) */
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+/* Three simple macros defining base-64. */
-};
+/* Is c a base-64 character? */
+
+#define IS_BASE64(c) \
+ (isalnum(c) || (c) == '+' || (c) == '/')
-/* Note: The comparison (c) <= 0 is a trick to work-around gcc
- warnings about the comparison always being false; since
- utf7_special[0] is 1, we can safely make that one comparison
- true */
+/* given that c is a base-64 character, what is its base-64 value? */
-#define SPECIAL(c, encodeO, encodeWS) \
- ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
- (encodeWS && (utf7_special[(c)] == 2)) || \
- (encodeO && (utf7_special[(c)] == 3)))
+#define FROM_BASE64(c) \
+ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
+ ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
+ ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
+ (c) == '+' ? 62 : 63)
-#define B64(n) \
+/* What is the base-64 character of the bottom 6 bits of n? */
+
+#define TO_BASE64(n) \
("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
-#define B64CHAR(c) \
- (isalnum(c) || (c) == '+' || (c) == '/')
-#define UB64(c) \
- ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
- (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
-
-#define ENCODE(out, ch, bits) \
- while (bits >= 6) { \
- *out++ = B64(ch >> (bits-6)); \
- bits -= 6; \
- }
-
-#define DECODE(out, ch, bits, surrogate) \
- while (bits >= 16) { \
- Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
- bits -= 16; \
- if (surrogate) { \
- /* We have already generated an error for the high surrogate \
- so let's not bother seeing if the low surrogate is correct or not */ \
- surrogate = 0; \
- } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
- /* This is a surrogate pair. Unfortunately we can't represent \
- it in a 16-bit character */ \
- surrogate = 1; \
- errmsg = "code pairs are not supported"; \
- goto utf7Error; \
- } else { \
- *out++ = outCh; \
- } \
- }
+
+/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
+ * decoded as itself. We are permissive on decoding; the only ASCII
+ * byte not decoding to itself is the + which begins a base64
+ * string. */
+
+#define DECODE_DIRECT(c) \
+ ((c) <= 127 && (c) != '+')
+
+/* The UTF-7 encoder treats ASCII characters differently according to
+ * whether they are Set D, Set O, Whitespace, or special (i.e. none of
+ * the above). See RFC2152. This array identifies these different
+ * sets:
+ * 0 : "Set D"
+ * alphanumeric and '(),-./:?
+ * 1 : "Set O"
+ * !"#$%&*;<=>@[]^_`{|}
+ * 2 : "whitespace"
+ * ht nl cr sp
+ * 3 : special (must be base64 encoded)
+ * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
+ */
+
+static
+char utf7_category[128] = {
+/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
+/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+/* sp ! " # $ % & ' ( ) * + , - . / */
+ 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
+/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+/* @ A B C D E F G H I J K L M N O */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/* P Q R S T U V W X Y Z [ \ ] ^ _ */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
+/* ` a b c d e f g h i j k l m n o */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/* p q r s t u v w x y z { | } ~ del */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
+};
+
+/* ENCODE_DIRECT: this character should be encoded as itself. The
+ * answer depends on whether we are encoding set O as itself, and also
+ * on whether we are encoding whitespace as itself. RFC2152 makes it
+ * clear that the answers to these questions vary between
+ * applications, so this code needs to be flexible. */
+
+#define ENCODE_DIRECT(c, directO, directWS) \
+ ((c) < 128 && (c) > 0 && \
+ ((utf7_category[(c)] == 0) || \
+ (directWS && (utf7_category[(c)] == 2)) || \
+ (directO && (utf7_category[(c)] == 1))))
PyObject *PyUnicode_DecodeUTF7(const char *s,
Py_ssize_t size,
@@ -1479,6 +1562,13 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
}
+/* The decoder. The only state we preserve is our read position,
+ * i.e. how many characters we have consumed. So if we end in the
+ * middle of a shift sequence we have to back off the read position
+ * and the output to the beginning of the sequence, otherwise we lose
+ * all the shift state (seen bits, number of bits seen, high
+ * surrogate). */
+
PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Py_ssize_t size,
const char *errors,
@@ -1493,9 +1583,10 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Py_UNICODE *p;
const char *errmsg = "";
int inShift = 0;
- unsigned int bitsleft = 0;
- unsigned long charsleft = 0;
- int surrogate = 0;
+ Py_UNICODE *shiftOutStart;
+ unsigned int base64bits = 0;
+ unsigned long base64buffer = 0;
+ Py_UNICODE surrogate = 0;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
@@ -1509,79 +1600,103 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
}
p = unicode->str;
+ shiftOutStart = p;
e = s + size;
while (s < e) {
- Py_UNICODE ch;
- restart:
- ch = (unsigned char) *s;
+ Py_UNICODE ch = (unsigned char) *s;
- if (inShift) {
- if ((ch == '-') || !B64CHAR(ch)) {
- inShift = 0;
+ if (inShift) { /* in a base-64 section */
+ if (IS_BASE64(ch)) { /* consume a base-64 character */
+ base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
+ base64bits += 6;
s++;
-
- /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
- if (bitsleft >= 6) {
- /* The shift sequence has a partial character in it. If
- bitsleft < 6 then we could just classify it as padding
- but that is not the case here */
-
- errmsg = "partial character in shift sequence";
- goto utf7Error;
+ if (base64bits >= 16) {
+ /* we have enough bits for a UTF-16 value */
+ Py_UNICODE outCh = (Py_UNICODE)
+ (base64buffer >> (base64bits-16));
+ base64bits -= 16;
+ base64buffer &= (1 << base64bits) - 1; /* clear high bits */
+ if (surrogate) {
+ /* expecting a second surrogate */
+ if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
+#ifdef Py_UNICODE_WIDE
+ *p++ = (((surrogate & 0x3FF)<<10)
+ | (outCh & 0x3FF)) + 0x10000;
+#else
+ *p++ = surrogate;
+ *p++ = outCh;
+#endif
+ surrogate = 0;
+ continue;
+ }
+ else {
+ *p++ = surrogate;
+ surrogate = 0;
+ }
+ }
+ if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+ /* first surrogate */
+ surrogate = outCh;
+ }
+ else {
+ *p++ = outCh;
+ }
}
- /* According to RFC2152 the remaining bits should be zero. We
- choose to signal an error/insert a replacement character
- here so indicate the potential of a misencoded character. */
-
- /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
- if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
- errmsg = "non-zero padding bits in shift sequence";
- goto utf7Error;
+ }
+ else { /* now leaving a base-64 section */
+ inShift = 0;
+ s++;
+ if (surrogate) {
+ *p++ = surrogate;
+ surrogate = 0;
}
-
- if (ch == '-') {
- if ((s < e) && (*(s) == '-')) {
- *p++ = '-';
- inShift = 1;
+ if (base64bits > 0) { /* left-over bits */
+ if (base64bits >= 6) {
+ /* We've seen at least one base-64 character */
+ errmsg = "partial character in shift sequence";
+ goto utf7Error;
}
- } else if (SPECIAL(ch,0,0)) {
- errmsg = "unexpected special character";
- goto utf7Error;
- } else {
+ else {
+ /* Some bits remain; they should be zero */
+ if (base64buffer != 0) {
+ errmsg = "non-zero padding bits in shift sequence";
+ goto utf7Error;
+ }
+ }
+ }
+ if (ch != '-') {
+ /* '-' is absorbed; other terminating
+ characters are preserved */
*p++ = ch;
}
- } else {
- charsleft = (charsleft << 6) | UB64(ch);
- bitsleft += 6;
- s++;
- /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
}
}
else if ( ch == '+' ) {
startinpos = s-starts;
- s++;
- if (s < e && *s == '-') {
+ s++; /* consume '+' */
+ if (s < e && *s == '-') { /* '+-' encodes '+' */
s++;
*p++ = '+';
- } else
- {
+ }
+ else { /* begin base64-encoded section */
inShift = 1;
- bitsleft = 0;
+ shiftOutStart = p;
+ base64bits = 0;
}
}
- else if (SPECIAL(ch,0,0)) {
- startinpos = s-starts;
- errmsg = "unexpected special character";
+ else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
+ *p++ = ch;
s++;
- goto utf7Error;
}
else {
- *p++ = ch;
+ startinpos = s-starts;
s++;
+ errmsg = "unexpected special character";
+ goto utf7Error;
}
continue;
- utf7Error:
+utf7Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
endinpos = s-starts;
if (unicode_decode_call_errorhandler(
@@ -1592,23 +1707,33 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
goto onError;
}
- if (inShift && !consumed) {
- outpos = p-PyUnicode_AS_UNICODE(unicode);
- endinpos = size;
- if (unicode_decode_call_errorhandler(
- errors, &errorHandler,
- "utf7", "unterminated shift sequence",
- starts, size, &startinpos, &endinpos, &exc, &s,
- &unicode, &outpos, &p))
- goto onError;
- if (s < e)
- goto restart;
+ /* end of string */
+
+ if (inShift && !consumed) { /* in shift sequence, no more to follow */
+ /* if we're in an inconsistent state, that's an error */
+ if (surrogate ||
+ (base64bits >= 6) ||
+ (base64bits > 0 && base64buffer != 0)) {
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ endinpos = size;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf7", "unterminated shift sequence",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ &unicode, &outpos, &p))
+ goto onError;
+ }
}
+
+ /* return state */
if (consumed) {
- if(inShift)
+ if (inShift) {
+ p = shiftOutStart; /* back off output */
*consumed = startinpos;
- else
+ }
+ else {
*consumed = s-starts;
+ }
}
if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
@@ -1628,27 +1753,27 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Py_ssize_t size,
- int encodeSetO,
- int encodeWhiteSpace,
+ int base64SetO,
+ int base64WhiteSpace,
const char *errors)
{
PyObject *v;
/* It might be possible to tighten this worst case */
- Py_ssize_t cbAllocated = 5 * size;
+ Py_ssize_t allocated = 8 * size;
int inShift = 0;
Py_ssize_t i = 0;
- unsigned int bitsleft = 0;
- unsigned long charsleft = 0;
+ unsigned int base64bits = 0;
+ unsigned long base64buffer = 0;
char * out;
char * start;
- if (cbAllocated / 5 != size)
+ if (allocated / 8 != size)
return PyErr_NoMemory();
if (size == 0)
return PyString_FromStringAndSize(NULL, 0);
- v = PyString_FromStringAndSize(NULL, cbAllocated);
+ v = PyString_FromStringAndSize(NULL, allocated);
if (v == NULL)
return NULL;
@@ -1656,78 +1781,77 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
for (;i < size; ++i) {
Py_UNICODE ch = s[i];
- if (!inShift) {
- if (ch == '+') {
- *out++ = '+';
- *out++ = '-';
- } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
- charsleft = ch;
- bitsleft = 16;
- *out++ = '+';
- /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
- inShift = bitsleft > 0;
- } else {
- *out++ = (char) ch;
- }
- } else {
- if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
- *out++ = B64(charsleft << (6-bitsleft));
- charsleft = 0;
- bitsleft = 0;
+ if (inShift) {
+ if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
+ /* shifting out */
+ if (base64bits) { /* output remaining bits */
+ *out++ = TO_BASE64(base64buffer << (6-base64bits));
+ base64buffer = 0;
+ base64bits = 0;
+ }
+ inShift = 0;
/* Characters not in the BASE64 set implicitly unshift the sequence
so no '-' is required, except if the character is itself a '-' */
- if (B64CHAR(ch) || ch == '-') {
+ if (IS_BASE64(ch) || ch == '-') {
*out++ = '-';
}
- inShift = 0;
*out++ = (char) ch;
- } else {
- bitsleft += 16;
- charsleft = (charsleft << 16) | ch;
- /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
-
- /* If the next character is special then we don't need to terminate
- the shift sequence. If the next character is not a BASE64 character
- or '-' then the shift sequence will be terminated implicitly and we
- don't have to insert a '-'. */
-
- if (bitsleft == 0) {
- if (i + 1 < size) {
- Py_UNICODE ch2 = s[i+1];
-
- if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
-
- } else if (B64CHAR(ch2) || ch2 == '-') {
- *out++ = '-';
- inShift = 0;
- } else {
- inShift = 0;
- }
-
- }
- else {
+ }
+ else {
+ goto encode_char;
+ }
+ }
+ else { /* not in a shift sequence */
+ if (ch == '+') {
+ *out++ = '+';
*out++ = '-';
- inShift = 0;
- }
- }
}
+ else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
+ *out++ = (char) ch;
+ }
+ else {
+ *out++ = '+';
+ inShift = 1;
+ goto encode_char;
+ }
+ }
+ continue;
+encode_char:
+#ifdef Py_UNICODE_WIDE
+ if (ch >= 0x10000) {
+ /* code first surrogate */
+ base64bits += 16;
+ base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
+ while (base64bits >= 6) {
+ *out++ = TO_BASE64(base64buffer >> (base64bits-6));
+ base64bits -= 6;
+ }
+ /* prepare second surrogate */
+ ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
+ }
+#endif
+ base64bits += 16;
+ base64buffer = (base64buffer << 16) | ch;
+ while (base64bits >= 6) {
+ *out++ = TO_BASE64(base64buffer >> (base64bits-6));
+ base64bits -= 6;
}
}
- if (bitsleft) {
- *out++= B64(charsleft << (6-bitsleft) );
+ if (base64bits)
+ *out++= TO_BASE64(base64buffer << (6-base64bits) );
+ if (inShift)
*out++ = '-';
- }
- _PyString_Resize(&v, out - start);
+ if (_PyString_Resize(&v, out - start))
+ return NULL;
return v;
}
-#undef SPECIAL
-#undef B64
-#undef B64CHAR
-#undef UB64
-#undef ENCODE
-#undef DECODE
+#undef IS_BASE64
+#undef FROM_BASE64
+#undef TO_BASE64
+#undef DECODE_DIRECT
+#undef ENCODE_DIRECT
/* --- UTF-8 Codec -------------------------------------------------------- */
@@ -2033,7 +2157,8 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
/* Cut back to size actually needed. */
nneeded = p - PyString_AS_STRING(v);
assert(nneeded <= nallocated);
- _PyString_Resize(&v, nneeded);
+ if (_PyString_Resize(&v, nneeded))
+ return NULL;
}
return v;
@@ -2077,10 +2202,11 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_UNICODE *p;
#ifndef Py_UNICODE_WIDE
int pairs = 0;
+ const unsigned char *qq;
#else
const int pairs = 0;
#endif
- const unsigned char *q, *e, *qq;
+ const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
const char *errmsg = "";
/* Offsets from q for retrieving bytes in the right order. */
@@ -2757,16 +2883,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
message = "malformed \\N character escape";
if (ucnhash_CAPI == NULL) {
/* load the unicode data module */
- PyObject *m, *api;
- m = PyImport_ImportModuleNoBlock("unicodedata");
- if (m == NULL)
- goto ucnhashError;
- api = PyObject_GetAttrString(m, "ucnhash_CAPI");
- Py_DECREF(m);
- if (api == NULL)
- goto ucnhashError;
- ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
- Py_DECREF(api);
+ ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
if (ucnhash_CAPI == NULL)
goto ucnhashError;
}
@@ -3004,7 +3121,8 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
*p++ = PyString_AS_STRING(repr)[1];
*p = '\0';
- _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
+ if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
+ return NULL;
return repr;
}
@@ -3225,7 +3343,8 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
*p++ = (char) ch;
}
*p = '\0';
- _PyString_Resize(&repr, p - q);
+ if (_PyString_Resize(&repr, p - q))
+ return NULL;
return repr;
}
@@ -3564,9 +3683,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
collstart-startp, collend-startp, &newpos);
if (repunicode == NULL)
goto onError;
- /* need more space? (at least enough for what we
- have+the replacement+the rest of the string, so
- we won't have to check space for encodable characters) */
+ /* need more space? (at least enough for what we have+the
+ replacement+the rest of the string, so we won't have to
+ check space for encodable characters) */
respos = str-PyString_AS_STRING(res);
repsize = PyUnicode_GET_SIZE(repunicode);
requiredsize = respos+repsize+(endp-collend);
@@ -4217,7 +4336,7 @@ PyUnicode_BuildEncodingMap(PyObject* string)
if (!result)
return NULL;
for (i = 0; i < 256; i++) {
- key = value = NULL;
+ value = NULL;
key = PyInt_FromLong(decode[i]);
value = PyInt_FromLong(i);
if (!key || !value)
@@ -5041,11 +5160,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
}
/* All other characters are considered unencodable */
collstart = p;
- collend = p+1;
- while (collend < end) {
+ for (collend = p+1; collend < end; collend++) {
if ((0 < *collend && *collend < 256) ||
- !Py_UNICODE_ISSPACE(*collend) ||
- Py_UNICODE_TODECIMAL(*collend))
+ Py_UNICODE_ISSPACE(*collend) ||
+ 0 <= Py_UNICODE_TODECIMAL(*collend))
break;
}
/* cache callback name lookup
@@ -5124,27 +5242,27 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
/* --- Helpers ------------------------------------------------------------ */
#include "stringlib/unicodedefs.h"
-
-#define FROM_UNICODE
-
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/partition.h"
+#include "stringlib/split.h"
/* helper macro to fixup start/end slice values */
-#define FIX_START_END(obj) \
- if (start < 0) \
- start += (obj)->length; \
- if (start < 0) \
- start = 0; \
- if (end > (obj)->length) \
- end = (obj)->length; \
- if (end < 0) \
- end += (obj)->length; \
- if (end < 0) \
- end = 0;
+#define ADJUST_INDICES(start, end, len) \
+ if (end > len) \
+ end = len; \
+ else if (end < 0) { \
+ end += len; \
+ if (end < 0) \
+ end = 0; \
+ } \
+ if (start < 0) { \
+ start += len; \
+ if (start < 0) \
+ start = 0; \
+ }
Py_ssize_t PyUnicode_Count(PyObject *str,
PyObject *substr,
@@ -5164,10 +5282,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str,
return -1;
}
- FIX_START_END(str_obj);
-
+ ADJUST_INDICES(start, end, str_obj->length);
result = stringlib_count(
- str_obj->str + start, end - start, sub_obj->str, sub_obj->length
+ str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
+ PY_SSIZE_T_MAX
);
Py_DECREF(sub_obj);
@@ -5222,8 +5340,7 @@ int tailmatch(PyUnicodeObject *self,
if (substring->length == 0)
return 1;
- FIX_START_END(self);
-
+ ADJUST_INDICES(start, end, self->length);
end -= substring->length;
if (end < start)
return 0;
@@ -5363,13 +5480,13 @@ int fixcapitalize(PyUnicodeObject *self)
if (len == 0)
return 0;
- if (Py_UNICODE_ISLOWER(*s)) {
+ if (!Py_UNICODE_ISUPPER(*s)) {
*s = Py_UNICODE_TOUPPER(*s);
status = 1;
}
s++;
while (--len > 0) {
- if (Py_UNICODE_ISUPPER(*s)) {
+ if (!Py_UNICODE_ISLOWER(*s)) {
*s = Py_UNICODE_TOLOWER(*s);
status = 1;
}
@@ -5600,305 +5717,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
return u;
}
-#define SPLIT_APPEND(data, left, right) \
- str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
- if (!str) \
- goto onError; \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
-
-static
-PyObject *split_whitespace(PyUnicodeObject *self,
- PyObject *list,
- Py_ssize_t maxcount)
+PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = 0; i < len; ) {
- /* find a token */
- while (i < len && Py_UNICODE_ISSPACE(buf[i]))
- i++;
- j = i;
- while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
- i++;
- if (j < i) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, j, i);
- while (i < len && Py_UNICODE_ISSPACE(buf[i]))
- i++;
- j = i;
- }
- }
- if (j < len) {
- SPLIT_APPEND(buf, j, len);
- }
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-PyObject *PyUnicode_Splitlines(PyObject *string,
- int keepends)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len;
PyObject *list;
- PyObject *str;
- Py_UNICODE *data;
string = PyUnicode_FromObject(string);
if (string == NULL)
return NULL;
- data = PyUnicode_AS_UNICODE(string);
- len = PyUnicode_GET_SIZE(string);
-
- list = PyList_New(0);
- if (!list)
- goto onError;
- for (i = j = 0; i < len; ) {
- Py_ssize_t eol;
+ list = stringlib_splitlines(
+ (PyObject*) string, PyUnicode_AS_UNICODE(string),
+ PyUnicode_GET_SIZE(string), keepends);
- /* Find a line and append it */
- while (i < len && !BLOOM_LINEBREAK(data[i]))
- i++;
-
- /* Skip the line break reading CRLF as one line break */
- eol = i;
- if (i < len) {
- if (data[i] == '\r' && i + 1 < len &&
- data[i+1] == '\n')
- i += 2;
- else
- i++;
- if (keepends)
- eol = i;
- }
- SPLIT_APPEND(data, j, eol);
- j = i;
- }
- if (j < len) {
- SPLIT_APPEND(data, j, len);
- }
-
- Py_DECREF(string);
- return list;
-
- onError:
- Py_XDECREF(list);
Py_DECREF(string);
- return NULL;
-}
-
-static
-PyObject *split_char(PyUnicodeObject *self,
- PyObject *list,
- Py_UNICODE ch,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = 0; i < len; ) {
- if (buf[i] == ch) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, j, i);
- i = j = i + 1;
- } else
- i++;
- }
- if (j <= len) {
- SPLIT_APPEND(buf, j, len);
- }
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *split_substring(PyUnicodeObject *self,
- PyObject *list,
- PyUnicodeObject *substring,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- Py_ssize_t sublen = substring->length;
- PyObject *str;
-
- for (i = j = 0; i <= len - sublen; ) {
- if (Py_UNICODE_MATCH(self, i, substring)) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(self->str, j, i);
- i = j = i + sublen;
- } else
- i++;
- }
- if (j <= len) {
- SPLIT_APPEND(self->str, j, len);
- }
return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *rsplit_whitespace(PyUnicodeObject *self,
- PyObject *list,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = len - 1; i >= 0; ) {
- /* find a token */
- while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
- i--;
- j = i;
- while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
- i--;
- if (j > i) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, i + 1, j + 1);
- while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
- i--;
- j = i;
- }
- }
- if (j >= 0) {
- SPLIT_APPEND(buf, 0, j + 1);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *rsplit_char(PyUnicodeObject *self,
- PyObject *list,
- Py_UNICODE ch,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = len - 1; i >= 0; ) {
- if (buf[i] == ch) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, i + 1, j + 1);
- j = i = i - 1;
- } else
- i--;
- }
- if (j >= -1) {
- SPLIT_APPEND(buf, 0, j + 1);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *rsplit_substring(PyUnicodeObject *self,
- PyObject *list,
- PyUnicodeObject *substring,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- Py_ssize_t sublen = substring->length;
- PyObject *str;
-
- for (i = len - sublen, j = len; i >= 0; ) {
- if (Py_UNICODE_MATCH(self, i, substring)) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(self->str, i + sublen, j);
- j = i;
- i -= sublen;
- } else
- i--;
- }
- if (j >= 0) {
- SPLIT_APPEND(self->str, 0, j);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
}
-#undef SPLIT_APPEND
-
static
PyObject *split(PyUnicodeObject *self,
PyUnicodeObject *substring,
Py_ssize_t maxcount)
{
- PyObject *list;
-
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
- list = PyList_New(0);
- if (!list)
- return NULL;
-
if (substring == NULL)
- return split_whitespace(self,list,maxcount);
-
- else if (substring->length == 1)
- return split_char(self,list,substring->str[0],maxcount);
+ return stringlib_split_whitespace(
+ (PyObject*) self, self->str, self->length, maxcount
+ );
- else if (substring->length == 0) {
- Py_DECREF(list);
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else
- return split_substring(self,list,substring,maxcount);
+ return stringlib_split(
+ (PyObject*) self, self->str, self->length,
+ substring->str, substring->length,
+ maxcount
+ );
}
static
@@ -5906,28 +5758,19 @@ PyObject *rsplit(PyUnicodeObject *self,
PyUnicodeObject *substring,
Py_ssize_t maxcount)
{
- PyObject *list;
-
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
- list = PyList_New(0);
- if (!list)
- return NULL;
-
if (substring == NULL)
- return rsplit_whitespace(self,list,maxcount);
-
- else if (substring->length == 1)
- return rsplit_char(self,list,substring->str[0],maxcount);
+ return stringlib_rsplit_whitespace(
+ (PyObject*) self, self->str, self->length, maxcount
+ );
- else if (substring->length == 0) {
- Py_DECREF(list);
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else
- return rsplit_substring(self,list,substring,maxcount);
+ return stringlib_rsplit(
+ (PyObject*) self, self->str, self->length,
+ substring->str, substring->length,
+ maxcount
+ );
}
static
@@ -5940,10 +5783,14 @@ PyObject *replace(PyUnicodeObject *self,
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
+ else if (maxcount == 0 || self->length == 0)
+ goto nothing;
if (str1->length == str2->length) {
- /* same length */
Py_ssize_t i;
+ /* same length */
+ if (str1->length == 0)
+ goto nothing;
if (str1->length == 1) {
/* replace characters */
Py_UNICODE u1, u2;
@@ -5962,8 +5809,8 @@ PyObject *replace(PyUnicodeObject *self,
u->str[i] = u2;
}
} else {
- i = fastsearch(
- self->str, self->length, str1->str, str1->length, FAST_SEARCH
+ i = stringlib_find(
+ self->str, self->length, str1->str, str1->length, 0
);
if (i < 0)
goto nothing;
@@ -5971,25 +5818,30 @@ PyObject *replace(PyUnicodeObject *self,
if (!u)
return NULL;
Py_UNICODE_COPY(u->str, self->str, self->length);
- while (i <= self->length - str1->length)
- if (Py_UNICODE_MATCH(self, i, str1)) {
- if (--maxcount < 0)
- break;
- Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
- i += str1->length;
- } else
- i++;
+
+ /* change everything in-place, starting with this one */
+ Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+ i += str1->length;
+
+ while ( --maxcount > 0) {
+ i = stringlib_find(self->str+i, self->length-i,
+ str1->str, str1->length,
+ i);
+ if (i == -1)
+ break;
+ Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+ i += str1->length;
+ }
}
} else {
- Py_ssize_t n, i, j, e;
+ Py_ssize_t n, i, j;
Py_ssize_t product, new_size, delta;
Py_UNICODE *p;
/* replace strings */
- n = stringlib_count(self->str, self->length, str1->str, str1->length);
- if (n > maxcount)
- n = maxcount;
+ n = stringlib_count(self->str, self->length, str1->str, str1->length,
+ maxcount);
if (n == 0)
goto nothing;
/* new_size = self->length + n * (str2->length - str1->length)); */
@@ -6015,19 +5867,15 @@ PyObject *replace(PyUnicodeObject *self,
return NULL;
i = 0;
p = u->str;
- e = self->length - str1->length;
if (str1->length > 0) {
while (n-- > 0) {
/* look for next match */
- j = i;
- while (j <= e) {
- if (Py_UNICODE_MATCH(self, j, str1))
- break;
- j++;
- }
- if (j > i) {
- if (j > e)
- break;
+ j = stringlib_find(self->str+i, self->length-i,
+ str1->str, str1->length,
+ i);
+ if (j == -1)
+ break;
+ else if (j > i) {
/* copy unchanged part [i:j] */
Py_UNICODE_COPY(p, self->str+i, j-i);
p += j - i;
@@ -6083,7 +5931,7 @@ PyDoc_STRVAR(capitalize__doc__,
"S.capitalize() -> unicode\n\
\n\
Return a capitalized version of S, i.e. make the first character\n\
-have upper case.");
+have upper case and the rest lower case.");
static PyObject*
unicode_capitalize(PyUnicodeObject *self)
@@ -6381,8 +6229,6 @@ int PyUnicode_Contains(PyObject *container,
/* Coerce the two arguments */
sub = PyUnicode_FromObject(element);
if (!sub) {
- PyErr_SetString(PyExc_TypeError,
- "'in <string>' requires string as left operand");
return -1;
}
@@ -6457,20 +6303,15 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
Py_ssize_t end = PY_SSIZE_T_MAX;
PyObject *result;
- if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
- _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
- return NULL;
-
- substring = (PyUnicodeObject *)PyUnicode_FromObject(
- (PyObject *)substring);
- if (substring == NULL)
+ if (!stringlib_parse_args_finds_unicode("count", args, &substring,
+ &start, &end))
return NULL;
- FIX_START_END(self);
-
+ ADJUST_INDICES(start, end, self->length);
result = PyInt_FromSsize_t(
stringlib_count(self->str + start, end - start,
- substring->str, substring->length)
+ substring->str, substring->length,
+ PY_SSIZE_T_MAX)
);
Py_DECREF(substring);
@@ -6489,13 +6330,15 @@ a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
codecs.register_error that can handle UnicodeEncodeErrors.");
static PyObject *
-unicode_encode(PyUnicodeObject *self, PyObject *args)
+unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
{
+ static char *kwlist[] = {"encoding", "errors", 0};
char *encoding = NULL;
char *errors = NULL;
PyObject *v;
- if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
+ kwlist, &encoding, &errors))
return NULL;
v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
if (v == NULL)
@@ -6521,17 +6364,19 @@ Decodes S using the codec registered for encoding. encoding defaults\n\
to the default encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
-as well as any other name registerd with codecs.register_error that is\n\
+as well as any other name registered with codecs.register_error that is\n\
able to handle UnicodeDecodeErrors.");
static PyObject *
-unicode_decode(PyUnicodeObject *self, PyObject *args)
+unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
{
+ static char *kwlist[] = {"encoding", "errors", 0};
char *encoding = NULL;
char *errors = NULL;
PyObject *v;
- if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
+ kwlist, &encoding, &errors))
return NULL;
v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
if (v == NULL)
@@ -6641,7 +6486,7 @@ PyDoc_STRVAR(find__doc__,
"S.find(sub [,start [,end]]) -> int\n\
\n\
Return the lowest index in S where substring sub is found,\n\
-such that sub is contained within s[start:end]. Optional\n\
+such that sub is contained within S[start:end]. Optional\n\
arguments start and end are interpreted as in slice notation.\n\
\n\
Return -1 on failure.");
@@ -6649,12 +6494,13 @@ Return -1 on failure.");
static PyObject *
unicode_find(PyUnicodeObject *self, PyObject *args)
{
- PyObject *substring;
+ PyUnicodeObject *substring;
Py_ssize_t start;
Py_ssize_t end;
Py_ssize_t result;
- if (!_ParseTupleFinds(args, &substring, &start, &end))
+ if (!stringlib_parse_args_finds_unicode("find", args, &substring,
+ &start, &end))
return NULL;
result = stringlib_find_slice(
@@ -6726,11 +6572,12 @@ static PyObject *
unicode_index(PyUnicodeObject *self, PyObject *args)
{
Py_ssize_t result;
- PyObject *substring;
+ PyUnicodeObject *substring;
Py_ssize_t start;
Py_ssize_t end;
- if (!_ParseTupleFinds(args, &substring, &start, &end))
+ if (!stringlib_parse_args_finds_unicode("index", args, &substring,
+ &start, &end))
return NULL;
result = stringlib_find_slice(
@@ -7385,7 +7232,7 @@ PyDoc_STRVAR(rfind__doc__,
"S.rfind(sub [,start [,end]]) -> int\n\
\n\
Return the highest index in S where substring sub is found,\n\
-such that sub is contained within s[start:end]. Optional\n\
+such that sub is contained within S[start:end]. Optional\n\
arguments start and end are interpreted as in slice notation.\n\
\n\
Return -1 on failure.");
@@ -7393,12 +7240,13 @@ Return -1 on failure.");
static PyObject *
unicode_rfind(PyUnicodeObject *self, PyObject *args)
{
- PyObject *substring;
+ PyUnicodeObject *substring;
Py_ssize_t start;
Py_ssize_t end;
Py_ssize_t result;
- if (!_ParseTupleFinds(args, &substring, &start, &end))
+ if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
+ &start, &end))
return NULL;
result = stringlib_rfind_slice(
@@ -7420,12 +7268,13 @@ Like S.rfind() but raise ValueError when the substring is not found.");
static PyObject *
unicode_rindex(PyUnicodeObject *self, PyObject *args)
{
- PyObject *substring;
+ PyUnicodeObject *substring;
Py_ssize_t start;
Py_ssize_t end;
Py_ssize_t result;
- if (!_ParseTupleFinds(args, &substring, &start, &end))
+ if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
+ &start, &end))
return NULL;
result = stringlib_rfind_slice(
@@ -7804,8 +7653,7 @@ unicode_startswith(PyUnicodeObject *self,
Py_ssize_t end = PY_SSIZE_T_MAX;
int result;
- if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
- _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
+ if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
return NULL;
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
@@ -7824,8 +7672,12 @@ unicode_startswith(PyUnicodeObject *self,
Py_RETURN_FALSE;
}
substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
- if (substring == NULL)
+ if (substring == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_TypeError))
+ PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
+ "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
return NULL;
+ }
result = tailmatch(self, substring, start, end, -1);
Py_DECREF(substring);
return PyBool_FromLong(result);
@@ -7850,8 +7702,7 @@ unicode_endswith(PyUnicodeObject *self,
Py_ssize_t end = PY_SSIZE_T_MAX;
int result;
- if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
- _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
+ if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
return NULL;
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
@@ -7869,9 +7720,12 @@ unicode_endswith(PyUnicodeObject *self,
Py_RETURN_FALSE;
}
substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
- if (substring == NULL)
+ if (substring == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_TypeError))
+ PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
+ "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
return NULL;
-
+ }
result = tailmatch(self, substring, start, end, +1);
Py_DECREF(substring);
return PyBool_FromLong(result);
@@ -7884,7 +7738,8 @@ unicode_endswith(PyUnicodeObject *self,
PyDoc_STRVAR(format__doc__,
"S.format(*args, **kwargs) -> unicode\n\
\n\
-");
+Return a formatted version of S, using substitutions from args and kwargs.\n\
+The substitutions are identified by braces ('{' and '}').");
static PyObject *
unicode__format__(PyObject *self, PyObject *args)
@@ -7918,7 +7773,7 @@ unicode__format__(PyObject *self, PyObject *args)
PyDoc_STRVAR(p_format__doc__,
"S.__format__(format_spec) -> unicode\n\
\n\
-");
+Return a formatted version of S as described by format_spec.");
static PyObject *
unicode__sizeof__(PyUnicodeObject *v)
@@ -7944,7 +7799,7 @@ static PyMethodDef unicode_methods[] = {
/* Order is according to common usage: often used methods should
appear first, since lookup is done sequentially. */
- {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
+ {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
{"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
@@ -7960,7 +7815,7 @@ static PyMethodDef unicode_methods[] = {
{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
{"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
- {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
+ {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
@@ -8175,16 +8030,6 @@ strtounicode(Py_UNICODE *buffer, const char *charbuffer)
}
static int
-doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
-{
- Py_ssize_t result;
-
- PyOS_ascii_formatd((char *)buffer, len, format, x);
- result = strtounicode(buffer, (char *)buffer);
- return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
-}
-
-static int
longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
{
Py_ssize_t result;
@@ -8198,64 +8043,29 @@ longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
shared with stringobject.c, converting from 8-bit to Unicode after the
formatting is done. */
-static int
-formatfloat(Py_UNICODE *buf,
- size_t buflen,
- int flags,
- int prec,
- int type,
- PyObject *v)
-{
- /* fmt = '%#.' + `prec` + `type`
- worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
- char fmt[20];
+/* Returns a new reference to a PyUnicode object, or NULL on failure. */
+
+static PyObject *
+formatfloat(PyObject *v, int flags, int prec, int type)
+{
+ char *p;
+ PyObject *result;
double x;
x = PyFloat_AsDouble(v);
if (x == -1.0 && PyErr_Occurred())
- return -1;
+ return NULL;
+
if (prec < 0)
prec = 6;
-#if SIZEOF_INT > 4
- /* make sure that the decimal representation of precision really does
- need at most 10 digits: platforms with sizeof(int) == 8 exist! */
- if (prec > 0x7fffffff) {
- PyErr_SetString(PyExc_OverflowError,
- "outrageously large precision "
- "for formatted float");
- return -1;
- }
-#endif
-
- if (type == 'f' && fabs(x) >= 1e50)
- type = 'g';
- /* Worst case length calc to ensure no buffer overrun:
- 'g' formats:
- fmt = %#.<prec>g
- buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
- for any double rep.)
- len = 1 + prec + 1 + 2 + 5 = 9 + prec
-
- 'f' formats:
- buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
- len = 1 + 50 + 1 + prec = 52 + prec
-
- If prec=0 the effective precision is 1 (the leading digit is
- always given), therefore increase the length by one.
-
- */
- if (((type == 'g' || type == 'G') &&
- buflen <= (size_t)10 + (size_t)prec) ||
- (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
- PyErr_SetString(PyExc_OverflowError,
- "formatted float is too long (precision too large?)");
- return -1;
- }
- PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
- (flags&F_ALT) ? "#" : "",
- prec, type);
- return doubletounicode(buf, buflen, fmt, x);
+ p = PyOS_double_to_string(x, type, prec,
+ (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
+ if (p == NULL)
+ return NULL;
+ result = PyUnicode_FromStringAndSize(p, strlen(p));
+ PyMem_Free(p);
+ return result;
}
static PyObject*
@@ -8425,7 +8235,7 @@ formatchar(Py_UNICODE *buf,
/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
- FORMATBUFLEN is the length of the buffer in which the floats, ints, &
+ FORMATBUFLEN is the length of the buffer in which the ints &
chars are formatted. XXX This is a magic number. Each formatting
routine does bounds checking to ensure no overflow, but a better
solution may be to malloc a buffer of appropriate size for each
@@ -8496,7 +8306,7 @@ PyObject *PyUnicode_Format(PyObject *format,
Py_UNICODE *pbuf;
Py_UNICODE sign;
Py_ssize_t len;
- Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
+ Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
fmt++;
if (*fmt == '(') {
@@ -8757,13 +8567,11 @@ PyObject *PyUnicode_Format(PyObject *format,
case 'F':
case 'g':
case 'G':
- if (c == 'F')
- c = 'f';
- pbuf = formatbuf;
- len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
- flags, prec, c, v);
- if (len < 0)
+ temp = formatfloat(v, flags, prec, c);
+ if (temp == NULL)
goto onError;
+ pbuf = PyUnicode_AS_UNICODE(temp);
+ len = PyUnicode_GET_SIZE(temp);
sign = 1;
if (flags & F_ZERO)
fill = '0';
@@ -9084,11 +8892,3 @@ _PyUnicode_Fini(void)
#ifdef __cplusplus
}
#endif
-
-
-/*
- Local variables:
- c-basic-offset: 4
- indent-tabs-mode: nil
- End:
-*/