summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorChristian Heimes <christian@cheimes.de>2008-01-30 11:58:22 (GMT)
committerChristian Heimes <christian@cheimes.de>2008-01-30 11:58:22 (GMT)
commit190d79e5c648174b550de2bef75d1b4addf0d625 (patch)
tree6af69a8af50c6d9f95ec5eb3372c17d36bb17fc5 /Objects
parent510711d598f1432afb021a01c2457b14334c6157 (diff)
downloadcpython-190d79e5c648174b550de2bef75d1b4addf0d625.zip
cpython-190d79e5c648174b550de2bef75d1b4addf0d625.tar.gz
cpython-190d79e5c648174b550de2bef75d1b4addf0d625.tar.bz2
Merged revisions 60408-60440 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r60425 | raymond.hettinger | 2008-01-29 20:52:09 +0100 (Tue, 29 Jan 2008) | 1 line CallMethod is faster with a NULL third-argument than with an empty format string. ........ r60431 | raymond.hettinger | 2008-01-30 01:01:07 +0100 (Wed, 30 Jan 2008) | 1 line Add isdisjoint() to the Set/MutableSet ABCs. ........ r60432 | raymond.hettinger | 2008-01-30 01:08:31 +0100 (Wed, 30 Jan 2008) | 1 line MutableSets support a remove() method. ........ r60433 | raymond.hettinger | 2008-01-30 01:51:58 +0100 (Wed, 30 Jan 2008) | 1 line Demonstrate new except/as syntax. ........ r60440 | christian.heimes | 2008-01-30 12:32:37 +0100 (Wed, 30 Jan 2008) | 1 line Patch #1970 by Antoine Pitrou: Speedup unicode whitespace and linebreak detection. The speedup is about 25% for split() (571 / 457 usec) and 35% (175 / 127 usec )for splitlines() ........
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c99
1 files changed, 81 insertions, 18 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 694f3b0..1b35d4e 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -125,6 +125,64 @@ static PyUnicodeObject *unicode_latin1[256];
*/
static const char unicode_default_encoding[] = "utf-8";
+/* Fast detection of the most frequent whitespace characters */
+const unsigned char _Py_ascii_whitespace[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+// case 0x0009: /* HORIZONTAL TABULATION */
+// case 0x000A: /* LINE FEED */
+// case 0x000B: /* VERTICAL TABULATION */
+// case 0x000C: /* FORM FEED */
+// case 0x000D: /* CARRIAGE RETURN */
+ 0, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+// case 0x001C: /* FILE SEPARATOR */
+// case 0x001D: /* GROUP SEPARATOR */
+// case 0x001E: /* RECORD SEPARATOR */
+// case 0x001F: /* UNIT SEPARATOR */
+ 0, 0, 0, 0, 1, 1, 1, 1,
+// case 0x0020: /* SPACE */
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Same for linebreaks */
+static unsigned char ascii_linebreak[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+// 0x000A, /* LINE FEED */
+// 0x000D, /* CARRIAGE RETURN */
+ 0, 0, 1, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+// 0x001C, /* FILE SEPARATOR */
+// 0x001D, /* GROUP SEPARATOR */
+// 0x001E, /* RECORD SEPARATOR */
+ 0, 0, 0, 0, 1, 1, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+
Py_UNICODE
PyUnicode_GetMax(void)
{
@@ -151,8 +209,9 @@ static BLOOM_MASK bloom_linebreak;
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
-#define BLOOM_LINEBREAK(ch)\
- (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
+#define BLOOM_LINEBREAK(ch) \
+ ((ch) < 128U ? ascii_linebreak[(ch)] : \
+ (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
{
@@ -5602,25 +5661,26 @@ PyObject *split_whitespace(PyUnicodeObject *self,
register Py_ssize_t j;
Py_ssize_t len = self->length;
PyObject *str;
+ register const Py_UNICODE *buf = self->str;
for (i = j = 0; i < len; ) {
/* find a token */
- while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
+ while (i < len && Py_UNICODE_ISSPACE(buf[i]))
i++;
j = i;
- while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
+ while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
i++;
if (j < i) {
if (maxcount-- <= 0)
break;
- SPLIT_APPEND(self->str, j, i);
- while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
+ SPLIT_APPEND(buf, j, i);
+ while (i < len && Py_UNICODE_ISSPACE(buf[i]))
i++;
j = i;
}
}
if (j < len) {
- SPLIT_APPEND(self->str, j, len);
+ SPLIT_APPEND(buf, j, len);
}
return list;
@@ -5693,18 +5753,19 @@ PyObject *split_char(PyUnicodeObject *self,
register Py_ssize_t j;
Py_ssize_t len = self->length;
PyObject *str;
+ register const Py_UNICODE *buf = self->str;
for (i = j = 0; i < len; ) {
- if (self->str[i] == ch) {
+ if (buf[i] == ch) {
if (maxcount-- <= 0)
break;
- SPLIT_APPEND(self->str, j, i);
+ SPLIT_APPEND(buf, j, i);
i = j = i + 1;
} else
i++;
}
if (j <= len) {
- SPLIT_APPEND(self->str, j, len);
+ SPLIT_APPEND(buf, j, len);
}
return list;
@@ -5753,25 +5814,26 @@ PyObject *rsplit_whitespace(PyUnicodeObject *self,
register Py_ssize_t j;
Py_ssize_t len = self->length;
PyObject *str;
+ register const Py_UNICODE *buf = self->str;
for (i = j = len - 1; i >= 0; ) {
/* find a token */
- while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
+ while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
i--;
j = i;
- while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
+ while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
i--;
if (j > i) {
if (maxcount-- <= 0)
break;
- SPLIT_APPEND(self->str, i + 1, j + 1);
- while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
+ SPLIT_APPEND(buf, i + 1, j + 1);
+ while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
i--;
j = i;
}
}
if (j >= 0) {
- SPLIT_APPEND(self->str, 0, j + 1);
+ SPLIT_APPEND(buf, 0, j + 1);
}
if (PyList_Reverse(list) < 0)
goto onError;
@@ -5792,18 +5854,19 @@ PyObject *rsplit_char(PyUnicodeObject *self,
register Py_ssize_t j;
Py_ssize_t len = self->length;
PyObject *str;
+ register const Py_UNICODE *buf = self->str;
for (i = j = len - 1; i >= 0; ) {
- if (self->str[i] == ch) {
+ if (buf[i] == ch) {
if (maxcount-- <= 0)
break;
- SPLIT_APPEND(self->str, i + 1, j + 1);
+ SPLIT_APPEND(buf, i + 1, j + 1);
j = i = i - 1;
} else
i--;
}
if (j >= -1) {
- SPLIT_APPEND(self->str, 0, j + 1);
+ SPLIT_APPEND(buf, 0, j + 1);
}
if (PyList_Reverse(list) < 0)
goto onError;