Optimize _PyUnicode_FastCopyCharacters() when maxchar(from) > maxchar(to)

author: Victor Stinner <victor.stinner@gmail.com> 2012-06-16 00:22:37 (GMT)
committer: Victor Stinner <victor.stinner@gmail.com> 2012-06-16 00:22:37 (GMT)
commit: c9d369f1bf78c48083679b2afa5f31d8378ea94d (patch)
tree: 610388d08918fbb6fdafa6e5f7fa7123d0ec8a19
parent: f05e17ece9ee4cf4d04e0657e6c7c9283a233968 (diff)
download: cpython-c9d369f1bf78c48083679b2afa5f31d8378ea94d.zip
cpython-c9d369f1bf78c48083679b2afa5f31d8378ea94d.tar.gz
cpython-c9d369f1bf78c48083679b2afa5f31d8378ea94d.tar.bz2
2 files changed, 79 insertions, 57 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 9c1a5f0..80a583c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1128,7 +1128,6 @@ _copy_characters(PyObject *to, Py_ssize_t to_start,
 {
     unsigned int from_kind, to_kind;
     void *from_data, *to_data;
-    int fast;
 
     assert(0 <= how_many);
     assert(0 <= from_start);
@@ -1137,41 +1136,40 @@ _copy_characters(PyObject *to, Py_ssize_t to_start,
     assert(PyUnicode_IS_READY(from));
     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
 
-    if (how_many == 0)
-        return 0;
-
     assert(PyUnicode_Check(to));
     assert(PyUnicode_IS_READY(to));
     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
 
+    if (how_many == 0)
+        return 0;
+
     from_kind = PyUnicode_KIND(from);
     from_data = PyUnicode_DATA(from);
     to_kind = PyUnicode_KIND(to);
     to_data = PyUnicode_DATA(to);
 
-#ifdef Py_DEBUG
-    if (!check_maxchar
-        && (from_kind > to_kind
-            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
-    {
-        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
-        Py_UCS4 ch;
-        Py_ssize_t i;
-        for (i=0; i < how_many; i++) {
-            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
-            assert(ch <= to_maxchar);
-        }
-    }
+    if (from_kind == to_kind) {
+        if (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) {
+            /* Writing Latin-1 characters into an ASCII string requires to
+               check that all written characters are pure ASCII */
+#ifndef Py_DEBUG
+            if (check_maxchar) {
+                Py_UCS4 max_char;
+                max_char = ucs1lib_find_max_char(from_data,
+                                                 (char*)from_data + how_many);
+                if (max_char >= 128)
+                    return -1;
+            }
+#else
+            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
+            Py_UCS4 ch;
+            Py_ssize_t i;
+            for (i=0; i < how_many; i++) {
+                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
+                assert(ch <= to_maxchar);
+            }
 #endif
-    fast = (from_kind == to_kind);
-    if (check_maxchar
-        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
-    {
-        /* deny latin1 => ascii */
-        fast = 0;
-    }
-
-    if (fast) {
+        }
         Py_MEMCPY((char*)to_data + to_kind * to_start,
                   (char*)from_data + from_kind * from_start,
                   to_kind * how_many);
@@ -1207,42 +1205,62 @@ _copy_characters(PyObject *to, Py_ssize_t to_start,
             );
     }
     else {
-        /* check if max_char(from substring) <= max_char(to) */
-        if (from_kind > to_kind
-                /* latin1 => ascii */
-            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
+        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
+
+#ifndef Py_DEBUG
+        if (!check_maxchar) {
+            if (from_kind == PyUnicode_2BYTE_KIND
+                && to_kind == PyUnicode_1BYTE_KIND)
+            {
+                _PyUnicode_CONVERT_BYTES(
+                    Py_UCS2, Py_UCS1,
+                    PyUnicode_2BYTE_DATA(from) + from_start,
+                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
+                    PyUnicode_1BYTE_DATA(to) + to_start
+                    );
+            }
+            else if (from_kind == PyUnicode_4BYTE_KIND
+                     && to_kind == PyUnicode_1BYTE_KIND)
+            {
+                _PyUnicode_CONVERT_BYTES(
+                    Py_UCS4, Py_UCS1,
+                    PyUnicode_4BYTE_DATA(from) + from_start,
+                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
+                    PyUnicode_1BYTE_DATA(to) + to_start
+                    );
+            }
+            else if (from_kind == PyUnicode_4BYTE_KIND
+                     && to_kind == PyUnicode_2BYTE_KIND)
+            {
+                _PyUnicode_CONVERT_BYTES(
+                    Py_UCS4, Py_UCS2,
+                    PyUnicode_4BYTE_DATA(from) + from_start,
+                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
+                    PyUnicode_2BYTE_DATA(to) + to_start
+                    );
+            }
+            else {
+                assert(0);
+                return -1;
+            }
+        }
+        else
+#endif
         {
-            /* slow path to check for character overflow */
             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
             Py_UCS4 ch;
             Py_ssize_t i;
 
-#ifdef Py_DEBUG
             for (i=0; i < how_many; i++) {
                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
+#ifndef Py_DEBUG
                 assert(ch <= to_maxchar);
-                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
-            }
 #else
-            if (!check_maxchar) {
-                for (i=0; i < how_many; i++) {
-                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
-                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
-                }
-            }
-            else {
-                for (i=0; i < how_many; i++) {
-                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
-                    if (ch > to_maxchar)
-                        return 1;
-                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
-                }
-            }
+                if (ch > to_maxchar)
+                    return -1;
 #endif
-        }
-        else {
-            assert(0 && "inconsistent state");
-            return 1;
+                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
+            }
         }
     }
     return 0;
@@ -13876,9 +13894,11 @@ PyUnicode_Format(PyObject *format, PyObject *args)
                 }
             }
 
-            _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
-                                          temp, pindex, len);
-            writer.pos += len;
+            if (len) {
+                _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
+                                              temp, pindex, len);
+                writer.pos += len;
+            }
             if (width > len) {
                 sublen = width - len;
                 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
diff --git a/Python/formatter_unicode.c b/Python/formatter_unicode.c
index fdb587d..cd66670 100644
--- a/Python/formatter_unicode.c
+++ b/Python/formatter_unicode.c
@@ -786,8 +786,10 @@ format_string_internal(PyObject *value, const InternalFormatSpec *format,
         goto done;
 
     /* Then the source string. */
-    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
-                                  value, 0, len);
+    if (len) {
+        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
+                                      value, 0, len);
+    }
     writer->pos += (len + rpad);
     result = 0;
author	Victor Stinner <victor.stinner@gmail.com>	2012-06-16 00:22:37 (GMT)
committer	Victor Stinner <victor.stinner@gmail.com>	2012-06-16 00:22:37 (GMT)
commit	c9d369f1bf78c48083679b2afa5f31d8378ea94d (patch)
tree	610388d08918fbb6fdafa6e5f7fa7123d0ec8a19
parent	f05e17ece9ee4cf4d04e0657e6c7c9283a233968 (diff)
download	cpython-c9d369f1bf78c48083679b2afa5f31d8378ea94d.zip cpython-c9d369f1bf78c48083679b2afa5f31d8378ea94d.tar.gz cpython-c9d369f1bf78c48083679b2afa5f31d8378ea94d.tar.bz2