From c3cec7868bf1019c0987f1e9aadb56d73fa93d61 Mon Sep 17 00:00:00 2001
From: Victor Stinner <victor.stinner@haypocalc.com>
Date: Wed, 5 Oct 2011 21:24:08 +0200
Subject: Add asciilib: similar to ucs1, ucs2 and ucs4 library, but specialized
 to ASCII

ucs1, ucs2 and ucs4 libraries have to scan created substring to find the
maximum character, whereas it is not need to ASCII strings. Because ASCII
strings are common, it is useful to optimize ASCII.
---
 Include/unicodeobject.h      |   1 +
 Objects/stringlib/asciilib.h |  34 +++++++++
 Objects/unicodeobject.c      | 163 ++++++++++++++++++++++++++++++-------------
 Python/formatter_unicode.c   |   4 +-
 4 files changed, 153 insertions(+), 49 deletions(-)
 create mode 100644 Objects/stringlib/asciilib.h

diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 0fd752f..6fdcd7b 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1851,6 +1851,7 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buff
    see Objects/stringlib/localeutil.h */
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
+    PyObject *unicode,
     int kind,
     void *buffer,
     Py_ssize_t n_buffer,
diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h
new file mode 100644
index 0000000..935a9c7
--- /dev/null
+++ b/Objects/stringlib/asciilib.h
@@ -0,0 +1,34 @@
+/* this is sort of a hack.  there's at least one place (formatting
+   floats) where some stringlib code takes a different path if it's
+   compiled as unicode. */
+#define STRINGLIB_IS_UNICODE     1
+
+#define FASTSEARCH               asciilib_fastsearch
+#define STRINGLIB(F)             asciilib_##F
+#define STRINGLIB_OBJECT         PyUnicodeObject
+#define STRINGLIB_CHAR           Py_UCS1
+#define STRINGLIB_TYPE_NAME      "unicode"
+#define STRINGLIB_PARSE_CODE     "U"
+#define STRINGLIB_EMPTY          unicode_empty
+#define STRINGLIB_ISSPACE        Py_UNICODE_ISSPACE
+#define STRINGLIB_ISLINEBREAK    BLOOM_LINEBREAK
+#define STRINGLIB_ISDECIMAL      Py_UNICODE_ISDECIMAL
+#define STRINGLIB_TODECIMAL      Py_UNICODE_TODECIMAL
+#define STRINGLIB_TOUPPER        Py_UNICODE_TOUPPER
+#define STRINGLIB_TOLOWER        Py_UNICODE_TOLOWER
+#define STRINGLIB_FILL           Py_UNICODE_FILL
+#define STRINGLIB_STR            PyUnicode_1BYTE_DATA
+#define STRINGLIB_LEN            PyUnicode_GET_LENGTH
+#define STRINGLIB_NEW            unicode_fromascii
+#define STRINGLIB_RESIZE         not_supported
+#define STRINGLIB_CHECK          PyUnicode_Check
+#define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
+#define STRINGLIB_GROUPING       _PyUnicode_InsertThousandsGrouping
+#define STRINGLIB_GROUPING_LOCALE _PyUnicode_InsertThousandsGroupingLocale
+
+#define STRINGLIB_TOSTR          PyObject_Str
+#define STRINGLIB_TOASCII        PyObject_ASCII
+
+#define _Py_InsertThousandsGrouping _PyUnicode_ascii_InsertThousandsGrouping
+#define _Py_InsertThousandsGroupingLocale _PyUnicode_ascii_InsertThousandsGroupingLocale
+
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f89a5f9..134ae29 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -8331,6 +8331,15 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
 
 /* --- Helpers ------------------------------------------------------------ */
 
+#include "stringlib/asciilib.h"
+#include "stringlib/fastsearch.h"
+#include "stringlib/partition.h"
+#include "stringlib/split.h"
+#include "stringlib/count.h"
+#include "stringlib/find.h"
+#include "stringlib/localeutil.h"
+#include "stringlib/undef.h"
+
 #include "stringlib/ucs1lib.h"
 #include "stringlib/fastsearch.h"
 #include "stringlib/partition.h"
@@ -8359,7 +8368,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
 #include "stringlib/undef.h"
 
 static Py_ssize_t
-any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
+any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
+                                  const Py_UCS1*, Py_ssize_t,
+                                  Py_ssize_t, Py_ssize_t),
+               Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
                                   const Py_UCS1*, Py_ssize_t,
                                   Py_ssize_t, Py_ssize_t),
                Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
@@ -8396,7 +8408,10 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
 
     switch(kind) {
     case PyUnicode_1BYTE_KIND:
-        result = ucs1(buf1, len1, buf2, len2, start, end);
+        if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
+            result = ascii(buf1, len1, buf2, len2, start, end);
+        else
+            result = ucs1(buf1, len1, buf2, len2, start, end);
         break;
     case PyUnicode_2BYTE_KIND:
         result = ucs2(buf1, len1, buf2, len2, start, end);
@@ -8417,7 +8432,7 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
 }
 
 Py_ssize_t
-_PyUnicode_InsertThousandsGrouping(int kind, void *data,
+_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
                                    Py_ssize_t n_buffer,
                                    void *digits, Py_ssize_t n_digits,
                                    Py_ssize_t min_width,
@@ -8426,9 +8441,14 @@ _PyUnicode_InsertThousandsGrouping(int kind, void *data,
 {
     switch(kind) {
     case PyUnicode_1BYTE_KIND:
-        return _PyUnicode_ucs1_InsertThousandsGrouping(
-            (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
-            min_width, grouping, thousands_sep);
+        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
+            return _PyUnicode_ascii_InsertThousandsGrouping(
+                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
+                min_width, grouping, thousands_sep);
+        else
+            return _PyUnicode_ucs1_InsertThousandsGrouping(
+                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
+                min_width, grouping, thousands_sep);
     case PyUnicode_2BYTE_KIND:
         return _PyUnicode_ucs2_InsertThousandsGrouping(
             (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
@@ -8505,10 +8525,16 @@ PyUnicode_Count(PyObject *str,
     ADJUST_INDICES(start, end, len1);
     switch(kind) {
     case PyUnicode_1BYTE_KIND:
-        result = ucs1lib_count(
-            ((Py_UCS1*)buf1) + start, end - start,
-            buf2, len2, PY_SSIZE_T_MAX
-            );
+        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
+            result = asciilib_count(
+                ((Py_UCS1*)buf1) + start, end - start,
+                buf2, len2, PY_SSIZE_T_MAX
+                );
+        else
+            result = ucs1lib_count(
+                ((Py_UCS1*)buf1) + start, end - start,
+                buf2, len2, PY_SSIZE_T_MAX
+                );
         break;
     case PyUnicode_2BYTE_KIND:
         result = ucs2lib_count(
@@ -8565,12 +8591,14 @@ PyUnicode_Find(PyObject *str,
 
     if (direction > 0)
         result = any_find_slice(
-            ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
+            asciilib_find_slice, ucs1lib_find_slice,
+            ucs2lib_find_slice, ucs4lib_find_slice,
             str, sub, start, end
             );
     else
         result = any_find_slice(
-            ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
+            asciilib_find_slice, ucs1lib_rfind_slice,
+            ucs2lib_rfind_slice, ucs4lib_rfind_slice,
             str, sub, start, end
             );
 
@@ -9200,9 +9228,14 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
 
     switch(PyUnicode_KIND(string)) {
     case PyUnicode_1BYTE_KIND:
-        list = ucs1lib_splitlines(
-            (PyObject*) string, PyUnicode_1BYTE_DATA(string),
-            PyUnicode_GET_LENGTH(string), keepends);
+        if (PyUnicode_IS_ASCII(string))
+            list = asciilib_splitlines(
+                (PyObject*) string, PyUnicode_1BYTE_DATA(string),
+                PyUnicode_GET_LENGTH(string), keepends);
+        else
+            list = ucs1lib_splitlines(
+                (PyObject*) string, PyUnicode_1BYTE_DATA(string),
+                PyUnicode_GET_LENGTH(string), keepends);
         break;
     case PyUnicode_2BYTE_KIND:
         list = ucs2lib_splitlines(
@@ -9241,10 +9274,16 @@ split(PyObject *self,
     if (substring == NULL)
         switch(PyUnicode_KIND(self)) {
         case PyUnicode_1BYTE_KIND:
-            return ucs1lib_split_whitespace(
-                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
-                PyUnicode_GET_LENGTH(self), maxcount
-                );
+            if (PyUnicode_IS_ASCII(self))
+                return asciilib_split_whitespace(
+                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
+                    PyUnicode_GET_LENGTH(self), maxcount
+                    );
+            else
+                return ucs1lib_split_whitespace(
+                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
+                    PyUnicode_GET_LENGTH(self), maxcount
+                    );
         case PyUnicode_2BYTE_KIND:
             return ucs2lib_split_whitespace(
                 (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
@@ -9283,8 +9322,12 @@ split(PyObject *self,
 
     switch(kind) {
     case PyUnicode_1BYTE_KIND:
-        out = ucs1lib_split(
-            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
+        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
+            out = asciilib_split(
+                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
+        else
+            out = ucs1lib_split(
+                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
         break;
     case PyUnicode_2BYTE_KIND:
         out = ucs2lib_split(
@@ -9323,10 +9366,16 @@ rsplit(PyObject *self,
     if (substring == NULL)
         switch(PyUnicode_KIND(self)) {
         case PyUnicode_1BYTE_KIND:
-            return ucs1lib_rsplit_whitespace(
-                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
-                PyUnicode_GET_LENGTH(self), maxcount
-                );
+            if (PyUnicode_IS_ASCII(self))
+                return asciilib_rsplit_whitespace(
+                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
+                    PyUnicode_GET_LENGTH(self), maxcount
+                    );
+            else
+                return ucs1lib_rsplit_whitespace(
+                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
+                    PyUnicode_GET_LENGTH(self), maxcount
+                    );
         case PyUnicode_2BYTE_KIND:
             return ucs2lib_rsplit_whitespace(
                 (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
@@ -9365,8 +9414,12 @@ rsplit(PyObject *self,
 
     switch(kind) {
     case PyUnicode_1BYTE_KIND:
-        out = ucs1lib_rsplit(
-            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
+        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
+            out = asciilib_rsplit(
+                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
+        else
+            out = ucs1lib_rsplit(
+                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
         break;
     case PyUnicode_2BYTE_KIND:
         out = ucs2lib_rsplit(
@@ -9387,12 +9440,15 @@ rsplit(PyObject *self,
 }
 
 static Py_ssize_t
-anylib_find(int kind, void *buf1, Py_ssize_t len1,
-            void *buf2, Py_ssize_t len2, Py_ssize_t offset)
+anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
+            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
 {
     switch(kind) {
     case PyUnicode_1BYTE_KIND:
-        return ucs1lib_find(buf1, len1, buf2, len2, offset);
+        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
+            return asciilib_find(buf1, len1, buf2, len2, offset);
+        else
+            return ucs1lib_find(buf1, len1, buf2, len2, offset);
     case PyUnicode_2BYTE_KIND:
         return ucs2lib_find(buf1, len1, buf2, len2, offset);
     case PyUnicode_4BYTE_KIND:
@@ -9403,12 +9459,15 @@ anylib_find(int kind, void *buf1, Py_ssize_t len1,
 }
 
 static Py_ssize_t
-anylib_count(int kind, void* sbuf, Py_ssize_t slen,
-             void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
+anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
+             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
 {
         switch(kind) {
         case PyUnicode_1BYTE_KIND:
-            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
+            if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
+                return asciilib_count(sbuf, slen, buf1, len1, maxcount);
+            else
+                return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
         case PyUnicode_2BYTE_KIND:
             return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
         case PyUnicode_4BYTE_KIND:
@@ -9497,7 +9556,7 @@ replace(PyObject *self, PyObject *str1,
                 if (!buf1) goto error;
                 release1 = 1;
             }
-            i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
+            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
             if (i < 0)
                 goto nothing;
             if (rkind > kind2) {
@@ -9530,9 +9589,9 @@ replace(PyObject *self, PyObject *str1,
             i += len1;
 
             while ( --maxcount > 0) {
-                i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
-                                slen-i,
-                                buf1, len1, i);
+                i = anylib_find(rkind, self,
+                                sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
+                                str1, buf1, len1, i);
                 if (i == -1)
                     break;
                 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
@@ -9557,7 +9616,7 @@ replace(PyObject *self, PyObject *str1,
             if (!buf1) goto error;
             release1 = 1;
         }
-        n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
+        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
         if (n == 0)
             goto nothing;
         if (kind2 < rkind) {
@@ -9596,9 +9655,9 @@ replace(PyObject *self, PyObject *str1,
         if (len1 > 0) {
             while (n-- > 0) {
                 /* look for next match */
-                j = anylib_find(rkind,
-                                sbuf + PyUnicode_KIND_SIZE(rkind, i),
-                                slen-i, buf1, len1, i);
+                j = anylib_find(rkind, self,
+                                sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
+                                str1, buf1, len1, i);
                 if (j == -1)
                     break;
                 else if (j > i) {
@@ -10443,7 +10502,8 @@ unicode_find(PyObject *self, PyObject *args)
         return NULL;
 
     result = any_find_slice(
-        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
+        asciilib_find_slice, ucs1lib_find_slice,
+        ucs2lib_find_slice, ucs4lib_find_slice,
         self, (PyObject*)substring, start, end
         );
 
@@ -10536,7 +10596,8 @@ unicode_index(PyObject *self, PyObject *args)
         return NULL;
 
     result = any_find_slice(
-        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
+        asciilib_find_slice, ucs1lib_find_slice,
+        ucs2lib_find_slice, ucs4lib_find_slice,
         self, (PyObject*)substring, start, end
         );
 
@@ -11548,7 +11609,8 @@ unicode_rfind(PyObject *self, PyObject *args)
         return NULL;
 
     result = any_find_slice(
-        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
+        asciilib_rfind_slice, ucs1lib_rfind_slice,
+        ucs2lib_rfind_slice, ucs4lib_rfind_slice,
         self, (PyObject*)substring, start, end
         );
 
@@ -11583,7 +11645,8 @@ unicode_rindex(PyObject *self, PyObject *args)
         return NULL;
 
     result = any_find_slice(
-        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
+        asciilib_rfind_slice, ucs1lib_rfind_slice,
+        ucs2lib_rfind_slice, ucs4lib_rfind_slice,
         self, (PyObject*)substring, start, end
         );
 
@@ -11712,7 +11775,10 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
 
     switch(PyUnicode_KIND(str_obj)) {
     case PyUnicode_1BYTE_KIND:
-        out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
+        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
+            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
+        else
+            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
         break;
     case PyUnicode_2BYTE_KIND:
         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
@@ -11781,7 +11847,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
 
     switch(PyUnicode_KIND(str_in)) {
     case PyUnicode_1BYTE_KIND:
-        out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
+        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
+            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
+        else
+            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
         break;
     case PyUnicode_2BYTE_KIND:
         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
diff --git a/Python/formatter_unicode.c b/Python/formatter_unicode.c
index 609df64..c989d83 100644
--- a/Python/formatter_unicode.c
+++ b/Python/formatter_unicode.c
@@ -501,7 +501,7 @@ calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix,
         spec->n_grouped_digits = 0;
     else
         spec->n_grouped_digits = _PyUnicode_InsertThousandsGrouping(
-            PyUnicode_1BYTE_KIND, NULL, 0, NULL,
+            NULL, PyUnicode_1BYTE_KIND, NULL, 0, NULL,
             spec->n_digits, spec->n_min_width,
             locale->grouping, locale->thousands_sep);
 
@@ -603,7 +603,7 @@ fill_number(PyObject *out, Py_ssize_t pos, const NumberFieldWidths *spec,
         r =
 #endif
             _PyUnicode_InsertThousandsGrouping(
-                kind,
+                out, kind,
                 (char*)data + PyUnicode_KIND_SIZE(kind, pos),
                 spec->n_grouped_digits,
                 pdigits + PyUnicode_KIND_SIZE(kind, d_pos),
-- 
cgit v0.12