diff options
Diffstat (limited to 'Objects/stringlib/split.h')
| -rw-r--r-- | Objects/stringlib/split.h | 394 | 
1 files changed, 394 insertions, 0 deletions
diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h new file mode 100644 index 0000000..60e7767 --- /dev/null +++ b/Objects/stringlib/split.h @@ -0,0 +1,394 @@ +/* stringlib: split implementation */ + +#ifndef STRINGLIB_SPLIT_H +#define STRINGLIB_SPLIT_H + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +/* Overallocate the initial list to reduce the number of reallocs for small +   split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three +   resizes, to sizes 4, 8, then 16.  Most observed string splits are for human +   text (roughly 11 words per line) and field delimited data (usually 1-10 +   fields).  For large strings the split algorithms are bandwidth limited +   so increasing the preallocation likely will not improve things.*/ + +#define MAX_PREALLOC 12 + +/* 5 splits gives 6 elements */ +#define PREALLOC_SIZE(maxsplit) \ +    (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) + +#define SPLIT_APPEND(data, left, right)         \ +    sub = STRINGLIB_NEW((data) + (left),        \ +                        (right) - (left));      \ +    if (sub == NULL)                            \ +        goto onError;                           \ +    if (PyList_Append(list, sub)) {             \ +        Py_DECREF(sub);                         \ +        goto onError;                           \ +    }                                           \ +    else                                        \ +        Py_DECREF(sub); + +#define SPLIT_ADD(data, left, right) {          \ +    sub = STRINGLIB_NEW((data) + (left),        \ +                        (right) - (left));      \ +    if (sub == NULL)                            \ +        goto onError;                           \ +    if (count < MAX_PREALLOC) {                 \ +        PyList_SET_ITEM(list, count, sub);      \ +    } else {                                    \ +        if (PyList_Append(list, sub)) {         \ +            Py_DECREF(sub);                     \ +            goto onError;                       \ +        }                                       \ +        else                                    \ +            Py_DECREF(sub);                     \ +    }                                           \ +    count++; } + + +/* Always force the list to the expected size. */ +#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count + +Py_LOCAL_INLINE(PyObject *) +stringlib_split_whitespace(PyObject* str_obj, +                           const STRINGLIB_CHAR* str, Py_ssize_t str_len, +                           Py_ssize_t maxcount) +{ +    Py_ssize_t i, j, count=0; +    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); +    PyObject *sub; + +    if (list == NULL) +        return NULL; + +    i = j = 0; +    while (maxcount-- > 0) { +        while (i < str_len && STRINGLIB_ISSPACE(str[i])) +            i++; +        if (i == str_len) break; +        j = i; i++; +        while (i < str_len && !STRINGLIB_ISSPACE(str[i])) +            i++; +#ifndef STRINGLIB_MUTABLE +        if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { +            /* No whitespace in str_obj, so just use it as list[0] */ +            Py_INCREF(str_obj); +            PyList_SET_ITEM(list, 0, (PyObject *)str_obj); +            count++; +            break; +        } +#endif +        SPLIT_ADD(str, j, i); +    } + +    if (i < str_len) { +        /* Only occurs when maxcount was reached */ +        /* Skip any remaining whitespace and copy to end of string */ +        while (i < str_len && STRINGLIB_ISSPACE(str[i])) +            i++; +        if (i != str_len) +            SPLIT_ADD(str, i, str_len); +    } +    FIX_PREALLOC_SIZE(list); +    return list; + +  onError: +    Py_DECREF(list); +    return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_split_char(PyObject* str_obj, +                     const STRINGLIB_CHAR* str, Py_ssize_t str_len, +                     const STRINGLIB_CHAR ch, +                     Py_ssize_t maxcount) +{ +    Py_ssize_t i, j, count=0; +    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); +    PyObject *sub; + +    if (list == NULL) +        return NULL; + +    i = j = 0; +    while ((j < str_len) && (maxcount-- > 0)) { +        for(; j < str_len; j++) { +            /* I found that using memchr makes no difference */ +            if (str[j] == ch) { +                SPLIT_ADD(str, i, j); +                i = j = j + 1; +                break; +            } +        } +    } +#ifndef STRINGLIB_MUTABLE +    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { +        /* ch not in str_obj, so just use str_obj as list[0] */ +        Py_INCREF(str_obj); +        PyList_SET_ITEM(list, 0, (PyObject *)str_obj); +        count++; +    } else +#endif +    if (i <= str_len) { +        SPLIT_ADD(str, i, str_len); +    } +    FIX_PREALLOC_SIZE(list); +    return list; + +  onError: +    Py_DECREF(list); +    return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_split(PyObject* str_obj, +                const STRINGLIB_CHAR* str, Py_ssize_t str_len, +                const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, +                Py_ssize_t maxcount) +{ +    Py_ssize_t i, j, pos, count=0; +    PyObject *list, *sub; + +    if (sep_len == 0) { +        PyErr_SetString(PyExc_ValueError, "empty separator"); +        return NULL; +    } +    else if (sep_len == 1) +        return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); + +    list = PyList_New(PREALLOC_SIZE(maxcount)); +    if (list == NULL) +        return NULL; + +    i = j = 0; +    while (maxcount-- > 0) { +        pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); +        if (pos < 0) +            break; +        j = i + pos; +        SPLIT_ADD(str, i, j); +        i = j + sep_len; +    } +#ifndef STRINGLIB_MUTABLE +    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { +        /* No match in str_obj, so just use it as list[0] */ +        Py_INCREF(str_obj); +        PyList_SET_ITEM(list, 0, (PyObject *)str_obj); +        count++; +    } else +#endif +    { +        SPLIT_ADD(str, i, str_len); +    } +    FIX_PREALLOC_SIZE(list); +    return list; + +  onError: +    Py_DECREF(list); +    return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit_whitespace(PyObject* str_obj, +                            const STRINGLIB_CHAR* str, Py_ssize_t str_len, +                            Py_ssize_t maxcount) +{ +    Py_ssize_t i, j, count=0; +    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); +    PyObject *sub; + +    if (list == NULL) +        return NULL; + +    i = j = str_len - 1; +    while (maxcount-- > 0) { +        while (i >= 0 && STRINGLIB_ISSPACE(str[i])) +            i--; +        if (i < 0) break; +        j = i; i--; +        while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) +            i--; +#ifndef STRINGLIB_MUTABLE +        if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { +            /* No whitespace in str_obj, so just use it as list[0] */ +            Py_INCREF(str_obj); +            PyList_SET_ITEM(list, 0, (PyObject *)str_obj); +            count++; +            break; +        } +#endif +        SPLIT_ADD(str, i + 1, j + 1); +    } + +    if (i >= 0) { +        /* Only occurs when maxcount was reached */ +        /* Skip any remaining whitespace and copy to beginning of string */ +        while (i >= 0 && STRINGLIB_ISSPACE(str[i])) +            i--; +        if (i >= 0) +            SPLIT_ADD(str, 0, i + 1); +    } +    FIX_PREALLOC_SIZE(list); +    if (PyList_Reverse(list) < 0) +        goto onError; +    return list; + +  onError: +    Py_DECREF(list); +    return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit_char(PyObject* str_obj, +                      const STRINGLIB_CHAR* str, Py_ssize_t str_len, +                      const STRINGLIB_CHAR ch, +                      Py_ssize_t maxcount) +{ +    Py_ssize_t i, j, count=0; +    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); +    PyObject *sub; + +    if (list == NULL) +        return NULL; + +    i = j = str_len - 1; +    while ((i >= 0) && (maxcount-- > 0)) { +        for(; i >= 0; i--) { +            if (str[i] == ch) { +                SPLIT_ADD(str, i + 1, j + 1); +                j = i = i - 1; +                break; +            } +        } +    } +#ifndef STRINGLIB_MUTABLE +    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { +        /* ch not in str_obj, so just use str_obj as list[0] */ +        Py_INCREF(str_obj); +        PyList_SET_ITEM(list, 0, (PyObject *)str_obj); +        count++; +    } else +#endif +    if (j >= -1) { +        SPLIT_ADD(str, 0, j + 1); +    } +    FIX_PREALLOC_SIZE(list); +    if (PyList_Reverse(list) < 0) +        goto onError; +    return list; + +  onError: +    Py_DECREF(list); +    return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit(PyObject* str_obj, +                 const STRINGLIB_CHAR* str, Py_ssize_t str_len, +                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, +                 Py_ssize_t maxcount) +{ +    Py_ssize_t j, pos, count=0; +    PyObject *list, *sub; + +    if (sep_len == 0) { +        PyErr_SetString(PyExc_ValueError, "empty separator"); +        return NULL; +    } +    else if (sep_len == 1) +        return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); + +    list = PyList_New(PREALLOC_SIZE(maxcount)); +    if (list == NULL) +        return NULL; + +    j = str_len; +    while (maxcount-- > 0) { +        pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); +        if (pos < 0) +            break; +        SPLIT_ADD(str, pos + sep_len, j); +        j = pos; +    } +#ifndef STRINGLIB_MUTABLE +    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { +        /* No match in str_obj, so just use it as list[0] */ +        Py_INCREF(str_obj); +        PyList_SET_ITEM(list, 0, (PyObject *)str_obj); +        count++; +    } else +#endif +    { +        SPLIT_ADD(str, 0, j); +    } +    FIX_PREALLOC_SIZE(list); +    if (PyList_Reverse(list) < 0) +        goto onError; +    return list; + +  onError: +    Py_DECREF(list); +    return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_splitlines(PyObject* str_obj, +                     const STRINGLIB_CHAR* str, Py_ssize_t str_len, +                     int keepends) +{ +    /* This does not use the preallocated list because splitlines is +       usually run with hundreds of newlines.  The overhead of +       switching between PyList_SET_ITEM and append causes about a +       2-3% slowdown for that common case.  A smarter implementation +       could move the if check out, so the SET_ITEMs are done first +       and the appends only done when the prealloc buffer is full. +       That's too much work for little gain.*/ + +    register Py_ssize_t i; +    register Py_ssize_t j; +    PyObject *list = PyList_New(0); +    PyObject *sub; + +    if (list == NULL) +        return NULL; + +    for (i = j = 0; i < str_len; ) { +        Py_ssize_t eol; + +        /* Find a line and append it */ +        while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) +            i++; + +        /* Skip the line break reading CRLF as one line break */ +        eol = i; +        if (i < str_len) { +            if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') +                i += 2; +            else +                i++; +            if (keepends) +                eol = i; +        } +#ifndef STRINGLIB_MUTABLE +        if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { +            /* No linebreak in str_obj, so just use it as list[0] */ +            if (PyList_Append(list, str_obj)) +                goto onError; +            break; +        } +#endif +        SPLIT_APPEND(str, j, eol); +        j = i; +    } +    return list; + +  onError: +    Py_DECREF(list); +    return NULL; +} + +#endif  | 
