1/* stringlib: split implementation */
2
3#ifndef STRINGLIB_SPLIT_H
4#define STRINGLIB_SPLIT_H
5
6#ifndef STRINGLIB_FASTSEARCH_H
7#error must include "stringlib/fastsearch.h" before including this module
8#endif
9
10/* Overallocate the initial list to reduce the number of reallocs for small
11   split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
12   resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
13   text (roughly 11 words per line) and field delimited data (usually 1-10
14   fields).  For large strings the split algorithms are bandwidth limited
15   so increasing the preallocation likely will not improve things.*/
16
17#define MAX_PREALLOC 12
18
19/* 5 splits gives 6 elements */
20#define PREALLOC_SIZE(maxsplit) \
21    (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
22
23#define SPLIT_APPEND(data, left, right)         \
24    sub = STRINGLIB_NEW((data) + (left),        \
25                        (right) - (left));      \
26    if (sub == NULL)                            \
27        goto onError;                           \
28    if (PyList_Append(list, sub)) {             \
29        Py_DECREF(sub);                         \
30        goto onError;                           \
31    }                                           \
32    else                                        \
33        Py_DECREF(sub);
34
35#define SPLIT_ADD(data, left, right) {          \
36    sub = STRINGLIB_NEW((data) + (left),        \
37                        (right) - (left));      \
38    if (sub == NULL)                            \
39        goto onError;                           \
40    if (count < MAX_PREALLOC) {                 \
41        PyList_SET_ITEM(list, count, sub);      \
42    } else {                                    \
43        if (PyList_Append(list, sub)) {         \
44            Py_DECREF(sub);                     \
45            goto onError;                       \
46        }                                       \
47        else                                    \
48            Py_DECREF(sub);                     \
49    }                                           \
50    count++; }
51
52
53/* Always force the list to the expected size. */
54#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
55
56Py_LOCAL_INLINE(PyObject *)
57stringlib_split_whitespace(PyObject* str_obj,
58                           const STRINGLIB_CHAR* str, Py_ssize_t str_len,
59                           Py_ssize_t maxcount)
60{
61    Py_ssize_t i, j, count=0;
62    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
63    PyObject *sub;
64
65    if (list == NULL)
66        return NULL;
67
68    i = j = 0;
69    while (maxcount-- > 0) {
70        while (i < str_len && STRINGLIB_ISSPACE(str[i]))
71            i++;
72        if (i == str_len) break;
73        j = i; i++;
74        while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
75            i++;
76#ifndef STRINGLIB_MUTABLE
77        if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
78            /* No whitespace in str_obj, so just use it as list[0] */
79            Py_INCREF(str_obj);
80            PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
81            count++;
82            break;
83        }
84#endif
85        SPLIT_ADD(str, j, i);
86    }
87
88    if (i < str_len) {
89        /* Only occurs when maxcount was reached */
90        /* Skip any remaining whitespace and copy to end of string */
91        while (i < str_len && STRINGLIB_ISSPACE(str[i]))
92            i++;
93        if (i != str_len)
94            SPLIT_ADD(str, i, str_len);
95    }
96    FIX_PREALLOC_SIZE(list);
97    return list;
98
99  onError:
100    Py_DECREF(list);
101    return NULL;
102}
103
104Py_LOCAL_INLINE(PyObject *)
105stringlib_split_char(PyObject* str_obj,
106                     const STRINGLIB_CHAR* str, Py_ssize_t str_len,
107                     const STRINGLIB_CHAR ch,
108                     Py_ssize_t maxcount)
109{
110    Py_ssize_t i, j, count=0;
111    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
112    PyObject *sub;
113
114    if (list == NULL)
115        return NULL;
116
117    i = j = 0;
118    while ((j < str_len) && (maxcount-- > 0)) {
119        for(; j < str_len; j++) {
120            /* I found that using memchr makes no difference */
121            if (str[j] == ch) {
122                SPLIT_ADD(str, i, j);
123                i = j = j + 1;
124                break;
125            }
126        }
127    }
128#ifndef STRINGLIB_MUTABLE
129    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
130        /* ch not in str_obj, so just use str_obj as list[0] */
131        Py_INCREF(str_obj);
132        PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
133        count++;
134    } else
135#endif
136    if (i <= str_len) {
137        SPLIT_ADD(str, i, str_len);
138    }
139    FIX_PREALLOC_SIZE(list);
140    return list;
141
142  onError:
143    Py_DECREF(list);
144    return NULL;
145}
146
147Py_LOCAL_INLINE(PyObject *)
148stringlib_split(PyObject* str_obj,
149                const STRINGLIB_CHAR* str, Py_ssize_t str_len,
150                const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
151                Py_ssize_t maxcount)
152{
153    Py_ssize_t i, j, pos, count=0;
154    PyObject *list, *sub;
155
156    if (sep_len == 0) {
157        PyErr_SetString(PyExc_ValueError, "empty separator");
158        return NULL;
159    }
160    else if (sep_len == 1)
161        return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
162
163    list = PyList_New(PREALLOC_SIZE(maxcount));
164    if (list == NULL)
165        return NULL;
166
167    i = j = 0;
168    while (maxcount-- > 0) {
169        pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
170        if (pos < 0)
171            break;
172        j = i + pos;
173        SPLIT_ADD(str, i, j);
174        i = j + sep_len;
175    }
176#ifndef STRINGLIB_MUTABLE
177    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
178        /* No match in str_obj, so just use it as list[0] */
179        Py_INCREF(str_obj);
180        PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
181        count++;
182    } else
183#endif
184    {
185        SPLIT_ADD(str, i, str_len);
186    }
187    FIX_PREALLOC_SIZE(list);
188    return list;
189
190  onError:
191    Py_DECREF(list);
192    return NULL;
193}
194
195Py_LOCAL_INLINE(PyObject *)
196stringlib_rsplit_whitespace(PyObject* str_obj,
197                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
198                            Py_ssize_t maxcount)
199{
200    Py_ssize_t i, j, count=0;
201    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
202    PyObject *sub;
203
204    if (list == NULL)
205        return NULL;
206
207    i = j = str_len - 1;
208    while (maxcount-- > 0) {
209        while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
210            i--;
211        if (i < 0) break;
212        j = i; i--;
213        while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
214            i--;
215#ifndef STRINGLIB_MUTABLE
216        if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
217            /* No whitespace in str_obj, so just use it as list[0] */
218            Py_INCREF(str_obj);
219            PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
220            count++;
221            break;
222        }
223#endif
224        SPLIT_ADD(str, i + 1, j + 1);
225    }
226
227    if (i >= 0) {
228        /* Only occurs when maxcount was reached */
229        /* Skip any remaining whitespace and copy to beginning of string */
230        while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
231            i--;
232        if (i >= 0)
233            SPLIT_ADD(str, 0, i + 1);
234    }
235    FIX_PREALLOC_SIZE(list);
236    if (PyList_Reverse(list) < 0)
237        goto onError;
238    return list;
239
240  onError:
241    Py_DECREF(list);
242    return NULL;
243}
244
245Py_LOCAL_INLINE(PyObject *)
246stringlib_rsplit_char(PyObject* str_obj,
247                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
248                      const STRINGLIB_CHAR ch,
249                      Py_ssize_t maxcount)
250{
251    Py_ssize_t i, j, count=0;
252    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
253    PyObject *sub;
254
255    if (list == NULL)
256        return NULL;
257
258    i = j = str_len - 1;
259    while ((i >= 0) && (maxcount-- > 0)) {
260        for(; i >= 0; i--) {
261            if (str[i] == ch) {
262                SPLIT_ADD(str, i + 1, j + 1);
263                j = i = i - 1;
264                break;
265            }
266        }
267    }
268#ifndef STRINGLIB_MUTABLE
269    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
270        /* ch not in str_obj, so just use str_obj as list[0] */
271        Py_INCREF(str_obj);
272        PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
273        count++;
274    } else
275#endif
276    if (j >= -1) {
277        SPLIT_ADD(str, 0, j + 1);
278    }
279    FIX_PREALLOC_SIZE(list);
280    if (PyList_Reverse(list) < 0)
281        goto onError;
282    return list;
283
284  onError:
285    Py_DECREF(list);
286    return NULL;
287}
288
289Py_LOCAL_INLINE(PyObject *)
290stringlib_rsplit(PyObject* str_obj,
291                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
292                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
293                 Py_ssize_t maxcount)
294{
295    Py_ssize_t j, pos, count=0;
296    PyObject *list, *sub;
297
298    if (sep_len == 0) {
299        PyErr_SetString(PyExc_ValueError, "empty separator");
300        return NULL;
301    }
302    else if (sep_len == 1)
303        return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
304
305    list = PyList_New(PREALLOC_SIZE(maxcount));
306    if (list == NULL)
307        return NULL;
308
309    j = str_len;
310    while (maxcount-- > 0) {
311        pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
312        if (pos < 0)
313            break;
314        SPLIT_ADD(str, pos + sep_len, j);
315        j = pos;
316    }
317#ifndef STRINGLIB_MUTABLE
318    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
319        /* No match in str_obj, so just use it as list[0] */
320        Py_INCREF(str_obj);
321        PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
322        count++;
323    } else
324#endif
325    {
326        SPLIT_ADD(str, 0, j);
327    }
328    FIX_PREALLOC_SIZE(list);
329    if (PyList_Reverse(list) < 0)
330        goto onError;
331    return list;
332
333  onError:
334    Py_DECREF(list);
335    return NULL;
336}
337
338Py_LOCAL_INLINE(PyObject *)
339stringlib_splitlines(PyObject* str_obj,
340                     const STRINGLIB_CHAR* str, Py_ssize_t str_len,
341                     int keepends)
342{
343    /* This does not use the preallocated list because splitlines is
344       usually run with hundreds of newlines.  The overhead of
345       switching between PyList_SET_ITEM and append causes about a
346       2-3% slowdown for that common case.  A smarter implementation
347       could move the if check out, so the SET_ITEMs are done first
348       and the appends only done when the prealloc buffer is full.
349       That's too much work for little gain.*/
350
351    register Py_ssize_t i;
352    register Py_ssize_t j;
353    PyObject *list = PyList_New(0);
354    PyObject *sub;
355
356    if (list == NULL)
357        return NULL;
358
359    for (i = j = 0; i < str_len; ) {
360        Py_ssize_t eol;
361
362        /* Find a line and append it */
363        while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
364            i++;
365
366        /* Skip the line break reading CRLF as one line break */
367        eol = i;
368        if (i < str_len) {
369            if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
370                i += 2;
371            else
372                i++;
373            if (keepends)
374                eol = i;
375        }
376#ifndef STRINGLIB_MUTABLE
377        if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
378            /* No linebreak in str_obj, so just use it as list[0] */
379            if (PyList_Append(list, str_obj))
380                goto onError;
381            break;
382        }
383#endif
384        SPLIT_APPEND(str, j, eol);
385        j = i;
386    }
387    return list;
388
389  onError:
390    Py_DECREF(list);
391    return NULL;
392}
393
394#endif
395