1/*
2**************************************************************************
3*    Copyright (C) 2016 and later: Unicode, Inc. and others.
4*    License & terms of use: http://www.unicode.org/copyright.html#License
5**************************************************************************
6**************************************************************************
7*   Copyright (C) 2014, International Business Machines
8*   Corporation and others.  All Rights Reserved.
9**************************************************************************
10*   file name:  unisetperf.cpp
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2007jan31
16*   created by: Markus Scherer
17*/
18
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include "unicode/uperf.h"
23#include "unicode/uniset.h"
24#include "unicode/unistr.h"
25#include "uoptions.h"
26#include "cmemory.h" // for UPRV_LENGTHOF
27
28// Command-line options specific to unisetperf.
29// Options do not have abbreviations: Force readable command lines.
30// (Using U+0001 for abbreviation characters.)
31enum {
32    SET_PATTERN,
33    FAST_TYPE,
34    UNISETPERF_OPTIONS_COUNT
35};
36
37static UOption options[UNISETPERF_OPTIONS_COUNT]={
38    UOPTION_DEF("pattern", '\x01', UOPT_REQUIRES_ARG),
39    UOPTION_DEF("type",    '\x01', UOPT_REQUIRES_ARG)
40};
41
42static const char *const unisetperf_usage =
43    "\t--pattern   UnicodeSet pattern for instantiation.\n"
44    "\t            Default: [:ID_Continue:]\n"
45    "\t--type      Type of UnicodeSet: slow fast\n"
46    "\t            Default: slow\n";
47
48// Test object with setup data.
49class UnicodeSetPerformanceTest : public UPerfTest {
50public:
51    UnicodeSetPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
52            : UPerfTest(argc, argv, options, UPRV_LENGTHOF(options), unisetperf_usage, status),
53              utf8(NULL), utf8Length(0), countInputCodePoints(0), spanCount(0) {
54        if (U_SUCCESS(status)) {
55            UnicodeString pattern=UnicodeString(options[SET_PATTERN].value, -1, US_INV).unescape();
56            set.applyPattern(pattern, status);
57            prefrozen=set;
58            if(0==strcmp(options[FAST_TYPE].value, "fast")) {
59                set.freeze();
60            }
61
62            int32_t inputLength;
63            UPerfTest::getBuffer(inputLength, status);
64            if(U_SUCCESS(status) && inputLength>0) {
65                countInputCodePoints = u_countChar32(buffer, bufferLen);
66
67                countSpans();
68
69                // Preflight the UTF-8 length and allocate utf8.
70                u_strToUTF8(NULL, 0, &utf8Length, buffer, bufferLen, &status);
71                if(status==U_BUFFER_OVERFLOW_ERROR) {
72                    utf8=(char *)malloc(utf8Length);
73                    if(utf8!=NULL) {
74                        status=U_ZERO_ERROR;
75                        u_strToUTF8(utf8, utf8Length, NULL, buffer, bufferLen, &status);
76                    } else {
77                        status=U_MEMORY_ALLOCATION_ERROR;
78                    }
79                }
80
81                if(verbose) {
82                    printf("code points:%ld  len16:%ld  len8:%ld  spans:%ld  "
83                           "cp/span:%.3g  UChar/span:%.3g  B/span:%.3g  B/cp:%.3g\n",
84                           (long)countInputCodePoints, (long)bufferLen, (long)utf8Length, (long)spanCount,
85                           (double)countInputCodePoints/spanCount, (double)bufferLen/spanCount, (double)utf8Length/spanCount,
86                           (double)utf8Length/countInputCodePoints);
87                }
88            }
89        }
90    }
91
92    virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL);
93
94    // Count spans of characters that are in the set,
95    // and spans of characters that are not in the set.
96    // If the very first character is in the set, then one additional
97    // not-span is counted.
98    void countSpans() {
99        const UChar *s=getBuffer();
100        int32_t length=getBufferLen();
101        int32_t i=0;
102        UBool tf=FALSE;
103        while(i<length) {
104            i=span(s, length, i, tf);
105            tf=(UBool)(!tf);
106            ++spanCount;
107        }
108    }
109    int32_t span(const UChar *s, int32_t length, int32_t start, UBool tf) const {
110        UChar32 c;
111        int32_t prev;
112        while((prev=start)<length) {
113            U16_NEXT(s, start, length, c);
114            if(tf!=set.contains(c)) {
115                break;
116            }
117        }
118        return prev;
119    }
120
121    const UChar *getBuffer() const { return buffer; }
122    int32_t getBufferLen() const { return bufferLen; }
123
124    char *utf8;
125    int32_t utf8Length;
126
127    // Number of code points in the input text.
128    int32_t countInputCodePoints;
129    int32_t spanCount;
130
131    UnicodeSet set;
132    UnicodeSet prefrozen;
133};
134
135// Performance test function object.
136class Command : public UPerfFunction {
137protected:
138    Command(const UnicodeSetPerformanceTest &testcase) : testcase(testcase) {}
139
140public:
141    virtual ~Command() {}
142
143    // virtual void call(UErrorCode* pErrorCode) { ... }
144
145    virtual long getOperationsPerIteration() {
146        // Number of code points tested:
147        // Input code points, plus one for the end of each span except the last span.
148        return testcase.countInputCodePoints+testcase.spanCount-1;
149    }
150
151    virtual long getEventsPerIteration() {
152        return testcase.spanCount;
153    }
154
155    const UnicodeSetPerformanceTest &testcase;
156};
157
158class Contains : public Command {
159protected:
160    Contains(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
161        // Verify that the frozen set is equal to the unfrozen one.
162        UnicodeSet set;
163        UChar32 c;
164
165        for(c=0; c<=0x10ffff; ++c) {
166            if(testcase.set.contains(c)) {
167                set.add(c);
168            }
169        }
170        if(set!=testcase.set) {
171            fprintf(stderr, "error: frozen set != original!\n");
172        }
173    }
174public:
175    static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
176        return new Contains(testcase);
177    }
178    virtual void call(UErrorCode* pErrorCode) {
179        const UnicodeSet &set=testcase.set;
180        const UChar *s=testcase.getBuffer();
181        int32_t length=testcase.getBufferLen();
182        int32_t count=0;
183        int32_t i=0;
184        UBool tf=FALSE;
185        while(i<length) {
186            i+=span(set, s+i, length-i, tf);
187            tf=(UBool)(!tf);
188            ++count;
189        }
190        if(count!=testcase.spanCount) {
191            fprintf(stderr, "error: Contains() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
192                    (long)count, (long)testcase.spanCount);
193        }
194    }
195    static int32_t span(const UnicodeSet &set, const UChar *s, int32_t length, UBool tf) {
196        UChar32 c;
197        int32_t start=0, prev;
198        while((prev=start)<length) {
199            U16_NEXT(s, start, length, c);
200            if(tf!=set.contains(c)) {
201                break;
202            }
203        }
204        return prev;
205    }
206};
207
208class SpanUTF16 : public Command {
209protected:
210    SpanUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
211        // Verify that the frozen set is equal to the unfrozen one.
212        UnicodeSet set;
213        UChar utf16[2];
214        UChar32 c, c2;
215
216        for(c=0; c<=0xffff; ++c) {
217            utf16[0]=(UChar)c;
218            if(testcase.set.span(utf16, 1, USET_SPAN_CONTAINED)>0) {
219                set.add(c);
220            }
221        }
222        for(c=0xd800; c<=0xdbff; ++c) {
223            utf16[0]=(UChar)c;
224            for(c2=0xdc00; c2<=0xdfff; ++c2) {
225                utf16[1]=(UChar)c2;
226                if(testcase.set.span(utf16, 2, USET_SPAN_CONTAINED)>0) {
227                    set.add(U16_GET_SUPPLEMENTARY(c, c2));
228                }
229            }
230        }
231
232        if(set!=testcase.set) {
233            fprintf(stderr, "error: frozen set != original!\n");
234        }
235    }
236public:
237    static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
238        return new SpanUTF16(testcase);
239    }
240    virtual void call(UErrorCode* pErrorCode) {
241        const UnicodeSet &set=testcase.set;
242        const UChar *s=testcase.getBuffer();
243        int32_t length=testcase.getBufferLen();
244        int32_t count=0;
245        int32_t i=0;
246        UBool tf=FALSE;
247        while(i<length) {
248            i+=set.span(s+i, length-i, (USetSpanCondition)tf);
249            tf=(UBool)(!tf);
250            ++count;
251        }
252        if(count!=testcase.spanCount) {
253            fprintf(stderr, "error: SpanUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
254                    (long)count, (long)testcase.spanCount);
255        }
256    }
257};
258
259class SpanBackUTF16 : public Command {
260protected:
261    SpanBackUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
262        // Verify that the frozen set is equal to the unfrozen one.
263        UnicodeSet set;
264        UChar utf16[2];
265        UChar32 c, c2;
266
267        for(c=0; c<=0xffff; ++c) {
268            utf16[0]=(UChar)c;
269            if(testcase.set.spanBack(utf16, 1, USET_SPAN_CONTAINED)==0) {
270                set.add(c);
271            }
272        }
273        for(c=0xd800; c<=0xdbff; ++c) {
274            utf16[0]=(UChar)c;
275            for(c2=0xdc00; c2<=0xdfff; ++c2) {
276                utf16[1]=(UChar)c2;
277                if(testcase.set.spanBack(utf16, 2, USET_SPAN_CONTAINED)==0) {
278                    set.add(U16_GET_SUPPLEMENTARY(c, c2));
279                }
280            }
281        }
282
283        if(set!=testcase.set) {
284            fprintf(stderr, "error: frozen set != original!\n");
285        }
286    }
287public:
288    static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
289        return new SpanBackUTF16(testcase);
290    }
291    virtual void call(UErrorCode* pErrorCode) {
292        const UnicodeSet &set=testcase.set;
293        const UChar *s=testcase.getBuffer();
294        int32_t length=testcase.getBufferLen();
295        int32_t count=0;
296        /*
297         * Get the same spans as with span() where we always start with a not-contained span.
298         * If testcase.spanCount is an odd number, then the last span() was not-contained.
299         * The last spanBack() must be not-contained to match the first span().
300         */
301        UBool tf=(UBool)((testcase.spanCount&1)==0);
302        while(length>0 || !tf) {
303            length=set.spanBack(s, length, (USetSpanCondition)tf);
304            tf=(UBool)(!tf);
305            ++count;
306        }
307        if(count!=testcase.spanCount) {
308            fprintf(stderr, "error: SpanBackUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
309                    (long)count, (long)testcase.spanCount);
310        }
311    }
312};
313
314class SpanUTF8 : public Command {
315protected:
316    SpanUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
317        // Verify that the frozen set is equal to the unfrozen one.
318        UnicodeSet set;
319        char utf8[4];
320        UChar32 c;
321        int32_t length;
322
323        for(c=0; c<=0x10ffff; ++c) {
324            if(c==0xd800) {
325                c=0xe000;
326            }
327            length=0;
328            U8_APPEND_UNSAFE(utf8, length, c);
329            if(testcase.set.spanUTF8(utf8, length, USET_SPAN_CONTAINED)>0) {
330                set.add(c);
331            }
332        }
333        if(set!=testcase.set) {
334            fprintf(stderr, "error: frozen set != original!\n");
335        }
336    }
337public:
338    static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
339        return new SpanUTF8(testcase);
340    }
341    virtual void call(UErrorCode* pErrorCode) {
342        const UnicodeSet &set=testcase.set;
343        const char *s=testcase.utf8;
344        int32_t length=testcase.utf8Length;
345        int32_t count=0;
346        int32_t i=0;
347        UBool tf=FALSE;
348        while(i<length) {
349            i+=set.spanUTF8(s+i, length-i, (USetSpanCondition)tf);
350            tf=(UBool)(!tf);
351            ++count;
352        }
353        if(count!=testcase.spanCount) {
354            fprintf(stderr, "error: SpanUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
355                    (long)count, (long)testcase.spanCount);
356        }
357    }
358};
359
360class SpanBackUTF8 : public Command {
361protected:
362    SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
363        // Verify that the frozen set is equal to the unfrozen one.
364        UnicodeSet set;
365        char utf8[4];
366        UChar32 c;
367        int32_t length;
368
369        for(c=0; c<=0x10ffff; ++c) {
370            if(c==0xd800) {
371                c=0xe000;
372            }
373            length=0;
374            U8_APPEND_UNSAFE(utf8, length, c);
375            if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) {
376                set.add(c);
377            }
378        }
379        if(set!=testcase.set) {
380            fprintf(stderr, "error: frozen set != original!\n");
381        }
382    }
383public:
384    static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
385        return new SpanBackUTF8(testcase);
386    }
387    virtual void call(UErrorCode* pErrorCode) {
388        const UnicodeSet &set=testcase.set;
389        const char *s=testcase.utf8;
390        int32_t length=testcase.utf8Length;
391        int32_t count=0;
392        /*
393         * Get the same spans as with span() where we always start with a not-contained span.
394         * If testcase.spanCount is an odd number, then the last span() was not-contained.
395         * The last spanBack() must be not-contained to match the first span().
396         */
397        UBool tf=(UBool)((testcase.spanCount&1)==0);
398        while(length>0 || !tf) {
399            length=set.spanBackUTF8(s, length, (USetSpanCondition)tf);
400            tf=(UBool)(!tf);
401            ++count;
402        }
403        if(count!=testcase.spanCount) {
404            fprintf(stderr, "error: SpanBackUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
405                    (long)count, (long)testcase.spanCount);
406        }
407    }
408};
409
410UPerfFunction* UnicodeSetPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) {
411    switch (index) {
412        case 0: name = "Contains";     if (exec) return Contains::get(*this); break;
413        case 1: name = "SpanUTF16";    if (exec) return SpanUTF16::get(*this); break;
414        case 2: name = "SpanBackUTF16";if (exec) return SpanBackUTF16::get(*this); break;
415        case 3: name = "SpanUTF8";     if (exec) return SpanUTF8::get(*this); break;
416        case 4: name = "SpanBackUTF8"; if (exec) return SpanBackUTF8::get(*this); break;
417        default: name = ""; break;
418    }
419    return NULL;
420}
421
422int main(int argc, const char *argv[])
423{
424    // Default values for command-line options.
425    options[SET_PATTERN].value = "[:ID_Continue:]";
426    options[FAST_TYPE].value = "slow";
427
428    UErrorCode status = U_ZERO_ERROR;
429    UnicodeSetPerformanceTest test(argc, argv, status);
430
431	if (U_FAILURE(status)){
432        printf("The error is %s\n", u_errorName(status));
433        test.usage();
434        return status;
435    }
436
437    if (test.run() == FALSE){
438        fprintf(stderr, "FAILED: Tests could not be run, please check the "
439			            "arguments.\n");
440        return 1;
441    }
442
443    return 0;
444}
445