1/* 2************************************************************************** 3* Copyright (C) 2016 and later: Unicode, Inc. and others. 4* License & terms of use: http://www.unicode.org/copyright.html#License 5************************************************************************** 6************************************************************************** 7* Copyright (C) 2014, International Business Machines 8* Corporation and others. All Rights Reserved. 9************************************************************************** 10* file name: unisetperf.cpp 11* encoding: US-ASCII 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2007jan31 16* created by: Markus Scherer 17*/ 18 19#include <stdio.h> 20#include <stdlib.h> 21#include <string.h> 22#include "unicode/uperf.h" 23#include "unicode/uniset.h" 24#include "unicode/unistr.h" 25#include "uoptions.h" 26#include "cmemory.h" // for UPRV_LENGTHOF 27 28// Command-line options specific to unisetperf. 29// Options do not have abbreviations: Force readable command lines. 30// (Using U+0001 for abbreviation characters.) 31enum { 32 SET_PATTERN, 33 FAST_TYPE, 34 UNISETPERF_OPTIONS_COUNT 35}; 36 37static UOption options[UNISETPERF_OPTIONS_COUNT]={ 38 UOPTION_DEF("pattern", '\x01', UOPT_REQUIRES_ARG), 39 UOPTION_DEF("type", '\x01', UOPT_REQUIRES_ARG) 40}; 41 42static const char *const unisetperf_usage = 43 "\t--pattern UnicodeSet pattern for instantiation.\n" 44 "\t Default: [:ID_Continue:]\n" 45 "\t--type Type of UnicodeSet: slow fast\n" 46 "\t Default: slow\n"; 47 48// Test object with setup data. 49class UnicodeSetPerformanceTest : public UPerfTest { 50public: 51 UnicodeSetPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status) 52 : UPerfTest(argc, argv, options, UPRV_LENGTHOF(options), unisetperf_usage, status), 53 utf8(NULL), utf8Length(0), countInputCodePoints(0), spanCount(0) { 54 if (U_SUCCESS(status)) { 55 UnicodeString pattern=UnicodeString(options[SET_PATTERN].value, -1, US_INV).unescape(); 56 set.applyPattern(pattern, status); 57 prefrozen=set; 58 if(0==strcmp(options[FAST_TYPE].value, "fast")) { 59 set.freeze(); 60 } 61 62 int32_t inputLength; 63 UPerfTest::getBuffer(inputLength, status); 64 if(U_SUCCESS(status) && inputLength>0) { 65 countInputCodePoints = u_countChar32(buffer, bufferLen); 66 67 countSpans(); 68 69 // Preflight the UTF-8 length and allocate utf8. 70 u_strToUTF8(NULL, 0, &utf8Length, buffer, bufferLen, &status); 71 if(status==U_BUFFER_OVERFLOW_ERROR) { 72 utf8=(char *)malloc(utf8Length); 73 if(utf8!=NULL) { 74 status=U_ZERO_ERROR; 75 u_strToUTF8(utf8, utf8Length, NULL, buffer, bufferLen, &status); 76 } else { 77 status=U_MEMORY_ALLOCATION_ERROR; 78 } 79 } 80 81 if(verbose) { 82 printf("code points:%ld len16:%ld len8:%ld spans:%ld " 83 "cp/span:%.3g UChar/span:%.3g B/span:%.3g B/cp:%.3g\n", 84 (long)countInputCodePoints, (long)bufferLen, (long)utf8Length, (long)spanCount, 85 (double)countInputCodePoints/spanCount, (double)bufferLen/spanCount, (double)utf8Length/spanCount, 86 (double)utf8Length/countInputCodePoints); 87 } 88 } 89 } 90 } 91 92 virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL); 93 94 // Count spans of characters that are in the set, 95 // and spans of characters that are not in the set. 96 // If the very first character is in the set, then one additional 97 // not-span is counted. 98 void countSpans() { 99 const UChar *s=getBuffer(); 100 int32_t length=getBufferLen(); 101 int32_t i=0; 102 UBool tf=FALSE; 103 while(i<length) { 104 i=span(s, length, i, tf); 105 tf=(UBool)(!tf); 106 ++spanCount; 107 } 108 } 109 int32_t span(const UChar *s, int32_t length, int32_t start, UBool tf) const { 110 UChar32 c; 111 int32_t prev; 112 while((prev=start)<length) { 113 U16_NEXT(s, start, length, c); 114 if(tf!=set.contains(c)) { 115 break; 116 } 117 } 118 return prev; 119 } 120 121 const UChar *getBuffer() const { return buffer; } 122 int32_t getBufferLen() const { return bufferLen; } 123 124 char *utf8; 125 int32_t utf8Length; 126 127 // Number of code points in the input text. 128 int32_t countInputCodePoints; 129 int32_t spanCount; 130 131 UnicodeSet set; 132 UnicodeSet prefrozen; 133}; 134 135// Performance test function object. 136class Command : public UPerfFunction { 137protected: 138 Command(const UnicodeSetPerformanceTest &testcase) : testcase(testcase) {} 139 140public: 141 virtual ~Command() {} 142 143 // virtual void call(UErrorCode* pErrorCode) { ... } 144 145 virtual long getOperationsPerIteration() { 146 // Number of code points tested: 147 // Input code points, plus one for the end of each span except the last span. 148 return testcase.countInputCodePoints+testcase.spanCount-1; 149 } 150 151 virtual long getEventsPerIteration() { 152 return testcase.spanCount; 153 } 154 155 const UnicodeSetPerformanceTest &testcase; 156}; 157 158class Contains : public Command { 159protected: 160 Contains(const UnicodeSetPerformanceTest &testcase) : Command(testcase) { 161 // Verify that the frozen set is equal to the unfrozen one. 162 UnicodeSet set; 163 UChar32 c; 164 165 for(c=0; c<=0x10ffff; ++c) { 166 if(testcase.set.contains(c)) { 167 set.add(c); 168 } 169 } 170 if(set!=testcase.set) { 171 fprintf(stderr, "error: frozen set != original!\n"); 172 } 173 } 174public: 175 static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) { 176 return new Contains(testcase); 177 } 178 virtual void call(UErrorCode* pErrorCode) { 179 const UnicodeSet &set=testcase.set; 180 const UChar *s=testcase.getBuffer(); 181 int32_t length=testcase.getBufferLen(); 182 int32_t count=0; 183 int32_t i=0; 184 UBool tf=FALSE; 185 while(i<length) { 186 i+=span(set, s+i, length-i, tf); 187 tf=(UBool)(!tf); 188 ++count; 189 } 190 if(count!=testcase.spanCount) { 191 fprintf(stderr, "error: Contains() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n", 192 (long)count, (long)testcase.spanCount); 193 } 194 } 195 static int32_t span(const UnicodeSet &set, const UChar *s, int32_t length, UBool tf) { 196 UChar32 c; 197 int32_t start=0, prev; 198 while((prev=start)<length) { 199 U16_NEXT(s, start, length, c); 200 if(tf!=set.contains(c)) { 201 break; 202 } 203 } 204 return prev; 205 } 206}; 207 208class SpanUTF16 : public Command { 209protected: 210 SpanUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) { 211 // Verify that the frozen set is equal to the unfrozen one. 212 UnicodeSet set; 213 UChar utf16[2]; 214 UChar32 c, c2; 215 216 for(c=0; c<=0xffff; ++c) { 217 utf16[0]=(UChar)c; 218 if(testcase.set.span(utf16, 1, USET_SPAN_CONTAINED)>0) { 219 set.add(c); 220 } 221 } 222 for(c=0xd800; c<=0xdbff; ++c) { 223 utf16[0]=(UChar)c; 224 for(c2=0xdc00; c2<=0xdfff; ++c2) { 225 utf16[1]=(UChar)c2; 226 if(testcase.set.span(utf16, 2, USET_SPAN_CONTAINED)>0) { 227 set.add(U16_GET_SUPPLEMENTARY(c, c2)); 228 } 229 } 230 } 231 232 if(set!=testcase.set) { 233 fprintf(stderr, "error: frozen set != original!\n"); 234 } 235 } 236public: 237 static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) { 238 return new SpanUTF16(testcase); 239 } 240 virtual void call(UErrorCode* pErrorCode) { 241 const UnicodeSet &set=testcase.set; 242 const UChar *s=testcase.getBuffer(); 243 int32_t length=testcase.getBufferLen(); 244 int32_t count=0; 245 int32_t i=0; 246 UBool tf=FALSE; 247 while(i<length) { 248 i+=set.span(s+i, length-i, (USetSpanCondition)tf); 249 tf=(UBool)(!tf); 250 ++count; 251 } 252 if(count!=testcase.spanCount) { 253 fprintf(stderr, "error: SpanUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n", 254 (long)count, (long)testcase.spanCount); 255 } 256 } 257}; 258 259class SpanBackUTF16 : public Command { 260protected: 261 SpanBackUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) { 262 // Verify that the frozen set is equal to the unfrozen one. 263 UnicodeSet set; 264 UChar utf16[2]; 265 UChar32 c, c2; 266 267 for(c=0; c<=0xffff; ++c) { 268 utf16[0]=(UChar)c; 269 if(testcase.set.spanBack(utf16, 1, USET_SPAN_CONTAINED)==0) { 270 set.add(c); 271 } 272 } 273 for(c=0xd800; c<=0xdbff; ++c) { 274 utf16[0]=(UChar)c; 275 for(c2=0xdc00; c2<=0xdfff; ++c2) { 276 utf16[1]=(UChar)c2; 277 if(testcase.set.spanBack(utf16, 2, USET_SPAN_CONTAINED)==0) { 278 set.add(U16_GET_SUPPLEMENTARY(c, c2)); 279 } 280 } 281 } 282 283 if(set!=testcase.set) { 284 fprintf(stderr, "error: frozen set != original!\n"); 285 } 286 } 287public: 288 static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) { 289 return new SpanBackUTF16(testcase); 290 } 291 virtual void call(UErrorCode* pErrorCode) { 292 const UnicodeSet &set=testcase.set; 293 const UChar *s=testcase.getBuffer(); 294 int32_t length=testcase.getBufferLen(); 295 int32_t count=0; 296 /* 297 * Get the same spans as with span() where we always start with a not-contained span. 298 * If testcase.spanCount is an odd number, then the last span() was not-contained. 299 * The last spanBack() must be not-contained to match the first span(). 300 */ 301 UBool tf=(UBool)((testcase.spanCount&1)==0); 302 while(length>0 || !tf) { 303 length=set.spanBack(s, length, (USetSpanCondition)tf); 304 tf=(UBool)(!tf); 305 ++count; 306 } 307 if(count!=testcase.spanCount) { 308 fprintf(stderr, "error: SpanBackUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n", 309 (long)count, (long)testcase.spanCount); 310 } 311 } 312}; 313 314class SpanUTF8 : public Command { 315protected: 316 SpanUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) { 317 // Verify that the frozen set is equal to the unfrozen one. 318 UnicodeSet set; 319 char utf8[4]; 320 UChar32 c; 321 int32_t length; 322 323 for(c=0; c<=0x10ffff; ++c) { 324 if(c==0xd800) { 325 c=0xe000; 326 } 327 length=0; 328 U8_APPEND_UNSAFE(utf8, length, c); 329 if(testcase.set.spanUTF8(utf8, length, USET_SPAN_CONTAINED)>0) { 330 set.add(c); 331 } 332 } 333 if(set!=testcase.set) { 334 fprintf(stderr, "error: frozen set != original!\n"); 335 } 336 } 337public: 338 static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) { 339 return new SpanUTF8(testcase); 340 } 341 virtual void call(UErrorCode* pErrorCode) { 342 const UnicodeSet &set=testcase.set; 343 const char *s=testcase.utf8; 344 int32_t length=testcase.utf8Length; 345 int32_t count=0; 346 int32_t i=0; 347 UBool tf=FALSE; 348 while(i<length) { 349 i+=set.spanUTF8(s+i, length-i, (USetSpanCondition)tf); 350 tf=(UBool)(!tf); 351 ++count; 352 } 353 if(count!=testcase.spanCount) { 354 fprintf(stderr, "error: SpanUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n", 355 (long)count, (long)testcase.spanCount); 356 } 357 } 358}; 359 360class SpanBackUTF8 : public Command { 361protected: 362 SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) { 363 // Verify that the frozen set is equal to the unfrozen one. 364 UnicodeSet set; 365 char utf8[4]; 366 UChar32 c; 367 int32_t length; 368 369 for(c=0; c<=0x10ffff; ++c) { 370 if(c==0xd800) { 371 c=0xe000; 372 } 373 length=0; 374 U8_APPEND_UNSAFE(utf8, length, c); 375 if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) { 376 set.add(c); 377 } 378 } 379 if(set!=testcase.set) { 380 fprintf(stderr, "error: frozen set != original!\n"); 381 } 382 } 383public: 384 static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) { 385 return new SpanBackUTF8(testcase); 386 } 387 virtual void call(UErrorCode* pErrorCode) { 388 const UnicodeSet &set=testcase.set; 389 const char *s=testcase.utf8; 390 int32_t length=testcase.utf8Length; 391 int32_t count=0; 392 /* 393 * Get the same spans as with span() where we always start with a not-contained span. 394 * If testcase.spanCount is an odd number, then the last span() was not-contained. 395 * The last spanBack() must be not-contained to match the first span(). 396 */ 397 UBool tf=(UBool)((testcase.spanCount&1)==0); 398 while(length>0 || !tf) { 399 length=set.spanBackUTF8(s, length, (USetSpanCondition)tf); 400 tf=(UBool)(!tf); 401 ++count; 402 } 403 if(count!=testcase.spanCount) { 404 fprintf(stderr, "error: SpanBackUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n", 405 (long)count, (long)testcase.spanCount); 406 } 407 } 408}; 409 410UPerfFunction* UnicodeSetPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) { 411 switch (index) { 412 case 0: name = "Contains"; if (exec) return Contains::get(*this); break; 413 case 1: name = "SpanUTF16"; if (exec) return SpanUTF16::get(*this); break; 414 case 2: name = "SpanBackUTF16";if (exec) return SpanBackUTF16::get(*this); break; 415 case 3: name = "SpanUTF8"; if (exec) return SpanUTF8::get(*this); break; 416 case 4: name = "SpanBackUTF8"; if (exec) return SpanBackUTF8::get(*this); break; 417 default: name = ""; break; 418 } 419 return NULL; 420} 421 422int main(int argc, const char *argv[]) 423{ 424 // Default values for command-line options. 425 options[SET_PATTERN].value = "[:ID_Continue:]"; 426 options[FAST_TYPE].value = "slow"; 427 428 UErrorCode status = U_ZERO_ERROR; 429 UnicodeSetPerformanceTest test(argc, argv, status); 430 431 if (U_FAILURE(status)){ 432 printf("The error is %s\n", u_errorName(status)); 433 test.usage(); 434 return status; 435 } 436 437 if (test.run() == FALSE){ 438 fprintf(stderr, "FAILED: Tests could not be run, please check the " 439 "arguments.\n"); 440 return 1; 441 } 442 443 return 0; 444} 445