1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
4 *
5 ********************************************************************/
6/********************************************************************************
7*
8* File CALLCOLL.C
9*
10* Modification History:
11*        Name                     Description
12*     Andy Heninger             First Version
13*
14*********************************************************************************
15*/
16
17//
18//  This program tests string collation and sort key generation performance.
19//      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
20//      A file of names is required as input, one per line.  It must be in utf-8 or utf-16 format,
21//      and include a byte order mark.  Either LE or BE format is OK.
22//
23
24const char gUsageString[] =
25 "usage:  collperf options...\n"
26    "-help                      Display this message.\n"
27    "-file file_name            utf-16 format file of names.\n"
28    "-locale name               ICU locale to use.  Default is en_US\n"
29    "-rules file_name           Collation rules file (overrides locale)\n"
30    "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
31    "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
32    "-win                       Run test using Windows native services.  (ICU is default)\n"
33    "-unix                      Run test using Unix strxfrm, strcoll services.\n"
34    "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
35    "-usekeys                   Run tests using sortkeys rather than strcoll\n"
36    "-strcmp                    Run tests using u_strcmp rather than strcoll\n"
37    "-strcmpCPO                 Run tests using u_strcmpCodePointOrder rather than strcoll\n"
38    "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
39    "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
40    "                               under test at each call point.  For measuring test overhead.\n"
41    "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
42    "-french                    French accent ordering\n"
43    "-frenchoff                 No French accent ordering (for use with French locales.)\n"
44    "-norm                      Normalizing mode on\n"
45    "-shifted                   Shifted mode\n"
46    "-lower                     Lower case first\n"
47    "-upper                     Upper case first\n"
48    "-case                      Enable separate case level\n"
49    "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
50    "-keyhist                   Produce a table sort key size vs. string length\n"
51    "-binsearch                 Binary Search timing test\n"
52    "-keygen                    Sort Key Generation timing test\n"
53    "-qsort                     Quicksort timing test\n"
54    "-iter                      Iteration Performance Test\n"
55    "-dump                      Display strings, sort keys and CEs.\n"
56    ;
57
58
59
60#include <stdio.h>
61#include <string.h>
62#include <stdlib.h>
63#include <math.h>
64#include <locale.h>
65#include <errno.h>
66
67#include <unicode/utypes.h>
68#include <unicode/ucol.h>
69#include <unicode/ucoleitr.h>
70#include <unicode/uloc.h>
71#include <unicode/ustring.h>
72#include <unicode/ures.h>
73#include <unicode/uchar.h>
74#include <unicode/ucnv.h>
75#include <unicode/utf8.h>
76
77#ifdef WIN32
78#include <windows.h>
79#else
80//
81//  Stubs for Windows API functions when building on UNIXes.
82//
83typedef int DWORD;
84inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
85#include <sys/time.h>
86unsigned long timeGetTime() {
87    struct timeval t;
88    gettimeofday(&t, 0);
89    unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
90    val += t.tv_usec / 1000;
91    return val;
92}
93inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
94const int LCMAP_SORTKEY = 0;
95#define MAKELCID(a,b) 0
96const int SORT_DEFAULT = 0;
97#endif
98
99
100
101//
102//  Command line option variables
103//     These global variables are set according to the options specified
104//     on the command line by the user.
105char * opt_fName      = 0;
106const char * opt_locale     = "en_US";
107int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
108char * opt_rules      = 0;
109UBool  opt_help       = FALSE;
110int    opt_loopCount  = 1;
111int    opt_iLoopCount = 1;
112UBool  opt_terse      = FALSE;
113UBool  opt_qsort      = FALSE;
114UBool  opt_binsearch  = FALSE;
115UBool  opt_icu        = TRUE;
116UBool  opt_win        = FALSE;      // Run with Windows native functions.
117UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
118UBool  opt_uselen     = FALSE;
119UBool  opt_usekeys    = FALSE;
120UBool  opt_strcmp     = FALSE;
121UBool  opt_strcmpCPO  = FALSE;
122UBool  opt_norm       = FALSE;
123UBool  opt_keygen     = FALSE;
124UBool  opt_french     = FALSE;
125UBool  opt_frenchoff  = FALSE;
126UBool  opt_shifted    = FALSE;
127UBool  opt_lower      = FALSE;
128UBool  opt_upper      = FALSE;
129UBool  opt_case       = FALSE;
130int    opt_level      = 0;
131UBool  opt_keyhist    = FALSE;
132UBool  opt_itertest   = FALSE;
133UBool  opt_dump       = FALSE;
134
135
136
137//
138//   Definitions for the command line options
139//
140struct OptSpec {
141    const char *name;
142    enum {FLAG, NUM, STRING} type;
143    void *pVar;
144};
145
146OptSpec opts[] = {
147    {"-file",        OptSpec::STRING, &opt_fName},
148    {"-locale",      OptSpec::STRING, &opt_locale},
149    {"-langid",      OptSpec::NUM,    &opt_langid},
150    {"-rules",       OptSpec::STRING, &opt_rules},
151    {"-qsort",       OptSpec::FLAG,   &opt_qsort},
152    {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
153    {"-iter",        OptSpec::FLAG,   &opt_itertest},
154    {"-win",         OptSpec::FLAG,   &opt_win},
155    {"-unix",        OptSpec::FLAG,   &opt_unix},
156    {"-uselen",      OptSpec::FLAG,   &opt_uselen},
157    {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
158    {"-strcmp",      OptSpec::FLAG,   &opt_strcmp},
159    {"-strcmpCPO",   OptSpec::FLAG,   &opt_strcmpCPO},
160    {"-norm",        OptSpec::FLAG,   &opt_norm},
161    {"-french",      OptSpec::FLAG,   &opt_french},
162    {"-frenchoff",   OptSpec::FLAG,   &opt_frenchoff},
163    {"-shifted",     OptSpec::FLAG,   &opt_shifted},
164    {"-lower",       OptSpec::FLAG,   &opt_lower},
165    {"-upper",       OptSpec::FLAG,   &opt_upper},
166    {"-case",        OptSpec::FLAG,   &opt_case},
167    {"-level",       OptSpec::NUM,    &opt_level},
168    {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
169    {"-keygen",      OptSpec::FLAG,   &opt_keygen},
170    {"-loop",        OptSpec::NUM,    &opt_loopCount},
171    {"-iloop",       OptSpec::NUM,    &opt_iLoopCount},
172    {"-terse",       OptSpec::FLAG,   &opt_terse},
173    {"-dump",        OptSpec::FLAG,   &opt_dump},
174    {"-help",        OptSpec::FLAG,   &opt_help},
175    {"-?",           OptSpec::FLAG,   &opt_help},
176    {0, OptSpec::FLAG, 0}
177};
178
179
180//---------------------------------------------------------------------------
181//
182//  Global variables pointing to and describing the test file
183//
184//---------------------------------------------------------------------------
185
186//
187//   struct Line
188//
189//      Each line from the source file (containing a name, presumably) gets
190//      one of these structs.
191//
192struct  Line {
193    UChar     *name;
194    int        len;
195    char      *winSortKey;
196    char      *icuSortKey;
197    char      *unixSortKey;
198    char      *unixName;
199};
200
201
202
203Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
204int            gNumFileLines;
205UCollator     *gCol;
206DWORD          gWinLCID;
207
208Line          **gSortedLines;
209Line          **gRandomLines;
210int            gCount;
211
212
213
214//---------------------------------------------------------------------------
215//
216//  ProcessOptions()    Function to read the command line options.
217//
218//---------------------------------------------------------------------------
219UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
220{
221    int         i;
222    int         argNum;
223    const char  *pArgName;
224    OptSpec    *pOpt;
225
226    for (argNum=1; argNum<argc; argNum++) {
227        pArgName = argv[argNum];
228        for (pOpt = opts;  pOpt->name != 0; pOpt++) {
229            if (strcmp(pOpt->name, pArgName) == 0) {
230                switch (pOpt->type) {
231                case OptSpec::FLAG:
232                    *(UBool *)(pOpt->pVar) = TRUE;
233                    break;
234                case OptSpec::STRING:
235                    argNum ++;
236                    if (argNum >= argc) {
237                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
238                        return FALSE;
239                    }
240                    *(const char **)(pOpt->pVar)  = argv[argNum];
241                    break;
242                case OptSpec::NUM:
243                    argNum ++;
244                    if (argNum >= argc) {
245                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
246                        return FALSE;
247                    }
248                    char *endp;
249                    i = strtol(argv[argNum], &endp, 0);
250                    if (endp == argv[argNum]) {
251                        fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
252                        return FALSE;
253                    }
254                    *(int *)(pOpt->pVar) = i;
255                }
256                break;
257            }
258        }
259        if (pOpt->name == 0)
260        {
261            fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
262            return FALSE;
263        }
264    }
265return TRUE;
266}
267
268//---------------------------------------------------------------------------------------
269//
270//   Comparison functions for use by qsort.
271//
272//       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
273//           or null terminated.
274//
275//---------------------------------------------------------------------------------------
276int ICUstrcmpK(const void *a, const void *b) {
277    gCount++;
278    int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
279    return t;
280}
281
282
283int ICUstrcmpL(const void *a, const void *b) {
284    gCount++;
285    UCollationResult t;
286    t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
287    if (t == UCOL_LESS) return -1;
288    if (t == UCOL_GREATER) return +1;
289    return 0;
290}
291
292
293int ICUstrcmp(const void *a, const void *b) {
294    gCount++;
295    UCollationResult t;
296    t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
297    if (t == UCOL_LESS) return -1;
298    if (t == UCOL_GREATER) return +1;
299    return 0;
300}
301
302
303int Winstrcmp(const void *a, const void *b) {
304    gCount++;
305    int t;
306    t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
307    return t-2;
308}
309
310
311int UNIXstrcmp(const void *a, const void *b) {
312    gCount++;
313    int t;
314    t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
315    return t;
316}
317
318
319int WinstrcmpL(const void *a, const void *b) {
320    gCount++;
321    int t;
322    t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
323    return t-2;
324}
325
326
327int WinstrcmpK(const void *a, const void *b) {
328    gCount++;
329    int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
330    return t;
331}
332
333
334//---------------------------------------------------------------------------------------
335//
336//   Function for sorting the names (lines) into a random order.
337//      Order is based on a hash of the  ICU Sort key for the lines
338//      The randomized order is used as input for the sorting timing tests.
339//
340//---------------------------------------------------------------------------------------
341int ICURandomCmp(const void *a, const void *b) {
342    char  *ask = (*(Line **)a)->icuSortKey;
343    char  *bsk = (*(Line **)b)->icuSortKey;
344    int   aVal = 0;
345    int   bVal = 0;
346    int   retVal;
347    while (*ask != 0) {
348        aVal += aVal*37 + *ask++;
349    }
350    while (*bsk != 0) {
351        bVal += bVal*37 + *bsk++;
352    }
353    retVal = -1;
354    if (aVal == bVal) {
355        retVal = 0;
356    }
357    else if (aVal > bVal) {
358        retVal = 1;
359    }
360    return retVal;
361}
362
363//---------------------------------------------------------------------------------------
364//
365//   doKeyGen()     Key Generation Timing Test
366//
367//---------------------------------------------------------------------------------------
368void doKeyGen()
369{
370    int  line;
371    int  loops = 0;
372    int  iLoop;
373    int  t;
374    int  len=-1;
375
376    // Adjust loop count to compensate for file size.   Should be order n
377    double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
378    int adj_loopCount = int(dLoopCount);
379    if (adj_loopCount < 1) adj_loopCount = 1;
380
381
382    unsigned long startTime = timeGetTime();
383
384    if (opt_win) {
385        for (loops=0; loops<adj_loopCount; loops++) {
386            for (line=0; line < gNumFileLines; line++) {
387                if (opt_uselen) {
388                    len = gFileLines[line].len;
389                }
390                for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
391                    t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
392                        gFileLines[line].name, len,
393                        (unsigned short *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
394                }
395            }
396        }
397    }
398    else if (opt_icu)
399    {
400        for (loops=0; loops<adj_loopCount; loops++) {
401            for (line=0; line < gNumFileLines; line++) {
402                if (opt_uselen) {
403                    len = gFileLines[line].len;
404                }
405                for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
406                    t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
407                }
408            }
409        }
410    }
411    else if (opt_unix)
412    {
413        for (loops=0; loops<adj_loopCount; loops++) {
414            for (line=0; line < gNumFileLines; line++) {
415                for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
416                t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
417                }
418            }
419        }
420    }
421
422    unsigned long elapsedTime = timeGetTime() - startTime;
423    int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
424
425    if (opt_terse == FALSE) {
426        printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
427        printf("Sort Key Generation:  time per key = %d ns\n", ns);
428    }
429    else {
430        printf("%d,  ", ns);
431    }
432
433    int   totalKeyLen = 0;
434    int   totalChars  = 0;
435    for (line=0; line<gNumFileLines; line++) {
436        totalChars += u_strlen(gFileLines[line].name);
437        if (opt_win) {
438            totalKeyLen += strlen(gFileLines[line].winSortKey);
439        }
440        else if (opt_icu) {
441            totalKeyLen += strlen(gFileLines[line].icuSortKey);
442        }
443        else if (opt_unix) {
444            totalKeyLen += strlen(gFileLines[line].unixSortKey);
445        }
446
447    }
448    if (opt_terse == FALSE) {
449        printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
450    } else {
451        printf("%f, ", (float)totalKeyLen / (float)totalChars);
452    }
453}
454
455
456
457//---------------------------------------------------------------------------------------
458//
459//    doBinarySearch()    Binary Search timing test.  Each name from the list
460//                        is looked up in the full sorted list of names.
461//
462//---------------------------------------------------------------------------------------
463void doBinarySearch()
464{
465
466    gCount = 0;
467    int  line;
468    int  loops = 0;
469    int  iLoop = 0;
470    unsigned long elapsedTime = 0;
471
472    // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
473    // Accurate timings do not depend on this being perfect.  The correction is just to try to
474    //   get total running times of about the right order, so the that user doesn't need to
475    //   manually adjust the loop count for every different file size.
476    double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
477    if (opt_usekeys) dLoopCount *= 5;
478    int adj_loopCount = int(dLoopCount);
479    if (adj_loopCount < 1) adj_loopCount = 1;
480
481
482    for (;;) {  // not really a loop, just allows "break" to work, to simplify
483                //   inadvertantly running more than one test through here.
484        if (opt_strcmp || opt_strcmpCPO)
485        {
486            unsigned long startTime = timeGetTime();
487            typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
488            PF pf = u_strcmp;
489            if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
490            //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;}   // Damn the difference between int32_t and int
491                                                            //   which forces the use of a cast here.
492
493            int r = 0;
494            for (loops=0; loops<adj_loopCount; loops++) {
495
496                for (line=0; line < gNumFileLines; line++) {
497                    int hi      = gNumFileLines-1;
498                    int lo      = 0;
499                    int  guess = -1;
500                    for (;;) {
501                        int newGuess = (hi + lo) / 2;
502                        if (newGuess == guess)
503                            break;
504                        guess = newGuess;
505                        for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
506                            r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
507                        }
508                        gCount++;
509                        if (r== 0)
510                            break;
511                        if (r < 0)
512                            hi = guess;
513                        else
514                            lo   = guess;
515                    }
516                }
517            }
518            elapsedTime = timeGetTime() - startTime;
519            break;
520        }
521
522
523        if (opt_icu)
524        {
525            unsigned long startTime = timeGetTime();
526            UCollationResult  r = UCOL_EQUAL;
527            for (loops=0; loops<adj_loopCount; loops++) {
528
529                for (line=0; line < gNumFileLines; line++) {
530                    int lineLen  = -1;
531                    int guessLen = -1;
532                    if (opt_uselen) {
533                        lineLen = (gSortedLines[line])->len;
534                    }
535                    int hi      = gNumFileLines-1;
536                    int lo      = 0;
537                    int  guess = -1;
538                    for (;;) {
539                        int newGuess = (hi + lo) / 2;
540                        if (newGuess == guess)
541                            break;
542                        guess = newGuess;
543                        int ri = 0;
544                        if (opt_usekeys) {
545                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
546                                ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
547                            }
548                            gCount++;
549                            r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
550                        }
551                        else
552                        {
553                            if (opt_uselen) {
554                                guessLen = (gSortedLines[guess])->len;
555                            }
556                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
557                                r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
558                            }
559                            gCount++;
560                        }
561                        if (r== UCOL_EQUAL)
562                            break;
563                        if (r == UCOL_LESS)
564                            hi = guess;
565                        else
566                            lo   = guess;
567                    }
568                }
569            }
570            elapsedTime = timeGetTime() - startTime;
571            break;
572        }
573
574        if (opt_win)
575        {
576            unsigned long startTime = timeGetTime();
577            int r = 0;
578            for (loops=0; loops<adj_loopCount; loops++) {
579
580                for (line=0; line < gNumFileLines; line++) {
581                    int lineLen  = -1;
582                    int guessLen = -1;
583                    if (opt_uselen) {
584                        lineLen = (gSortedLines[line])->len;
585                    }
586                    int hi   = gNumFileLines-1;
587                    int lo   = 0;
588                    int  guess = -1;
589                    for (;;) {
590                        int newGuess = (hi + lo) / 2;
591                        if (newGuess == guess)
592                            break;
593                        guess = newGuess;
594                        if (opt_usekeys) {
595                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
596                                r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
597                            }
598                            gCount++;
599                            r+=2;
600                        }
601                        else
602                        {
603                            if (opt_uselen) {
604                                guessLen = (gSortedLines[guess])->len;
605                            }
606                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
607                                r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
608                            }
609                            if (r == 0) {
610                                if (opt_terse == FALSE) {
611                                    fprintf(stderr, "Error returned from Windows CompareStringW.\n");
612                                }
613                                exit(-1);
614                            }
615                            gCount++;
616                        }
617                        if (r== 2)   //  strings ==
618                            break;
619                        if (r == 1)  //  line < guess
620                            hi = guess;
621                        else         //  line > guess
622                            lo   = guess;
623                    }
624                }
625            }
626            elapsedTime = timeGetTime() - startTime;
627            break;
628        }
629
630        if (opt_unix)
631        {
632            unsigned long startTime = timeGetTime();
633            int r = 0;
634            for (loops=0; loops<adj_loopCount; loops++) {
635
636                for (line=0; line < gNumFileLines; line++) {
637                    int hi   = gNumFileLines-1;
638                    int lo   = 0;
639                    int  guess = -1;
640                    for (;;) {
641                        int newGuess = (hi + lo) / 2;
642                        if (newGuess == guess)
643                            break;
644                        guess = newGuess;
645                        if (opt_usekeys) {
646                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
647                                 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
648                            }
649                            gCount++;
650                        }
651                        else
652                        {
653                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
654                                r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
655                            }
656                            errno = 0;
657                            if (errno != 0) {
658                                fprintf(stderr, "Error %d returned from strcoll.\n", errno);
659                                exit(-1);
660                            }
661                            gCount++;
662                        }
663                        if (r == 0)   //  strings ==
664                            break;
665                        if (r < 0)  //  line < guess
666                            hi = guess;
667                        else         //  line > guess
668                            lo   = guess;
669                    }
670                }
671            }
672            elapsedTime = timeGetTime() - startTime;
673            break;
674        }
675        break;
676    }
677
678    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
679    if (opt_terse == FALSE) {
680        printf("binary search:  total # of string compares = %d\n", gCount);
681        printf("binary search:  compares per loop = %d\n", gCount / loops);
682        printf("binary search:  time per compare = %d ns\n", ns);
683    } else {
684        printf("%d, ", ns);
685    }
686
687}
688
689
690
691
692//---------------------------------------------------------------------------------------
693//
694//   doQSort()    The quick sort timing test.  Uses the C library qsort function.
695//
696//---------------------------------------------------------------------------------------
697void doQSort() {
698    int i;
699    Line **sortBuf = new Line *[gNumFileLines];
700
701    // Adjust loop count to compensate for file size.   QSort should be n log(n)
702    double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
703    if (opt_usekeys) dLoopCount *= 5;
704    int adj_loopCount = int(dLoopCount);
705    if (adj_loopCount < 1) adj_loopCount = 1;
706
707
708    gCount = 0;
709    unsigned long startTime = timeGetTime();
710    if (opt_win && opt_usekeys) {
711        for (i=0; i<opt_loopCount; i++) {
712            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
713            qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
714        }
715    }
716
717    else if (opt_win && opt_uselen) {
718        for (i=0; i<adj_loopCount; i++) {
719            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
720            qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
721        }
722    }
723
724
725    else if (opt_win && !opt_uselen) {
726        for (i=0; i<adj_loopCount; i++) {
727            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
728            qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
729        }
730    }
731
732    else if (opt_icu && opt_usekeys) {
733        for (i=0; i<adj_loopCount; i++) {
734            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
735            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
736        }
737    }
738
739    else if (opt_icu && opt_uselen) {
740        for (i=0; i<adj_loopCount; i++) {
741            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
742            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
743        }
744    }
745
746
747    else if (opt_icu && !opt_uselen) {
748        for (i=0; i<adj_loopCount; i++) {
749            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
750            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
751        }
752    }
753
754    else if (opt_unix && !opt_usekeys) {
755        for (i=0; i<adj_loopCount; i++) {
756            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
757            qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
758        }
759    }
760
761    unsigned long elapsedTime = timeGetTime() - startTime;
762    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
763    if (opt_terse == FALSE) {
764        printf("qsort:  total # of string compares = %d\n", gCount);
765        printf("qsort:  time per compare = %d ns\n", ns);
766    } else {
767        printf("%d, ", ns);
768    }
769}
770
771
772
773//---------------------------------------------------------------------------------------
774//
775//    doKeyHist()       Output a table of data for
776//                        average sort key size vs. string length.
777//
778//---------------------------------------------------------------------------------------
779void doKeyHist() {
780    int     i;
781    int     maxLen = 0;
782
783    // Find the maximum string length
784    for (i=0; i<gNumFileLines; i++) {
785        if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
786    }
787
788    // Allocate arrays to hold the histogram data
789    int *accumulatedLen  = new int[maxLen+1];
790    int *numKeysOfSize   = new int[maxLen+1];
791    for (i=0; i<=maxLen; i++) {
792        accumulatedLen[i] = 0;
793        numKeysOfSize[i] = 0;
794    }
795
796    // Fill the arrays...
797    for (i=0; i<gNumFileLines; i++) {
798        int len = gFileLines[i].len;
799        accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
800        numKeysOfSize[len] += 1;
801    }
802
803    // And write out averages
804    printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
805    for (i=1; i<=maxLen; i++) {
806        if (numKeysOfSize[i] > 0) {
807            printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
808                (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
809        }
810    }
811    delete []accumulatedLen;
812    delete []numKeysOfSize ;
813}
814
815//---------------------------------------------------------------------------------------
816//
817//    doForwardIterTest(UBool)       Forward iteration test
818//                                   argument null-terminated string used
819//
820//---------------------------------------------------------------------------------------
821void doForwardIterTest(UBool haslen) {
822    int count = 0;
823
824    UErrorCode error = U_ZERO_ERROR;
825    printf("\n\nPerforming forward iteration performance test with ");
826
827    if (haslen) {
828        printf("non-null terminated data -----------\n");
829    }
830    else {
831        printf("null terminated data -----------\n");
832    }
833    printf("performance test on strings from file -----------\n");
834
835    UChar dummytext[] = {0, 0};
836    UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
837    ucol_setText(iter, dummytext, 1, &error);
838
839    gCount = 0;
840    unsigned long startTime = timeGetTime();
841    while (count < opt_loopCount) {
842        int linecount = 0;
843        while (linecount < gNumFileLines) {
844            UChar *str = gFileLines[linecount].name;
845            int strlen = haslen?gFileLines[linecount].len:-1;
846            ucol_setText(iter, str, strlen, &error);
847            while (ucol_next(iter, &error) != UCOL_NULLORDER) {
848                gCount++;
849            }
850
851            linecount ++;
852        }
853        count ++;
854    }
855    unsigned long elapsedTime = timeGetTime() - startTime;
856    printf("elapsedTime %ld\n", elapsedTime);
857
858    // empty loop recalculation
859    count = 0;
860    startTime = timeGetTime();
861    while (count < opt_loopCount) {
862        int linecount = 0;
863        while (linecount < gNumFileLines) {
864            UChar *str = gFileLines[linecount].name;
865            int strlen = haslen?gFileLines[linecount].len:-1;
866            ucol_setText(iter, str, strlen, &error);
867            linecount ++;
868        }
869        count ++;
870    }
871    elapsedTime -= (timeGetTime() - startTime);
872    printf("elapsedTime %ld\n", elapsedTime);
873
874    ucol_closeElements(iter);
875
876    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
877    printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
878                                                                opt_loopCount);
879    printf("Average time per ucol_next() nano seconds %d\n", ns);
880
881    printf("performance test on skipped-5 concatenated strings from file -----------\n");
882
883    UChar *str;
884    int    strlen = 0;
885    // appending all the strings
886    int linecount = 0;
887    while (linecount < gNumFileLines) {
888        strlen += haslen?gFileLines[linecount].len:
889                                      u_strlen(gFileLines[linecount].name);
890        linecount ++;
891    }
892    str = (UChar *)malloc(sizeof(UChar) * strlen);
893    int strindex = 0;
894    linecount = 0;
895    while (strindex < strlen) {
896        int len = 0;
897        len += haslen?gFileLines[linecount].len:
898                                      u_strlen(gFileLines[linecount].name);
899        memcpy(str + strindex, gFileLines[linecount].name,
900               sizeof(UChar) * len);
901        strindex += len;
902        linecount ++;
903    }
904
905    printf("Total size of strings %d\n", strlen);
906
907    gCount = 0;
908    count  = 0;
909
910    if (!haslen) {
911        strlen = -1;
912    }
913    iter = ucol_openElements(gCol, str, strlen, &error);
914    if (!haslen) {
915        strlen = u_strlen(str);
916    }
917    strlen -= 5; // any left over characters are not iterated,
918                 // this is to ensure the backwards and forwards iterators
919                 // gets the same position
920    startTime = timeGetTime();
921    while (count < opt_loopCount) {
922        int count5 = 5;
923        strindex = 0;
924        ucol_setOffset(iter, strindex, &error);
925        while (TRUE) {
926            if (ucol_next(iter, &error) == UCOL_NULLORDER) {
927                break;
928            }
929            gCount++;
930            count5 --;
931            if (count5 == 0) {
932                strindex += 10;
933                if (strindex > strlen) {
934                    break;
935                }
936                ucol_setOffset(iter, strindex, &error);
937                count5 = 5;
938            }
939        }
940        count ++;
941    }
942
943    elapsedTime = timeGetTime() - startTime;
944    printf("elapsedTime %ld\n", elapsedTime);
945
946    // empty loop recalculation
947    int tempgCount = 0;
948    count = 0;
949    startTime = timeGetTime();
950    while (count < opt_loopCount) {
951        int count5 = 5;
952        strindex = 0;
953        ucol_setOffset(iter, strindex, &error);
954        while (TRUE) {
955            tempgCount ++;
956            count5 --;
957            if (count5 == 0) {
958                strindex += 10;
959                if (strindex > strlen) {
960                    break;
961                }
962                ucol_setOffset(iter, strindex, &error);
963                count5 = 5;
964            }
965        }
966        count ++;
967    }
968    elapsedTime -= (timeGetTime() - startTime);
969    printf("elapsedTime %ld\n", elapsedTime);
970
971    ucol_closeElements(iter);
972
973    printf("gCount %d\n", gCount);
974    ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
975    printf("Average time per ucol_next() nano seconds %d\n", ns);
976}
977
978//---------------------------------------------------------------------------------------
979//
980//    doBackwardIterTest(UBool)      Backwards iteration test
981//                                   argument null-terminated string used
982//
983//---------------------------------------------------------------------------------------
984void doBackwardIterTest(UBool haslen) {
985    int count = 0;
986    UErrorCode error = U_ZERO_ERROR;
987    printf("\n\nPerforming backward iteration performance test with ");
988
989    if (haslen) {
990        printf("non-null terminated data -----------\n");
991    }
992    else {
993        printf("null terminated data -----------\n");
994    }
995
996    printf("performance test on strings from file -----------\n");
997
998    UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
999    UChar dummytext[] = {0, 0};
1000    ucol_setText(iter, dummytext, 1, &error);
1001
1002    gCount = 0;
1003    unsigned long startTime = timeGetTime();
1004    while (count < opt_loopCount) {
1005        int linecount = 0;
1006        while (linecount < gNumFileLines) {
1007            UChar *str = gFileLines[linecount].name;
1008            int strlen = haslen?gFileLines[linecount].len:-1;
1009            ucol_setText(iter, str, strlen, &error);
1010            while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1011                gCount ++;
1012            }
1013
1014            linecount ++;
1015        }
1016        count ++;
1017    }
1018    unsigned long elapsedTime = timeGetTime() - startTime;
1019
1020    printf("elapsedTime %ld\n", elapsedTime);
1021
1022    // empty loop recalculation
1023    count = 0;
1024    startTime = timeGetTime();
1025    while (count < opt_loopCount) {
1026        int linecount = 0;
1027        while (linecount < gNumFileLines) {
1028            UChar *str = gFileLines[linecount].name;
1029            int strlen = haslen?gFileLines[linecount].len:-1;
1030            ucol_setText(iter, str, strlen, &error);
1031            linecount ++;
1032        }
1033        count ++;
1034    }
1035    elapsedTime -= (timeGetTime() - startTime);
1036
1037    printf("elapsedTime %ld\n", elapsedTime);
1038    ucol_closeElements(iter);
1039
1040    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1041    printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1042                                                                opt_loopCount);
1043    printf("Average time per ucol_previous() nano seconds %d\n", ns);
1044
1045    printf("performance test on skipped-5 concatenated strings from file -----------\n");
1046
1047    UChar *str;
1048    int    strlen = 0;
1049    // appending all the strings
1050    int linecount = 0;
1051    while (linecount < gNumFileLines) {
1052        strlen += haslen?gFileLines[linecount].len:
1053                                      u_strlen(gFileLines[linecount].name);
1054        linecount ++;
1055    }
1056    str = (UChar *)malloc(sizeof(UChar) * strlen);
1057    int strindex = 0;
1058    linecount = 0;
1059    while (strindex < strlen) {
1060        int len = 0;
1061        len += haslen?gFileLines[linecount].len:
1062                                      u_strlen(gFileLines[linecount].name);
1063        memcpy(str + strindex, gFileLines[linecount].name,
1064               sizeof(UChar) * len);
1065        strindex += len;
1066        linecount ++;
1067    }
1068
1069    printf("Total size of strings %d\n", strlen);
1070
1071    gCount = 0;
1072    count  = 0;
1073
1074    if (!haslen) {
1075        strlen = -1;
1076    }
1077
1078    iter = ucol_openElements(gCol, str, strlen, &error);
1079    if (!haslen) {
1080        strlen = u_strlen(str);
1081    }
1082
1083    startTime = timeGetTime();
1084    while (count < opt_loopCount) {
1085        int count5 = 5;
1086        strindex = 5;
1087        ucol_setOffset(iter, strindex, &error);
1088        while (TRUE) {
1089            if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1090                break;
1091            }
1092             gCount ++;
1093             count5 --;
1094             if (count5 == 0) {
1095                 strindex += 10;
1096                 if (strindex > strlen) {
1097                    break;
1098                 }
1099                 ucol_setOffset(iter, strindex, &error);
1100                 count5 = 5;
1101             }
1102        }
1103        count ++;
1104    }
1105
1106    elapsedTime = timeGetTime() - startTime;
1107    printf("elapsedTime %ld\n", elapsedTime);
1108
1109    // empty loop recalculation
1110    count = 0;
1111    int tempgCount = 0;
1112    startTime = timeGetTime();
1113    while (count < opt_loopCount) {
1114        int count5 = 5;
1115        strindex = 5;
1116        ucol_setOffset(iter, strindex, &error);
1117        while (TRUE) {
1118             tempgCount ++;
1119             count5 --;
1120             if (count5 == 0) {
1121                 strindex += 10;
1122                 if (strindex > strlen) {
1123                    break;
1124                 }
1125                 ucol_setOffset(iter, strindex, &error);
1126                 count5 = 5;
1127             }
1128        }
1129        count ++;
1130    }
1131    elapsedTime -= (timeGetTime() - startTime);
1132    printf("elapsedTime %ld\n", elapsedTime);
1133    ucol_closeElements(iter);
1134
1135    printf("gCount %d\n", gCount);
1136    ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1137    printf("Average time per ucol_previous() nano seconds %d\n", ns);
1138}
1139
1140//---------------------------------------------------------------------------------------
1141//
1142//    doIterTest()       Iteration test
1143//
1144//---------------------------------------------------------------------------------------
1145void doIterTest() {
1146    doForwardIterTest(opt_uselen);
1147    doBackwardIterTest(opt_uselen);
1148}
1149
1150
1151//----------------------------------------------------------------------------------------
1152//
1153//   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
1154//                    Since it appears that Unicode support is going in the general
1155//                    direction of the use of UTF-8 locales, that is the approach
1156//                    that is used here.
1157//
1158//----------------------------------------------------------------------------------------
1159void  UnixConvert() {
1160    int    line;
1161
1162    UConverter   *cvrtr;    // An ICU code page converter.
1163    UErrorCode    status = U_ZERO_ERROR;
1164
1165
1166    cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
1167    if (U_FAILURE(status)) {
1168        fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1169        exit(-1);
1170    }
1171
1172    for (line=0; line < gNumFileLines; line++) {
1173        int sizeNeeded = ucnv_fromUChars(cvrtr,
1174                                         0,            // ptr to target buffer.
1175                                         0,            // length of target buffer.
1176                                         gFileLines[line].name,
1177                                         -1,           //  source is null terminated
1178                                         &status);
1179        if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1180            //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1181            //exit(-1);
1182        }
1183        status = U_ZERO_ERROR;
1184        gFileLines[line].unixName = new char[sizeNeeded+1];
1185        sizeNeeded = ucnv_fromUChars(cvrtr,
1186                                         gFileLines[line].unixName, // ptr to target buffer.
1187                                         sizeNeeded+1, // length of target buffer.
1188                                         gFileLines[line].name,
1189                                         -1,           //  source is null terminated
1190                                         &status);
1191        if (U_FAILURE(status)) {
1192            fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1193            exit(-1);
1194        }
1195        gFileLines[line].unixName[sizeNeeded] = 0;
1196    };
1197    ucnv_close(cvrtr);
1198}
1199
1200
1201//----------------------------------------------------------------------------------------
1202//
1203//  class UCharFile   Class to hide all the gorp to read a file in
1204//                    and produce a stream of UChars.
1205//
1206//----------------------------------------------------------------------------------------
1207class UCharFile {
1208public:
1209    UCharFile(const char *fileName);
1210    ~UCharFile();
1211    UChar   get();
1212    UBool   eof() {return fEof;};
1213    UBool   error() {return fError;};
1214
1215private:
1216    UCharFile (const UCharFile & /*other*/) {};                         // No copy constructor.
1217    UCharFile & operator = (const UCharFile &/*other*/) {return *this;};   // No assignment op
1218
1219    FILE         *fFile;
1220    const char   *fName;
1221    UBool        fEof;
1222    UBool        fError;
1223    UChar        fPending2ndSurrogate;
1224
1225    enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1226};
1227
1228UCharFile::UCharFile(const char * fileName) {
1229    fEof                 = FALSE;
1230    fError               = FALSE;
1231    fName                = fileName;
1232    fFile                = fopen(fName, "rb");
1233    fPending2ndSurrogate = 0;
1234    if (fFile == NULL) {
1235        fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1236        fError = TRUE;
1237        return;
1238    }
1239    //
1240    //  Look for the byte order mark at the start of the file.
1241    //
1242    int BOMC1, BOMC2, BOMC3;
1243    BOMC1 = fgetc(fFile);
1244    BOMC2 = fgetc(fFile);
1245
1246    if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1247        fEncoding = UTF16LE; }
1248    else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1249        fEncoding = UTF16BE; }
1250    else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1251        fEncoding = UTF8; }
1252    else
1253    {
1254        fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
1255            "must include a BOM.\n", fileName);
1256        fError = true;
1257        return;
1258    }
1259}
1260
1261
1262UCharFile::~UCharFile() {
1263    fclose(fFile);
1264}
1265
1266
1267
1268UChar UCharFile::get() {
1269    UChar   c;
1270    switch (fEncoding) {
1271    case UTF16LE:
1272        {
1273            int  cL, cH;
1274            cL = fgetc(fFile);
1275            cH = fgetc(fFile);
1276            c  = cL  | (cH << 8);
1277            if (cH == EOF) {
1278                c   = 0;
1279                fEof = TRUE;
1280            }
1281            break;
1282        }
1283    case UTF16BE:
1284        {
1285            int  cL, cH;
1286            cH = fgetc(fFile);
1287            cL = fgetc(fFile);
1288            c  = cL  | (cH << 8);
1289            if (cL == EOF) {
1290                c   = 0;
1291                fEof = TRUE;
1292            }
1293            break;
1294        }
1295    case UTF8:
1296        {
1297            if (fPending2ndSurrogate != 0) {
1298                c = fPending2ndSurrogate;
1299                fPending2ndSurrogate = 0;
1300                break;
1301            }
1302
1303            int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
1304            if (ch == EOF) {
1305                c = 0;
1306                fEof = TRUE;
1307                break;
1308            }
1309
1310            if (ch <= 0x7f) {
1311                // It's ascii.  No further utf-8 conversion.
1312                c = ch;
1313                break;
1314            }
1315
1316            // Figure out the lenght of the char and read the rest of the bytes
1317            //   into a temp array.
1318            int nBytes;
1319            if (ch >= 0xF0) {nBytes=4;}
1320            else if (ch >= 0xE0) {nBytes=3;}
1321            else if (ch >= 0xC0) {nBytes=2;}
1322            else {
1323                fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1324                fError = TRUE;
1325                return 0;
1326            }
1327
1328            unsigned char  bytes[10];
1329            bytes[0] = (unsigned char)ch;
1330            int i;
1331            for (i=1; i<nBytes; i++) {
1332                bytes[i] = fgetc(fFile);
1333                if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1334                    fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1335                    fError = TRUE;
1336                    return 0;
1337                }
1338            }
1339
1340            // Convert the bytes from the temp array to a Unicode char.
1341            i = 0;
1342            uint32_t  cp;
1343            U8_NEXT_UNSAFE(bytes, i, cp);
1344            c = (UChar)cp;
1345
1346            if (cp >= 0x10000) {
1347                // The code point needs to be broken up into a utf-16 surrogate pair.
1348                //  Process first half this time through the main loop, and
1349                //   remember the other half for the next time through.
1350                UChar utf16Buf[3];
1351                i = 0;
1352                UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1353                fPending2ndSurrogate = utf16Buf[1];
1354                c = utf16Buf[0];
1355            }
1356            break;
1357        };
1358    default:
1359        c = 0xFFFD; /* Error, unspecified codepage*/
1360        fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1361        exit(1);
1362    }
1363    return c;
1364}
1365
1366//----------------------------------------------------------------------------------------
1367//
1368//   openRulesCollator  - Command line specified a rules file.  Read it in
1369//                        and open a collator with it.
1370//
1371//----------------------------------------------------------------------------------------
1372UCollator *openRulesCollator() {
1373    UCharFile f(opt_rules);
1374    if (f.error()) {
1375        return 0;
1376    }
1377
1378    int  bufLen = 10000;
1379    UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1380    UChar *tmp;
1381    int i = 0;
1382
1383    for(;;) {
1384        buf[i] = f.get();
1385        if (f.eof()) {
1386            break;
1387        }
1388        if (f.error()) {
1389            return 0;
1390        }
1391        i++;
1392        if (i >= bufLen) {
1393            tmp = buf;
1394            bufLen += 10000;
1395            buf = (UChar *)realloc(buf, bufLen);
1396            if (buf == NULL) {
1397                free(tmp);
1398                return 0;
1399            }
1400        }
1401    }
1402    buf[i] = 0;
1403
1404    UErrorCode    status = U_ZERO_ERROR;
1405    UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1406                                         UCOL_DEFAULT_STRENGTH, NULL, &status);
1407    if (U_FAILURE(status)) {
1408        fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1409        return 0;
1410    }
1411    free(buf);
1412    return coll;
1413}
1414
1415
1416
1417
1418
1419//----------------------------------------------------------------------------------------
1420//
1421//    Main   --  process command line, read in and pre-process the test file,
1422//                 call other functions to do the actual tests.
1423//
1424//----------------------------------------------------------------------------------------
1425int main(int argc, const char** argv) {
1426    if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
1427        printf(gUsageString);
1428        exit (1);
1429    }
1430
1431    // Make sure that we've only got one API selected.
1432    if (opt_unix || opt_win) opt_icu = FALSE;
1433    if (opt_unix) opt_win = FALSE;
1434
1435    //
1436    //  Set up an ICU collator
1437    //
1438    UErrorCode          status = U_ZERO_ERROR;
1439
1440    if (opt_rules != 0) {
1441        gCol = openRulesCollator();
1442        if (gCol == 0) {return -1;}
1443    }
1444    else {
1445        gCol = ucol_open(opt_locale, &status);
1446        if (U_FAILURE(status)) {
1447            fprintf(stderr, "Collator creation failed.: %d\n", status);
1448            return -1;
1449        }
1450    }
1451    if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
1452        fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1453    }
1454    if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
1455        fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1456    }
1457
1458    if (opt_norm) {
1459        ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1460    }
1461    if (opt_french && opt_frenchoff) {
1462        fprintf(stderr, "collperf:  Error, specified both -french and -frenchoff options.");
1463        exit(-1);
1464    }
1465    if (opt_french) {
1466        ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1467    }
1468    if (opt_frenchoff) {
1469        ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1470    }
1471    if (opt_lower) {
1472        ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1473    }
1474    if (opt_upper) {
1475        ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1476    }
1477    if (opt_case) {
1478        ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1479    }
1480    if (opt_shifted) {
1481        ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1482    }
1483    if (opt_level != 0) {
1484        switch (opt_level) {
1485        case 1:
1486            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1487            break;
1488        case 2:
1489            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1490            break;
1491        case 3:
1492            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1493            break;
1494        case 4:
1495            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1496            break;
1497        case 5:
1498            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1499            break;
1500        default:
1501            fprintf(stderr, "-level param must be between 1 and 5\n");
1502            exit(-1);
1503        }
1504    }
1505
1506    if (U_FAILURE(status)) {
1507        fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1508        return -1;
1509    }
1510
1511
1512    //
1513    //  Set up a Windows LCID
1514    //
1515    if (opt_langid != 0) {
1516        gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1517    }
1518    else {
1519        gWinLCID = uloc_getLCID(opt_locale);
1520    }
1521
1522
1523    //
1524    //  Set the UNIX locale
1525    //
1526    if (opt_unix) {
1527        if (setlocale(LC_ALL, opt_locale) == 0) {
1528            fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1529            exit(-1);
1530        }
1531    }
1532
1533    // Read in  the input file.
1534    //   File assumed to be utf-16.
1535    //   Lines go onto heap buffers.  Global index array to line starts is created.
1536    //   Lines themselves are null terminated.
1537    //
1538
1539    UCharFile f(opt_fName);
1540    if (f.error()) {
1541        exit(-1);
1542    }
1543
1544    const int MAXLINES = 100000;
1545    gFileLines = new Line[MAXLINES];
1546    UChar buf[1024];
1547    int   column = 0;
1548
1549    //  Read the file, split into lines, and save in memory.
1550    //  Loop runs once per utf-16 value from the input file,
1551    //    (The number of bytes read from file per loop iteration depends on external encoding.)
1552    for (;;) {
1553
1554        UChar c = f.get();
1555        if (f.error()){
1556            exit(-1);
1557        }
1558
1559
1560        // We now have a good UTF-16 value in c.
1561
1562        // Watch for CR, LF, EOF; these finish off a line.
1563        if (c == 0xd) {
1564            continue;
1565        }
1566
1567        if (f.eof() || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
1568            buf[column++] = 0;
1569            if (column > 1) {
1570                gFileLines[gNumFileLines].name  = new UChar[column];
1571                gFileLines[gNumFileLines].len   = column-1;
1572                memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1573                gNumFileLines++;
1574                column = 0;
1575                if (gNumFileLines >= MAXLINES) {
1576                    fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
1577                    exit(-1);
1578                }
1579
1580            }
1581            if (c == 0xa || c == 0x2028)
1582                continue;
1583            else
1584                break;  // EOF
1585        }
1586        buf[column++] = c;
1587        if (column >= 1023)
1588        {
1589            static UBool warnFlag = TRUE;
1590            if (warnFlag) {
1591                fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1592                warnFlag = FALSE;
1593            }
1594            column--;
1595        }
1596    }
1597
1598    if (opt_terse == FALSE) {
1599        printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1600    }
1601
1602
1603    // Convert the lines to the UNIX encoding.
1604    if (opt_unix) {
1605        UnixConvert();
1606    }
1607
1608    //
1609    //  Pre-compute ICU sort keys for the lines of the file.
1610    //
1611    int line;
1612    int32_t t;
1613
1614    for (line=0; line<gNumFileLines; line++) {
1615         t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1616         gFileLines[line].icuSortKey  = new char[t];
1617
1618         if (t > (int32_t)sizeof(buf)) {
1619             t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1620         }
1621         else
1622         {
1623             memcpy(gFileLines[line].icuSortKey, buf, t);
1624         }
1625    }
1626
1627
1628
1629    //
1630    //  Pre-compute Windows sort keys for the lines of the file.
1631    //
1632    for (line=0; line<gNumFileLines; line++) {
1633         t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1634         gFileLines[line].winSortKey  = new char[t];
1635         if (t > (int32_t)sizeof(buf)) {
1636             t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
1637         }
1638         else
1639         {
1640             memcpy(gFileLines[line].winSortKey, buf, t);
1641         }
1642    }
1643
1644    //
1645    //  Pre-compute UNIX sort keys for the lines of the file.
1646    //
1647    if (opt_unix) {
1648        for (line=0; line<gNumFileLines; line++) {
1649            t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
1650            gFileLines[line].unixSortKey  = new char[t];
1651            if (t > (int32_t)sizeof(buf)) {
1652                t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
1653            }
1654            else
1655            {
1656                memcpy(gFileLines[line].unixSortKey, buf, t);
1657            }
1658        }
1659    }
1660
1661
1662    //
1663    //  Dump file lines, CEs, Sort Keys if requested.
1664    //
1665    if (opt_dump) {
1666        int  i;
1667        for (line=0; line<gNumFileLines; line++) {
1668            for (i=0;;i++) {
1669                UChar  c = gFileLines[line].name[i];
1670                if (c == 0)
1671                    break;
1672                if (c < 0x20 || c > 0x7e) {
1673                    printf("\\u%.4x", c);
1674                }
1675                else {
1676                    printf("%c", c);
1677                }
1678            }
1679            printf("\n");
1680
1681            printf("   CEs: ");
1682            UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1683            int32_t ce;
1684            i = 0;
1685            for (;;) {
1686                ce = ucol_next(CEiter, &status);
1687                if (ce == UCOL_NULLORDER) {
1688                    break;
1689                }
1690                printf(" %.8x", ce);
1691                if (++i > 8) {
1692                    printf("\n        ");
1693                    i = 0;
1694                }
1695            }
1696            printf("\n");
1697            ucol_closeElements(CEiter);
1698
1699
1700            printf("   ICU Sort Key: ");
1701            for (i=0; ; i++) {
1702                unsigned char c = gFileLines[line].icuSortKey[i];
1703                printf("%02x ", c);
1704                if (c == 0) {
1705                    break;
1706                }
1707                if (i > 0 && i % 20 == 0) {
1708                    printf("\n                 ");
1709                }
1710           }
1711            printf("\n");
1712        }
1713    }
1714
1715
1716    //
1717    //  Pre-sort the lines.
1718    //
1719    int i;
1720    gSortedLines = new Line *[gNumFileLines];
1721    for (i=0; i<gNumFileLines; i++) {
1722        gSortedLines[i] = &gFileLines[i];
1723    }
1724
1725    if (opt_win) {
1726        qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1727    }
1728    else if (opt_unix) {
1729        qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1730    }
1731    else   /* ICU */
1732    {
1733        qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1734    }
1735
1736
1737    //
1738    //  Make up a randomized order, will be used for sorting tests.
1739    //
1740    gRandomLines = new Line *[gNumFileLines];
1741    for (i=0; i<gNumFileLines; i++) {
1742        gRandomLines[i] = &gFileLines[i];
1743    }
1744    qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1745
1746
1747
1748
1749    //
1750    //  We've got the file read into memory.  Go do something with it.
1751    //
1752
1753    if (opt_qsort)     doQSort();
1754    if (opt_binsearch) doBinarySearch();
1755    if (opt_keygen)    doKeyGen();
1756    if (opt_keyhist)   doKeyHist();
1757    if (opt_itertest)  doIterTest();
1758
1759    return 0;
1760
1761}
1762