164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert/***********************************************************************
20596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * © 2016 and later: Unicode, Inc. and others.
364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert * License & terms of use: http://www.unicode.org/copyright.html#License
464339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert *
564339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert ***********************************************************************
664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert ***********************************************************************
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * COPYRIGHT:
854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
1064339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert ***********************************************************************/
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/********************************************************************************
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* File ubrkperf.cpp
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Modification History:
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*        Name                     Description
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*     Vladimir Weinstein          First Version, based on collperf
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*********************************************************************************
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  This program tests break iterator performance
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//      (if any)
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//      A text file is required as input.  It must be in utf-8 or utf-16 format,
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//      and include a byte order mark.  Either LE or BE format is OK.
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst char gUsageString[] =
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "usage:  ubrkperf options...\n"
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-help                      Display this message.\n"
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-file file_name            utf-16/utf-8 format file.\n"
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-locale name               ICU locale to use.  Default is en_US\n"
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-unix                      Run test using Unix word breaking services. (currently not working) \n"
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-mac                       Run test using MacOSX word breaking services.\n"
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-char                      Use character break iterator\n"
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-word                      Use word break iterator\n"
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-line                      Use line break iterator\n"
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-sentence                  Use sentence break iterator\n"
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "                               under test at each call point.  For measuring test overhead.\n"
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-dump                      Display stuff.\n"
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-next                      Do the next test\n"
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    "-isBound                   Do the isBound test\n"
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ;
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdio.h>
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <string.h>
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdlib.h>
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <math.h>
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <locale.h>
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <errno.h>
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <sys/stat.h>
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/utypes.h>
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ucol.h>
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ucoleitr.h>
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/uloc.h>
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ustring.h>
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ures.h>
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/uchar.h>
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ucnv.h>
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/utf8.h>
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/brkiter.h>
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
77103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if U_PLATFORM_HAS_WIN32_API
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <windows.h>
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#else
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  Stubs for Windows API functions when building on UNIXes.
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <sys/time.h>
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruunsigned long timeGetTime() {
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    struct timeval t;
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    gettimeofday(&t, 0);
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    val += t.tv_usec / 1000;
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return val;
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define MAKELCID(a,b) 0
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  Command line option variables
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//     These global variables are set according to the options specified
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//     on the command line by the user.
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruchar * opt_fName      = 0;
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruchar * opt_locale     = "en_US";
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruchar * opt_rules      = 0;
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_help       = FALSE;
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint    opt_time       = 0;
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint    opt_loopCount  = 0;
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint    opt_passesCount= 1;
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_terse      = FALSE;
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_icu        = TRUE;
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_win        = FALSE;      // Run with Windows native functions.
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_mac        = FALSE;      // Run with MacOSX word break services.
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_uselen     = FALSE;
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_dump       = FALSE;
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_char       = FALSE;
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_word       = FALSE;
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_line       = FALSE;
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_sentence   = FALSE;
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_capi       = FALSE;
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_next       = FALSE;
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool  opt_isBound    = FALSE;
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//   Definitions for the command line options
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustruct OptSpec {
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *name;
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enum {FLAG, NUM, STRING} type;
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void *pVar;
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruOptSpec opts[] = {
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-file",        OptSpec::STRING, &opt_fName},
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-locale",      OptSpec::STRING, &opt_locale},
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-langid",      OptSpec::NUM,    &opt_langid},
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-win",         OptSpec::FLAG,   &opt_win},
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-unix",        OptSpec::FLAG,   &opt_unix},
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-mac",         OptSpec::FLAG,   &opt_mac},
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-uselen",      OptSpec::FLAG,   &opt_uselen},
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-loop",        OptSpec::NUM,    &opt_loopCount},
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-time",        OptSpec::NUM,    &opt_time},
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-passes",      OptSpec::NUM,    &opt_passesCount},
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-char",        OptSpec::FLAG,   &opt_char},
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-word",        OptSpec::FLAG,   &opt_word},
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-line",        OptSpec::FLAG,   &opt_line},
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-sentence",    OptSpec::FLAG,   &opt_sentence},
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-terse",       OptSpec::FLAG,   &opt_terse},
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-dump",        OptSpec::FLAG,   &opt_dump},
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-capi",        OptSpec::FLAG,   &opt_capi},
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-next",        OptSpec::FLAG,   &opt_next},
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-isBound",     OptSpec::FLAG,   &opt_isBound},
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-help",        OptSpec::FLAG,   &opt_help},
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {"-?",           OptSpec::FLAG,   &opt_help},
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {0, OptSpec::FLAG, 0}
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  Global variables pointing to and describing the test file
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//DWORD          gWinLCID;
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBreakIterator *brkit = NULL;
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUChar *text = NULL;
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t textSize = 0;
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
173103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if U_PLATFORM_IS_DARWIN_BASED
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <ApplicationServices/ApplicationServices.h>
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruenum{
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    };
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruTextBreakLocatorRef breakRef;
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUCTextBreakType macBreakType;
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid createMACBrkIt() {
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  OSStatus status = noErr;
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  LocaleRef lref;
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  status = LocaleRefFromLocaleString(opt_locale, &lref);
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if(opt_char == TRUE) {
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    macBreakType = kUCTextBreakClusterMask;
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else if(opt_word == TRUE) {
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    macBreakType = kUCTextBreakWordMask;
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else if(opt_line == TRUE) {
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    macBreakType = kUCTextBreakLineMask;
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else if(opt_sentence == TRUE) {
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // error
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else {
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // default is character iterator
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    macBreakType = kUCTextBreakClusterMask;
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      }
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid createICUBrkIt() {
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  //
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  //  Set up an ICU break iterator
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  //
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  UErrorCode          status = U_ZERO_ERROR;
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if(opt_char == TRUE) {
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    brkit = BreakIterator::createCharacterInstance(opt_locale, status);
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else if(opt_word == TRUE) {
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    brkit = BreakIterator::createWordInstance(opt_locale, status);
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else if(opt_line == TRUE) {
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    brkit = BreakIterator::createLineInstance(opt_locale, status);
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else if(opt_sentence == TRUE) {
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    brkit = BreakIterator::createSentenceInstance(opt_locale, status);
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else {
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // default is character iterator
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    brkit = BreakIterator::createCharacterInstance(opt_locale, status);
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  ProcessOptions()    Function to read the command line options.
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int         i;
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int         argNum;
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char  *pArgName;
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OptSpec    *pOpt;
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (argNum=1; argNum<argc; argNum++) {
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        pArgName = argv[argNum];
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for (pOpt = opts;  pOpt->name != 0; pOpt++) {
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (strcmp(pOpt->name, pArgName) == 0) {
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                switch (pOpt->type) {
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                case OptSpec::FLAG:
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    *(UBool *)(pOpt->pVar) = TRUE;
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                case OptSpec::STRING:
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    argNum ++;
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (argNum >= argc) {
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return FALSE;
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    *(const char **)(pOpt->pVar)  = argv[argNum];
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    break;
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                case OptSpec::NUM:
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    argNum ++;
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (argNum >= argc) {
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return FALSE;
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    char *endp;
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    i = strtol(argv[argNum], &endp, 0);
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (endp == argv[argNum]) {
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        return FALSE;
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    *(int *)(pOpt->pVar) = i;
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                break;
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (pOpt->name == 0)
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        {
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return FALSE;
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querureturn TRUE;
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid doForwardTest() {
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if (opt_terse == FALSE) {
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    printf("Doing the forward test\n");
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t noBreaks = 0;
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t i = 0;
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  unsigned long startTime = timeGetTime();
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  unsigned long elapsedTime = 0;
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if(opt_icu) {
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    createICUBrkIt();
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    brkit->setText(UnicodeString(text, textSize));
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    brkit->first();
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_terse == FALSE) {
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("Warmup\n");
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int j;
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while((j = brkit->next()) != BreakIterator::DONE) {
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      noBreaks++;
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      //fprintf(stderr, "%d ", j);
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_terse == FALSE) {
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("Measure\n");
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    startTime = timeGetTime();
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(i = 0; i < opt_loopCount; i++) {
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      brkit->first();
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      while(brkit->next() != BreakIterator::DONE) {
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      }
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    elapsedTime = timeGetTime()-startTime;
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else if(opt_mac) {
317103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if U_PLATFORM_IS_DARWIN_BASED
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    createMACBrkIt();
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UniChar* filePtr = text;
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    OSStatus status = noErr;
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    startOffset = 0;
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //printf("\t---Search forward--\n");
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while (startOffset < numUniChars)
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                               startOffset, &breakOffset);
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      // Output break
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      //printf("\t%d\n", (int)breakOffset);
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      // Increment counters
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru	noBreaks++;
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      startOffset = breakOffset;
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    startTime = timeGetTime();
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(i = 0; i < opt_loopCount; i++) {
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      startOffset = 0;
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      while (startOffset < numUniChars)
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru	{
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru	  status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru				   startOffset, &breakOffset);
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru	  // Increment counters
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru	  startOffset = breakOffset;
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru	}
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    elapsedTime = timeGetTime()-startTime;
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UCDisposeTextBreakLocator(&breakRef);
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if (opt_terse == FALSE) {
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("forward break iteration average loop time %d\n", loopTime);
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else {
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid doIsBoundTest() {
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t noBreaks = 0, hit = 0;
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t i = 0, j = 0;
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  unsigned long startTime = timeGetTime();
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  unsigned long elapsedTime = 0;
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  createICUBrkIt();
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  brkit->setText(UnicodeString(text, textSize));
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  brkit->first();
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  for(j = 0; j < textSize; j++) {
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(brkit->isBoundary(j)) {
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      noBreaks++;
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      //fprintf(stderr, "%d ", j);
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  /*
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  while(brkit->next() != BreakIterator::DONE) {
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    noBreaks++;
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  */
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  startTime = timeGetTime();
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  for(i = 0; i < opt_loopCount; i++) {
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(j = 0; j < textSize; j++) {
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      if(brkit->isBoundary(j)) {
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hit++;
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      }
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  elapsedTime = timeGetTime()-startTime;
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if (opt_terse == FALSE) {
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("forward break iteration average loop time %d\n", loopTime);
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  } else {
411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                    Since it appears that Unicode support is going in the general
419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                    direction of the use of UTF-8 locales, that is the approach
420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                    that is used here.
421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid  UnixConvert() {
424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if 0
425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int    line;
426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UConverter   *cvrtr;    // An ICU code page converter.
428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode    status = U_ZERO_ERROR;
429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(-1);
435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // redo for unix
437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (line=0; line < gNumFileLines; line++) {
438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int sizeNeeded = ucnv_fromUChars(cvrtr,
439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         0,            // ptr to target buffer.
440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         0,            // length of target buffer.
441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         gFileLines[line].name,
442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         -1,           //  source is null terminated
443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         &status);
444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(-1);
447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_ZERO_ERROR;
449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        gFileLines[line].unixName = new char[sizeNeeded+1];
450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        sizeNeeded = ucnv_fromUChars(cvrtr,
451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         gFileLines[line].unixName, // ptr to target buffer.
452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         sizeNeeded+1, // length of target buffer.
453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         gFileLines[line].name,
454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         -1,           //  source is null terminated
455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                         &status);
456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (U_FAILURE(status)) {
457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(-1);
459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        gFileLines[line].unixName[sizeNeeded] = 0;
461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    };
462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ucnv_close(cvrtr);
463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  class UCharFile   Class to hide all the gorp to read a file in
470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                    and produce a stream of UChars.
471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass UCharFile {
474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UCharFile(const char *fileName);
476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ~UCharFile();
477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar   get();
478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool   eof() {return fEof;};
479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool   error() {return fError;};
480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t size() { return fFileSize; };
481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UCharFile (const UCharFile &other) {};                         // No copy constructor.
484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    FILE         *fFile;
487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char   *fName;
488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool        fEof;
489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool        fError;
490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar        fPending2ndSurrogate;
491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t      fFileSize;
492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    enum {UTF16LE, UTF16BE, UTF8} fEncoding;
494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUCharFile::UCharFile(const char * fileName) {
497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEof                 = FALSE;
498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fError               = FALSE;
499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fName                = fileName;
500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    struct stat buf;
501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t result = stat(fileName, &buf);
502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(result != 0) {
503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      fprintf(stderr, "Error getting info\n");
504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      fFileSize = -1;
505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      fFileSize = buf.st_size;
507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fFile                = fopen(fName, "rb");
509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fPending2ndSurrogate = 0;
510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (fFile == NULL) {
511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fError = TRUE;
513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  Look for the byte order mark at the start of the file.
517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int BOMC1, BOMC2, BOMC3;
519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BOMC1 = fgetc(fFile);
520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BOMC2 = fgetc(fFile);
521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (BOMC1 == 0xff && BOMC2 == 0xfe) {
523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fEncoding = UTF16LE; }
524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fEncoding = UTF16BE; }
526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fEncoding = UTF8; }
528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    else
529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {
530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            "must include a BOM.\n", fileName);
532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fError = true;
533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUCharFile::~UCharFile() {
539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fclose(fFile);
540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUChar UCharFile::get() {
545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar   c;
546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    switch (fEncoding) {
547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    case UTF16LE:
548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        {
549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int  cL, cH;
550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            cL = fgetc(fFile);
551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            cH = fgetc(fFile);
552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c  = cL  | (cH << 8);
553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (cH == EOF) {
554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                c   = 0;
555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fEof = TRUE;
556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    case UTF16BE:
560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        {
561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int  cL, cH;
562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            cH = fgetc(fFile);
563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            cL = fgetc(fFile);
564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c  = cL  | (cH << 8);
565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (cL == EOF) {
566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                c   = 0;
567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fEof = TRUE;
568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    case UTF8:
572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        {
573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (fPending2ndSurrogate != 0) {
574ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                c = fPending2ndSurrogate;
575ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fPending2ndSurrogate = 0;
576ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                break;
577ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
578ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
579ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
580ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (ch == EOF) {
581ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                c = 0;
582ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fEof = TRUE;
583ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                break;
584ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
585ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
586ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (ch <= 0x7f) {
587ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // It's ascii.  No further utf-8 conversion.
588ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                c = ch;
589ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                break;
590ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
591ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
592ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Figure out the lenght of the char and read the rest of the bytes
593ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            //   into a temp array.
594ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int nBytes;
595ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (ch >= 0xF0) {nBytes=4;}
596ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            else if (ch >= 0xE0) {nBytes=3;}
597ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            else if (ch >= 0xC0) {nBytes=2;}
598ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            else {
599ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
600ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fError = TRUE;
601ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return 0;
602ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
603ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
604ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            unsigned char  bytes[10];
605ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bytes[0] = (unsigned char)ch;
606ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            int i;
607ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            for (i=1; i<nBytes; i++) {
608ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                bytes[i] = fgetc(fFile);
609ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
610ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
611ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    fError = TRUE;
612ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    return 0;
613ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
614ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
615ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
616ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Convert the bytes from the temp array to a Unicode char.
617ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i = 0;
618ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            uint32_t  cp;
61954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            U8_NEXT_UNSAFE(bytes, i, cp);
620ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c = (UChar)cp;
621ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
622ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (cp >= 0x10000) {
623ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // The code point needs to be broken up into a utf-16 surrogate pair.
624ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                //  Process first half this time through the main loop, and
625ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                //   remember the other half for the next time through.
626ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                UChar utf16Buf[3];
627ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                i = 0;
628ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
629ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fPending2ndSurrogate = utf16Buf[1];
630ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                c = utf16Buf[0];
631ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
632ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
633ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        };
634ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
635ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return c;
636ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
637ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
638ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//    Main   --  process command line, read in and pre-process the test file,
642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                 call other functions to do the actual tests.
643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint main(int argc, const char** argv) {
646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf(gUsageString);
648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit (1);
649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Make sure that we've only got one API selected.
651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_mac || opt_unix) opt_win = FALSE;
653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_mac) opt_unix = FALSE;
654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode          status = U_ZERO_ERROR;
656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
657ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
658ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
659ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
660ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  Set up a Windows LCID
661ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
662ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  /*
663ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_langid != 0) {
664ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
665ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
666ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    else {
667ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        gWinLCID = uloc_getLCID(opt_locale);
668ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
669ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  */
670ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
671ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
672ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  Set the UNIX locale
673ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
674ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_unix) {
675ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (setlocale(LC_ALL, opt_locale) == 0) {
676ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
677ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(-1);
678ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
679ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
680ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
681ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Read in  the input file.
682ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   File assumed to be utf-16.
683ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   Lines go onto heap buffers.  Global index array to line starts is created.
684ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   Lines themselves are null terminated.
685ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
686ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
687ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UCharFile f(opt_fName);
688ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (f.error()) {
689ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        exit(-1);
690ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
691ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t fileSize = f.size();
692ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const int STARTSIZE = 70000;
693ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t bufSize = 0;
694ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t charCount = 0;
695ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(fileSize != -1) {
696ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      text = (UChar *)malloc(fileSize*sizeof(UChar));
697ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      bufSize = fileSize;
698ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
699ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
700ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      bufSize = STARTSIZE;
701ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
702ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(text == NULL) {
703ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      fprintf(stderr, "Allocating buffer failed\n");
704ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      exit(-1);
705ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
706ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
707ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
708ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  Read the file, split into lines, and save in memory.
709ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  Loop runs once per utf-16 value from the input file,
710ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //    (The number of bytes read from file per loop iteration depends on external encoding.)
711ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (;;) {
712ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
713ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UChar c = f.get();
714ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(f.eof()) {
715ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          break;
716ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
717ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (f.error()){
718ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          exit(-1);
719ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
720ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // We now have a good UTF-16 value in c.
721ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        text[charCount++] = c;
722ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(charCount == bufSize) {
723ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
724ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          if(text == NULL) {
725ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fprintf(stderr, "Reallocating buffer failed\n");
726ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            exit(-1);
727ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          }
728ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          bufSize *= 2;
729ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
730ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
731ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
732ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
733ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_terse == FALSE) {
734ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
735ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
736ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
737ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    textSize = charCount;
738ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
739ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
740ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
741ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
742ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
743ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  Dump file contents if requested.
744ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
745ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (opt_dump) {
746ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      // dump file, etc... possibly
747ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
748ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
749ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
750ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
751ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  We've got the file read into memory.  Go do something with it.
752ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
753ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i = 0;
754ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(i = 0; i < opt_passesCount; i++) {
755ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      if(opt_loopCount != 0) {
756ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(opt_next) {
757ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          doForwardTest();
758ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if(opt_isBound) {
759ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          doIsBoundTest();
760ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
761ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru          doForwardTest();
762ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
763ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      } else if(opt_time != 0) {
764ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
765ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      }
766ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
767ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
768ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  if(text != NULL) {
769ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    free(text);
770ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  }
771ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(brkit != NULL) {
772ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      delete brkit;
773ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
774ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
775ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return 0;
776ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
777