ubrkperfold.cpp revision 51cfa1a9a96cad34675a6415fe86dfdf3f525bb6
1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (C) 2001-2005 IBM, Inc.   All Rights Reserved.
4 *
5 ********************************************************************/
6/********************************************************************************
7*
8* File ubrkperf.cpp
9*
10* Modification History:
11*        Name                     Description
12*     Vladimir Weinstein          First Version, based on collperf
13*
14*********************************************************************************
15*/
16
17//
18//  This program tests break iterator performance
19//      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
20//      (if any)
21//      A text file is required as input.  It must be in utf-8 or utf-16 format,
22//      and include a byte order mark.  Either LE or BE format is OK.
23//
24
25const char gUsageString[] =
26 "usage:  ubrkperf options...\n"
27    "-help                      Display this message.\n"
28    "-file file_name            utf-16/utf-8 format file.\n"
29    "-locale name               ICU locale to use.  Default is en_US\n"
30    "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
31    "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
32    "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
33    "-unix                      Run test using Unix word breaking services. (currently not working) \n"
34    "-mac                       Run test using MacOSX word breaking services.\n"
35    "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
36    "-char                      Use character break iterator\n"
37    "-word                      Use word break iterator\n"
38    "-line                      Use line break iterator\n"
39    "-sentence                  Use sentence break iterator\n"
40    "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
41    "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
42    "                               under test at each call point.  For measuring test overhead.\n"
43    "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
44    "-dump                      Display stuff.\n"
45    "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
46    "-next                      Do the next test\n"
47    "-isBound                   Do the isBound test\n"
48    ;
49
50
51#include <stdio.h>
52#include <string.h>
53#include <stdlib.h>
54#include <math.h>
55#include <locale.h>
56#include <errno.h>
57#include <sys/stat.h>
58
59#include <unicode/utypes.h>
60#include <unicode/ucol.h>
61#include <unicode/ucoleitr.h>
62#include <unicode/uloc.h>
63#include <unicode/ustring.h>
64#include <unicode/ures.h>
65#include <unicode/uchar.h>
66#include <unicode/ucnv.h>
67#include <unicode/utf8.h>
68
69#include <unicode/brkiter.h>
70
71
72#ifdef U_WINDOWS
73#include <windows.h>
74#else
75//
76//  Stubs for Windows API functions when building on UNIXes.
77//
78#include <sys/time.h>
79unsigned long timeGetTime() {
80    struct timeval t;
81    gettimeofday(&t, 0);
82    unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
83    val += t.tv_usec / 1000;
84    return val;
85};
86#define MAKELCID(a,b) 0
87#endif
88
89
90//
91//  Command line option variables
92//     These global variables are set according to the options specified
93//     on the command line by the user.
94char * opt_fName      = 0;
95char * opt_locale     = "en_US";
96int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
97char * opt_rules      = 0;
98UBool  opt_help       = FALSE;
99int    opt_time       = 0;
100int    opt_loopCount  = 0;
101int    opt_passesCount= 1;
102UBool  opt_terse      = FALSE;
103UBool  opt_icu        = TRUE;
104UBool  opt_win        = FALSE;      // Run with Windows native functions.
105UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
106UBool  opt_mac        = FALSE;      // Run with MacOSX word break services.
107UBool  opt_uselen     = FALSE;
108UBool  opt_dump       = FALSE;
109UBool  opt_char       = FALSE;
110UBool  opt_word       = FALSE;
111UBool  opt_line       = FALSE;
112UBool  opt_sentence   = FALSE;
113UBool  opt_capi       = FALSE;
114
115UBool  opt_next       = FALSE;
116UBool  opt_isBound    = FALSE;
117
118
119
120//
121//   Definitions for the command line options
122//
123struct OptSpec {
124    const char *name;
125    enum {FLAG, NUM, STRING} type;
126    void *pVar;
127};
128
129OptSpec opts[] = {
130    {"-file",        OptSpec::STRING, &opt_fName},
131    {"-locale",      OptSpec::STRING, &opt_locale},
132    {"-langid",      OptSpec::NUM,    &opt_langid},
133    {"-win",         OptSpec::FLAG,   &opt_win},
134    {"-unix",        OptSpec::FLAG,   &opt_unix},
135    {"-mac",         OptSpec::FLAG,   &opt_mac},
136    {"-uselen",      OptSpec::FLAG,   &opt_uselen},
137    {"-loop",        OptSpec::NUM,    &opt_loopCount},
138    {"-time",        OptSpec::NUM,    &opt_time},
139    {"-passes",      OptSpec::NUM,    &opt_passesCount},
140    {"-char",        OptSpec::FLAG,   &opt_char},
141    {"-word",        OptSpec::FLAG,   &opt_word},
142    {"-line",        OptSpec::FLAG,   &opt_line},
143    {"-sentence",    OptSpec::FLAG,   &opt_sentence},
144    {"-terse",       OptSpec::FLAG,   &opt_terse},
145    {"-dump",        OptSpec::FLAG,   &opt_dump},
146    {"-capi",        OptSpec::FLAG,   &opt_capi},
147    {"-next",        OptSpec::FLAG,   &opt_next},
148    {"-isBound",     OptSpec::FLAG,   &opt_isBound},
149    {"-help",        OptSpec::FLAG,   &opt_help},
150    {"-?",           OptSpec::FLAG,   &opt_help},
151    {0, OptSpec::FLAG, 0}
152};
153
154
155//---------------------------------------------------------------------------
156//
157//  Global variables pointing to and describing the test file
158//
159//---------------------------------------------------------------------------
160
161//DWORD          gWinLCID;
162BreakIterator *brkit = NULL;
163UChar *text = NULL;
164int32_t textSize = 0;
165
166
167
168#ifdef U_DARWIN
169#include <ApplicationServices/ApplicationServices.h>
170enum{
171  kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
172    };
173UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
174TextBreakLocatorRef breakRef;
175UCTextBreakType macBreakType;
176
177void createMACBrkIt() {
178  OSStatus status = noErr;
179  LocaleRef lref;
180  status = LocaleRefFromLocaleString(opt_locale, &lref);
181  status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
182  if(opt_char == TRUE) {
183    macBreakType = kUCTextBreakClusterMask;
184  } else if(opt_word == TRUE) {
185    macBreakType = kUCTextBreakWordMask;
186  } else if(opt_line == TRUE) {
187    macBreakType = kUCTextBreakLineMask;
188  } else if(opt_sentence == TRUE) {
189    // error
190    // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
191  } else {
192    // default is character iterator
193    macBreakType = kUCTextBreakClusterMask;
194      }
195}
196#endif
197
198void createICUBrkIt() {
199  //
200  //  Set up an ICU break iterator
201  //
202  UErrorCode          status = U_ZERO_ERROR;
203  if(opt_char == TRUE) {
204    brkit = BreakIterator::createCharacterInstance(opt_locale, status);
205  } else if(opt_word == TRUE) {
206    brkit = BreakIterator::createWordInstance(opt_locale, status);
207  } else if(opt_line == TRUE) {
208    brkit = BreakIterator::createLineInstance(opt_locale, status);
209  } else if(opt_sentence == TRUE) {
210    brkit = BreakIterator::createSentenceInstance(opt_locale, status);
211  } else {
212    // default is character iterator
213    brkit = BreakIterator::createCharacterInstance(opt_locale, status);
214  }
215  if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
216    fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
217  }
218  if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
219    fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
220  }
221
222}
223
224//---------------------------------------------------------------------------
225//
226//  ProcessOptions()    Function to read the command line options.
227//
228//---------------------------------------------------------------------------
229UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
230{
231    int         i;
232    int         argNum;
233    const char  *pArgName;
234    OptSpec    *pOpt;
235
236    for (argNum=1; argNum<argc; argNum++) {
237        pArgName = argv[argNum];
238        for (pOpt = opts;  pOpt->name != 0; pOpt++) {
239            if (strcmp(pOpt->name, pArgName) == 0) {
240                switch (pOpt->type) {
241                case OptSpec::FLAG:
242                    *(UBool *)(pOpt->pVar) = TRUE;
243                    break;
244                case OptSpec::STRING:
245                    argNum ++;
246                    if (argNum >= argc) {
247                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
248                        return FALSE;
249                    }
250                    *(const char **)(pOpt->pVar)  = argv[argNum];
251                    break;
252                case OptSpec::NUM:
253                    argNum ++;
254                    if (argNum >= argc) {
255                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
256                        return FALSE;
257                    }
258                    char *endp;
259                    i = strtol(argv[argNum], &endp, 0);
260                    if (endp == argv[argNum]) {
261                        fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
262                        return FALSE;
263                    }
264                    *(int *)(pOpt->pVar) = i;
265                }
266                break;
267            }
268        }
269        if (pOpt->name == 0)
270        {
271            fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
272            return FALSE;
273        }
274    }
275return TRUE;
276}
277
278
279void doForwardTest() {
280  if (opt_terse == FALSE) {
281    printf("Doing the forward test\n");
282  }
283  int32_t noBreaks = 0;
284  int32_t i = 0;
285  unsigned long startTime = timeGetTime();
286  unsigned long elapsedTime = 0;
287  if(opt_icu) {
288    createICUBrkIt();
289    brkit->setText(UnicodeString(text, textSize));
290    brkit->first();
291    if (opt_terse == FALSE) {
292      printf("Warmup\n");
293    }
294    int j;
295    while((j = brkit->next()) != BreakIterator::DONE) {
296      noBreaks++;
297      //fprintf(stderr, "%d ", j);
298    }
299
300    if (opt_terse == FALSE) {
301      printf("Measure\n");
302    }
303    startTime = timeGetTime();
304    for(i = 0; i < opt_loopCount; i++) {
305      brkit->first();
306      while(brkit->next() != BreakIterator::DONE) {
307      }
308    }
309
310    elapsedTime = timeGetTime()-startTime;
311  } else if(opt_mac) {
312#ifdef U_DARWIN
313    createMACBrkIt();
314    UniChar* filePtr = text;
315    OSStatus status = noErr;
316    UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
317    startOffset = 0;
318    //printf("\t---Search forward--\n");
319
320    while (startOffset < numUniChars)
321    {
322	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
323                               startOffset, &breakOffset);
324      //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
325      //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
326
327      // Output break
328      //printf("\t%d\n", (int)breakOffset);
329
330      // Increment counters
331	noBreaks++;
332      startOffset = breakOffset;
333    }
334    startTime = timeGetTime();
335    for(i = 0; i < opt_loopCount; i++) {
336      startOffset = 0;
337
338      while (startOffset < numUniChars)
339	{
340	  status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
341				   startOffset, &breakOffset);
342	  // Increment counters
343	  startOffset = breakOffset;
344	}
345    }
346    elapsedTime = timeGetTime()-startTime;
347    UCDisposeTextBreakLocator(&breakRef);
348#endif
349
350
351  }
352
353
354  if (opt_terse == FALSE) {
355  int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
356      int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
357      int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
358      printf("forward break iteration average loop time %d\n", loopTime);
359      printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
360      printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
361  } else {
362      printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
363  }
364
365
366}
367
368void doIsBoundTest() {
369  int32_t noBreaks = 0, hit = 0;
370  int32_t i = 0, j = 0;
371  unsigned long startTime = timeGetTime();
372  unsigned long elapsedTime = 0;
373  createICUBrkIt();
374  brkit->setText(UnicodeString(text, textSize));
375  brkit->first();
376  for(j = 0; j < textSize; j++) {
377    if(brkit->isBoundary(j)) {
378      noBreaks++;
379      //fprintf(stderr, "%d ", j);
380    }
381  }
382  /*
383  while(brkit->next() != BreakIterator::DONE) {
384    noBreaks++;
385  }
386  */
387
388  startTime = timeGetTime();
389  for(i = 0; i < opt_loopCount; i++) {
390    for(j = 0; j < textSize; j++) {
391      if(brkit->isBoundary(j)) {
392        hit++;
393      }
394    }
395  }
396
397  elapsedTime = timeGetTime()-startTime;
398  int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
399  if (opt_terse == FALSE) {
400      int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
401      int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
402      printf("forward break iteration average loop time %d\n", loopTime);
403      printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
404      printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
405  } else {
406      printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
407  }
408}
409
410//----------------------------------------------------------------------------------------
411//
412//   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
413//                    Since it appears that Unicode support is going in the general
414//                    direction of the use of UTF-8 locales, that is the approach
415//                    that is used here.
416//
417//----------------------------------------------------------------------------------------
418void  UnixConvert() {
419#if 0
420    int    line;
421
422    UConverter   *cvrtr;    // An ICU code page converter.
423    UErrorCode    status = U_ZERO_ERROR;
424
425
426    cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
427    if (U_FAILURE(status)) {
428        fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
429        exit(-1);
430    }
431    // redo for unix
432    for (line=0; line < gNumFileLines; line++) {
433        int sizeNeeded = ucnv_fromUChars(cvrtr,
434                                         0,            // ptr to target buffer.
435                                         0,            // length of target buffer.
436                                         gFileLines[line].name,
437                                         -1,           //  source is null terminated
438                                         &status);
439        if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
440            fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
441            exit(-1);
442        }
443        status = U_ZERO_ERROR;
444        gFileLines[line].unixName = new char[sizeNeeded+1];
445        sizeNeeded = ucnv_fromUChars(cvrtr,
446                                         gFileLines[line].unixName, // ptr to target buffer.
447                                         sizeNeeded+1, // length of target buffer.
448                                         gFileLines[line].name,
449                                         -1,           //  source is null terminated
450                                         &status);
451        if (U_FAILURE(status)) {
452            fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
453            exit(-1);
454        }
455        gFileLines[line].unixName[sizeNeeded] = 0;
456    };
457    ucnv_close(cvrtr);
458#endif
459}
460
461
462//----------------------------------------------------------------------------------------
463//
464//  class UCharFile   Class to hide all the gorp to read a file in
465//                    and produce a stream of UChars.
466//
467//----------------------------------------------------------------------------------------
468class UCharFile {
469public:
470    UCharFile(const char *fileName);
471    ~UCharFile();
472    UChar   get();
473    UBool   eof() {return fEof;};
474    UBool   error() {return fError;};
475    int32_t size() { return fFileSize; };
476
477private:
478    UCharFile (const UCharFile &other) {};                         // No copy constructor.
479    UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
480
481    FILE         *fFile;
482    const char   *fName;
483    UBool        fEof;
484    UBool        fError;
485    UChar        fPending2ndSurrogate;
486    int32_t      fFileSize;
487
488    enum {UTF16LE, UTF16BE, UTF8} fEncoding;
489};
490
491UCharFile::UCharFile(const char * fileName) {
492    fEof                 = FALSE;
493    fError               = FALSE;
494    fName                = fileName;
495    struct stat buf;
496    int32_t result = stat(fileName, &buf);
497    if(result != 0) {
498      fprintf(stderr, "Error getting info\n");
499      fFileSize = -1;
500    } else {
501      fFileSize = buf.st_size;
502    }
503    fFile                = fopen(fName, "rb");
504    fPending2ndSurrogate = 0;
505    if (fFile == NULL) {
506        fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
507        fError = TRUE;
508        return;
509    }
510    //
511    //  Look for the byte order mark at the start of the file.
512    //
513    int BOMC1, BOMC2, BOMC3;
514    BOMC1 = fgetc(fFile);
515    BOMC2 = fgetc(fFile);
516
517    if (BOMC1 == 0xff && BOMC2 == 0xfe) {
518        fEncoding = UTF16LE; }
519    else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
520        fEncoding = UTF16BE; }
521    else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
522        fEncoding = UTF8; }
523    else
524    {
525        fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
526            "must include a BOM.\n", fileName);
527        fError = true;
528        return;
529    }
530}
531
532
533UCharFile::~UCharFile() {
534    fclose(fFile);
535}
536
537
538
539UChar UCharFile::get() {
540    UChar   c;
541    switch (fEncoding) {
542    case UTF16LE:
543        {
544            int  cL, cH;
545            cL = fgetc(fFile);
546            cH = fgetc(fFile);
547            c  = cL  | (cH << 8);
548            if (cH == EOF) {
549                c   = 0;
550                fEof = TRUE;
551            }
552            break;
553        }
554    case UTF16BE:
555        {
556            int  cL, cH;
557            cH = fgetc(fFile);
558            cL = fgetc(fFile);
559            c  = cL  | (cH << 8);
560            if (cL == EOF) {
561                c   = 0;
562                fEof = TRUE;
563            }
564            break;
565        }
566    case UTF8:
567        {
568            if (fPending2ndSurrogate != 0) {
569                c = fPending2ndSurrogate;
570                fPending2ndSurrogate = 0;
571                break;
572            }
573
574            int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
575            if (ch == EOF) {
576                c = 0;
577                fEof = TRUE;
578                break;
579            }
580
581            if (ch <= 0x7f) {
582                // It's ascii.  No further utf-8 conversion.
583                c = ch;
584                break;
585            }
586
587            // Figure out the lenght of the char and read the rest of the bytes
588            //   into a temp array.
589            int nBytes;
590            if (ch >= 0xF0) {nBytes=4;}
591            else if (ch >= 0xE0) {nBytes=3;}
592            else if (ch >= 0xC0) {nBytes=2;}
593            else {
594                fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
595                fError = TRUE;
596                return 0;
597            }
598
599            unsigned char  bytes[10];
600            bytes[0] = (unsigned char)ch;
601            int i;
602            for (i=1; i<nBytes; i++) {
603                bytes[i] = fgetc(fFile);
604                if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
605                    fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
606                    fError = TRUE;
607                    return 0;
608                }
609            }
610
611            // Convert the bytes from the temp array to a Unicode char.
612            i = 0;
613            uint32_t  cp;
614            UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp);
615            c = (UChar)cp;
616
617            if (cp >= 0x10000) {
618                // The code point needs to be broken up into a utf-16 surrogate pair.
619                //  Process first half this time through the main loop, and
620                //   remember the other half for the next time through.
621                UChar utf16Buf[3];
622                i = 0;
623                UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
624                fPending2ndSurrogate = utf16Buf[1];
625                c = utf16Buf[0];
626            }
627            break;
628        };
629    }
630    return c;
631}
632
633
634//----------------------------------------------------------------------------------------
635//
636//    Main   --  process command line, read in and pre-process the test file,
637//                 call other functions to do the actual tests.
638//
639//----------------------------------------------------------------------------------------
640int main(int argc, const char** argv) {
641    if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
642        printf(gUsageString);
643        exit (1);
644    }
645    // Make sure that we've only got one API selected.
646    if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
647    if (opt_mac || opt_unix) opt_win = FALSE;
648    if (opt_mac) opt_unix = FALSE;
649
650    UErrorCode          status = U_ZERO_ERROR;
651
652
653
654    //
655    //  Set up a Windows LCID
656    //
657  /*
658    if (opt_langid != 0) {
659        gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
660    }
661    else {
662        gWinLCID = uloc_getLCID(opt_locale);
663    }
664  */
665
666    //
667    //  Set the UNIX locale
668    //
669    if (opt_unix) {
670        if (setlocale(LC_ALL, opt_locale) == 0) {
671            fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
672            exit(-1);
673        }
674    }
675
676    // Read in  the input file.
677    //   File assumed to be utf-16.
678    //   Lines go onto heap buffers.  Global index array to line starts is created.
679    //   Lines themselves are null terminated.
680    //
681
682    UCharFile f(opt_fName);
683    if (f.error()) {
684        exit(-1);
685    }
686    int32_t fileSize = f.size();
687    const int STARTSIZE = 70000;
688    int32_t bufSize = 0;
689    int32_t charCount = 0;
690    if(fileSize != -1) {
691      text = (UChar *)malloc(fileSize*sizeof(UChar));
692      bufSize = fileSize;
693    } else {
694      text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
695      bufSize = STARTSIZE;
696    }
697    if(text == NULL) {
698      fprintf(stderr, "Allocating buffer failed\n");
699      exit(-1);
700    }
701
702
703    //  Read the file, split into lines, and save in memory.
704    //  Loop runs once per utf-16 value from the input file,
705    //    (The number of bytes read from file per loop iteration depends on external encoding.)
706    for (;;) {
707
708        UChar c = f.get();
709        if(f.eof()) {
710          break;
711        }
712        if (f.error()){
713          exit(-1);
714        }
715        // We now have a good UTF-16 value in c.
716        text[charCount++] = c;
717        if(charCount == bufSize) {
718          text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
719          if(text == NULL) {
720            fprintf(stderr, "Reallocating buffer failed\n");
721            exit(-1);
722          }
723          bufSize *= 2;
724        }
725    }
726
727
728    if (opt_terse == FALSE) {
729        printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
730    }
731
732    textSize = charCount;
733
734
735
736
737    //
738    //  Dump file contents if requested.
739    //
740    if (opt_dump) {
741      // dump file, etc... possibly
742    }
743
744
745    //
746    //  We've got the file read into memory.  Go do something with it.
747    //
748    int32_t i = 0;
749    for(i = 0; i < opt_passesCount; i++) {
750      if(opt_loopCount != 0) {
751        if(opt_next) {
752          doForwardTest();
753        } else if(opt_isBound) {
754          doIsBoundTest();
755        } else {
756          doForwardTest();
757        }
758      } else if(opt_time != 0) {
759
760      }
761    }
762
763  if(text != NULL) {
764    free(text);
765  }
766    if(brkit != NULL) {
767      delete brkit;
768    }
769
770    return 0;
771}
772