164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert/*********************************************************************** 20596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * © 2016 and later: Unicode, Inc. and others. 364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert * License & terms of use: http://www.unicode.org/copyright.html#License 464339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert * 564339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert *********************************************************************** 664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert *********************************************************************** 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * COPYRIGHT: 854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 1064339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert ***********************************************************************/ 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/******************************************************************************** 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* File ubrkperf.cpp 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Modification History: 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Name Description 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Vladimir Weinstein First Version, based on collperf 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************************* 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// This program tests break iterator performance 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// (if any) 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// A text file is required as input. It must be in utf-8 or utf-16 format, 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// and include a byte order mark. Either LE or BE format is OK. 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst char gUsageString[] = 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "usage: ubrkperf options...\n" 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-help Display this message.\n" 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-file file_name utf-16/utf-8 format file.\n" 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-locale name ICU locale to use. Default is en_US\n" 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-win Run test using Windows native services. (currently not working) (ICU is default)\n" 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-unix Run test using Unix word breaking services. (currently not working) \n" 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-mac Run test using MacOSX word breaking services.\n" 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-uselen Use API with string lengths. Default is null-terminated strings\n" 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-char Use character break iterator\n" 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-word Use word break iterator\n" 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-line Use line break iterator\n" 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-sentence Use sentence break iterator\n" 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru " under test at each call point. For measuring test overhead.\n" 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-terse Terse numbers-only output. Intended for use by scripts.\n" 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-dump Display stuff.\n" 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-capi Use C APIs instead of C++ APIs (currently not working)\n" 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-next Do the next test\n" 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "-isBound Do the isBound test\n" 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ; 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdio.h> 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <string.h> 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdlib.h> 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <math.h> 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <locale.h> 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <errno.h> 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <sys/stat.h> 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/utypes.h> 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ucol.h> 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ucoleitr.h> 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/uloc.h> 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ustring.h> 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ures.h> 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/uchar.h> 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/ucnv.h> 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/utf8.h> 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <unicode/brkiter.h> 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if U_PLATFORM_HAS_WIN32_API 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <windows.h> 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#else 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Stubs for Windows API functions when building on UNIXes. 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <sys/time.h> 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruunsigned long timeGetTime() { 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru struct timeval t; 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru gettimeofday(&t, 0); 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru val += t.tv_usec / 1000; 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return val; 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define MAKELCID(a,b) 0 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Command line option variables 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// These global variables are set according to the options specified 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// on the command line by the user. 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruchar * opt_fName = 0; 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruchar * opt_locale = "en_US"; 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint opt_langid = 0; // Defaults to value corresponding to opt_locale. 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruchar * opt_rules = 0; 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_help = FALSE; 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint opt_time = 0; 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint opt_loopCount = 0; 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint opt_passesCount= 1; 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_terse = FALSE; 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_icu = TRUE; 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_win = FALSE; // Run with Windows native functions. 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_mac = FALSE; // Run with MacOSX word break services. 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_uselen = FALSE; 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_dump = FALSE; 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_char = FALSE; 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_word = FALSE; 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_line = FALSE; 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_sentence = FALSE; 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_capi = FALSE; 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_next = FALSE; 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool opt_isBound = FALSE; 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Definitions for the command line options 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustruct OptSpec { 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *name; 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enum {FLAG, NUM, STRING} type; 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void *pVar; 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruOptSpec opts[] = { 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-file", OptSpec::STRING, &opt_fName}, 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-locale", OptSpec::STRING, &opt_locale}, 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-langid", OptSpec::NUM, &opt_langid}, 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-win", OptSpec::FLAG, &opt_win}, 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-unix", OptSpec::FLAG, &opt_unix}, 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-mac", OptSpec::FLAG, &opt_mac}, 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-uselen", OptSpec::FLAG, &opt_uselen}, 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-loop", OptSpec::NUM, &opt_loopCount}, 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-time", OptSpec::NUM, &opt_time}, 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-passes", OptSpec::NUM, &opt_passesCount}, 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-char", OptSpec::FLAG, &opt_char}, 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-word", OptSpec::FLAG, &opt_word}, 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-line", OptSpec::FLAG, &opt_line}, 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-sentence", OptSpec::FLAG, &opt_sentence}, 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-terse", OptSpec::FLAG, &opt_terse}, 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-dump", OptSpec::FLAG, &opt_dump}, 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-capi", OptSpec::FLAG, &opt_capi}, 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-next", OptSpec::FLAG, &opt_next}, 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-isBound", OptSpec::FLAG, &opt_isBound}, 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-help", OptSpec::FLAG, &opt_help}, 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {"-?", OptSpec::FLAG, &opt_help}, 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru {0, OptSpec::FLAG, 0} 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//--------------------------------------------------------------------------- 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Global variables pointing to and describing the test file 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//--------------------------------------------------------------------------- 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//DWORD gWinLCID; 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBreakIterator *brkit = NULL; 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUChar *text = NULL; 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t textSize = 0; 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 173103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if U_PLATFORM_IS_DARWIN_BASED 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <ApplicationServices/ApplicationServices.h> 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruenum{ 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask) 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru }; 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask}; 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruTextBreakLocatorRef breakRef; 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUCTextBreakType macBreakType; 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid createMACBrkIt() { 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OSStatus status = noErr; 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru LocaleRef lref; 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = LocaleRefFromLocaleString(opt_locale, &lref); 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef); 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(opt_char == TRUE) { 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru macBreakType = kUCTextBreakClusterMask; 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_word == TRUE) { 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru macBreakType = kUCTextBreakWordMask; 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_line == TRUE) { 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru macBreakType = kUCTextBreakLineMask; 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_sentence == TRUE) { 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // error 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // brkit = BreakIterator::createSentenceInstance(opt_locale, status); 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // default is character iterator 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru macBreakType = kUCTextBreakClusterMask; 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid createICUBrkIt() { 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set up an ICU break iterator 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(opt_char == TRUE) { 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit = BreakIterator::createCharacterInstance(opt_locale, status); 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_word == TRUE) { 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit = BreakIterator::createWordInstance(opt_locale, status); 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_line == TRUE) { 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit = BreakIterator::createLineInstance(opt_locale, status); 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_sentence == TRUE) { 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit = BreakIterator::createSentenceInstance(opt_locale, status); 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // default is character iterator 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit = BreakIterator::createCharacterInstance(opt_locale, status); 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//--------------------------------------------------------------------------- 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// ProcessOptions() Function to read the command line options. 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//--------------------------------------------------------------------------- 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int i; 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int argNum; 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *pArgName; 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OptSpec *pOpt; 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (argNum=1; argNum<argc; argNum++) { 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pArgName = argv[argNum]; 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (pOpt = opts; pOpt->name != 0; pOpt++) { 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (strcmp(pOpt->name, pArgName) == 0) { 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru switch (pOpt->type) { 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case OptSpec::FLAG: 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *(UBool *)(pOpt->pVar) = TRUE; 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case OptSpec::STRING: 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru argNum ++; 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (argNum >= argc) { 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *(const char **)(pOpt->pVar) = argv[argNum]; 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case OptSpec::NUM: 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru argNum ++; 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (argNum >= argc) { 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru char *endp; 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i = strtol(argv[argNum], &endp, 0); 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (endp == argv[argNum]) { 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *(int *)(pOpt->pVar) = i; 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (pOpt->name == 0) 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querureturn TRUE; 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid doForwardTest() { 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_terse == FALSE) { 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("Doing the forward test\n"); 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t noBreaks = 0; 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i = 0; 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru unsigned long startTime = timeGetTime(); 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru unsigned long elapsedTime = 0; 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(opt_icu) { 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru createICUBrkIt(); 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit->setText(UnicodeString(text, textSize)); 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit->first(); 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_terse == FALSE) { 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("Warmup\n"); 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int j; 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while((j = brkit->next()) != BreakIterator::DONE) { 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru noBreaks++; 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //fprintf(stderr, "%d ", j); 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_terse == FALSE) { 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("Measure\n"); 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startTime = timeGetTime(); 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i = 0; i < opt_loopCount; i++) { 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit->first(); 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(brkit->next() != BreakIterator::DONE) { 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru elapsedTime = timeGetTime()-startTime; 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_mac) { 317103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if U_PLATFORM_IS_DARWIN_BASED 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru createMACBrkIt(); 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UniChar* filePtr = text; 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru OSStatus status = noErr; 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startOffset = 0; 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //printf("\t---Search forward--\n"); 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (startOffset < numUniChars) 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startOffset, &breakOffset); 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status)); 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset)); 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Output break 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //printf("\t%d\n", (int)breakOffset); 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Increment counters 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru noBreaks++; 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startOffset = breakOffset; 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startTime = timeGetTime(); 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i = 0; i < opt_loopCount; i++) { 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startOffset = 0; 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (startOffset < numUniChars) 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startOffset, &breakOffset); 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Increment counters 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startOffset = breakOffset; 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru elapsedTime = timeGetTime()-startTime; 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCDisposeTextBreakLocator(&breakRef); 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_terse == FALSE) { 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("forward break iteration average loop time %d\n", loopTime); 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid doIsBoundTest() { 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t noBreaks = 0, hit = 0; 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i = 0, j = 0; 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru unsigned long startTime = timeGetTime(); 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru unsigned long elapsedTime = 0; 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru createICUBrkIt(); 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit->setText(UnicodeString(text, textSize)); 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru brkit->first(); 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(j = 0; j < textSize; j++) { 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(brkit->isBoundary(j)) { 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru noBreaks++; 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //fprintf(stderr, "%d ", j); 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(brkit->next() != BreakIterator::DONE) { 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru noBreaks++; 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru startTime = timeGetTime(); 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i = 0; i < opt_loopCount; i++) { 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(j = 0; j < textSize; j++) { 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(brkit->isBoundary(j)) { 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hit++; 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru elapsedTime = timeGetTime()-startTime; 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_terse == FALSE) { 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("forward break iteration average loop time %d\n", loopTime); 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); 410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); 412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// UnixConvert -- Convert the lines of the file to the encoding for UNIX 418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Since it appears that Unicode support is going in the general 419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// direction of the use of UTF-8 locales, that is the approach 420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// that is used here. 421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid UnixConvert() { 424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if 0 425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int line; 426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UConverter *cvrtr; // An ICU code page converter. 428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. 432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "ICU Converter open failed.: %d\n", &status); 434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(-1); 435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // redo for unix 437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (line=0; line < gNumFileLines; line++) { 438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int sizeNeeded = ucnv_fromUChars(cvrtr, 439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, // ptr to target buffer. 440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, // length of target buffer. 441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru gFileLines[line].name, 442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru -1, // source is null terminated 443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru &status); 444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { 445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); 446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(-1); 447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_ZERO_ERROR; 449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru gFileLines[line].unixName = new char[sizeNeeded+1]; 450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru sizeNeeded = ucnv_fromUChars(cvrtr, 451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru gFileLines[line].unixName, // ptr to target buffer. 452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru sizeNeeded+1, // length of target buffer. 453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru gFileLines[line].name, 454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru -1, // source is null terminated 455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru &status); 456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "ICU Conversion Failed.: %d\n", status); 458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(-1); 459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru gFileLines[line].unixName[sizeNeeded] = 0; 461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru }; 462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ucnv_close(cvrtr); 463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// class UCharFile Class to hide all the gorp to read a file in 470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// and produce a stream of UChars. 471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass UCharFile { 474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharFile(const char *fileName); 476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~UCharFile(); 477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar get(); 478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool eof() {return fEof;}; 479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool error() {return fError;}; 480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t size() { return fFileSize; }; 481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharFile (const UCharFile &other) {}; // No copy constructor. 484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op 485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru FILE *fFile; 487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const char *fName; 488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fEof; 489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fError; 490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar fPending2ndSurrogate; 491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fFileSize; 492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru enum {UTF16LE, UTF16BE, UTF8} fEncoding; 494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUCharFile::UCharFile(const char * fileName) { 497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEof = FALSE; 498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fError = FALSE; 499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fName = fileName; 500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru struct stat buf; 501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t result = stat(fileName, &buf); 502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(result != 0) { 503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Error getting info\n"); 504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFileSize = -1; 505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFileSize = buf.st_size; 507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fFile = fopen(fName, "rb"); 509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fPending2ndSurrogate = 0; 510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fFile == NULL) { 511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); 512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fError = TRUE; 513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look for the byte order mark at the start of the file. 517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int BOMC1, BOMC2, BOMC3; 519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BOMC1 = fgetc(fFile); 520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BOMC2 = fgetc(fFile); 521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (BOMC1 == 0xff && BOMC2 == 0xfe) { 523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEncoding = UTF16LE; } 524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else if (BOMC1 == 0xfe && BOMC2 == 0xff) { 525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEncoding = UTF16BE; } 526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { 527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEncoding = UTF8; } 528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else 529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " 531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru "must include a BOM.\n", fileName); 532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fError = true; 533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUCharFile::~UCharFile() { 539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fclose(fFile); 540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUChar UCharFile::get() { 545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar c; 546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru switch (fEncoding) { 547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case UTF16LE: 548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int cL, cH; 550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru cL = fgetc(fFile); 551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru cH = fgetc(fFile); 552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = cL | (cH << 8); 553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (cH == EOF) { 554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = 0; 555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEof = TRUE; 556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case UTF16BE: 560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int cL, cH; 562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru cH = fgetc(fFile); 563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru cL = fgetc(fFile); 564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = cL | (cH << 8); 565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (cL == EOF) { 566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = 0; 567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEof = TRUE; 568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case UTF8: 572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fPending2ndSurrogate != 0) { 574ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = fPending2ndSurrogate; 575ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fPending2ndSurrogate = 0; 576ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 577ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 578ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 579ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. 580ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ch == EOF) { 581ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = 0; 582ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEof = TRUE; 583ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 584ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 585ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 586ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ch <= 0x7f) { 587ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // It's ascii. No further utf-8 conversion. 588ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = ch; 589ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 590ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 591ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 592ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Figure out the lenght of the char and read the rest of the bytes 593ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // into a temp array. 594ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int nBytes; 595ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ch >= 0xF0) {nBytes=4;} 596ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else if (ch >= 0xE0) {nBytes=3;} 597ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else if (ch >= 0xC0) {nBytes=2;} 598ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 599ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile)); 600ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fError = TRUE; 601ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 602ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 603ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 604ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru unsigned char bytes[10]; 605ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bytes[0] = (unsigned char)ch; 606ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int i; 607ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (i=1; i<nBytes; i++) { 608ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bytes[i] = fgetc(fFile); 609ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { 610ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch); 611ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fError = TRUE; 612ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 613ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 614ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 615ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 616ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Convert the bytes from the temp array to a Unicode char. 617ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i = 0; 618ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t cp; 61954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius U8_NEXT_UNSAFE(bytes, i, cp); 620ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = (UChar)cp; 621ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 622ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (cp >= 0x10000) { 623ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The code point needs to be broken up into a utf-16 surrogate pair. 624ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Process first half this time through the main loop, and 625ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // remember the other half for the next time through. 626ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar utf16Buf[3]; 627ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i = 0; 628ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); 629ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fPending2ndSurrogate = utf16Buf[1]; 630ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = utf16Buf[0]; 631ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 632ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 633ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru }; 634ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 635ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return c; 636ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 637ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 638ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Main -- process command line, read in and pre-process the test file, 642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// call other functions to do the actual tests. 643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint main(int argc, const char** argv) { 646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { 647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf(gUsageString); 648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit (1); 649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Make sure that we've only got one API selected. 651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_mac || opt_unix || opt_win) opt_icu = FALSE; 652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_mac || opt_unix) opt_win = FALSE; 653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_mac) opt_unix = FALSE; 654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 657ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 658ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 659ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 660ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set up a Windows LCID 661ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 662ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 663ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_langid != 0) { 664ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); 665ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 666ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 667ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru gWinLCID = uloc_getLCID(opt_locale); 668ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 669ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 670ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 671ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 672ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set the UNIX locale 673ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 674ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_unix) { 675ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (setlocale(LC_ALL, opt_locale) == 0) { 676ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); 677ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(-1); 678ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 679ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 680ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 681ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Read in the input file. 682ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // File assumed to be utf-16. 683ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Lines go onto heap buffers. Global index array to line starts is created. 684ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Lines themselves are null terminated. 685ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 686ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 687ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharFile f(opt_fName); 688ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (f.error()) { 689ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(-1); 690ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 691ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fileSize = f.size(); 692ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const int STARTSIZE = 70000; 693ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t bufSize = 0; 694ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t charCount = 0; 695ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(fileSize != -1) { 696ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru text = (UChar *)malloc(fileSize*sizeof(UChar)); 697ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bufSize = fileSize; 698ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 699ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); 700ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bufSize = STARTSIZE; 701ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 702ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(text == NULL) { 703ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Allocating buffer failed\n"); 704ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(-1); 705ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 706ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 707ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 708ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Read the file, split into lines, and save in memory. 709ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Loop runs once per utf-16 value from the input file, 710ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // (The number of bytes read from file per loop iteration depends on external encoding.) 711ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (;;) { 712ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 713ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar c = f.get(); 714ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(f.eof()) { 715ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 716ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 717ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (f.error()){ 718ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(-1); 719ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 720ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // We now have a good UTF-16 value in c. 721ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru text[charCount++] = c; 722ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(charCount == bufSize) { 723ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); 724ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(text == NULL) { 725ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Reallocating buffer failed\n"); 726ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru exit(-1); 727ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 728ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bufSize *= 2; 729ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 730ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 731ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 732ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 733ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_terse == FALSE) { 734ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); 735ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 736ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 737ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru textSize = charCount; 738ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 739ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 740ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 741ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 742ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 743ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Dump file contents if requested. 744ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 745ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (opt_dump) { 746ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // dump file, etc... possibly 747ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 748ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 749ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 750ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 751ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // We've got the file read into memory. Go do something with it. 752ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 753ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i = 0; 754ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i = 0; i < opt_passesCount; i++) { 755ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(opt_loopCount != 0) { 756ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(opt_next) { 757ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru doForwardTest(); 758ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_isBound) { 759ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru doIsBoundTest(); 760ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 761ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru doForwardTest(); 762ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 763ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(opt_time != 0) { 764ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 765ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 766ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 767ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 768ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(text != NULL) { 769ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru free(text); 770ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 771ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(brkit != NULL) { 772ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete brkit; 773ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 774ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 775ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 776ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 777