1b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/* 2b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ********************************************************************** 354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Copyright (C) 2010-2012, International Business Machines 4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Corporation and others. All Rights Reserved. 5b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ********************************************************************** 6b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * file name: dicttrieperf.cpp 7b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * encoding: US-ASCII 8b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * tab size: 8 (not used) 9b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * indentation:4 10b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 11b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * created on: 2010dec09 12b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * created by: Markus W. Scherer 13b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 14b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Performance test program for dictionary-type tries. 15b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 16b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Usage from within <ICU build tree>/test/perf/dicttrieperf/ : 17b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * (Linux) 18b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * make 19b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw 20b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000 21b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * or 22b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250 23b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 24b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 25b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include <stdio.h> 26b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include <stdlib.h> 27b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/bytestrie.h" 28b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/bytestriebuilder.h" 29b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/localpointer.h" 30b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/ucharstrie.h" 31b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/ucharstriebuilder.h" 32b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/uperf.h" 33b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/utext.h" 34b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "charstr.h" 35b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "package.h" 36b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "toolutil.h" 37b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "ucbuf.h" // struct ULine 38b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "uoptions.h" 39b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "uvectr32.h" 40b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 41b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 42b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 43b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Test object. 44b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass DictionaryTriePerfTest : public UPerfTest { 45b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 46b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho DictionaryTriePerfTest(int32_t argc, const char *argv[], UErrorCode &status) 47b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : UPerfTest(argc, argv, NULL, 0, "", status), numTextLines(0) { 48b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(hasFile()) { 49b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho getLines(status); 50b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<numLines; ++i) { 51b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip comment lines (start with a character below 'A'). 52b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(lines[i].name[0]>=0x41) { 53b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++numTextLines; 54b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Remove trailing CR LF. 55b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t len=lines[i].len; 56b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar c; 57b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while(len>0 && ((c=lines[i].name[len-1])==0xa || c==0xd)) { 58b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho --len; 59b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 60b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho lines[i].len=len; 61b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 62b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 63b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 64b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 65b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 66b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual UPerfFunction *runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL); 67b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 68b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *getSourceDir() const { return sourceDir; } 69b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 70b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool hasFile() const { return ucharBuf!=NULL; } 71b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const ULine *getCachedLines() const { return lines; } 72b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t getNumLines() const { return numLines; } 73b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numTextLines; // excluding comment lines 74b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 75b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 76b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Performance test function object. 77b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Loads icudt46l.dat (or whatever its current versioned filename) 78b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// from the -s or --sourcedir path. 79b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass PackageLookup : public UPerfFunction { 80b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected: 81b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho PackageLookup(const DictionaryTriePerfTest &perf) { 82b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho IcuToolErrorCode errorCode("PackageLookup()"); 83b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho CharString filename(perf.getSourceDir(), errorCode); 84b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t filenameLength=filename.length(); 85b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(filenameLength>0 && filename[filenameLength-1]!=U_FILE_SEP_CHAR && 86b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho filename[filenameLength-1]!=U_FILE_ALT_SEP_CHAR) { 87b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho filename.append(U_FILE_SEP_CHAR, errorCode); 88b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 89b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho filename.append(U_ICUDATA_NAME, errorCode); 90b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho filename.append(".dat", errorCode); 91b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pkg.readPackage(filename.data()); 92b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 93b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 94b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 95b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual ~PackageLookup() {} 96b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 97b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // virtual void call(UErrorCode* pErrorCode) { ... } 98b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 99b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual long getOperationsPerIteration() { 100b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pkg.getItemCount(); 101b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 102b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 103b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // virtual long getEventsPerIteration(); 104b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 105b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected: 106b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho Package pkg; 107b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 108b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 109b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostruct TOCEntry { 110b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t nameOffset, dataOffset; 111b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 112b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 113b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c). 114b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t simpleBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) { 115b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t start=0; 116b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t limit=count; 117b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t lastNumber=limit; 118b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(;;) { 119b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t number=(start+limit)/2; 120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(lastNumber==number) { // have we moved? 121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return -1; // not found 122b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 123b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho lastNumber=number; 124b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t cmp=strcmp(s, names+toc[number].nameOffset); 125b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(cmp<0) { 126b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho limit=number; 127b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(cmp>0) { 128b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho start=number; 129b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { // found s 130b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return number; 131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 132b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 133b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 134b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 135b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BinarySearchPackageLookup : public PackageLookup { 136b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 137b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BinarySearchPackageLookup(const DictionaryTriePerfTest &perf) 138b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : PackageLookup(perf) { 139b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho IcuToolErrorCode errorCode("BinarySearchPackageLookup()"); 140b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t count=pkg.getItemCount(); 141b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho toc=new TOCEntry[count]; 142b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<count; ++i) { 143b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho toc[i].nameOffset=itemNames.length(); 144b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho toc[i].dataOffset=i; // arbitrary value, see toc comment below 145b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The Package class removes the "icudt46l/" prefix. 146b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We restore that here for a fair performance test. 147b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *name=pkg.getItem(i)->name; 148b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho itemNames.append("icudt46l/", errorCode); 149b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho itemNames.append(name, strlen(name)+1, errorCode); 150b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 151b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho printf("size of item names: %6ld\n", (long)itemNames.length()); 152b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho printf("size of TOC: %6ld\n", (long)(count*8)); 153b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho printf("total index size: %6ld\n", (long)(itemNames.length()+count*8)); 154b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 155b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual ~BinarySearchPackageLookup() { 156b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete[] toc; 157b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 158b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 159b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual void call(UErrorCode * /*pErrorCode*/) { 160b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t count=pkg.getItemCount(); 161b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *itemNameChars=itemNames.data(); 162b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *name=itemNameChars; 163b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<count; ++i) { 164b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(simpleBinarySearch(name, itemNameChars, toc, count)<0) { 165b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "item not found: %s\n", name); 166b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 167b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name=strchr(name, 0)+1; 168b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 169b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 170b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 171b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected: 172b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho CharString itemNames; 173b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // toc imitates a .dat file's array of UDataOffsetTOCEntry 174b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // with nameOffset and dataOffset. 175b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We don't need the dataOffsets, but we want to imitate the real 176b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // memory density, to measure equivalent CPU cache usage. 177b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho TOCEntry *toc; 178b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 179b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 180b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#ifndef MIN 181b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define MIN(a,b) (((a)<(b)) ? (a) : (b)) 182b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif 183b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 184b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Compare strings where we know the shared prefix length, 185b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// and advance the prefix length as we find that the strings share even more characters. 186b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t strcmpAfterPrefix(const char *s1, const char *s2, int32_t &prefixLength) { 187b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t pl=prefixLength; 188b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho s1+=pl; 189b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho s2+=pl; 190b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t cmp=0; 191b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(;;) { 192b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t c1=(uint8_t)*s1++; 193b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t c2=(uint8_t)*s2++; 194b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho cmp=c1-c2; 195b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(cmp!=0 || c1==0) { // different or done 196b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 197b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 198b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++pl; // increment shared same-prefix length 199b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 200b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho prefixLength=pl; 201b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return cmp; 202b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 203b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 204b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t prefixBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) { 205b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(count==0) { 206b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return -1; 207b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 208b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t start=0; 209b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t limit=count; 210b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Remember the shared prefix between s, start and limit, 211b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // and don't compare that shared prefix again. 212b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The shared prefix should get longer as we narrow the [start, limit[ range. 213b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t startPrefixLength=0; 214b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t limitPrefixLength=0; 215b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Prime the prefix lengths so that we don't keep prefixLength at 0 until 216b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // both the start and limit indexes have moved. 217b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // At the same time, we find if s is one of the start and (limit-1) names, 218b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // and if not, exclude them from the actual binary search. 219b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, startPrefixLength)) { 220b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return 0; 221b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 222b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++start; 223b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho --limit; 224b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, limitPrefixLength)) { 225b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return limit; 226b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 227b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while(start<limit) { 228b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t i=(start+limit)/2; 229b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 230b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, prefixLength); 231b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(cmp<0) { 232b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho limit=i; 233b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho limitPrefixLength=prefixLength; 234b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(cmp==0) { 235b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return i; 236b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 237b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho start=i+1; 238b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho startPrefixLength=prefixLength; 239b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 240b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 241b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return -1; 242b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 243b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 244b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass PrefixBinarySearchPackageLookup : public BinarySearchPackageLookup { 245b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 246b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest &perf) 247b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : BinarySearchPackageLookup(perf) {} 248b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 249b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual void call(UErrorCode * /*pErrorCode*/) { 250b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t count=pkg.getItemCount(); 251b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *itemNameChars=itemNames.data(); 252b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *name=itemNameChars; 253b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<count; ++i) { 254b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(prefixBinarySearch(name, itemNameChars, toc, count)<0) { 255b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "item not found: %s\n", name); 256b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 257b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name=strchr(name, 0)+1; 258b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 259b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 260b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 261b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 262b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t bytesTrieLookup(const char *s, const char *nameTrieBytes) { 263b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrie trie(nameTrieBytes); 264b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(USTRINGTRIE_HAS_VALUE(trie.next(s, -1))) { 265b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return trie.getValue(); 266b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 267b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return -1; 268b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 269b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 270b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 271b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTriePackageLookup : public PackageLookup { 272b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 273b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTriePackageLookup(const DictionaryTriePerfTest &perf) 274b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : PackageLookup(perf) { 275b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho IcuToolErrorCode errorCode("BinarySearchPackageLookup()"); 276b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho builder=new BytesTrieBuilder(errorCode); 277b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t count=pkg.getItemCount(); 278b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<count; ++i) { 279b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The Package class removes the "icudt46l/" prefix. 280b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We restore that here for a fair performance test. 281b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We store all full names so that we do not have to reconstruct them 282b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // in the call() function. 283b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *name=pkg.getItem(i)->name; 284b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t offset=itemNames.length(); 285b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho itemNames.append("icudt46l/", errorCode); 286b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho itemNames.append(name, -1, errorCode); 287b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // As value, set the data item index. 288b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // In a real implementation, we would use that to get the 289b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // start and limit offset of the data item. 290b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho StringPiece fullName(itemNames.toStringPiece()); 291b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fullName.remove_prefix(offset); 292b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho builder->add(fullName, i, errorCode); 293b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // NUL-terminate the name for call() to find the next one. 294b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho itemNames.append(0, errorCode); 295b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 296b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length(); 297b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho printf("size of BytesTrie: %6ld\n", (long)length); 298b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // count+1: +1 for the last-item limit offset which we should have always had 299b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho printf("size of dataOffsets:%6ld\n", (long)((count+1)*4)); 300b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho printf("total index size: %6ld\n", (long)(length+(count+1)*4)); 301b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 302b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual ~BytesTriePackageLookup() { 303b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete builder; 304b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 305b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 306b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual void call(UErrorCode *pErrorCode) { 307b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t count=pkg.getItemCount(); 308b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *nameTrieBytes=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data(); 309b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *name=itemNames.data(); 310b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<count; ++i) { 311b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(bytesTrieLookup(name, nameTrieBytes)<0) { 312b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "item not found: %s\n", name); 313b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 314b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name=strchr(name, 0)+1; 315b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 316b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 317b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 318b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected: 319b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrieBuilder *builder; 320b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho CharString itemNames; 321b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 322b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 323b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Performance test function object. 324b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Each subclass loads a dictionary text file 325b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// from the -s or --sourcedir path plus -f or --file-name. 326b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// For example, <ICU source dir>/source/data/brkitr/thaidict.txt. 327b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass DictLookup : public UPerfFunction { 328b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 329b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho DictLookup(const DictionaryTriePerfTest &perfTest) : perf(perfTest) {} 330b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 331b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual long getOperationsPerIteration() { 332b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return perf.numTextLines; 333b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 334b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 335b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected: 336b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const DictionaryTriePerfTest &perf; 337b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 338b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 339b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Closely imitate CompactTrieDictionary::matches(). 340b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Note: CompactTrieDictionary::matches() is part of its trie implementation, 341b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// and while it loops over the text, it knows the current state. 342b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// By contrast, this implementation uses UCharsTrie API functions that have to 343b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// check the trie state each time and load/store state in the object. 344b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// (Whether it hasNext() and whether it is in the middle of a linear-match node.) 345b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t 346b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoucharsTrieMatches(UCharsTrie &trie, 347b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UText *text, int32_t textLimit, 348b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t *lengths, int &count, int limit ) { 349b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 c=utext_next32(text); 350b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Notes: 351b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. 352b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // b) It also ignores non-BMP code points by casting to UChar! 353b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(c<0) { 354b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return 0; 355b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 356b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Should be firstForCodePoint() but CompactTrieDictionary 357b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // handles only code units. 358b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult result=trie.first(c); 359b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numChars=1; 360b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho count=0; 361b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(;;) { 362b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(USTRINGTRIE_HAS_VALUE(result)) { 363b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(count<limit) { 364b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // lengths[count++]=(int32_t)utext_getNativeIndex(text); 365b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. 366b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 367b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(result==USTRINGTRIE_FINAL_VALUE) { 368b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 369b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 370b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(result==USTRINGTRIE_NO_MATCH) { 371b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 372b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 373b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(numChars>=textLimit) { 374b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Note: Why do we have both a text limit and a UText that knows its length? 375b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 376b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 377b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 c=utext_next32(text); 378b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Notes: 379b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. 380b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // b) It also ignores non-BMP code points by casting to UChar! 381b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(c<0) { 382b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 383b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 384b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++numChars; 385b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Should be nextForCodePoint() but CompactTrieDictionary 386b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // handles only code units. 387b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho result=trie.next(c); 388b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 389b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#if 0 390b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Note: CompactTrieDictionary::matches() comments say that it leaves the UText 391b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // after the longest prefix match and returns the number of characters 392b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // that were matched. 393b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(index!=lastMatch) { 394b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_setNativeIndex(text, lastMatch); 395b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 396b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return lastMatch-start; 397b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // However, it does not do either of these, so I am not trying to 398b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // imitate it (or its docs) 100%. 399b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif 400b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return numChars; 401b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 402b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 403b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UCharsTrieDictLookup : public DictLookup { 404b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 405b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrieDictLookup(const DictionaryTriePerfTest &perfTest) 406b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : DictLookup(perfTest), trie(NULL) { 407b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho IcuToolErrorCode errorCode("UCharsTrieDictLookup()"); 408b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho builder=new UCharsTrieBuilder(errorCode); 409b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const ULine *lines=perf.getCachedLines(); 410b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numLines=perf.getNumLines(); 411b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<numLines; ++i) { 412b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip comment lines (start with a character below 'A'). 413b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(lines[i].name[0]<0x41) { 414b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho continue; 415b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 416b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho builder->add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode); 417b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 418b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString trieUChars; 419b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t length=builder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieUChars, errorCode).length(); 420b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho printf("size of UCharsTrie: %6ld bytes\n", (long)length*2); 421b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode); 422b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 423b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 424b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual ~UCharsTrieDictLookup() { 425b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete builder; 426b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete trie; 427b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 428b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 429b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected: 430b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrieBuilder *builder; 431b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrie *trie; 432b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 433b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 434b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UCharsTrieDictMatches : public UCharsTrieDictLookup { 435b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 436b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrieDictMatches(const DictionaryTriePerfTest &perfTest) 437b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : UCharsTrieDictLookup(perfTest) {} 438b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 439b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual void call(UErrorCode *pErrorCode) { 440b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UText text=UTEXT_INITIALIZER; 441b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t lengths[20]; 442b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const ULine *lines=perf.getCachedLines(); 443b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numLines=perf.getNumLines(); 444b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<numLines; ++i) { 445b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip comment lines (start with a character below 'A'). 446b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(lines[i].name[0]<0x41) { 447b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho continue; 448b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 449b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode); 450b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t count=0; 451b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ucharsTrieMatches(*trie, &text, lines[i].len, 452b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho lengths, count, LENGTHOF(lengths)); 453b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(count==0 || lengths[count-1]!=lines[i].len) { 454b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 455b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 456b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 457b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 458b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 459b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 460b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UCharsTrieDictContains : public UCharsTrieDictLookup { 461b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrieDictContains(const DictionaryTriePerfTest &perfTest) 463b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : UCharsTrieDictLookup(perfTest) {} 464b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 465b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual void call(UErrorCode * /*pErrorCode*/) { 466b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const ULine *lines=perf.getCachedLines(); 467b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numLines=perf.getNumLines(); 468b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<numLines; ++i) { 469b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip comment lines (which start with a character below 'A'). 470b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(lines[i].name[0]<0x41) { 471b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho continue; 472b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 473b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(!USTRINGTRIE_HAS_VALUE(trie->reset().next(lines[i].name, lines[i].len))) { 474b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 475b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 476b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 477b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 478b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 479b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 480b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic inline int32_t thaiCharToByte(UChar32 c) { 481b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(0xe00<=c && c<=0xefe) { 482b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return c&0xff; 483b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(c==0x2e) { 484b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return 0xff; 485b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 486b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return -1; 487b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 488b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 489b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 490b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic UBool thaiWordToBytes(const UChar *s, int32_t length, 491b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho CharString &str, UErrorCode &errorCode) { 492b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<length; ++i) { 493b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar c=s[i]; 494b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t b=thaiCharToByte(c); 495b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(b>=0) { 496b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho str.append((char)b, errorCode); 497b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 498b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c); 499b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return FALSE; 500b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 501b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 502b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return TRUE; 503b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 504b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 505b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTrieDictLookup : public DictLookup { 506b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 507b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrieDictLookup(const DictionaryTriePerfTest &perfTest) 508b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : DictLookup(perfTest), trie(NULL), noDict(FALSE) { 509b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho IcuToolErrorCode errorCode("BytesTrieDictLookup()"); 510b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho builder=new BytesTrieBuilder(errorCode); 511b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho CharString str; 512b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const ULine *lines=perf.getCachedLines(); 513b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numLines=perf.getNumLines(); 514b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<numLines; ++i) { 515b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip comment lines (start with a character below 'A'). 516b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(lines[i].name[0]<0x41) { 517b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho continue; 518b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 519b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(!thaiWordToBytes(lines[i].name, lines[i].len, str.clear(), errorCode)) { 520b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i); 521b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho noDict=TRUE; 522b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 523b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 524b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho builder->add(str.toStringPiece(), 0, errorCode); 525b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 526b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(!noDict) { 527b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length(); 528b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho printf("size of BytesTrie: %6ld bytes\n", (long)length); 529b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode); 530b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 531b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 532b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 533b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual ~BytesTrieDictLookup() { 534b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete builder; 535b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete trie; 536b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 537b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 538b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected: 539b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrieBuilder *builder; 540b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrie *trie; 541b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool noDict; 542b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 543b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 544b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t 545b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehobytesTrieMatches(BytesTrie &trie, 546b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UText *text, int32_t textLimit, 547b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t *lengths, int &count, int limit ) { 548b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 c=utext_next32(text); 549b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(c<0) { 550b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return 0; 551b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 552b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult result=trie.first(thaiCharToByte(c)); 553b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numChars=1; 554b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho count=0; 555b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(;;) { 556b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(USTRINGTRIE_HAS_VALUE(result)) { 557b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(count<limit) { 558b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // lengths[count++]=(int32_t)utext_getNativeIndex(text); 559b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. 560b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 561b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(result==USTRINGTRIE_FINAL_VALUE) { 562b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 563b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 564b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(result==USTRINGTRIE_NO_MATCH) { 565b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 566b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 567b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(numChars>=textLimit) { 568b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 569b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 570b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 c=utext_next32(text); 571b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(c<0) { 572b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 573b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 574b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++numChars; 575b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho result=trie.next(thaiCharToByte(c)); 576b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 577b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return numChars; 578b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 579b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 580b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTrieDictMatches : public BytesTrieDictLookup { 581b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 582b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrieDictMatches(const DictionaryTriePerfTest &perfTest) 583b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : BytesTrieDictLookup(perfTest) {} 584b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 585b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual void call(UErrorCode *pErrorCode) { 586b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(noDict) { 587b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return; 588b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 589b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UText text=UTEXT_INITIALIZER; 590b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t lengths[20]; 591b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const ULine *lines=perf.getCachedLines(); 592b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numLines=perf.getNumLines(); 593b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<numLines; ++i) { 594b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip comment lines (start with a character below 'A'). 595b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(lines[i].name[0]<0x41) { 596b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho continue; 597b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 598b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode); 599b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t count=0; 600b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho bytesTrieMatches(*trie, &text, lines[i].len, 601b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho lengths, count, LENGTHOF(lengths)); 602b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(count==0 || lengths[count-1]!=lines[i].len) { 603b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 604b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 605b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 606b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 607b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 608b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 609b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTrieDictContains : public BytesTrieDictLookup { 610b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 611b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrieDictContains(const DictionaryTriePerfTest &perfTest) 612b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : BytesTrieDictLookup(perfTest) {} 613b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 614b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho virtual void call(UErrorCode * /*pErrorCode*/) { 615b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(noDict) { 616b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return; 617b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 618b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const ULine *lines=perf.getCachedLines(); 619b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t numLines=perf.getNumLines(); 620b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t i=0; i<numLines; ++i) { 621b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const UChar *line=lines[i].name; 622b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip comment lines (start with a character below 'A'). 623b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(line[0]<0x41) { 624b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho continue; 625b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 626b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult result=trie->first(thaiCharToByte(line[0])); 627b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t lineLength=lines[i].len; 628b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for(int32_t j=1; j<lineLength; ++j) { 629b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(!USTRINGTRIE_HAS_NEXT(result)) { 630b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 631b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 632b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 633b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho result=trie->next(thaiCharToByte(line[j])); 634b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 635b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(!USTRINGTRIE_HAS_VALUE(result)) { 636b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 637b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 638b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 639b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 640b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 641b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 642b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoUPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec, 643b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const char *&name, char * /*par*/) { 644b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(hasFile()) { 645b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho switch(index) { 646b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho case 0: 647b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name="ucharstriematches"; 648b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(exec) { 649b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return new UCharsTrieDictMatches(*this); 650b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 651b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 65254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius case 1: 653b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name="ucharstriecontains"; 654b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(exec) { 655b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return new UCharsTrieDictContains(*this); 656b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 657b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 65854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius case 2: 659b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name="bytestriematches"; 660b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(exec) { 661b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return new BytesTrieDictMatches(*this); 662b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 663b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 66454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius case 3: 665b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name="bytestriecontains"; 666b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(exec) { 667b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return new BytesTrieDictContains(*this); 668b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 669b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 670b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho default: 671b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name=""; 672b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 673b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 674b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 675b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(index==0 && exec) { 676b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho puts("Running BytesTrie perf tests on the .dat package file from the --sourcedir.\n" 677b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho "For UCharsTrie perf tests on a dictionary text file, specify the -f or --file-name.\n"); 678b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 679b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho switch(index) { 680b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho case 0: 681b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name="simplebinarysearch"; 682b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(exec) { 683b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return new BinarySearchPackageLookup(*this); 684b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 685b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 686b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho case 1: 687b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name="prefixbinarysearch"; 688b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(exec) { 689b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return new PrefixBinarySearchPackageLookup(*this); 690b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 691b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 692b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho case 2: 693b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name="bytestrie"; 694b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(exec) { 695b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return new BytesTriePackageLookup(*this); 696b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 697b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 698b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho default: 699b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho name=""; 700b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 701b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 702b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 703b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return NULL; 704b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 705b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 706b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoint main(int argc, const char *argv[]) { 707b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho IcuToolErrorCode errorCode("dicttrieperf main()"); 708b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho DictionaryTriePerfTest test(argc, argv, errorCode); 709b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(errorCode.isFailure()) { 710b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "DictionaryTriePerfTest() failed: %s\n", errorCode.errorName()); 711b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho test.usage(); 712b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return errorCode.reset(); 713b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 714b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(!test.run()) { 715b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fprintf(stderr, "FAILED: Tests could not be run, please check the arguments.\n"); 716b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return -1; 717b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 718b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return 0; 719b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 720