16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ********************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copyright (C) 2010-2012, International Business Machines 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Corporation and others. All Rights Reserved. 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ********************************************************************** 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * file name: dicttrieperf.cpp 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * encoding: US-ASCII 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * tab size: 8 (not used) 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * indentation:4 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * created on: 2010dec09 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * created by: Markus W. Scherer 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Performance test program for dictionary-type tries. 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Usage from within <ICU build tree>/test/perf/dicttrieperf/ : 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (Linux) 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * make 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * or 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <stdio.h> 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <stdlib.h> 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/bytestrie.h" 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/bytestriebuilder.h" 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/localpointer.h" 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucharstrie.h" 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ucharstriebuilder.h" 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uperf.h" 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utext.h" 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "charstr.h" 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "package.h" 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "toolutil.h" 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "ucbuf.h" // struct ULine 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uoptions.h" 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uvectr32.h" 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Test object. 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass DictionaryTriePerfTest : public UPerfTest { 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DictionaryTriePerfTest(int32_t argc, const char *argv[], UErrorCode &status) 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : UPerfTest(argc, argv, NULL, 0, "", status), numTextLines(0) { 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(hasFile()) { 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org getLines(status); 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<numLines; ++i) { 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip comment lines (start with a character below 'A'). 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lines[i].name[0]>=0x41) { 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++numTextLines; 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Remove trailing CR LF. 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t len=lines[i].len; 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar c; 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(len>0 && ((c=lines[i].name[len-1])==0xa || c==0xd)) { 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --len; 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lines[i].len=len; 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual UPerfFunction *runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL); 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *getSourceDir() const { return sourceDir; } 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool hasFile() const { return ucharBuf!=NULL; } 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ULine *getCachedLines() const { return lines; } 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t getNumLines() const { return numLines; } 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numTextLines; // excluding comment lines 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Performance test function object. 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Loads icudt46l.dat (or whatever its current versioned filename) 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// from the -s or --sourcedir path. 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass PackageLookup : public UPerfFunction { 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprotected: 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org PackageLookup(const DictionaryTriePerfTest &perf) { 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IcuToolErrorCode errorCode("PackageLookup()"); 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org CharString filename(perf.getSourceDir(), errorCode); 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t filenameLength=filename.length(); 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(filenameLength>0 && filename[filenameLength-1]!=U_FILE_SEP_CHAR && 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org filename[filenameLength-1]!=U_FILE_ALT_SEP_CHAR) { 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org filename.append(U_FILE_SEP_CHAR, errorCode); 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org filename.append(U_ICUDATA_NAME, errorCode); 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org filename.append(".dat", errorCode); 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pkg.readPackage(filename.data()); 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual ~PackageLookup() {} 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // virtual void call(UErrorCode* pErrorCode) { ... } 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual long getOperationsPerIteration() { 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pkg.getItemCount(); 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // virtual long getEventsPerIteration(); 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprotected: 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Package pkg; 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstruct TOCEntry { 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t nameOffset, dataOffset; 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c). 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic int32_t simpleBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) { 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t start=0; 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t limit=count; 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lastNumber=limit; 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t number=(start+limit)/2; 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lastNumber==number) { // have we moved? 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return -1; // not found 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lastNumber=number; 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cmp=strcmp(s, names+toc[number].nameOffset); 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(cmp<0) { 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=number; 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(cmp>0) { 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=number; 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { // found s 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return number; 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass BinarySearchPackageLookup : public PackageLookup { 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BinarySearchPackageLookup(const DictionaryTriePerfTest &perf) 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : PackageLookup(perf) { 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IcuToolErrorCode errorCode("BinarySearchPackageLookup()"); 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count=pkg.getItemCount(); 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org toc=new TOCEntry[count]; 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<count; ++i) { 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org toc[i].nameOffset=itemNames.length(); 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org toc[i].dataOffset=i; // arbitrary value, see toc comment below 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The Package class removes the "icudt46l/" prefix. 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We restore that here for a fair performance test. 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name=pkg.getItem(i)->name; 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org itemNames.append("icudt46l/", errorCode); 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org itemNames.append(name, strlen(name)+1, errorCode); 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("size of item names: %6ld\n", (long)itemNames.length()); 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("size of TOC: %6ld\n", (long)(count*8)); 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("total index size: %6ld\n", (long)(itemNames.length()+count*8)); 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual ~BinarySearchPackageLookup() { 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete[] toc; 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual void call(UErrorCode * /*pErrorCode*/) { 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count=pkg.getItemCount(); 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *itemNameChars=itemNames.data(); 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name=itemNameChars; 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<count; ++i) { 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(simpleBinarySearch(name, itemNameChars, toc, count)<0) { 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "item not found: %s\n", name); 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name=strchr(name, 0)+1; 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprotected: 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org CharString itemNames; 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // toc imitates a .dat file's array of UDataOffsetTOCEntry 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // with nameOffset and dataOffset. 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We don't need the dataOffsets, but we want to imitate the real 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // memory density, to measure equivalent CPU cache usage. 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org TOCEntry *toc; 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifndef MIN 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define MIN(a,b) (((a)<(b)) ? (a) : (b)) 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Compare strings where we know the shared prefix length, 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// and advance the prefix length as we find that the strings share even more characters. 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic int32_t strcmpAfterPrefix(const char *s1, const char *s2, int32_t &prefixLength) { 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t pl=prefixLength; 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s1+=pl; 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s2+=pl; 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cmp=0; 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t c1=(uint8_t)*s1++; 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t c2=(uint8_t)*s2++; 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cmp=c1-c2; 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(cmp!=0 || c1==0) { // different or done 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++pl; // increment shared same-prefix length 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org prefixLength=pl; 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return cmp; 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic int32_t prefixBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) { 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(count==0) { 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return -1; 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t start=0; 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t limit=count; 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Remember the shared prefix between s, start and limit, 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and don't compare that shared prefix again. 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The shared prefix should get longer as we narrow the [start, limit[ range. 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t startPrefixLength=0; 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t limitPrefixLength=0; 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Prime the prefix lengths so that we don't keep prefixLength at 0 until 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // both the start and limit indexes have moved. 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // At the same time, we find if s is one of the start and (limit-1) names, 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and if not, exclude them from the actual binary search. 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, startPrefixLength)) { 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++start; 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --limit; 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, limitPrefixLength)) { 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return limit; 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(start<limit) { 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i=(start+limit)/2; 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, prefixLength); 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(cmp<0) { 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=i; 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limitPrefixLength=prefixLength; 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(cmp==0) { 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return i; 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=i+1; 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org startPrefixLength=prefixLength; 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return -1; 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass PrefixBinarySearchPackageLookup : public BinarySearchPackageLookup { 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest &perf) 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : BinarySearchPackageLookup(perf) {} 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual void call(UErrorCode * /*pErrorCode*/) { 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count=pkg.getItemCount(); 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *itemNameChars=itemNames.data(); 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name=itemNameChars; 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<count; ++i) { 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(prefixBinarySearch(name, itemNameChars, toc, count)<0) { 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "item not found: %s\n", name); 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name=strchr(name, 0)+1; 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic int32_t bytesTrieLookup(const char *s, const char *nameTrieBytes) { 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrie trie(nameTrieBytes); 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(USTRINGTRIE_HAS_VALUE(trie.next(s, -1))) { 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return trie.getValue(); 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return -1; 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass BytesTriePackageLookup : public PackageLookup { 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTriePackageLookup(const DictionaryTriePerfTest &perf) 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : PackageLookup(perf) { 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IcuToolErrorCode errorCode("BinarySearchPackageLookup()"); 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org builder=new BytesTrieBuilder(errorCode); 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count=pkg.getItemCount(); 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<count; ++i) { 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The Package class removes the "icudt46l/" prefix. 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We restore that here for a fair performance test. 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We store all full names so that we do not have to reconstruct them 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // in the call() function. 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name=pkg.getItem(i)->name; 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t offset=itemNames.length(); 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org itemNames.append("icudt46l/", errorCode); 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org itemNames.append(name, -1, errorCode); 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // As value, set the data item index. 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // In a real implementation, we would use that to get the 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // start and limit offset of the data item. 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org StringPiece fullName(itemNames.toStringPiece()); 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fullName.remove_prefix(offset); 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org builder->add(fullName, i, errorCode); 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // NUL-terminate the name for call() to find the next one. 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org itemNames.append(0, errorCode); 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length(); 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("size of BytesTrie: %6ld\n", (long)length); 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // count+1: +1 for the last-item limit offset which we should have always had 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("size of dataOffsets:%6ld\n", (long)((count+1)*4)); 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("total index size: %6ld\n", (long)(length+(count+1)*4)); 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual ~BytesTriePackageLookup() { 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete builder; 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual void call(UErrorCode *pErrorCode) { 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count=pkg.getItemCount(); 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *nameTrieBytes=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data(); 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name=itemNames.data(); 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<count; ++i) { 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(bytesTrieLookup(name, nameTrieBytes)<0) { 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "item not found: %s\n", name); 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name=strchr(name, 0)+1; 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprotected: 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrieBuilder *builder; 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org CharString itemNames; 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Performance test function object. 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Each subclass loads a dictionary text file 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// from the -s or --sourcedir path plus -f or --file-name. 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// For example, <ICU source dir>/source/data/brkitr/thaidict.txt. 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass DictLookup : public UPerfFunction { 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DictLookup(const DictionaryTriePerfTest &perfTest) : perf(perfTest) {} 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual long getOperationsPerIteration() { 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return perf.numTextLines; 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprotected: 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const DictionaryTriePerfTest &perf; 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Closely imitate CompactTrieDictionary::matches(). 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Note: CompactTrieDictionary::matches() is part of its trie implementation, 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// and while it loops over the text, it knows the current state. 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// By contrast, this implementation uses UCharsTrie API functions that have to 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// check the trie state each time and load/store state in the object. 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// (Whether it hasNext() and whether it is in the middle of a linear-match node.) 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic int32_t 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgucharsTrieMatches(UCharsTrie &trie, 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UText *text, int32_t textLimit, 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t *lengths, int &count, int limit ) { 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c=utext_next32(text); 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Notes: 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // b) It also ignores non-BMP code points by casting to UChar! 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<0) { 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Should be firstForCodePoint() but CompactTrieDictionary 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // handles only code units. 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStringTrieResult result=trie.first(c); 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numChars=1; 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org count=0; 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(USTRINGTRIE_HAS_VALUE(result)) { 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(count<limit) { 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // lengths[count++]=(int32_t)utext_getNativeIndex(text); 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(result==USTRINGTRIE_FINAL_VALUE) { 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(result==USTRINGTRIE_NO_MATCH) { 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(numChars>=textLimit) { 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Note: Why do we have both a text limit and a UText that knows its length? 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c=utext_next32(text); 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Notes: 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // b) It also ignores non-BMP code points by casting to UChar! 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<0) { 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++numChars; 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Should be nextForCodePoint() but CompactTrieDictionary 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // handles only code units. 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result=trie.next(c); 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if 0 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Note: CompactTrieDictionary::matches() comments say that it leaves the UText 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // after the longest prefix match and returns the number of characters 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // that were matched. 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(index!=lastMatch) { 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, lastMatch); 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return lastMatch-start; 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // However, it does not do either of these, so I am not trying to 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // imitate it (or its docs) 100%. 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return numChars; 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass UCharsTrieDictLookup : public DictLookup { 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsTrieDictLookup(const DictionaryTriePerfTest &perfTest) 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : DictLookup(perfTest), trie(NULL) { 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IcuToolErrorCode errorCode("UCharsTrieDictLookup()"); 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org builder=new UCharsTrieBuilder(errorCode); 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ULine *lines=perf.getCachedLines(); 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numLines=perf.getNumLines(); 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<numLines; ++i) { 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip comment lines (start with a character below 'A'). 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lines[i].name[0]<0x41) { 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org builder->add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode); 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString trieUChars; 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length=builder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieUChars, errorCode).length(); 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("size of UCharsTrie: %6ld bytes\n", (long)length*2); 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode); 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual ~UCharsTrieDictLookup() { 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete builder; 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete trie; 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprotected: 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsTrieBuilder *builder; 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsTrie *trie; 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass UCharsTrieDictMatches : public UCharsTrieDictLookup { 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsTrieDictMatches(const DictionaryTriePerfTest &perfTest) 4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : UCharsTrieDictLookup(perfTest) {} 4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual void call(UErrorCode *pErrorCode) { 4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UText text=UTEXT_INITIALIZER; 4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lengths[20]; 4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ULine *lines=perf.getCachedLines(); 4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numLines=perf.getNumLines(); 4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<numLines; ++i) { 4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip comment lines (start with a character below 'A'). 4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lines[i].name[0]<0x41) { 4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode); 4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count=0; 4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ucharsTrieMatches(*trie, &text, lines[i].len, 4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lengths, count, LENGTHOF(lengths)); 4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(count==0 || lengths[count-1]!=lines[i].len) { 4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass UCharsTrieDictContains : public UCharsTrieDictLookup { 4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UCharsTrieDictContains(const DictionaryTriePerfTest &perfTest) 4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : UCharsTrieDictLookup(perfTest) {} 4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual void call(UErrorCode * /*pErrorCode*/) { 4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ULine *lines=perf.getCachedLines(); 4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numLines=perf.getNumLines(); 4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<numLines; ++i) { 4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip comment lines (which start with a character below 'A'). 4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lines[i].name[0]<0x41) { 4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!USTRINGTRIE_HAS_VALUE(trie->reset().next(lines[i].name, lines[i].len))) { 4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline int32_t thaiCharToByte(UChar32 c) { 4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(0xe00<=c && c<=0xefe) { 4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return c&0xff; 4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c==0x2e) { 4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0xff; 4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return -1; 4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic UBool thaiWordToBytes(const UChar *s, int32_t length, 4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org CharString &str, UErrorCode &errorCode) { 4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<length; ++i) { 4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar c=s[i]; 4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t b=thaiCharToByte(c); 4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(b>=0) { 4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org str.append((char)b, errorCode); 4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c); 4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return FALSE; 5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return TRUE; 5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass BytesTrieDictLookup : public DictLookup { 5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrieDictLookup(const DictionaryTriePerfTest &perfTest) 5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : DictLookup(perfTest), trie(NULL), noDict(FALSE) { 5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IcuToolErrorCode errorCode("BytesTrieDictLookup()"); 5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org builder=new BytesTrieBuilder(errorCode); 5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org CharString str; 5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ULine *lines=perf.getCachedLines(); 5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numLines=perf.getNumLines(); 5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<numLines; ++i) { 5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip comment lines (start with a character below 'A'). 5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lines[i].name[0]<0x41) { 5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!thaiWordToBytes(lines[i].name, lines[i].len, str.clear(), errorCode)) { 5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i); 5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org noDict=TRUE; 5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org builder->add(str.toStringPiece(), 0, errorCode); 5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!noDict) { 5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length(); 5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org printf("size of BytesTrie: %6ld bytes\n", (long)length); 5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode); 5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual ~BytesTrieDictLookup() { 5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete builder; 5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete trie; 5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprotected: 5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrieBuilder *builder; 5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrie *trie; 5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool noDict; 5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic int32_t 5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbytesTrieMatches(BytesTrie &trie, 5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UText *text, int32_t textLimit, 5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t *lengths, int &count, int limit ) { 5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c=utext_next32(text); 5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<0) { 5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStringTrieResult result=trie.first(thaiCharToByte(c)); 5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numChars=1; 5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org count=0; 5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(USTRINGTRIE_HAS_VALUE(result)) { 5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(count<limit) { 5586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // lengths[count++]=(int32_t)utext_getNativeIndex(text); 5596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. 5606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(result==USTRINGTRIE_FINAL_VALUE) { 5626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 5636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(result==USTRINGTRIE_NO_MATCH) { 5656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 5666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(numChars>=textLimit) { 5686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 5696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c=utext_next32(text); 5716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<0) { 5726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 5736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++numChars; 5756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result=trie.next(thaiCharToByte(c)); 5766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return numChars; 5786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 5796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass BytesTrieDictMatches : public BytesTrieDictLookup { 5816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 5826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrieDictMatches(const DictionaryTriePerfTest &perfTest) 5836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : BytesTrieDictLookup(perfTest) {} 5846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual void call(UErrorCode *pErrorCode) { 5866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(noDict) { 5876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 5886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UText text=UTEXT_INITIALIZER; 5906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lengths[20]; 5916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ULine *lines=perf.getCachedLines(); 5926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numLines=perf.getNumLines(); 5936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<numLines; ++i) { 5946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip comment lines (start with a character below 'A'). 5956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lines[i].name[0]<0x41) { 5966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 5976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode); 5996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count=0; 6006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bytesTrieMatches(*trie, &text, lines[i].len, 6016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lengths, count, LENGTHOF(lengths)); 6026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(count==0 || lengths[count-1]!=lines[i].len) { 6036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 6046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 6086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass BytesTrieDictContains : public BytesTrieDictLookup { 6106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 6116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrieDictContains(const DictionaryTriePerfTest &perfTest) 6126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : BytesTrieDictLookup(perfTest) {} 6136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual void call(UErrorCode * /*pErrorCode*/) { 6156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(noDict) { 6166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 6176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ULine *lines=perf.getCachedLines(); 6196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t numLines=perf.getNumLines(); 6206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t i=0; i<numLines; ++i) { 6216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UChar *line=lines[i].name; 6226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip comment lines (start with a character below 'A'). 6236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(line[0]<0x41) { 6246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 6256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStringTrieResult result=trie->first(thaiCharToByte(line[0])); 6276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lineLength=lines[i].len; 6286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int32_t j=1; j<lineLength; ++j) { 6296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!USTRINGTRIE_HAS_NEXT(result)) { 6306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 6316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result=trie->next(thaiCharToByte(line[j])); 6346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!USTRINGTRIE_HAS_VALUE(result)) { 6366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "word %ld (0-based) not found\n", (long)i); 6376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 6416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec, 6436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *&name, char * /*par*/) { 6446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(hasFile()) { 6456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org switch(index) { 6466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 0: 6476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name="ucharstriematches"; 6486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(exec) { 6496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new UCharsTrieDictMatches(*this); 6506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 1: 6536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name="ucharstriecontains"; 6546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(exec) { 6556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new UCharsTrieDictContains(*this); 6566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 2: 6596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name="bytestriematches"; 6606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(exec) { 6616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new BytesTrieDictMatches(*this); 6626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 3: 6656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name="bytestriecontains"; 6666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(exec) { 6676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new BytesTrieDictContains(*this); 6686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org default: 6716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name=""; 6726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 6756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(index==0 && exec) { 6766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org puts("Running BytesTrie perf tests on the .dat package file from the --sourcedir.\n" 6776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "For UCharsTrie perf tests on a dictionary text file, specify the -f or --file-name.\n"); 6786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org switch(index) { 6806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 0: 6816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name="simplebinarysearch"; 6826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(exec) { 6836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new BinarySearchPackageLookup(*this); 6846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 1: 6876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name="prefixbinarysearch"; 6886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(exec) { 6896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new PrefixBinarySearchPackageLookup(*this); 6906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org case 2: 6936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name="bytestrie"; 6946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(exec) { 6956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new BytesTriePackageLookup(*this); 6966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 6986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org default: 6996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org name=""; 7006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 7016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 7046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 7056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint main(int argc, const char *argv[]) { 7076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IcuToolErrorCode errorCode("dicttrieperf main()"); 7086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DictionaryTriePerfTest test(argc, argv, errorCode); 7096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(errorCode.isFailure()) { 7106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "DictionaryTriePerfTest() failed: %s\n", errorCode.errorName()); 7116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org test.usage(); 7126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return errorCode.reset(); 7136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!test.run()) { 7156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fprintf(stderr, "FAILED: Tests could not be run, please check the arguments.\n"); 7166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return -1; 7176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 7196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 720