1b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/*
2b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho **********************************************************************
354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius *   Copyright (C) 2010-2012, International Business Machines
4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *   Corporation and others.  All Rights Reserved.
5b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho **********************************************************************
6b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  file name:  dicttrieperf.cpp
7b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  encoding:   US-ASCII
8b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  tab size:   8 (not used)
9b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  indentation:4
10b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *
11b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  created on: 2010dec09
12b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  created by: Markus W. Scherer
13b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *
14b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  Performance test program for dictionary-type tries.
15b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *
16b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Usage from within <ICU build tree>/test/perf/dicttrieperf/ :
17b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * (Linux)
18b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  make
19b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw
20b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000
21b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * or
22b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *  ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250
23b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */
24b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
25b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include <stdio.h>
26b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include <stdlib.h>
27b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/bytestrie.h"
28b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/bytestriebuilder.h"
29b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/localpointer.h"
30b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/ucharstrie.h"
31b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/ucharstriebuilder.h"
32b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/uperf.h"
33b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/utext.h"
34b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "charstr.h"
35b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "package.h"
36b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "toolutil.h"
37b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "ucbuf.h"  // struct ULine
38b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "uoptions.h"
39b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "uvectr32.h"
40b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
41b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
43b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Test object.
44b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass DictionaryTriePerfTest : public UPerfTest {
45b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
46b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    DictionaryTriePerfTest(int32_t argc, const char *argv[], UErrorCode &status)
47b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : UPerfTest(argc, argv, NULL, 0, "", status), numTextLines(0) {
48b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(hasFile()) {
49b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            getLines(status);
50b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            for(int32_t i=0; i<numLines; ++i) {
51b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Skip comment lines (start with a character below 'A').
52b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                if(lines[i].name[0]>=0x41) {
53b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    ++numTextLines;
54b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // Remove trailing CR LF.
55b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    int32_t len=lines[i].len;
56b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    UChar c;
57b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    while(len>0 && ((c=lines[i].name[len-1])==0xa || c==0xd)) {
58b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        --len;
59b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
60b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    lines[i].len=len;
61b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                }
62b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
63b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
64b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
65b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
66b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual UPerfFunction *runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
67b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
68b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    const char *getSourceDir() const { return sourceDir; }
69b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
70b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UBool hasFile() const { return ucharBuf!=NULL; }
71b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    const ULine *getCachedLines() const { return lines; }
72b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t getNumLines() const { return numLines; }
73b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t numTextLines;  // excluding comment lines
74b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
75b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
76b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Performance test function object.
77b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Loads icudt46l.dat (or whatever its current versioned filename)
78b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// from the -s or --sourcedir path.
79b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass PackageLookup : public UPerfFunction {
80b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected:
81b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    PackageLookup(const DictionaryTriePerfTest &perf) {
82b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        IcuToolErrorCode errorCode("PackageLookup()");
83b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        CharString filename(perf.getSourceDir(), errorCode);
84b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t filenameLength=filename.length();
85b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(filenameLength>0 && filename[filenameLength-1]!=U_FILE_SEP_CHAR &&
86b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                               filename[filenameLength-1]!=U_FILE_ALT_SEP_CHAR) {
87b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            filename.append(U_FILE_SEP_CHAR, errorCode);
88b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
89b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        filename.append(U_ICUDATA_NAME, errorCode);
90b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        filename.append(".dat", errorCode);
91b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        pkg.readPackage(filename.data());
92b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
93b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
94b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
95b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual ~PackageLookup() {}
96b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
97b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // virtual void call(UErrorCode* pErrorCode) { ... }
98b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
99b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual long getOperationsPerIteration() {
100b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return pkg.getItemCount();
101b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
102b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
103b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // virtual long getEventsPerIteration();
104b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
105b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected:
106b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    Package pkg;
107b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
108b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
109b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostruct TOCEntry {
110b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t nameOffset, dataOffset;
111b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
112b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
113b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c).
114b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t simpleBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) {
115b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t start=0;
116b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t limit=count;
117b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t lastNumber=limit;
118b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    for(;;) {
119b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t number=(start+limit)/2;
120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(lastNumber==number) {  // have we moved?
121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            return -1;  // not found
122b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
123b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        lastNumber=number;
124b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t cmp=strcmp(s, names+toc[number].nameOffset);
125b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(cmp<0) {
126b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            limit=number;
127b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        } else if(cmp>0) {
128b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            start=number;
129b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        } else {  // found s
130b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            return number;
131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
132b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
133b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
134b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
135b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BinarySearchPackageLookup : public PackageLookup {
136b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
137b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BinarySearchPackageLookup(const DictionaryTriePerfTest &perf)
138b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : PackageLookup(perf) {
139b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
140b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t count=pkg.getItemCount();
141b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        toc=new TOCEntry[count];
142b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<count; ++i) {
143b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            toc[i].nameOffset=itemNames.length();
144b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            toc[i].dataOffset=i;  // arbitrary value, see toc comment below
145b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // The Package class removes the "icudt46l/" prefix.
146b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // We restore that here for a fair performance test.
147b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            const char *name=pkg.getItem(i)->name;
148b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            itemNames.append("icudt46l/", errorCode);
149b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            itemNames.append(name, strlen(name)+1, errorCode);
150b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
151b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        printf("size of item names: %6ld\n", (long)itemNames.length());
152b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        printf("size of TOC:        %6ld\n", (long)(count*8));
153b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        printf("total index size:   %6ld\n", (long)(itemNames.length()+count*8));
154b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
155b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual ~BinarySearchPackageLookup() {
156b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        delete[] toc;
157b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
158b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
159b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual void call(UErrorCode * /*pErrorCode*/) {
160b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t count=pkg.getItemCount();
161b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const char *itemNameChars=itemNames.data();
162b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const char *name=itemNameChars;
163b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<count; ++i) {
164b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(simpleBinarySearch(name, itemNameChars, toc, count)<0) {
165b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                fprintf(stderr, "item not found: %s\n", name);
166b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
167b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name=strchr(name, 0)+1;
168b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
169b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
170b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
171b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected:
172b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    CharString itemNames;
173b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // toc imitates a .dat file's array of UDataOffsetTOCEntry
174b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // with nameOffset and dataOffset.
175b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // We don't need the dataOffsets, but we want to imitate the real
176b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // memory density, to measure equivalent CPU cache usage.
177b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    TOCEntry *toc;
178b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
179b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
180b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#ifndef MIN
181b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define MIN(a,b) (((a)<(b)) ? (a) : (b))
182b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif
183b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
184b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Compare strings where we know the shared prefix length,
185b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// and advance the prefix length as we find that the strings share even more characters.
186b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t strcmpAfterPrefix(const char *s1, const char *s2, int32_t &prefixLength) {
187b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t pl=prefixLength;
188b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    s1+=pl;
189b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    s2+=pl;
190b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t cmp=0;
191b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    for(;;) {
192b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t c1=(uint8_t)*s1++;
193b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t c2=(uint8_t)*s2++;
194b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        cmp=c1-c2;
195b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(cmp!=0 || c1==0) {  // different or done
196b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
197b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
198b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        ++pl;  // increment shared same-prefix length
199b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
200b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    prefixLength=pl;
201b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return cmp;
202b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
203b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
204b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t prefixBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) {
205b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(count==0) {
206b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return -1;
207b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
208b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t start=0;
209b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t limit=count;
210b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Remember the shared prefix between s, start and limit,
211b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // and don't compare that shared prefix again.
212b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // The shared prefix should get longer as we narrow the [start, limit[ range.
213b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t startPrefixLength=0;
214b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t limitPrefixLength=0;
215b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Prime the prefix lengths so that we don't keep prefixLength at 0 until
216b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // both the start and limit indexes have moved.
217b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // At the same time, we find if s is one of the start and (limit-1) names,
218b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // and if not, exclude them from the actual binary search.
219b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, startPrefixLength)) {
220b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return 0;
221b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
222b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    ++start;
223b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    --limit;
224b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, limitPrefixLength)) {
225b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return limit;
226b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
227b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    while(start<limit) {
228b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t i=(start+limit)/2;
229b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength);
230b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, prefixLength);
231b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(cmp<0) {
232b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            limit=i;
233b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            limitPrefixLength=prefixLength;
234b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        } else if(cmp==0) {
235b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            return i;
236b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        } else {
237b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            start=i+1;
238b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            startPrefixLength=prefixLength;
239b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
240b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
241b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return -1;
242b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
243b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
244b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass PrefixBinarySearchPackageLookup : public BinarySearchPackageLookup {
245b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
246b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest &perf)
247b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : BinarySearchPackageLookup(perf) {}
248b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
249b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual void call(UErrorCode * /*pErrorCode*/) {
250b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t count=pkg.getItemCount();
251b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const char *itemNameChars=itemNames.data();
252b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const char *name=itemNameChars;
253b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<count; ++i) {
254b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(prefixBinarySearch(name, itemNameChars, toc, count)<0) {
255b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                fprintf(stderr, "item not found: %s\n", name);
256b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
257b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name=strchr(name, 0)+1;
258b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
259b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
260b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
261b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
262b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t bytesTrieLookup(const char *s, const char *nameTrieBytes) {
263b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BytesTrie trie(nameTrieBytes);
264b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(USTRINGTRIE_HAS_VALUE(trie.next(s, -1))) {
265b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return trie.getValue();
266b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    } else {
267b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return -1;
268b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
269b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
270b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
271b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTriePackageLookup : public PackageLookup {
272b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
273b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BytesTriePackageLookup(const DictionaryTriePerfTest &perf)
274b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : PackageLookup(perf) {
275b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
276b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        builder=new BytesTrieBuilder(errorCode);
277b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t count=pkg.getItemCount();
278b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<count; ++i) {
279b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // The Package class removes the "icudt46l/" prefix.
280b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // We restore that here for a fair performance test.
281b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // We store all full names so that we do not have to reconstruct them
282b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // in the call() function.
283b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            const char *name=pkg.getItem(i)->name;
284b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            int32_t offset=itemNames.length();
285b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            itemNames.append("icudt46l/", errorCode);
286b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            itemNames.append(name, -1, errorCode);
287b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // As value, set the data item index.
288b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // In a real implementation, we would use that to get the
289b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // start and limit offset of the data item.
290b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            StringPiece fullName(itemNames.toStringPiece());
291b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            fullName.remove_prefix(offset);
292b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            builder->add(fullName, i, errorCode);
293b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // NUL-terminate the name for call() to find the next one.
294b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            itemNames.append(0, errorCode);
295b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
296b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length();
297b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        printf("size of BytesTrie:   %6ld\n", (long)length);
298b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // count+1: +1 for the last-item limit offset which we should have always had
299b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        printf("size of dataOffsets:%6ld\n", (long)((count+1)*4));
300b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        printf("total index size:   %6ld\n", (long)(length+(count+1)*4));
301b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
302b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual ~BytesTriePackageLookup() {
303b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        delete builder;
304b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
305b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
306b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual void call(UErrorCode *pErrorCode) {
307b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t count=pkg.getItemCount();
308b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const char *nameTrieBytes=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data();
309b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const char *name=itemNames.data();
310b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<count; ++i) {
311b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(bytesTrieLookup(name, nameTrieBytes)<0) {
312b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                fprintf(stderr, "item not found: %s\n", name);
313b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
314b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name=strchr(name, 0)+1;
315b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
316b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
317b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
318b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected:
319b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BytesTrieBuilder *builder;
320b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    CharString itemNames;
321b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
322b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
323b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Performance test function object.
324b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Each subclass loads a dictionary text file
325b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// from the -s or --sourcedir path plus -f or --file-name.
326b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// For example, <ICU source dir>/source/data/brkitr/thaidict.txt.
327b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass DictLookup : public UPerfFunction {
328b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
329b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    DictLookup(const DictionaryTriePerfTest &perfTest) : perf(perfTest) {}
330b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
331b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual long getOperationsPerIteration() {
332b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return perf.numTextLines;
333b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
334b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
335b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected:
336b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    const DictionaryTriePerfTest &perf;
337b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
338b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
339b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Closely imitate CompactTrieDictionary::matches().
340b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Note: CompactTrieDictionary::matches() is part of its trie implementation,
341b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// and while it loops over the text, it knows the current state.
342b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// By contrast, this implementation uses UCharsTrie API functions that have to
343b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// check the trie state each time and load/store state in the object.
344b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// (Whether it hasNext() and whether it is in the middle of a linear-match node.)
345b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t
346b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoucharsTrieMatches(UCharsTrie &trie,
347b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                  UText *text, int32_t textLimit,
348b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                  int32_t *lengths, int &count, int limit ) {
349b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UChar32 c=utext_next32(text);
350b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Notes:
351b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
352b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // b) It also ignores non-BMP code points by casting to UChar!
353b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(c<0) {
354b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return 0;
355b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
356b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Should be firstForCodePoint() but CompactTrieDictionary
357b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // handles only code units.
358b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UStringTrieResult result=trie.first(c);
359b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t numChars=1;
360b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    count=0;
361b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    for(;;) {
362b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(USTRINGTRIE_HAS_VALUE(result)) {
363b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(count<limit) {
364b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // lengths[count++]=(int32_t)utext_getNativeIndex(text);
365b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                lengths[count++]=numChars;  // CompactTrieDictionary just counts chars too.
366b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
367b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(result==USTRINGTRIE_FINAL_VALUE) {
368b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                break;
369b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
370b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        } else if(result==USTRINGTRIE_NO_MATCH) {
371b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
372b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
373b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(numChars>=textLimit) {
374b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // Note: Why do we have both a text limit and a UText that knows its length?
375b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
376b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
377b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UChar32 c=utext_next32(text);
378b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Notes:
379b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
380b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // b) It also ignores non-BMP code points by casting to UChar!
381b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(c<0) {
382b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
383b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
384b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        ++numChars;
385b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Should be nextForCodePoint() but CompactTrieDictionary
386b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // handles only code units.
387b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        result=trie.next(c);
388b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
389b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#if 0
390b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Note: CompactTrieDictionary::matches() comments say that it leaves the UText
391b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // after the longest prefix match and returns the number of characters
392b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // that were matched.
393b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(index!=lastMatch) {
394b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        utext_setNativeIndex(text, lastMatch);
395b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
396b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return lastMatch-start;
397b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // However, it does not do either of these, so I am not trying to
398b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // imitate it (or its docs) 100%.
399b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif
400b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return numChars;
401b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
402b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
403b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UCharsTrieDictLookup : public DictLookup {
404b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
405b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UCharsTrieDictLookup(const DictionaryTriePerfTest &perfTest)
406b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : DictLookup(perfTest), trie(NULL) {
407b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        IcuToolErrorCode errorCode("UCharsTrieDictLookup()");
408b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        builder=new UCharsTrieBuilder(errorCode);
409b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const ULine *lines=perf.getCachedLines();
410b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t numLines=perf.getNumLines();
411b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<numLines; ++i) {
412b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // Skip comment lines (start with a character below 'A').
413b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(lines[i].name[0]<0x41) {
414b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                continue;
415b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
416b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            builder->add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode);
417b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
418b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UnicodeString trieUChars;
419b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t length=builder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieUChars, errorCode).length();
420b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        printf("size of UCharsTrie:          %6ld bytes\n", (long)length*2);
421b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode);
422b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
423b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
424b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual ~UCharsTrieDictLookup() {
425b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        delete builder;
426b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        delete trie;
427b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
428b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
429b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected:
430b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UCharsTrieBuilder *builder;
431b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UCharsTrie *trie;
432b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
433b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
434b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UCharsTrieDictMatches : public UCharsTrieDictLookup {
435b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
436b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UCharsTrieDictMatches(const DictionaryTriePerfTest &perfTest)
437b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : UCharsTrieDictLookup(perfTest) {}
438b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
439b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual void call(UErrorCode *pErrorCode) {
440b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UText text=UTEXT_INITIALIZER;
441b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t lengths[20];
442b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const ULine *lines=perf.getCachedLines();
443b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t numLines=perf.getNumLines();
444b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<numLines; ++i) {
445b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // Skip comment lines (start with a character below 'A').
446b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(lines[i].name[0]<0x41) {
447b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                continue;
448b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
449b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
450b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            int32_t count=0;
451b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            ucharsTrieMatches(*trie, &text, lines[i].len,
452b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                              lengths, count, LENGTHOF(lengths));
453b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(count==0 || lengths[count-1]!=lines[i].len) {
454b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
455b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
456b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
457b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
458b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
459b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
460b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UCharsTrieDictContains : public UCharsTrieDictLookup {
461b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UCharsTrieDictContains(const DictionaryTriePerfTest &perfTest)
463b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : UCharsTrieDictLookup(perfTest) {}
464b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
465b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual void call(UErrorCode * /*pErrorCode*/) {
466b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const ULine *lines=perf.getCachedLines();
467b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t numLines=perf.getNumLines();
468b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<numLines; ++i) {
469b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // Skip comment lines (which start with a character below 'A').
470b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(lines[i].name[0]<0x41) {
471b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                continue;
472b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
473b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(!USTRINGTRIE_HAS_VALUE(trie->reset().next(lines[i].name, lines[i].len))) {
474b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
475b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
476b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
477b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
478b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
479b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
480b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic inline int32_t thaiCharToByte(UChar32 c) {
481b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(0xe00<=c && c<=0xefe) {
482b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return c&0xff;
483b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    } else if(c==0x2e) {
484b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return 0xff;
485b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    } else {
486b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return -1;
487b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
488b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
489b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
490b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic UBool thaiWordToBytes(const UChar *s, int32_t length,
491b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                             CharString &str, UErrorCode &errorCode) {
492b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    for(int32_t i=0; i<length; ++i) {
493b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UChar c=s[i];
494b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t b=thaiCharToByte(c);
495b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(b>=0) {
496b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            str.append((char)b, errorCode);
497b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        } else {
498b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            fprintf(stderr, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c);
499b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            return FALSE;
500b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
501b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
502b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return TRUE;
503b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
504b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
505b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTrieDictLookup : public DictLookup {
506b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
507b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BytesTrieDictLookup(const DictionaryTriePerfTest &perfTest)
508b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : DictLookup(perfTest), trie(NULL), noDict(FALSE) {
509b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        IcuToolErrorCode errorCode("BytesTrieDictLookup()");
510b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        builder=new BytesTrieBuilder(errorCode);
511b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        CharString str;
512b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const ULine *lines=perf.getCachedLines();
513b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t numLines=perf.getNumLines();
514b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<numLines; ++i) {
515b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // Skip comment lines (start with a character below 'A').
516b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(lines[i].name[0]<0x41) {
517b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                continue;
518b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
519b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(!thaiWordToBytes(lines[i].name, lines[i].len, str.clear(), errorCode)) {
520b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                fprintf(stderr, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i);
521b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                noDict=TRUE;
522b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                break;
523b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
524b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            builder->add(str.toStringPiece(), 0, errorCode);
525b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
526b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(!noDict) {
527b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length();
528b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            printf("size of BytesTrie:           %6ld bytes\n", (long)length);
529b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode);
530b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
531b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
532b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
533b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual ~BytesTrieDictLookup() {
534b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        delete builder;
535b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        delete trie;
536b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
537b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
538b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected:
539b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BytesTrieBuilder *builder;
540b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BytesTrie *trie;
541b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UBool noDict;
542b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
543b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
544b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostatic int32_t
545b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehobytesTrieMatches(BytesTrie &trie,
546b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                 UText *text, int32_t textLimit,
547b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                 int32_t *lengths, int &count, int limit ) {
548b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UChar32 c=utext_next32(text);
549b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(c<0) {
550b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return 0;
551b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
552b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UStringTrieResult result=trie.first(thaiCharToByte(c));
553b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t numChars=1;
554b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    count=0;
555b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    for(;;) {
556b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(USTRINGTRIE_HAS_VALUE(result)) {
557b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(count<limit) {
558b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // lengths[count++]=(int32_t)utext_getNativeIndex(text);
559b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                lengths[count++]=numChars;  // CompactTrieDictionary just counts chars too.
560b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
561b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(result==USTRINGTRIE_FINAL_VALUE) {
562b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                break;
563b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
564b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        } else if(result==USTRINGTRIE_NO_MATCH) {
565b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
566b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
567b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(numChars>=textLimit) {
568b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
569b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
570b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UChar32 c=utext_next32(text);
571b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(c<0) {
572b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
573b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
574b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        ++numChars;
575b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        result=trie.next(thaiCharToByte(c));
576b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
577b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return numChars;
578b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
579b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
580b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTrieDictMatches : public BytesTrieDictLookup {
581b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
582b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BytesTrieDictMatches(const DictionaryTriePerfTest &perfTest)
583b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : BytesTrieDictLookup(perfTest) {}
584b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
585b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual void call(UErrorCode *pErrorCode) {
586b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(noDict) {
587b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            return;
588b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
589b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UText text=UTEXT_INITIALIZER;
590b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t lengths[20];
591b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const ULine *lines=perf.getCachedLines();
592b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t numLines=perf.getNumLines();
593b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<numLines; ++i) {
594b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // Skip comment lines (start with a character below 'A').
595b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(lines[i].name[0]<0x41) {
596b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                continue;
597b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
598b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
599b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            int32_t count=0;
600b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            bytesTrieMatches(*trie, &text, lines[i].len,
601b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                             lengths, count, LENGTHOF(lengths));
602b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(count==0 || lengths[count-1]!=lines[i].len) {
603b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
604b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
605b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
606b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
607b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
608b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
609b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTrieDictContains : public BytesTrieDictLookup {
610b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
611b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BytesTrieDictContains(const DictionaryTriePerfTest &perfTest)
612b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            : BytesTrieDictLookup(perfTest) {}
613b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
614b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual void call(UErrorCode * /*pErrorCode*/) {
615b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(noDict) {
616b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            return;
617b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
618b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        const ULine *lines=perf.getCachedLines();
619b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t numLines=perf.getNumLines();
620b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for(int32_t i=0; i<numLines; ++i) {
621b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            const UChar *line=lines[i].name;
622b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // Skip comment lines (start with a character below 'A').
623b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(line[0]<0x41) {
624b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                continue;
625b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
626b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            UStringTrieResult result=trie->first(thaiCharToByte(line[0]));
627b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            int32_t lineLength=lines[i].len;
628b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            for(int32_t j=1; j<lineLength; ++j) {
629b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                if(!USTRINGTRIE_HAS_NEXT(result)) {
630b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
631b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    break;
632b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                }
633b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                result=trie->next(thaiCharToByte(line[j]));
634b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
635b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(!USTRINGTRIE_HAS_VALUE(result)) {
636b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
637b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
638b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
639b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
640b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
641b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
642b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoUPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec,
643b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                                      const char *&name, char * /*par*/) {
644b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(hasFile()) {
645b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        switch(index) {
646b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        case 0:
647b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="ucharstriematches";
648b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(exec) {
649b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                return new UCharsTrieDictMatches(*this);
650b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
651b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
65254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        case 1:
653b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="ucharstriecontains";
654b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(exec) {
655b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                return new UCharsTrieDictContains(*this);
656b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
657b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
65854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        case 2:
659b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="bytestriematches";
660b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(exec) {
661b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                return new BytesTrieDictMatches(*this);
662b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
663b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
66454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        case 3:
665b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="bytestriecontains";
666b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(exec) {
667b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                return new BytesTrieDictContains(*this);
668b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
669b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
670b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        default:
671b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="";
672b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
673b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
674b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    } else {
675b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if(index==0 && exec) {
676b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            puts("Running BytesTrie perf tests on the .dat package file from the --sourcedir.\n"
677b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                 "For UCharsTrie perf tests on a dictionary text file, specify the -f or --file-name.\n");
678b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
679b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        switch(index) {
680b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        case 0:
681b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="simplebinarysearch";
682b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(exec) {
683b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                return new BinarySearchPackageLookup(*this);
684b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
685b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
686b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        case 1:
687b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="prefixbinarysearch";
688b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(exec) {
689b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                return new PrefixBinarySearchPackageLookup(*this);
690b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
691b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
692b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        case 2:
693b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="bytestrie";
694b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(exec) {
695b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                return new BytesTriePackageLookup(*this);
696b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
697b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
698b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        default:
699b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            name="";
700b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            break;
701b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
702b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
703b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return NULL;
704b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
705b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
706b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoint main(int argc, const char *argv[]) {
707b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    IcuToolErrorCode errorCode("dicttrieperf main()");
708b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    DictionaryTriePerfTest test(argc, argv, errorCode);
709b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(errorCode.isFailure()) {
710b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        fprintf(stderr, "DictionaryTriePerfTest() failed: %s\n", errorCode.errorName());
711b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        test.usage();
712b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return errorCode.reset();
713b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
714b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if(!test.run()) {
715b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        fprintf(stderr, "FAILED: Tests could not be run, please check the arguments.\n");
716b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return -1;
717b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
718b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return 0;
719b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
720