1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 **********************************************************************
5 * Copyright (c) 2002-2008, International Business Machines           *
6 * Corporation and others.  All Rights Reserved.                      *
7 **********************************************************************
8 */
9package com.ibm.icu.dev.test.perf;
10
11import java.io.FileInputStream;
12import java.util.ArrayList;
13
14public class BreakIteratorPerformanceTest extends PerfTest {
15
16    String fileContents;
17
18    com.ibm.icu.text.BreakIterator iSentenceIter;
19    com.ibm.icu.text.BreakIterator iWordIter;
20    com.ibm.icu.text.BreakIterator iLineIter;
21    com.ibm.icu.text.BreakIterator iCharacterIter;
22    java.text.BreakIterator jSentenceIter;
23    java.text.BreakIterator jWordIter;
24    java.text.BreakIterator jLineIter;
25    java.text.BreakIterator jCharacterIter;
26    String[] iSentences;
27    String[] iWords;
28    String[] iLines;
29    String[] iCharacters;
30    String[] jSentences;
31    String[] jWords;
32    String[] jLines;
33    String[] jCharacters;
34
35    public static void main(String[] args) throws Exception {
36        new BreakIteratorPerformanceTest().run(args);
37    }
38
39    protected void setup(String[] args) {
40        try {
41            // read in the input file, being careful with a possible BOM
42            FileInputStream in = new FileInputStream(fileName);
43            BOMFreeReader reader = new BOMFreeReader(in, encoding);
44            fileContents = new String(readToEOS(reader));
45
46            // // get rid of any characters that may cause differences between ICU4J and Java BreakIterator
47            // // fileContents = fileContents.replaceAll("[\t\f\r\n\\-/ ]+", " ");
48            // String res = "";
49            // StringTokenizer tokenizer = new StringTokenizer(fileContents, "\t\f\r\n-/ ");
50            // while (tokenizer.hasMoreTokens())
51            // res += tokenizer.nextToken() + " ";
52            // fileContents = res.trim();
53
54            // create the break iterators with respect to locale
55            if (locale == null) {
56                iSentenceIter = com.ibm.icu.text.BreakIterator.getSentenceInstance();
57                iWordIter = com.ibm.icu.text.BreakIterator.getWordInstance();
58                iLineIter = com.ibm.icu.text.BreakIterator.getLineInstance();
59                iCharacterIter = com.ibm.icu.text.BreakIterator.getCharacterInstance();
60
61                jSentenceIter = java.text.BreakIterator.getSentenceInstance();
62                jWordIter = java.text.BreakIterator.getWordInstance();
63                jLineIter = java.text.BreakIterator.getLineInstance();
64                jCharacterIter = java.text.BreakIterator.getCharacterInstance();
65            } else {
66                iSentenceIter = com.ibm.icu.text.BreakIterator.getSentenceInstance(locale);
67                iWordIter = com.ibm.icu.text.BreakIterator.getWordInstance(locale);
68                iLineIter = com.ibm.icu.text.BreakIterator.getLineInstance(locale);
69                iCharacterIter = com.ibm.icu.text.BreakIterator.getCharacterInstance(locale);
70
71                jSentenceIter = java.text.BreakIterator.getSentenceInstance(locale);
72                jWordIter = java.text.BreakIterator.getWordInstance(locale);
73                jLineIter = java.text.BreakIterator.getLineInstance(locale);
74                jCharacterIter = java.text.BreakIterator.getCharacterInstance(locale);
75            }
76
77            iSentences = init(iSentenceIter);
78            iWords = init(iWordIter);
79            iLines = init(iLineIter);
80            iCharacters = init(iCharacterIter);
81            jSentences = init(jSentenceIter);
82            jWords = init(jWordIter);
83            jLines = init(jLineIter);
84            jCharacters = init(jCharacterIter);
85
86        } catch (Exception ex) {
87            ex.printStackTrace();
88            throw new RuntimeException(ex.getMessage());
89        }
90
91        // we created some heavy objects, so lets try to clean up a little before running the tests
92        gc();
93    }
94
95    private String[] init(com.ibm.icu.text.BreakIterator iter) {
96        // set the string to iterate on
97        iter.setText(fileContents);
98
99        // produce a token list
100        ArrayList tokenList = new ArrayList();
101        int start = iter.first();
102        for (int end = iter.next(); end != com.ibm.icu.text.BreakIterator.DONE; start = end, end = iter.next())
103            tokenList.add(fileContents.substring(start, end));
104
105        // return the token list as a string array
106        return (String[]) tokenList.toArray(new String[0]);
107    }
108
109    private String[] init(java.text.BreakIterator iter) {
110        // set the string to iterate on
111        iter.setText(fileContents);
112
113        // produce a token list
114        ArrayList tokenList = new ArrayList();
115        int start = iter.first();
116        for (int end = iter.next(); end != com.ibm.icu.text.BreakIterator.DONE; start = end, end = iter.next())
117            tokenList.add(fileContents.substring(start, end));
118
119        // return the token list as a string array
120        return (String[]) tokenList.toArray(new String[0]);
121    }
122
123    PerfTest.Function createTestICU(final com.ibm.icu.text.BreakIterator iIter, final String[] correct,
124            final String breakType) {
125        return new PerfTest.Function() {
126            public void call() {
127                int k = 0;
128                int start = iIter.first();
129                for (int end = iIter.next(); end != com.ibm.icu.text.BreakIterator.DONE; start = end, end = iIter
130                        .next())
131                    if (!correct[k++].equals(fileContents.substring(start, end)))
132                        throw new RuntimeException("ICU4J BreakIterator gave the wrong answer for " + breakType + " "
133                                + (k - 1) + " during the performance test. Cannot continue the performance test.");
134                if (k != correct.length)
135                    throw new RuntimeException("ICU4J BreakIterator gave the wrong number of " + breakType
136                            + "s during the performance test. Cannot continue the performance test.");
137            }
138
139            public long getOperationsPerIteration() {
140                return fileContents.length();
141            }
142        };
143    }
144
145    PerfTest.Function createTestJava(final java.text.BreakIterator jIter, final String[] correct, final String breakType) {
146        return new PerfTest.Function() {
147            public void call() {
148                int k = 0;
149                int start = jIter.first();
150                for (int end = jIter.next(); end != java.text.BreakIterator.DONE; start = end, end = jIter.next())
151                    if (!correct[k++].equals(fileContents.substring(start, end)))
152                        throw new RuntimeException("Java BreakIterator gave the wrong answer for " + breakType + " "
153                                + (k - 1) + " during the performance test. Cannot continue the performance test.");
154                if (k != correct.length)
155                    throw new RuntimeException("Java BreakIterator gave the wrong number of " + breakType
156                            + "s during the performance test. Cannot continue the performance test.");
157            }
158
159            public long getOperationsPerIteration() {
160                return fileContents.length();
161            }
162        };
163    }
164
165    PerfTest.Function TestICUSentences() {
166        return createTestICU(iSentenceIter, iSentences, "sentence");
167    }
168
169    PerfTest.Function TestICUWords() {
170        return createTestICU(iWordIter, iWords, "word");
171    }
172
173    PerfTest.Function TestICULines() {
174        return createTestICU(iLineIter, iLines, "line");
175    }
176
177    PerfTest.Function TestICUCharacters() {
178        return createTestICU(iCharacterIter, iCharacters, "character");
179    }
180
181    PerfTest.Function TestJavaSentences() {
182        return createTestJava(jSentenceIter, jSentences, "sentence");
183    }
184
185    PerfTest.Function TestJavaWords() {
186        return createTestJava(jWordIter, jWords, "word");
187    }
188
189    PerfTest.Function TestJavaLines() {
190        return createTestJava(jLineIter, jLines, "line");
191    }
192
193    PerfTest.Function TestJavaCharacters() {
194        return createTestJava(jCharacterIter, jCharacters, "character");
195    }
196}
197