10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others.
264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and others. All Rights Reserved.                                           *
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <errno.h>
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdio.h>
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <string.h>
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchar.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchriter.h"
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/brkiter.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/locid.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unistr.h"
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ustring.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This program takes a Unicode text file containing Thai text with
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * spaces inserted where the word breaks are. It computes a copy of
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the text without spaces and uses a word instance of a Thai BreakIterator
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to compute the word breaks. The program reports any differences in the
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * breaks.
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * NOTE: by it's very nature, Thai word breaking is not exact, so it is
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * exptected that this program will always report some differences.
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This class is a break iterator that counts words and spaces.
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass SpaceBreakIterator
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // The constructor:
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // text  - pointer to an array of UChars to iterate over
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // count - the number of UChars in text
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    SpaceBreakIterator(const UChar *text, int32_t count);
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // the destructor
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ~SpaceBreakIterator();
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // return next break position
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t next();
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // return current word count
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t getWordCount();
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // return current space count
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t getSpaceCount();
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // No arg constructor: private so clients can't call it.
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    SpaceBreakIterator();
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // The underlying BreakIterator
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BreakIterator *fBreakIter;
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // address of the UChar array
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const UChar *fText;
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // number of UChars in fText
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t fTextCount;
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // current word count
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t fWordCount;
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // current space count
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t fSpaceCount;
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // UnicodeSet of SA characters
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet fComplexContext;
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // true when fBreakIter has returned DONE
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool fDone;
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This is the main class. It compares word breaks and reports the differences.
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass ThaiWordbreakTest
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // The main constructor:
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // spaces       - pointer to a UChar array for the text with spaces
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // spaceCount   - the number of characters in the spaces array
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // noSpaces     - pointer to a UChar array for the text without spaces
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // noSpaceCount - the number of characters in the noSpaces array
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // verbose      - report all breaks if true, otherwise just report differences
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ~ThaiWordbreakTest();
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // returns the number of breaks that are in the spaces array
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // but aren't found in the noSpaces array
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t getBreaksNotFound();
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // returns the number of breaks which are found in the noSpaces
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // array but aren't in the spaces array
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t getInvalidBreaks();
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // returns the number of words found in the spaces array
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t getWordCount();
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // reads the input Unicode text file:
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // fileName  - the path name of the file
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // charCount - set to the number of UChars read from the file
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // returns   - the address of the UChar array containing the characters
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static const UChar *readFile(char *fileName, int32_t &charCount);
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // removes spaces form the input UChar array:
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // spaces        - pointer to the input UChar array
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // count         - number of UChars in the spaces array
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // nonSpaceCount - the number of UChars in the result array
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // returns       - the address of the UChar array with spaces removed
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // The no arg constructor - private so clients can't call it
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ThaiWordbreakTest();
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // This does the actual comparison:
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // spaces - the address of the UChar array for the text with spaces
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // spaceCount - the number of UChars in the spaces array
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // noSpaces   - the address of the UChar array for the text without spaces
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // noSpaceCount - the number of UChars in the noSpaces array
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // returns      - true if all breaks match, FALSE otherwise
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            const UChar *noSpaces, int32_t noSpaceCount);
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // helper method to report a break in the spaces
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // array that's not found in the noSpaces array
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void breakNotFound(int32_t br);
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // helper method to report a break that's found in
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // the noSpaces array that's not in the spaces array
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void foundInvalidBreak(int32_t br);
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // count of breaks in the spaces array that
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // aren't found in the noSpaces array
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t fBreaksNotFound;
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // count of breaks found in the noSpaces array
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // that aren't in the spaces array
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t fInvalidBreaks;
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // number of words found in the spaces array
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t fWordCount;
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // report all breaks if true, otherwise just report differences
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool fVerbose;
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main constructor: it calls compareWordBreaks and reports any differences
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                     const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The no arg constructor
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::ThaiWordbreakTest()
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // nothing
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The destructor
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::~ThaiWordbreakTest()
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // nothing?
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * returns the number of breaks in the spaces array
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * that aren't found in the noSpaces array
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getBreaksNotFound()
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fBreaksNotFound;
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the number of breaks found in the noSpaces
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * array that aren't in the spaces array
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getInvalidBreaks()
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fInvalidBreaks;
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the number of words found in the spaces array
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getWordCount()
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fWordCount;
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This method does the acutal break comparison and reports the results.
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * It uses a SpaceBreakIterator to iterate over the text with spaces,
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and a word instance of a Thai BreakIterator to iterate over the text
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * without spaces.
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                           const UChar *noSpaces, int32_t noSpaceCount)
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool result = TRUE;
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    Locale thai("th");
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    breakIter->adoptText(noSpaceIter);
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    SpaceBreakIterator spaceIter(spaces, spaceCount);
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t nextBreak = 0;
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t nextSpaceBreak = 0;
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t iterCount = 0;
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while (TRUE) {
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        nextSpaceBreak = spaceIter.next();
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        nextBreak = breakIter->next();
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (nextBreak != BreakIterator::DONE) {
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fprintf(stderr, "break iterator didn't end.\n");
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if (nextSpaceBreak != BreakIterator::DONE) {
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                fprintf(stderr, "premature break iterator end.\n");
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while (nextSpaceBreak != nextBreak &&
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru               nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (nextSpaceBreak < nextBreak) {
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                breakNotFound(nextSpaceBreak);
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                result = FALSE;
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                nextSpaceBreak = spaceIter.next();
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if (nextSpaceBreak > nextBreak) {
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                foundInvalidBreak(nextBreak);
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                result = FALSE;
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                nextBreak = breakIter->next();
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (fVerbose) {
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            printf("%d   %d\n", nextSpaceBreak, nextBreak);
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fWordCount = spaceIter.getWordCount();
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete breakIter;
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return result;
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Report a break that's in the text with spaces but
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * not found in the text without spaces.
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid ThaiWordbreakTest::breakNotFound(int32_t br)
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (fVerbose) {
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf("%d   ****\n", br);
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "break not found: %d\n", br);
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBreaksNotFound += 1;
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Report a break that's found in the text without spaces
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * that isn't in the text with spaces.
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid ThaiWordbreakTest::foundInvalidBreak(int32_t br)
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (fVerbose) {
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf("****   %d\n", br);
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "found invalid break: %d\n", br);
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fInvalidBreaks += 1;
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Read the text from a file. The text must start with a Unicode Byte
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Order Mark (BOM) so that we know what order to read the bytes in.
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    FILE *f;
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t fileSize;
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar *buffer;
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    char *bufferChars;
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    f = fopen(fileName, "rb");
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if( f == NULL ) {
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fseek(f, 0, SEEK_END);
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fileSize = ftell(f);
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fseek(f, 0, SEEK_SET);
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    bufferChars = new char[fileSize];
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(bufferChars == 0) {
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fclose(f);
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fread(bufferChars, sizeof(char), fileSize, f);
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if( ferror(f) ) {
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fclose(f);
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete[] bufferChars;
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fclose(f);
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeString myText(bufferChars, fileSize, "UTF-8");
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete[] bufferChars;
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    charCount = myText.length();
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    buffer = new UChar[charCount];
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(buffer == 0) {
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    myText.extract(1, myText.length(), buffer);
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    charCount--;  // skip the BOM
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return buffer;
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Remove spaces from the input UChar array.
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * We check explicitly for a Unicode code value of 0x0020
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because Unicode::isSpaceChar returns true for CR, LF, etc.
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t i, out, spaceCount;
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    spaceCount = 0;
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (i = 0; i < count; i += 1) {
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            spaceCount += 1;
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    nonSpaceCount = count - spaceCount;
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar *noSpaces = new UChar[nonSpaceCount];
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (noSpaces == 0) {
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (out = 0, i = 0; i < count; i += 1) {
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            noSpaces[out++] = spaces[i];
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return noSpaces;
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Generate a text file with spaces in it from a file without.
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint generateFile(const UChar *chars, int32_t length) {
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    Locale root("");
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    breakIter->adoptText(noSpaceIter);
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    char outbuf[1024];
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t strlength;
409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar bom = 0xFEFF;
410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t prevbreak = 0;
413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while (U_SUCCESS(status)) {
414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t nextbreak = breakIter->next();
415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (nextbreak == BreakIterator::DONE) {
416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                    nextbreak-prevbreak, &status));
420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            && complexContext.contains(chars[nextbreak])) {
422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            printf(" ");
423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        prevbreak = nextbreak;
425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "generate failed: %s\n", u_errorName(status));
429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return status;
430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    else {
432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main routine. Read the command line arguments, read the text file,
438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * remove the spaces, do the comparison and report the final results
439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint main(int argc, char **argv)
441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    char *fileName = "space.txt";
443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int arg = 1;
444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool verbose = FALSE;
445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool generate = FALSE;
446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        generate = TRUE;
449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        arg += 1;
450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        verbose = TRUE;
454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        arg += 1;
455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (arg == argc - 1) {
458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fileName = argv[arg++];
459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (arg != argc) {
462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 1;
464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t spaceCount, nonSpaceCount;
467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const UChar *spaces, *noSpaces;
468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (spaces == 0) {
472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 1;
473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (generate) {
476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return generateFile(spaces, spaceCount);
477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (noSpaces == 0) {
482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 1;
483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    printf("word count: %d\n", test.getWordCount());
488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    printf("breaks not found: %d\n", test.getBreaksNotFound());
489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    printf("invalid breaks found: %d\n", test.getInvalidBreaks());
490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return 0;
492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main constructor. Clear all the counts and construct a default
496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * word instance of a BreakIterator.
497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    Locale root("");
505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBreakIter = BreakIterator::createWordInstance(root, status);
507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBreakIter->adoptText(iter);
508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::SpaceBreakIterator()
511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // nothing
513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The destructor. delete the underlying BreakIterator
517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::~SpaceBreakIterator()
519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fBreakIter;
521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Return the next break, counting words and spaces.
525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::next()
527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (fDone) {
529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return BreakIterator::DONE;
530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t nextBreak;
533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    do {
534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        nextBreak = fBreakIter->next();
535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (nextBreak == BreakIterator::DONE) {
537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fDone = TRUE;
538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return BreakIterator::DONE;
539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            && fComplexContext.contains(fText[nextBreak]));
543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru   int32_t result = nextBreak - fSpaceCount;
545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (nextBreak < fTextCount) {
547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            fSpaceCount += fBreakIter->next() - nextBreak;
549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fWordCount += 1;
553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return result;
555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the current space count
559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::getSpaceCount()
561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fSpaceCount;
563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the current word count
567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::getWordCount()
569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return fWordCount;
571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
574