10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ****************************************************************************** 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copyright (C) 1998-2003, 2006, International Business Machines Corporation * 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and others. All Rights Reserved. * 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ****************************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <errno.h> 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdio.h> 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <string.h> 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchar.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchriter.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/brkiter.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/locid.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unistr.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ustring.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This program takes a Unicode text file containing Thai text with 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * spaces inserted where the word breaks are. It computes a copy of 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the text without spaces and uses a word instance of a Thai BreakIterator 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to compute the word breaks. The program reports any differences in the 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * breaks. 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * NOTE: by it's very nature, Thai word breaking is not exact, so it is 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * exptected that this program will always report some differences. 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This class is a break iterator that counts words and spaces. 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass SpaceBreakIterator 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The constructor: 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // text - pointer to an array of UChars to iterate over 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // count - the number of UChars in text 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru SpaceBreakIterator(const UChar *text, int32_t count); 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the destructor 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~SpaceBreakIterator(); 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // return next break position 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t next(); 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // return current word count 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getWordCount(); 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // return current space count 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getSpaceCount(); 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No arg constructor: private so clients can't call it. 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru SpaceBreakIterator(); 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The underlying BreakIterator 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BreakIterator *fBreakIter; 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // address of the UChar array 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *fText; 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // number of UChars in fText 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fTextCount; 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // current word count 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fWordCount; 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // current space count 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fSpaceCount; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UnicodeSet of SA characters 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet fComplexContext; 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // true when fBreakIter has returned DONE 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fDone; 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This is the main class. It compares word breaks and reports the differences. 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass ThaiWordbreakTest 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The main constructor: 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaces - pointer to a UChar array for the text with spaces 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaceCount - the number of characters in the spaces array 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // noSpaces - pointer to a UChar array for the text without spaces 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // noSpaceCount - the number of characters in the noSpaces array 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // verbose - report all breaks if true, otherwise just report differences 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose); 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~ThaiWordbreakTest(); 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns the number of breaks that are in the spaces array 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // but aren't found in the noSpaces array 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getBreaksNotFound(); 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns the number of breaks which are found in the noSpaces 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // array but aren't in the spaces array 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getInvalidBreaks(); 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns the number of words found in the spaces array 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getWordCount(); 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // reads the input Unicode text file: 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // fileName - the path name of the file 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // charCount - set to the number of UChars read from the file 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns - the address of the UChar array containing the characters 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru static const UChar *readFile(char *fileName, int32_t &charCount); 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // removes spaces form the input UChar array: 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaces - pointer to the input UChar array 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // count - number of UChars in the spaces array 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // nonSpaceCount - the number of UChars in the result array 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns - the address of the UChar array with spaces removed 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount); 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The no arg constructor - private so clients can't call it 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ThaiWordbreakTest(); 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // This does the actual comparison: 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaces - the address of the UChar array for the text with spaces 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaceCount - the number of UChars in the spaces array 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // noSpaces - the address of the UChar array for the text without spaces 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // noSpaceCount - the number of UChars in the noSpaces array 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns - true if all breaks match, FALSE otherwise 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount, 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *noSpaces, int32_t noSpaceCount); 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // helper method to report a break in the spaces 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // array that's not found in the noSpaces array 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void breakNotFound(int32_t br); 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // helper method to report a break that's found in 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the noSpaces array that's not in the spaces array 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void foundInvalidBreak(int32_t br); 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // count of breaks in the spaces array that 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // aren't found in the noSpaces array 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fBreaksNotFound; 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // count of breaks found in the noSpaces array 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // that aren't in the spaces array 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fInvalidBreaks; 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // number of words found in the spaces array 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fWordCount; 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // report all breaks if true, otherwise just report differences 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fVerbose; 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main constructor: it calls compareWordBreaks and reports any differences 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *noSpaces, int32_t noSpaceCount, UBool verbose) 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose) 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount); 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The no arg constructor 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::ThaiWordbreakTest() 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // nothing 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The destructor 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::~ThaiWordbreakTest() 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // nothing? 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * returns the number of breaks in the spaces array 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * that aren't found in the noSpaces array 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getBreaksNotFound() 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fBreaksNotFound; 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the number of breaks found in the noSpaces 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * array that aren't in the spaces array 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getInvalidBreaks() 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fInvalidBreaks; 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the number of words found in the spaces array 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getWordCount() 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fWordCount; 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This method does the acutal break comparison and reports the results. 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * It uses a SpaceBreakIterator to iterate over the text with spaces, 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and a word instance of a Thai BreakIterator to iterate over the text 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * without spaces. 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount, 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *noSpaces, int32_t noSpaceCount) 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool result = TRUE; 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru Locale thai("th"); 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount); 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru breakIter->adoptText(noSpaceIter); 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru SpaceBreakIterator spaceIter(spaces, spaceCount); 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextBreak = 0; 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextSpaceBreak = 0; 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t iterCount = 0; 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (TRUE) { 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextSpaceBreak = spaceIter.next(); 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextBreak = breakIter->next(); 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) { 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextBreak != BreakIterator::DONE) { 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "break iterator didn't end.\n"); 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if (nextSpaceBreak != BreakIterator::DONE) { 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "premature break iterator end.\n"); 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (nextSpaceBreak != nextBreak && 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) { 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextSpaceBreak < nextBreak) { 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru breakNotFound(nextSpaceBreak); 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result = FALSE; 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextSpaceBreak = spaceIter.next(); 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if (nextSpaceBreak > nextBreak) { 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru foundInvalidBreak(nextBreak); 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result = FALSE; 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextBreak = breakIter->next(); 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fVerbose) { 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("%d %d\n", nextSpaceBreak, nextBreak); 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fWordCount = spaceIter.getWordCount(); 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete breakIter; 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result; 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Report a break that's in the text with spaces but 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * not found in the text without spaces. 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid ThaiWordbreakTest::breakNotFound(int32_t br) 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fVerbose) { 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("%d ****\n", br); 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "break not found: %d\n", br); 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBreaksNotFound += 1; 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Report a break that's found in the text without spaces 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * that isn't in the text with spaces. 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid ThaiWordbreakTest::foundInvalidBreak(int32_t br) 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fVerbose) { 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("**** %d\n", br); 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "found invalid break: %d\n", br); 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fInvalidBreaks += 1; 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Read the text from a file. The text must start with a Unicode Byte 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Order Mark (BOM) so that we know what order to read the bytes in. 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount) 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru FILE *f; 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fileSize; 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar *buffer; 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru char *bufferChars; 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru f = fopen(fileName, "rb"); 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( f == NULL ) { 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno)); 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fseek(f, 0, SEEK_END); 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fileSize = ftell(f); 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fseek(f, 0, SEEK_SET); 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bufferChars = new char[fileSize]; 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bufferChars == 0) { 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fclose(f); 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fread(bufferChars, sizeof(char), fileSize, f); 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( ferror(f) ) { 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno)); 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fclose(f); 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete[] bufferChars; 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fclose(f); 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeString myText(bufferChars, fileSize, "UTF-8"); 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete[] bufferChars; 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru charCount = myText.length(); 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru buffer = new UChar[charCount]; 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(buffer == 0) { 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru myText.extract(1, myText.length(), buffer); 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru charCount--; // skip the BOM 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru buffer[charCount] = 0; // NULL terminate for easier reading in the debugger 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return buffer; 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Remove spaces from the input UChar array. 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * We check explicitly for a Unicode code value of 0x0020 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because Unicode::isSpaceChar returns true for CR, LF, etc. 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount) 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, out, spaceCount; 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spaceCount = 0; 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (i = 0; i < count; i += 1) { 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) { 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spaceCount += 1; 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nonSpaceCount = count - spaceCount; 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar *noSpaces = new UChar[nonSpaceCount]; 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (noSpaces == 0) { 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n"); 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (out = 0, i = 0; i < count; i += 1) { 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) { 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru noSpaces[out++] = spaces[i]; 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return noSpaces; 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Generate a text file with spaces in it from a file without. 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint generateFile(const UChar *chars, int32_t length) { 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru Locale root(""); 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length); 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BreakIterator *breakIter = BreakIterator::createWordInstance(root, status); 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru breakIter->adoptText(noSpaceIter); 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru char outbuf[1024]; 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t strlength; 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar bom = 0xFEFF; 410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status)); 412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t prevbreak = 0; 413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (U_SUCCESS(status)) { 414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextbreak = breakIter->next(); 415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextbreak == BreakIterator::DONE) { 416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak], 419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextbreak-prevbreak, &status)); 420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1]) 421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && complexContext.contains(chars[nextbreak])) { 422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf(" "); 423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prevbreak = nextbreak; 425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "generate failed: %s\n", u_errorName(status)); 429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return status; 430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main routine. Read the command line arguments, read the text file, 438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * remove the spaces, do the comparison and report the final results 439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint main(int argc, char **argv) 441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru char *fileName = "space.txt"; 443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int arg = 1; 444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool verbose = FALSE; 445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool generate = FALSE; 446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (argc >= 2 && strcmp(argv[1], "-generate") == 0) { 448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru generate = TRUE; 449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru arg += 1; 450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) { 453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru verbose = TRUE; 454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru arg += 1; 455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (arg == argc - 1) { 458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fileName = argv[arg++]; 459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (arg != argc) { 462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]); 463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 1; 464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spaceCount, nonSpaceCount; 467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *spaces, *noSpaces; 468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spaces = ThaiWordbreakTest::readFile(fileName, spaceCount); 470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (spaces == 0) { 472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 1; 473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (generate) { 476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return generateFile(spaces, spaceCount); 477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount); 480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (noSpaces == 0) { 482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 1; 483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose); 486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("word count: %d\n", test.getWordCount()); 488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("breaks not found: %d\n", test.getBreaksNotFound()); 489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("invalid breaks found: %d\n", test.getInvalidBreaks()); 490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main constructor. Clear all the counts and construct a default 496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * word instance of a BreakIterator. 497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count) 499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE) 500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharCharacterIterator *iter = new UCharCharacterIterator(text, count); 502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); 504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru Locale root(""); 505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBreakIter = BreakIterator::createWordInstance(root, status); 507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBreakIter->adoptText(iter); 508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::SpaceBreakIterator() 511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // nothing 513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The destructor. delete the underlying BreakIterator 517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::~SpaceBreakIterator() 519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fBreakIter; 521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Return the next break, counting words and spaces. 525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::next() 527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fDone) { 529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return BreakIterator::DONE; 530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextBreak; 533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextBreak = fBreakIter->next(); 535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextBreak == BreakIterator::DONE) { 537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fDone = TRUE; 538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return BreakIterator::DONE; 539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1]) 542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && fComplexContext.contains(fText[nextBreak])); 543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t result = nextBreak - fSpaceCount; 545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextBreak < fTextCount) { 547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) { 548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSpaceCount += fBreakIter->next() - nextBreak; 549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fWordCount += 1; 553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result; 555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the current space count 559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::getSpaceCount() 561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fSpaceCount; 563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the current word count 567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::getWordCount() 569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fWordCount; 571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 574