1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ****************************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copyright (C) 1998-2003, 2006, International Business Machines Corporation * 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and others. All Rights Reserved. * 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ****************************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <errno.h> 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <stdio.h> 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include <string.h> 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchar.h" 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchriter.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/brkiter.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/locid.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unistr.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ustring.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This program takes a Unicode text file containing Thai text with 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * spaces inserted where the word breaks are. It computes a copy of 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the text without spaces and uses a word instance of a Thai BreakIterator 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * to compute the word breaks. The program reports any differences in the 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * breaks. 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * NOTE: by it's very nature, Thai word breaking is not exact, so it is 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * exptected that this program will always report some differences. 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This class is a break iterator that counts words and spaces. 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass SpaceBreakIterator 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The constructor: 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // text - pointer to an array of UChars to iterate over 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // count - the number of UChars in text 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru SpaceBreakIterator(const UChar *text, int32_t count); 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the destructor 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~SpaceBreakIterator(); 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // return next break position 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t next(); 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // return current word count 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getWordCount(); 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // return current space count 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getSpaceCount(); 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // No arg constructor: private so clients can't call it. 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru SpaceBreakIterator(); 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The underlying BreakIterator 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BreakIterator *fBreakIter; 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // address of the UChar array 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *fText; 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // number of UChars in fText 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fTextCount; 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // current word count 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fWordCount; 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // current space count 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fSpaceCount; 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UnicodeSet of SA characters 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet fComplexContext; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // true when fBreakIter has returned DONE 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fDone; 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This is the main class. It compares word breaks and reports the differences. 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass ThaiWordbreakTest 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The main constructor: 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaces - pointer to a UChar array for the text with spaces 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaceCount - the number of characters in the spaces array 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // noSpaces - pointer to a UChar array for the text without spaces 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // noSpaceCount - the number of characters in the noSpaces array 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // verbose - report all breaks if true, otherwise just report differences 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose); 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~ThaiWordbreakTest(); 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns the number of breaks that are in the spaces array 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // but aren't found in the noSpaces array 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getBreaksNotFound(); 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns the number of breaks which are found in the noSpaces 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // array but aren't in the spaces array 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getInvalidBreaks(); 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns the number of words found in the spaces array 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getWordCount(); 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // reads the input Unicode text file: 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // fileName - the path name of the file 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // charCount - set to the number of UChars read from the file 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns - the address of the UChar array containing the characters 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru static const UChar *readFile(char *fileName, int32_t &charCount); 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // removes spaces form the input UChar array: 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaces - pointer to the input UChar array 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // count - number of UChars in the spaces array 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // nonSpaceCount - the number of UChars in the result array 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns - the address of the UChar array with spaces removed 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount); 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The no arg constructor - private so clients can't call it 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ThaiWordbreakTest(); 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // This does the actual comparison: 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaces - the address of the UChar array for the text with spaces 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // spaceCount - the number of UChars in the spaces array 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // noSpaces - the address of the UChar array for the text without spaces 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // noSpaceCount - the number of UChars in the noSpaces array 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // returns - true if all breaks match, FALSE otherwise 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount, 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *noSpaces, int32_t noSpaceCount); 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // helper method to report a break in the spaces 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // array that's not found in the noSpaces array 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void breakNotFound(int32_t br); 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // helper method to report a break that's found in 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the noSpaces array that's not in the spaces array 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void foundInvalidBreak(int32_t br); 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // count of breaks in the spaces array that 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // aren't found in the noSpaces array 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fBreaksNotFound; 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // count of breaks found in the noSpaces array 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // that aren't in the spaces array 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fInvalidBreaks; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // number of words found in the spaces array 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fWordCount; 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // report all breaks if true, otherwise just report differences 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fVerbose; 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main constructor: it calls compareWordBreaks and reports any differences 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *noSpaces, int32_t noSpaceCount, UBool verbose) 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose) 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount); 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The no arg constructor 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::ThaiWordbreakTest() 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // nothing 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The destructor 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiWordbreakTest::~ThaiWordbreakTest() 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // nothing? 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * returns the number of breaks in the spaces array 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * that aren't found in the noSpaces array 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getBreaksNotFound() 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fBreaksNotFound; 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the number of breaks found in the noSpaces 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * array that aren't in the spaces array 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getInvalidBreaks() 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fInvalidBreaks; 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the number of words found in the spaces array 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t ThaiWordbreakTest::getWordCount() 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fWordCount; 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This method does the acutal break comparison and reports the results. 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * It uses a SpaceBreakIterator to iterate over the text with spaces, 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and a word instance of a Thai BreakIterator to iterate over the text 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * without spaces. 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount, 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *noSpaces, int32_t noSpaceCount) 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool result = TRUE; 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru Locale thai("th"); 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount); 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru breakIter->adoptText(noSpaceIter); 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru SpaceBreakIterator spaceIter(spaces, spaceCount); 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextBreak = 0; 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextSpaceBreak = 0; 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t iterCount = 0; 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (TRUE) { 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextSpaceBreak = spaceIter.next(); 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextBreak = breakIter->next(); 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) { 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextBreak != BreakIterator::DONE) { 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "break iterator didn't end.\n"); 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if (nextSpaceBreak != BreakIterator::DONE) { 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "premature break iterator end.\n"); 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (nextSpaceBreak != nextBreak && 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) { 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextSpaceBreak < nextBreak) { 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru breakNotFound(nextSpaceBreak); 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result = FALSE; 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextSpaceBreak = spaceIter.next(); 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if (nextSpaceBreak > nextBreak) { 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru foundInvalidBreak(nextBreak); 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result = FALSE; 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextBreak = breakIter->next(); 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fVerbose) { 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("%d %d\n", nextSpaceBreak, nextBreak); 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fWordCount = spaceIter.getWordCount(); 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete breakIter; 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result; 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Report a break that's in the text with spaces but 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * not found in the text without spaces. 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid ThaiWordbreakTest::breakNotFound(int32_t br) 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fVerbose) { 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("%d ****\n", br); 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "break not found: %d\n", br); 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBreaksNotFound += 1; 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Report a break that's found in the text without spaces 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * that isn't in the text with spaces. 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid ThaiWordbreakTest::foundInvalidBreak(int32_t br) 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fVerbose) { 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("**** %d\n", br); 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "found invalid break: %d\n", br); 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fInvalidBreaks += 1; 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Read the text from a file. The text must start with a Unicode Byte 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Order Mark (BOM) so that we know what order to read the bytes in. 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount) 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru FILE *f; 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fileSize; 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar *buffer; 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru char *bufferChars; 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru f = fopen(fileName, "rb"); 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( f == NULL ) { 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno)); 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fseek(f, 0, SEEK_END); 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fileSize = ftell(f); 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fseek(f, 0, SEEK_SET); 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bufferChars = new char[fileSize]; 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bufferChars == 0) { 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fclose(f); 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fread(bufferChars, sizeof(char), fileSize, f); 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( ferror(f) ) { 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno)); 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fclose(f); 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete[] bufferChars; 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fclose(f); 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeString myText(bufferChars, fileSize, "UTF-8"); 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete[] bufferChars; 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru charCount = myText.length(); 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru buffer = new UChar[charCount]; 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(buffer == 0) { 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru myText.extract(1, myText.length(), buffer); 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru charCount--; // skip the BOM 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru buffer[charCount] = 0; // NULL terminate for easier reading in the debugger 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return buffer; 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Remove spaces from the input UChar array. 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * We check explicitly for a Unicode code value of 0x0020 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * because Unicode::isSpaceChar returns true for CR, LF, etc. 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount) 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i, out, spaceCount; 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spaceCount = 0; 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (i = 0; i < count; i += 1) { 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) { 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spaceCount += 1; 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nonSpaceCount = count - spaceCount; 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar *noSpaces = new UChar[nonSpaceCount]; 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (noSpaces == 0) { 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n"); 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (out = 0, i = 0; i < count; i += 1) { 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) { 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru noSpaces[out++] = spaces[i]; 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return noSpaces; 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Generate a text file with spaces in it from a file without. 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint generateFile(const UChar *chars, int32_t length) { 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru Locale root(""); 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length); 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BreakIterator *breakIter = BreakIterator::createWordInstance(root, status); 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru breakIter->adoptText(noSpaceIter); 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru char outbuf[1024]; 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t strlength; 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar bom = 0xFEFF; 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status)); 410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t prevbreak = 0; 411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (U_SUCCESS(status)) { 412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextbreak = breakIter->next(); 413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextbreak == BreakIterator::DONE) { 414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak], 417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextbreak-prevbreak, &status)); 418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1]) 419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && complexContext.contains(chars[nextbreak])) { 420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf(" "); 421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prevbreak = nextbreak; 423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "generate failed: %s\n", u_errorName(status)); 427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return status; 428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main routine. Read the command line arguments, read the text file, 436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * remove the spaces, do the comparison and report the final results 437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint main(int argc, char **argv) 439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru char *fileName = "space.txt"; 441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int arg = 1; 442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool verbose = FALSE; 443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool generate = FALSE; 444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (argc >= 2 && strcmp(argv[1], "-generate") == 0) { 446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru generate = TRUE; 447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru arg += 1; 448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) { 451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru verbose = TRUE; 452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru arg += 1; 453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (arg == argc - 1) { 456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fileName = argv[arg++]; 457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (arg != argc) { 460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]); 461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 1; 462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spaceCount, nonSpaceCount; 465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *spaces, *noSpaces; 466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spaces = ThaiWordbreakTest::readFile(fileName, spaceCount); 468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (spaces == 0) { 470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 1; 471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (generate) { 474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return generateFile(spaces, spaceCount); 475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount); 478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (noSpaces == 0) { 480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 1; 481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose); 484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("word count: %d\n", test.getWordCount()); 486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("breaks not found: %d\n", test.getBreaksNotFound()); 487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru printf("invalid breaks found: %d\n", test.getInvalidBreaks()); 488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The main constructor. Clear all the counts and construct a default 494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * word instance of a BreakIterator. 495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count) 497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE) 498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UCharCharacterIterator *iter = new UCharCharacterIterator(text, count); 500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); 502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru Locale root(""); 503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBreakIter = BreakIterator::createWordInstance(root, status); 505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBreakIter->adoptText(iter); 506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::SpaceBreakIterator() 509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // nothing 511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The destructor. delete the underlying BreakIterator 515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruSpaceBreakIterator::~SpaceBreakIterator() 517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fBreakIter; 519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Return the next break, counting words and spaces. 523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::next() 525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fDone) { 527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return BreakIterator::DONE; 528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t nextBreak; 531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru nextBreak = fBreakIter->next(); 533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextBreak == BreakIterator::DONE) { 535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fDone = TRUE; 536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return BreakIterator::DONE; 537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1]) 540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && fComplexContext.contains(fText[nextBreak])); 541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t result = nextBreak - fSpaceCount; 543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (nextBreak < fTextCount) { 545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) { 546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSpaceCount += fBreakIter->next() - nextBreak; 547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fWordCount += 1; 551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result; 553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the current space count 557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::getSpaceCount() 559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fSpaceCount; 561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Returns the current word count 565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t SpaceBreakIterator::getWordCount() 567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return fWordCount; 569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 572