1/* 2 ****************************************************************************** 3 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8#include <errno.h> 9#include <stdio.h> 10#include <string.h> 11 12#include "unicode/utypes.h" 13#include "unicode/uchar.h" 14#include "unicode/uchriter.h" 15#include "unicode/brkiter.h" 16#include "unicode/locid.h" 17#include "unicode/unistr.h" 18#include "unicode/uniset.h" 19#include "unicode/ustring.h" 20 21/* 22 * This program takes a Unicode text file containing Thai text with 23 * spaces inserted where the word breaks are. It computes a copy of 24 * the text without spaces and uses a word instance of a Thai BreakIterator 25 * to compute the word breaks. The program reports any differences in the 26 * breaks. 27 * 28 * NOTE: by it's very nature, Thai word breaking is not exact, so it is 29 * exptected that this program will always report some differences. 30 */ 31 32/* 33 * This class is a break iterator that counts words and spaces. 34 */ 35class SpaceBreakIterator 36{ 37public: 38 // The constructor: 39 // text - pointer to an array of UChars to iterate over 40 // count - the number of UChars in text 41 SpaceBreakIterator(const UChar *text, int32_t count); 42 43 // the destructor 44 ~SpaceBreakIterator(); 45 46 // return next break position 47 int32_t next(); 48 49 // return current word count 50 int32_t getWordCount(); 51 52 // return current space count 53 int32_t getSpaceCount(); 54 55private: 56 // No arg constructor: private so clients can't call it. 57 SpaceBreakIterator(); 58 59 // The underlying BreakIterator 60 BreakIterator *fBreakIter; 61 62 // address of the UChar array 63 const UChar *fText; 64 65 // number of UChars in fText 66 int32_t fTextCount; 67 68 // current word count 69 int32_t fWordCount; 70 71 // current space count 72 int32_t fSpaceCount; 73 74 // UnicodeSet of SA characters 75 UnicodeSet fComplexContext; 76 77 // true when fBreakIter has returned DONE 78 UBool fDone; 79}; 80 81/* 82 * This is the main class. It compares word breaks and reports the differences. 83 */ 84class ThaiWordbreakTest 85{ 86public: 87 // The main constructor: 88 // spaces - pointer to a UChar array for the text with spaces 89 // spaceCount - the number of characters in the spaces array 90 // noSpaces - pointer to a UChar array for the text without spaces 91 // noSpaceCount - the number of characters in the noSpaces array 92 // verbose - report all breaks if true, otherwise just report differences 93 ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose); 94 ~ThaiWordbreakTest(); 95 96 // returns the number of breaks that are in the spaces array 97 // but aren't found in the noSpaces array 98 int32_t getBreaksNotFound(); 99 100 // returns the number of breaks which are found in the noSpaces 101 // array but aren't in the spaces array 102 int32_t getInvalidBreaks(); 103 104 // returns the number of words found in the spaces array 105 int32_t getWordCount(); 106 107 // reads the input Unicode text file: 108 // fileName - the path name of the file 109 // charCount - set to the number of UChars read from the file 110 // returns - the address of the UChar array containing the characters 111 static const UChar *readFile(char *fileName, int32_t &charCount); 112 113 // removes spaces form the input UChar array: 114 // spaces - pointer to the input UChar array 115 // count - number of UChars in the spaces array 116 // nonSpaceCount - the number of UChars in the result array 117 // returns - the address of the UChar array with spaces removed 118 static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount); 119 120private: 121 // The no arg constructor - private so clients can't call it 122 ThaiWordbreakTest(); 123 124 // This does the actual comparison: 125 // spaces - the address of the UChar array for the text with spaces 126 // spaceCount - the number of UChars in the spaces array 127 // noSpaces - the address of the UChar array for the text without spaces 128 // noSpaceCount - the number of UChars in the noSpaces array 129 // returns - true if all breaks match, FALSE otherwise 130 UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount, 131 const UChar *noSpaces, int32_t noSpaceCount); 132 133 // helper method to report a break in the spaces 134 // array that's not found in the noSpaces array 135 void breakNotFound(int32_t br); 136 137 // helper method to report a break that's found in 138 // the noSpaces array that's not in the spaces array 139 void foundInvalidBreak(int32_t br); 140 141 // count of breaks in the spaces array that 142 // aren't found in the noSpaces array 143 int32_t fBreaksNotFound; 144 145 // count of breaks found in the noSpaces array 146 // that aren't in the spaces array 147 int32_t fInvalidBreaks; 148 149 // number of words found in the spaces array 150 int32_t fWordCount; 151 152 // report all breaks if true, otherwise just report differences 153 UBool fVerbose; 154}; 155 156/* 157 * The main constructor: it calls compareWordBreaks and reports any differences 158 */ 159ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, 160 const UChar *noSpaces, int32_t noSpaceCount, UBool verbose) 161: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose) 162{ 163 compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount); 164} 165 166/* 167 * The no arg constructor 168 */ 169ThaiWordbreakTest::ThaiWordbreakTest() 170{ 171 // nothing 172} 173 174/* 175 * The destructor 176 */ 177ThaiWordbreakTest::~ThaiWordbreakTest() 178{ 179 // nothing? 180} 181 182/* 183 * returns the number of breaks in the spaces array 184 * that aren't found in the noSpaces array 185 */ 186inline int32_t ThaiWordbreakTest::getBreaksNotFound() 187{ 188 return fBreaksNotFound; 189} 190 191/* 192 * Returns the number of breaks found in the noSpaces 193 * array that aren't in the spaces array 194 */ 195inline int32_t ThaiWordbreakTest::getInvalidBreaks() 196{ 197 return fInvalidBreaks; 198} 199 200/* 201 * Returns the number of words found in the spaces array 202 */ 203inline int32_t ThaiWordbreakTest::getWordCount() 204{ 205 return fWordCount; 206} 207 208/* 209 * This method does the acutal break comparison and reports the results. 210 * It uses a SpaceBreakIterator to iterate over the text with spaces, 211 * and a word instance of a Thai BreakIterator to iterate over the text 212 * without spaces. 213 */ 214UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount, 215 const UChar *noSpaces, int32_t noSpaceCount) 216{ 217 UBool result = TRUE; 218 Locale thai("th"); 219 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount); 220 UErrorCode status = U_ZERO_ERROR; 221 222 BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); 223 breakIter->adoptText(noSpaceIter); 224 225 SpaceBreakIterator spaceIter(spaces, spaceCount); 226 227 int32_t nextBreak = 0; 228 int32_t nextSpaceBreak = 0; 229 int32_t iterCount = 0; 230 231 while (TRUE) { 232 nextSpaceBreak = spaceIter.next(); 233 nextBreak = breakIter->next(); 234 235 if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) { 236 if (nextBreak != BreakIterator::DONE) { 237 fprintf(stderr, "break iterator didn't end.\n"); 238 } else if (nextSpaceBreak != BreakIterator::DONE) { 239 fprintf(stderr, "premature break iterator end.\n"); 240 } 241 242 break; 243 } 244 245 while (nextSpaceBreak != nextBreak && 246 nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) { 247 if (nextSpaceBreak < nextBreak) { 248 breakNotFound(nextSpaceBreak); 249 result = FALSE; 250 nextSpaceBreak = spaceIter.next(); 251 } else if (nextSpaceBreak > nextBreak) { 252 foundInvalidBreak(nextBreak); 253 result = FALSE; 254 nextBreak = breakIter->next(); 255 } 256 } 257 258 if (fVerbose) { 259 printf("%d %d\n", nextSpaceBreak, nextBreak); 260 } 261 } 262 263 264 fWordCount = spaceIter.getWordCount(); 265 266 delete breakIter; 267 268 return result; 269} 270 271/* 272 * Report a break that's in the text with spaces but 273 * not found in the text without spaces. 274 */ 275void ThaiWordbreakTest::breakNotFound(int32_t br) 276{ 277 if (fVerbose) { 278 printf("%d ****\n", br); 279 } else { 280 fprintf(stderr, "break not found: %d\n", br); 281 } 282 283 fBreaksNotFound += 1; 284} 285 286/* 287 * Report a break that's found in the text without spaces 288 * that isn't in the text with spaces. 289 */ 290void ThaiWordbreakTest::foundInvalidBreak(int32_t br) 291{ 292 if (fVerbose) { 293 printf("**** %d\n", br); 294 } else { 295 fprintf(stderr, "found invalid break: %d\n", br); 296 } 297 298 fInvalidBreaks += 1; 299} 300 301/* 302 * Read the text from a file. The text must start with a Unicode Byte 303 * Order Mark (BOM) so that we know what order to read the bytes in. 304 */ 305const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount) 306{ 307 FILE *f; 308 int32_t fileSize; 309 310 UChar *buffer; 311 char *bufferChars; 312 313 f = fopen(fileName, "rb"); 314 315 if( f == NULL ) { 316 fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno)); 317 return 0; 318 } 319 320 fseek(f, 0, SEEK_END); 321 fileSize = ftell(f); 322 323 fseek(f, 0, SEEK_SET); 324 bufferChars = new char[fileSize]; 325 326 if(bufferChars == 0) { 327 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); 328 fclose(f); 329 return 0; 330 } 331 332 fread(bufferChars, sizeof(char), fileSize, f); 333 if( ferror(f) ) { 334 fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno)); 335 fclose(f); 336 delete[] bufferChars; 337 return 0; 338 } 339 fclose(f); 340 341 UnicodeString myText(bufferChars, fileSize, "UTF-8"); 342 343 delete[] bufferChars; 344 345 charCount = myText.length(); 346 buffer = new UChar[charCount]; 347 if(buffer == 0) { 348 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno)); 349 return 0; 350 } 351 352 myText.extract(1, myText.length(), buffer); 353 charCount--; // skip the BOM 354 buffer[charCount] = 0; // NULL terminate for easier reading in the debugger 355 356 return buffer; 357} 358 359/* 360 * Remove spaces from the input UChar array. 361 * 362 * We check explicitly for a Unicode code value of 0x0020 363 * because Unicode::isSpaceChar returns true for CR, LF, etc. 364 * 365 */ 366const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount) 367{ 368 int32_t i, out, spaceCount; 369 370 spaceCount = 0; 371 for (i = 0; i < count; i += 1) { 372 if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) { 373 spaceCount += 1; 374 } 375 } 376 377 nonSpaceCount = count - spaceCount; 378 UChar *noSpaces = new UChar[nonSpaceCount]; 379 380 if (noSpaces == 0) { 381 fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n"); 382 return 0; 383 } 384 385 for (out = 0, i = 0; i < count; i += 1) { 386 if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) { 387 noSpaces[out++] = spaces[i]; 388 } 389 } 390 391 return noSpaces; 392} 393 394/* 395 * Generate a text file with spaces in it from a file without. 396 */ 397int generateFile(const UChar *chars, int32_t length) { 398 Locale root(""); 399 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length); 400 UErrorCode status = U_ZERO_ERROR; 401 402 UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); 403 BreakIterator *breakIter = BreakIterator::createWordInstance(root, status); 404 breakIter->adoptText(noSpaceIter); 405 char outbuf[1024]; 406 int32_t strlength; 407 UChar bom = 0xFEFF; 408 409 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status)); 410 int32_t prevbreak = 0; 411 while (U_SUCCESS(status)) { 412 int32_t nextbreak = breakIter->next(); 413 if (nextbreak == BreakIterator::DONE) { 414 break; 415 } 416 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak], 417 nextbreak-prevbreak, &status)); 418 if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1]) 419 && complexContext.contains(chars[nextbreak])) { 420 printf(" "); 421 } 422 prevbreak = nextbreak; 423 } 424 425 if (U_FAILURE(status)) { 426 fprintf(stderr, "generate failed: %s\n", u_errorName(status)); 427 return status; 428 } 429 else { 430 return 0; 431 } 432} 433 434/* 435 * The main routine. Read the command line arguments, read the text file, 436 * remove the spaces, do the comparison and report the final results 437 */ 438int main(int argc, char **argv) 439{ 440 char *fileName = "space.txt"; 441 int arg = 1; 442 UBool verbose = FALSE; 443 UBool generate = FALSE; 444 445 if (argc >= 2 && strcmp(argv[1], "-generate") == 0) { 446 generate = TRUE; 447 arg += 1; 448 } 449 450 if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) { 451 verbose = TRUE; 452 arg += 1; 453 } 454 455 if (arg == argc - 1) { 456 fileName = argv[arg++]; 457 } 458 459 if (arg != argc) { 460 fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]); 461 return 1; 462 } 463 464 int32_t spaceCount, nonSpaceCount; 465 const UChar *spaces, *noSpaces; 466 467 spaces = ThaiWordbreakTest::readFile(fileName, spaceCount); 468 469 if (spaces == 0) { 470 return 1; 471 } 472 473 if (generate) { 474 return generateFile(spaces, spaceCount); 475 } 476 477 noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount); 478 479 if (noSpaces == 0) { 480 return 1; 481 } 482 483 ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose); 484 485 printf("word count: %d\n", test.getWordCount()); 486 printf("breaks not found: %d\n", test.getBreaksNotFound()); 487 printf("invalid breaks found: %d\n", test.getInvalidBreaks()); 488 489 return 0; 490} 491 492/* 493 * The main constructor. Clear all the counts and construct a default 494 * word instance of a BreakIterator. 495 */ 496SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count) 497 : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE) 498{ 499 UCharCharacterIterator *iter = new UCharCharacterIterator(text, count); 500 UErrorCode status = U_ZERO_ERROR; 501 fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status); 502 Locale root(""); 503 504 fBreakIter = BreakIterator::createWordInstance(root, status); 505 fBreakIter->adoptText(iter); 506} 507 508SpaceBreakIterator::SpaceBreakIterator() 509{ 510 // nothing 511} 512 513/* 514 * The destructor. delete the underlying BreakIterator 515 */ 516SpaceBreakIterator::~SpaceBreakIterator() 517{ 518 delete fBreakIter; 519} 520 521/* 522 * Return the next break, counting words and spaces. 523 */ 524int32_t SpaceBreakIterator::next() 525{ 526 if (fDone) { 527 return BreakIterator::DONE; 528 } 529 530 int32_t nextBreak; 531 do { 532 nextBreak = fBreakIter->next(); 533 534 if (nextBreak == BreakIterator::DONE) { 535 fDone = TRUE; 536 return BreakIterator::DONE; 537 } 538 } 539 while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1]) 540 && fComplexContext.contains(fText[nextBreak])); 541 542 int32_t result = nextBreak - fSpaceCount; 543 544 if (nextBreak < fTextCount) { 545 if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) { 546 fSpaceCount += fBreakIter->next() - nextBreak; 547 } 548 } 549 550 fWordCount += 1; 551 552 return result; 553} 554 555/* 556 * Returns the current space count 557 */ 558int32_t SpaceBreakIterator::getSpaceCount() 559{ 560 return fSpaceCount; 561} 562 563/* 564 * Returns the current word count 565 */ 566int32_t SpaceBreakIterator::getWordCount() 567{ 568 return fWordCount; 569} 570 571 572