thaitest.cpp revision c73f511526464f8e56c242df80552e9b0d94ae3d
1/*
2 ******************************************************************************
3 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
4 * and others. All Rights Reserved.                                           *
5 ******************************************************************************
6 */
7
8#include <errno.h>
9#include <stdio.h>
10#include <string.h>
11
12#include "unicode/utypes.h"
13#include "unicode/uchar.h"
14#include "unicode/uchriter.h"
15#include "unicode/brkiter.h"
16#include "unicode/locid.h"
17#include "unicode/unistr.h"
18#include "unicode/uniset.h"
19#include "unicode/ustring.h"
20
21/*
22 * This program takes a Unicode text file containing Thai text with
23 * spaces inserted where the word breaks are. It computes a copy of
24 * the text without spaces and uses a word instance of a Thai BreakIterator
25 * to compute the word breaks. The program reports any differences in the
26 * breaks.
27 *
28 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
29 * exptected that this program will always report some differences.
30 */
31
32/*
33 * This class is a break iterator that counts words and spaces.
34 */
35class SpaceBreakIterator
36{
37public:
38    // The constructor:
39    // text  - pointer to an array of UChars to iterate over
40    // count - the number of UChars in text
41    SpaceBreakIterator(const UChar *text, int32_t count);
42
43    // the destructor
44    ~SpaceBreakIterator();
45
46    // return next break position
47    int32_t next();
48
49    // return current word count
50    int32_t getWordCount();
51
52    // return current space count
53    int32_t getSpaceCount();
54
55private:
56    // No arg constructor: private so clients can't call it.
57    SpaceBreakIterator();
58
59    // The underlying BreakIterator
60    BreakIterator *fBreakIter;
61
62    // address of the UChar array
63    const UChar *fText;
64
65    // number of UChars in fText
66    int32_t fTextCount;
67
68    // current word count
69    int32_t fWordCount;
70
71    // current space count
72    int32_t fSpaceCount;
73
74    // UnicodeSet of SA characters
75    UnicodeSet fComplexContext;
76
77    // true when fBreakIter has returned DONE
78    UBool fDone;
79};
80
81/*
82 * This is the main class. It compares word breaks and reports the differences.
83 */
84class ThaiWordbreakTest
85{
86public:
87    // The main constructor:
88    // spaces       - pointer to a UChar array for the text with spaces
89    // spaceCount   - the number of characters in the spaces array
90    // noSpaces     - pointer to a UChar array for the text without spaces
91    // noSpaceCount - the number of characters in the noSpaces array
92    // verbose      - report all breaks if true, otherwise just report differences
93    ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
94    ~ThaiWordbreakTest();
95
96    // returns the number of breaks that are in the spaces array
97    // but aren't found in the noSpaces array
98    int32_t getBreaksNotFound();
99
100    // returns the number of breaks which are found in the noSpaces
101    // array but aren't in the spaces array
102    int32_t getInvalidBreaks();
103
104    // returns the number of words found in the spaces array
105    int32_t getWordCount();
106
107    // reads the input Unicode text file:
108    // fileName  - the path name of the file
109    // charCount - set to the number of UChars read from the file
110    // returns   - the address of the UChar array containing the characters
111    static const UChar *readFile(char *fileName, int32_t &charCount);
112
113    // removes spaces form the input UChar array:
114    // spaces        - pointer to the input UChar array
115    // count         - number of UChars in the spaces array
116    // nonSpaceCount - the number of UChars in the result array
117    // returns       - the address of the UChar array with spaces removed
118    static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
119
120private:
121    // The no arg constructor - private so clients can't call it
122    ThaiWordbreakTest();
123
124    // This does the actual comparison:
125    // spaces - the address of the UChar array for the text with spaces
126    // spaceCount - the number of UChars in the spaces array
127    // noSpaces   - the address of the UChar array for the text without spaces
128    // noSpaceCount - the number of UChars in the noSpaces array
129    // returns      - true if all breaks match, FALSE otherwise
130    UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
131                            const UChar *noSpaces, int32_t noSpaceCount);
132
133    // helper method to report a break in the spaces
134    // array that's not found in the noSpaces array
135    void breakNotFound(int32_t br);
136
137    // helper method to report a break that's found in
138    // the noSpaces array that's not in the spaces array
139    void foundInvalidBreak(int32_t br);
140
141    // count of breaks in the spaces array that
142    // aren't found in the noSpaces array
143    int32_t fBreaksNotFound;
144
145    // count of breaks found in the noSpaces array
146    // that aren't in the spaces array
147    int32_t fInvalidBreaks;
148
149    // number of words found in the spaces array
150    int32_t fWordCount;
151
152    // report all breaks if true, otherwise just report differences
153    UBool fVerbose;
154};
155
156/*
157 * The main constructor: it calls compareWordBreaks and reports any differences
158 */
159ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
160                                     const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
161: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
162{
163    compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
164}
165
166/*
167 * The no arg constructor
168 */
169ThaiWordbreakTest::ThaiWordbreakTest()
170{
171    // nothing
172}
173
174/*
175 * The destructor
176 */
177ThaiWordbreakTest::~ThaiWordbreakTest()
178{
179    // nothing?
180}
181
182/*
183 * returns the number of breaks in the spaces array
184 * that aren't found in the noSpaces array
185 */
186inline int32_t ThaiWordbreakTest::getBreaksNotFound()
187{
188    return fBreaksNotFound;
189}
190
191/*
192 * Returns the number of breaks found in the noSpaces
193 * array that aren't in the spaces array
194 */
195inline int32_t ThaiWordbreakTest::getInvalidBreaks()
196{
197    return fInvalidBreaks;
198}
199
200/*
201 * Returns the number of words found in the spaces array
202 */
203inline int32_t ThaiWordbreakTest::getWordCount()
204{
205    return fWordCount;
206}
207
208/*
209 * This method does the acutal break comparison and reports the results.
210 * It uses a SpaceBreakIterator to iterate over the text with spaces,
211 * and a word instance of a Thai BreakIterator to iterate over the text
212 * without spaces.
213 */
214UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
215                                           const UChar *noSpaces, int32_t noSpaceCount)
216{
217    UBool result = TRUE;
218    Locale thai("th");
219    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
220    UErrorCode status = U_ZERO_ERROR;
221
222    BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
223    breakIter->adoptText(noSpaceIter);
224
225    SpaceBreakIterator spaceIter(spaces, spaceCount);
226
227    int32_t nextBreak = 0;
228    int32_t nextSpaceBreak = 0;
229    int32_t iterCount = 0;
230
231    while (TRUE) {
232        nextSpaceBreak = spaceIter.next();
233        nextBreak = breakIter->next();
234
235        if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
236            if (nextBreak != BreakIterator::DONE) {
237                fprintf(stderr, "break iterator didn't end.\n");
238            } else if (nextSpaceBreak != BreakIterator::DONE) {
239                fprintf(stderr, "premature break iterator end.\n");
240            }
241
242            break;
243        }
244
245        while (nextSpaceBreak != nextBreak &&
246               nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
247            if (nextSpaceBreak < nextBreak) {
248                breakNotFound(nextSpaceBreak);
249                result = FALSE;
250                nextSpaceBreak = spaceIter.next();
251            } else if (nextSpaceBreak > nextBreak) {
252                foundInvalidBreak(nextBreak);
253                result = FALSE;
254                nextBreak = breakIter->next();
255            }
256        }
257
258        if (fVerbose) {
259            printf("%d   %d\n", nextSpaceBreak, nextBreak);
260        }
261    }
262
263
264    fWordCount = spaceIter.getWordCount();
265
266    delete breakIter;
267
268    return result;
269}
270
271/*
272 * Report a break that's in the text with spaces but
273 * not found in the text without spaces.
274 */
275void ThaiWordbreakTest::breakNotFound(int32_t br)
276{
277    if (fVerbose) {
278        printf("%d   ****\n", br);
279    } else {
280        fprintf(stderr, "break not found: %d\n", br);
281    }
282
283    fBreaksNotFound += 1;
284}
285
286/*
287 * Report a break that's found in the text without spaces
288 * that isn't in the text with spaces.
289 */
290void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
291{
292    if (fVerbose) {
293        printf("****   %d\n", br);
294    } else {
295        fprintf(stderr, "found invalid break: %d\n", br);
296    }
297
298    fInvalidBreaks += 1;
299}
300
301/*
302 * Read the text from a file. The text must start with a Unicode Byte
303 * Order Mark (BOM) so that we know what order to read the bytes in.
304 */
305const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
306{
307    FILE *f;
308    int32_t fileSize;
309
310    UChar *buffer;
311    char *bufferChars;
312
313    f = fopen(fileName, "rb");
314
315    if( f == NULL ) {
316        fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
317        return 0;
318    }
319
320    fseek(f, 0, SEEK_END);
321    fileSize = ftell(f);
322
323    fseek(f, 0, SEEK_SET);
324    bufferChars = new char[fileSize];
325
326    if(bufferChars == 0) {
327        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
328        fclose(f);
329        return 0;
330    }
331
332    fread(bufferChars, sizeof(char), fileSize, f);
333    if( ferror(f) ) {
334        fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
335        fclose(f);
336        delete[] bufferChars;
337        return 0;
338    }
339    fclose(f);
340
341    UnicodeString myText(bufferChars, fileSize, "UTF-8");
342
343    delete[] bufferChars;
344
345    charCount = myText.length();
346    buffer = new UChar[charCount];
347    if(buffer == 0) {
348        fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
349        return 0;
350    }
351
352    myText.extract(1, myText.length(), buffer);
353    charCount--;  // skip the BOM
354    buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
355
356    return buffer;
357}
358
359/*
360 * Remove spaces from the input UChar array.
361 *
362 * We check explicitly for a Unicode code value of 0x0020
363 * because Unicode::isSpaceChar returns true for CR, LF, etc.
364 *
365 */
366const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
367{
368    int32_t i, out, spaceCount;
369
370    spaceCount = 0;
371    for (i = 0; i < count; i += 1) {
372        if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
373            spaceCount += 1;
374        }
375    }
376
377    nonSpaceCount = count - spaceCount;
378    UChar *noSpaces = new UChar[nonSpaceCount];
379
380    if (noSpaces == 0) {
381        fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
382        return 0;
383    }
384
385    for (out = 0, i = 0; i < count; i += 1) {
386        if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
387            noSpaces[out++] = spaces[i];
388        }
389    }
390
391    return noSpaces;
392}
393
394/*
395 * Generate a text file with spaces in it from a file without.
396 */
397int generateFile(const UChar *chars, int32_t length) {
398    Locale root("");
399    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
400    UErrorCode status = U_ZERO_ERROR;
401
402    UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
403    BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
404    breakIter->adoptText(noSpaceIter);
405    char outbuf[1024];
406    int32_t strlength;
407    UChar bom = 0xFEFF;
408
409    printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
410    int32_t prevbreak = 0;
411    while (U_SUCCESS(status)) {
412        int32_t nextbreak = breakIter->next();
413        if (nextbreak == BreakIterator::DONE) {
414            break;
415        }
416        printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
417                                    nextbreak-prevbreak, &status));
418        if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
419            && complexContext.contains(chars[nextbreak])) {
420            printf(" ");
421        }
422        prevbreak = nextbreak;
423    }
424
425    if (U_FAILURE(status)) {
426        fprintf(stderr, "generate failed: %s\n", u_errorName(status));
427        return status;
428    }
429    else {
430        return 0;
431    }
432}
433
434/*
435 * The main routine. Read the command line arguments, read the text file,
436 * remove the spaces, do the comparison and report the final results
437 */
438int main(int argc, char **argv)
439{
440    char *fileName = "space.txt";
441    int arg = 1;
442    UBool verbose = FALSE;
443    UBool generate = FALSE;
444
445    if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
446        generate = TRUE;
447        arg += 1;
448    }
449
450    if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
451        verbose = TRUE;
452        arg += 1;
453    }
454
455    if (arg == argc - 1) {
456        fileName = argv[arg++];
457    }
458
459    if (arg != argc) {
460        fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
461        return 1;
462    }
463
464    int32_t spaceCount, nonSpaceCount;
465    const UChar *spaces, *noSpaces;
466
467    spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
468
469    if (spaces == 0) {
470        return 1;
471    }
472
473    if (generate) {
474        return generateFile(spaces, spaceCount);
475    }
476
477    noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
478
479    if (noSpaces == 0) {
480        return 1;
481    }
482
483    ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
484
485    printf("word count: %d\n", test.getWordCount());
486    printf("breaks not found: %d\n", test.getBreaksNotFound());
487    printf("invalid breaks found: %d\n", test.getInvalidBreaks());
488
489    return 0;
490}
491
492/*
493 * The main constructor. Clear all the counts and construct a default
494 * word instance of a BreakIterator.
495 */
496SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
497  : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
498{
499    UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
500    UErrorCode status = U_ZERO_ERROR;
501    fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
502    Locale root("");
503
504    fBreakIter = BreakIterator::createWordInstance(root, status);
505    fBreakIter->adoptText(iter);
506}
507
508SpaceBreakIterator::SpaceBreakIterator()
509{
510    // nothing
511}
512
513/*
514 * The destructor. delete the underlying BreakIterator
515 */
516SpaceBreakIterator::~SpaceBreakIterator()
517{
518    delete fBreakIter;
519}
520
521/*
522 * Return the next break, counting words and spaces.
523 */
524int32_t SpaceBreakIterator::next()
525{
526    if (fDone) {
527        return BreakIterator::DONE;
528    }
529
530    int32_t nextBreak;
531    do {
532        nextBreak = fBreakIter->next();
533
534        if (nextBreak == BreakIterator::DONE) {
535            fDone = TRUE;
536            return BreakIterator::DONE;
537        }
538    }
539    while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
540            && fComplexContext.contains(fText[nextBreak]));
541
542   int32_t result = nextBreak - fSpaceCount;
543
544    if (nextBreak < fTextCount) {
545        if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
546            fSpaceCount += fBreakIter->next() - nextBreak;
547        }
548    }
549
550    fWordCount += 1;
551
552    return result;
553}
554
555/*
556 * Returns the current space count
557 */
558int32_t SpaceBreakIterator::getSpaceCount()
559{
560    return fSpaceCount;
561}
562
563/*
564 * Returns the current word count
565 */
566int32_t SpaceBreakIterator::getWordCount()
567{
568    return fWordCount;
569}
570
571
572