1b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/*
2b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho**********************************************************************
3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* Copyright (C) 2011-2011, International Business Machines Corporation
4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* and others.  All Rights Reserved.
5b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho**********************************************************************
6b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho************************************************************************
7b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   Date          Name        Description
8b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   05/14/2011    grhoten     Creation.
9b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho************************************************************************/
10b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
11b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/utypes.h"
12b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
13b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#if !UCONFIG_NO_BREAK_ITERATION
14b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
15b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "dicttest.h"
16b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "textfile.h"
17b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "uvector.h"
18b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/rbbi.h"
19b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
20b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehovoid DictionaryWordTest::TestThaiBreaks() {
21b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UErrorCode status=U_ZERO_ERROR;
22b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BreakIterator* b;
23b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    Locale locale = Locale("th");
24b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t p, index;
25b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UChar c[]= {
26b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
27b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
28b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            0x0E16, 0x0E49, 0x0E33, 0x0000
29b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
30b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t expectedWordResult[] = {
31b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            2, 3, 6, 10, 11, 15, 17, 20, 22
32b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
33b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t expectedLineResult[] = {
34b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            3, 6, 11, 15, 17, 20, 22
35b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
36b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
37b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t size = u_strlen(c);
38b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UnicodeString text=UnicodeString(c);
39b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
40b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    b = BreakIterator::createWordInstance(locale, status);
41b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (U_FAILURE(status)) {
42b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
43b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return;
44b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
45b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    b->setText(text);
46b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    p = index = 0;
47b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    while ((p=b->next())!=BreakIterator::DONE && p < size) {
48b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if (p != expectedWordResult[index++]) {
49b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
50b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
51b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
52b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    delete b;
53b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
54b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    b = BreakIterator::createLineInstance(locale, status);
55b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (U_FAILURE(status)) {
56b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        errln("Unable to create thai line break iterator.");
57b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return;
58b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
59b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    b->setText(text);
60b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    p = index = 0;
61b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    while ((p=b->next())!=BreakIterator::DONE && p < size) {
62b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if (p != expectedLineResult[index++]) {
63b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
64b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
65b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
66b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
67b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    delete b;
68b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
69b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
70b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define DICTIONARY_TEST_FILE "wordsegments.txt"
71b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
72b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehovoid DictionaryWordTest::TestWordBoundaries() {
73b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UErrorCode      status  = U_ZERO_ERROR;
74b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
75b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status);
76b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (U_FAILURE(status)) {
77b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test",
78b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho              u_errorName(status));
79b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return;
80b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
81b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
82b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Due to how the word break iterator works,
83b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // scripts for languages that use no spaces should use the correct dictionary by default.
84b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    BreakIterator *wb = BreakIterator::createWordInstance("en", status);
85b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (U_FAILURE(status)) {
86b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        dataerrln("Word break iterator can not be opened: %s; skipping test",
87b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho              u_errorName(status));
88b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return;
89b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
90b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
91b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t pos, pIdx;
92b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t testLines = 0;
93b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UnicodeString phrase;
94b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) {
95b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UVector breaks(status);
96b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
97b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        for (pIdx = 0; pIdx < phrase.length(); pIdx++) {
98b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if (phrase.charAt(pIdx) == 0x007C /* | */) {
99b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                breaks.addElement(pIdx, status);
100b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                phrase.remove(pIdx, 1);
101b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
102b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
103b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        breaks.addElement(pIdx, status);
104b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
105b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        wb->setText(phrase);
106b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t brkArrPos = 0;
107b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        while ((pos=wb->next())!=BreakIterator::DONE) {
108b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            int32_t expectedPos = breaks.elementAti(brkArrPos);
109b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if (expectedPos != pos) {
110b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                errln("Incorrect forward word break on line %d. Expected: %d  Got: %d",
111b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
112b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
113b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            brkArrPos++;
114b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
115b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        brkArrPos = breaks.size() - 1;
116b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        while ((pos=wb->previous())!=BreakIterator::DONE) {
117b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            brkArrPos--;
118b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            int32_t expectedPos = breaks.elementAti(brkArrPos);
119b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if (expectedPos != pos) {
120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                errln("Incorrect backward word break on line %d. Expected: %d  Got: %d",
121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos);
122b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
123b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
124b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        testLines++;
125b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
126b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    delete wb;
127b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    logln("%d tests were run.", testLines);
128b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
129b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
130b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehovoid DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho{
132b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (exec) logln("TestSuite DictionaryWordTest: ");
133b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    TESTCASE_AUTO_BEGIN;
134b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    TESTCASE_AUTO(TestThaiBreaks);
135b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    TESTCASE_AUTO(TestWordBoundaries);
136b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    TESTCASE_AUTO_END;
137b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
138b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
139b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
140b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif
141