1b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/* 2b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho********************************************************************** 3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* Copyright (C) 2011-2011, International Business Machines Corporation 4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* and others. All Rights Reserved. 5b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho********************************************************************** 6b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho************************************************************************ 7b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* Date Name Description 8b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* 05/14/2011 grhoten Creation. 9b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho************************************************************************/ 10b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 11b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/utypes.h" 12b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 13b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#if !UCONFIG_NO_BREAK_ITERATION 14b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 15b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "dicttest.h" 16b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "textfile.h" 17b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "uvector.h" 18b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/rbbi.h" 19b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 20b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehovoid DictionaryWordTest::TestThaiBreaks() { 21b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UErrorCode status=U_ZERO_ERROR; 22b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BreakIterator* b; 23b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho Locale locale = Locale("th"); 24b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t p, index; 25b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar c[]= { 26b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, 27b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, 28b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 0x0E16, 0x0E49, 0x0E33, 0x0000 29b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }; 30b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t expectedWordResult[] = { 31b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 2, 3, 6, 10, 11, 15, 17, 20, 22 32b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }; 33b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t expectedLineResult[] = { 34b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 3, 6, 11, 15, 17, 20, 22 35b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }; 36b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 37b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t size = u_strlen(c); 38b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString text=UnicodeString(c); 39b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 40b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho b = BreakIterator::createWordInstance(locale, status); 41b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_FAILURE(status)) { 42b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); 43b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return; 44b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 45b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho b->setText(text); 46b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho p = index = 0; 47b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while ((p=b->next())!=BreakIterator::DONE && p < size) { 48b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (p != expectedWordResult[index++]) { 49b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); 50b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 51b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 52b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete b; 53b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 54b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho b = BreakIterator::createLineInstance(locale, status); 55b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_FAILURE(status)) { 56b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho errln("Unable to create thai line break iterator."); 57b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return; 58b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 59b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho b->setText(text); 60b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho p = index = 0; 61b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while ((p=b->next())!=BreakIterator::DONE && p < size) { 62b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (p != expectedLineResult[index++]) { 63b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); 64b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 65b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 66b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 67b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete b; 68b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 69b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 70b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define DICTIONARY_TEST_FILE "wordsegments.txt" 71b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 72b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehovoid DictionaryWordTest::TestWordBoundaries() { 73b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UErrorCode status = U_ZERO_ERROR; 74b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 75b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho TextFile phrases(DICTIONARY_TEST_FILE, "UTF8", status); 76b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_FAILURE(status)) { 77b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho dataerrln("Can't open "DICTIONARY_TEST_FILE": %s; skipping test", 78b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho u_errorName(status)); 79b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return; 80b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 81b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 82b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Due to how the word break iterator works, 83b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // scripts for languages that use no spaces should use the correct dictionary by default. 84b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BreakIterator *wb = BreakIterator::createWordInstance("en", status); 85b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_FAILURE(status)) { 86b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho dataerrln("Word break iterator can not be opened: %s; skipping test", 87b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho u_errorName(status)); 88b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return; 89b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 90b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 91b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t pos, pIdx; 92b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t testLines = 0; 93b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString phrase; 94b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while (phrases.readLineSkippingComments(phrase, status, FALSE) && U_SUCCESS(status)) { 95b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UVector breaks(status); 96b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 97b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for (pIdx = 0; pIdx < phrase.length(); pIdx++) { 98b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (phrase.charAt(pIdx) == 0x007C /* | */) { 99b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho breaks.addElement(pIdx, status); 100b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho phrase.remove(pIdx, 1); 101b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 102b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 103b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho breaks.addElement(pIdx, status); 104b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 105b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wb->setText(phrase); 106b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t brkArrPos = 0; 107b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while ((pos=wb->next())!=BreakIterator::DONE) { 108b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t expectedPos = breaks.elementAti(brkArrPos); 109b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (expectedPos != pos) { 110b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho errln("Incorrect forward word break on line %d. Expected: %d Got: %d", 111b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos); 112b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 113b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho brkArrPos++; 114b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 115b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho brkArrPos = breaks.size() - 1; 116b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while ((pos=wb->previous())!=BreakIterator::DONE) { 117b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho brkArrPos--; 118b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t expectedPos = breaks.elementAti(brkArrPos); 119b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (expectedPos != pos) { 120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho errln("Incorrect backward word break on line %d. Expected: %d Got: %d", 121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho phrases.getLineNumber(), breaks.elementAt(brkArrPos), pos); 122b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 123b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 124b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho testLines++; 125b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 126b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete wb; 127b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho logln("%d tests were run.", testLines); 128b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 129b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 130b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehovoid DictionaryWordTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */) 131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho{ 132b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (exec) logln("TestSuite DictionaryWordTest: "); 133b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho TESTCASE_AUTO_BEGIN; 134b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho TESTCASE_AUTO(TestThaiBreaks); 135b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho TESTCASE_AUTO(TestWordBoundaries); 136b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho TESTCASE_AUTO_END; 137b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 138b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 139b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 140b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif 141